gistgen 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/Gemfile +4 -0
- data/Rakefile +2 -0
- data/gistgen.gemspec +24 -0
- data/lib/gistgen.rb +8 -0
- data/lib/gistgen/crunch_view.rb +44 -0
- data/lib/gistgen/hacker_news.rb +61 -0
- data/lib/gistgen/page.rb +18 -0
- data/lib/gistgen/reddit.rb +48 -0
- data/lib/gistgen/string.rb +39 -0
- data/lib/gistgen/url.rb +32 -0
- data/lib/gistgen/version.rb +3 -0
- data/lib/gistgen/wiki_abstract.rb +34 -0
- data/tests/crunch_view_test.rb +5 -0
- data/tests/hubs_test.rb +8 -0
- data/tests/wiki_test.rb +3 -0
- metadata +102 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/gistgen.gemspec
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
|
3
|
+
require "gistgen/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |s|
|
|
6
|
+
s.name = "gistgen"
|
|
7
|
+
s.version = Gistgen::VERSION
|
|
8
|
+
s.platform = Gem::Platform::RUBY
|
|
9
|
+
s.authors = ["Quan Nguyen"]
|
|
10
|
+
s.email = ["mquannie@gmail.com"]
|
|
11
|
+
s.homepage = "http://github.com/mquan/gistgen"
|
|
12
|
+
s.summary = %q{generate different types of summaries for a text}
|
|
13
|
+
s.description = %q{gistgen has several modules to generate summaries from wikipedia and crunchbase}
|
|
14
|
+
s.rubyforge_project = "gistgen"
|
|
15
|
+
|
|
16
|
+
s.files = `git ls-files`.split("\n")
|
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
|
19
|
+
s.require_paths = ["lib"]
|
|
20
|
+
|
|
21
|
+
s.add_dependency "mechanize"
|
|
22
|
+
s.add_dependency "json"
|
|
23
|
+
s.add_dependency "htmlentities"
|
|
24
|
+
end
|
data/lib/gistgen.rb
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
require 'htmlentities'
|
|
3
|
+
|
|
4
|
+
require "gistgen/page"
|
|
5
|
+
require "gistgen/string"
|
|
6
|
+
require "gistgen/url"
|
|
7
|
+
|
|
8
|
+
module Gistgen
|
|
9
|
+
class CrunchView
|
|
10
|
+
def initialize(name)
|
|
11
|
+
begin
|
|
12
|
+
res = Gistgen::Page.get_page("http://api.crunchbase.com/v/1/company/#{name}.js")
|
|
13
|
+
@json = JSON.parse(res)
|
|
14
|
+
@json = (@json and @json['overview'])? @json : nil
|
|
15
|
+
rescue
|
|
16
|
+
nil
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def overview(length=500)
|
|
21
|
+
begin
|
|
22
|
+
text = @json['overview'].gsub(/\u003C(.*?)\\u003E/,'').gsub(/<(.*?)>/,'').gsub("\n",'')
|
|
23
|
+
text = HTMLEntities.new.decode(text) #decode_html
|
|
24
|
+
text.extract_passage(0, length)
|
|
25
|
+
rescue
|
|
26
|
+
nil
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def permalink
|
|
31
|
+
begin
|
|
32
|
+
Gistgen::URL.standardize("http://www.crunchbase.com/company/#{@json['permalink']}")
|
|
33
|
+
rescue
|
|
34
|
+
nil
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def homepage
|
|
39
|
+
(@json)? Gistgen::URL.standardize(@json['homepage_url']) : nil
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
|
|
3
|
+
require 'gistgen/page'
|
|
4
|
+
require 'gistgen/url'
|
|
5
|
+
|
|
6
|
+
module Gistgen
|
|
7
|
+
class HackerNews
|
|
8
|
+
#http://api.ihackernews.com/
|
|
9
|
+
def self.frontpage
|
|
10
|
+
begin
|
|
11
|
+
res = Gistgen::Page.get_page("http://api.ihackernews.com/page")
|
|
12
|
+
Gistgen::HackerNews.get_hash(res)
|
|
13
|
+
rescue
|
|
14
|
+
nil
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
#don't use this too often (low score shouldn't be added)
|
|
19
|
+
def self.new_posts
|
|
20
|
+
begin
|
|
21
|
+
res = Gistgen::Page.get_page("http://api.ihackernews.com/new")
|
|
22
|
+
Gistgen::HackerNews.get_hash(res)
|
|
23
|
+
rescue
|
|
24
|
+
nil
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def self.get_score(hn_url)
|
|
29
|
+
id = hn_url.match(/\d+$/)[0]
|
|
30
|
+
begin
|
|
31
|
+
res = Gistgen::Page.get_page("http://api.ihackernews.com/post/#{id}")
|
|
32
|
+
json = JSON.parse(res)
|
|
33
|
+
json['points']
|
|
34
|
+
rescue
|
|
35
|
+
nil
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def self.get_hash(res)
|
|
40
|
+
json = JSON.parse(res)
|
|
41
|
+
json['items'].map do |i|
|
|
42
|
+
{"title" => i['title'],
|
|
43
|
+
"url" => Gistgen::URL.standardize(i['url']),
|
|
44
|
+
"score" => i['points'],
|
|
45
|
+
"time" => Gistgen::HackerNews.parse_time(i['postedAgo']),
|
|
46
|
+
"discussion_url" => "http://news.ycombinator.com/item?id=#{i['id']}"
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def self.parse_time(time_ago)
|
|
52
|
+
begin
|
|
53
|
+
tmp = time_ago.split(' ')
|
|
54
|
+
time = tmp[0].to_i.send(tmp[1]).ago
|
|
55
|
+
rescue
|
|
56
|
+
Time.now.utc
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
end
|
|
61
|
+
end
|
data/lib/gistgen/page.rb
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
require 'mechanize'
|
|
2
|
+
|
|
3
|
+
module Gistgen
|
|
4
|
+
class Page
|
|
5
|
+
#make http request and return the html page
|
|
6
|
+
def self.get_page(url, user_agent='gistgen gem request')
|
|
7
|
+
begin
|
|
8
|
+
agent = Mechanize.new
|
|
9
|
+
agent.user_agent = user_agent
|
|
10
|
+
agent.get(url)
|
|
11
|
+
agent.page.body
|
|
12
|
+
rescue
|
|
13
|
+
nil
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
|
|
3
|
+
require 'gistgen/page'
|
|
4
|
+
require 'gistgen/url'
|
|
5
|
+
|
|
6
|
+
module Gistgen
|
|
7
|
+
class Reddit
|
|
8
|
+
#http://code.reddit.com/wiki/API
|
|
9
|
+
#reddit api is so nice, you just pick a page and add .json to get just the data
|
|
10
|
+
#ex: http://www.reddit.com/.json
|
|
11
|
+
def self.fetch(url)
|
|
12
|
+
res = Gistgen::Page.get_page("#{url.gsub(/\/$/,'')}/.json")
|
|
13
|
+
Gistgen::Reddit.get_hash(res)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.get_score(reddit_url)
|
|
17
|
+
begin
|
|
18
|
+
res = Gistgen::Page.get_page("#{reddit_url.gsub(/\/$/,'')}/.json")
|
|
19
|
+
score = res.scan(/"score"\s*:\s*(\d+)/)[0].join('').to_i #reddit nested comments is too deep for json
|
|
20
|
+
rescue
|
|
21
|
+
nil
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def self.get_hash(res)
|
|
26
|
+
json = JSON.parse(res)
|
|
27
|
+
items = json['data']['children']
|
|
28
|
+
items.map do |i|
|
|
29
|
+
post = i['data']
|
|
30
|
+
{"title" => post['title'],
|
|
31
|
+
"url" => Gistgen::URL.standardize(post['url']),
|
|
32
|
+
"score" => post['score'],
|
|
33
|
+
"time" => Time.at(post['created_utc']),
|
|
34
|
+
"discussion_url" => "http://reddit.com#{post['permalink']}"
|
|
35
|
+
}
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
#ban digg: they link to their url shortener
|
|
40
|
+
#http://developers.digg.com/documentation
|
|
41
|
+
#require 'uri'
|
|
42
|
+
#def self.get_diggs(url)
|
|
43
|
+
# res = Gistgen::Page.get_page(URI.escape(url)) #need to encode url
|
|
44
|
+
# json = JSON.parse(res)
|
|
45
|
+
# json['stories'][0]['diggs']
|
|
46
|
+
#end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
class String
|
|
2
|
+
#return a passage of size <= length from start_indexth sentence
|
|
3
|
+
def extract_passage(start_index=0,length=500)
|
|
4
|
+
sentences = self.split_sentences
|
|
5
|
+
stop = ((start_index+1)...sentences.size).detect { |i| (sentences[start_index..i].join('. ')).size > length }
|
|
6
|
+
stop = (stop and stop <= sentences.size)? stop-1 : sentences.size - 1
|
|
7
|
+
passages = sentences[start_index...stop].join('. ').split("\n")
|
|
8
|
+
(passages.size > 0)? passages[0].gsub(/^[^\w]+/,'').limit(length) : ''
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
#split text into sentences, take into account Mr.|Ms. endings are not end of sentence
|
|
12
|
+
def split_sentences
|
|
13
|
+
#break text first by paragraph then into chunks delimited by a period
|
|
14
|
+
#but these are not quite sentences yet
|
|
15
|
+
chunks = (self.split(/\n+/).map { |p| "#{p}\n".split(/\.(?:[^\w])/) }).flatten.compact
|
|
16
|
+
|
|
17
|
+
#if a sentence is split at Mr.|Ms.|Dr.|Mrs.
|
|
18
|
+
#then recombine it with its remaining part and nil it to delete later
|
|
19
|
+
tmp=''
|
|
20
|
+
sentences = chunks.map { |c|
|
|
21
|
+
ss = (tmp != '')? "#{tmp}. #{c}" : c
|
|
22
|
+
if c.match(/(?:Dr|Mr|Ms|Mrs)$/) #what about John F. Kennedy ([A-Z])
|
|
23
|
+
tmp = ss
|
|
24
|
+
ss=nil
|
|
25
|
+
else
|
|
26
|
+
tmp = ''
|
|
27
|
+
end
|
|
28
|
+
ss
|
|
29
|
+
}
|
|
30
|
+
sentences.compact #delete nil elements
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
#constraint a string to a fixed length or less
|
|
34
|
+
#discard everything after the last punctuation that occurs right before lenght limit
|
|
35
|
+
#the regexp look ahead for any punctuation
|
|
36
|
+
def limit(length)
|
|
37
|
+
(self.length > length)? self[0...length].gsub(/(?![\s\S]+?[,:;)\/\\\|])([,:;)\/\\\|].*)/,'') : self
|
|
38
|
+
end
|
|
39
|
+
end
|
data/lib/gistgen/url.rb
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
module Gistgen
|
|
2
|
+
class URL
|
|
3
|
+
def self.standardize(url)
|
|
4
|
+
protocol = url.split('.')[0].match(/^(.*):\/\//)
|
|
5
|
+
u1 = (!protocol)? "http://#{url}" : url
|
|
6
|
+
#raise error if protocol && protocol[0] != 'http'
|
|
7
|
+
|
|
8
|
+
#remove www subdomain if exist
|
|
9
|
+
u2 = u1.gsub(/^(http|https):\/\/www\./ix,'http://')
|
|
10
|
+
|
|
11
|
+
#make sure google.com and google.com/ are the same thing
|
|
12
|
+
u3 = (u2.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?$/ix))? "#{u2}/" : u2
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def self.is_valid?(url)
|
|
16
|
+
url.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}((:[0-9]{1,5})?\/.*)?$/ix)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def self.is_image?(url)
|
|
20
|
+
url.match(/\.(?:jpg|jpeg|png|gif|tiff|raw|bmp|webp|ai|psd|svg)$/i)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.is_multimedia?(url)
|
|
24
|
+
url.match(/\.(?:js|css|mp3|swf|wmv|mov|doc|pdf|ppt|xls|xlsx|docx|eps|ps|ttf|xml)$/i)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
#return true if it's a base url (nothing after first '/') and does not have subdomain
|
|
28
|
+
def self.is_root?(url)
|
|
29
|
+
url.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?\/$/ix) #and url.split('.').size == 2
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
require 'uri'
|
|
2
|
+
require 'json'
|
|
3
|
+
|
|
4
|
+
require "gistgen/page"
|
|
5
|
+
require "gistgen/string"
|
|
6
|
+
|
|
7
|
+
module Gistgen
|
|
8
|
+
class WikiAbstract
|
|
9
|
+
|
|
10
|
+
def self.permalink(name)
|
|
11
|
+
"http://en.wikipedia.org/wiki/#{name}"
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def self.search(query, length=500)
|
|
15
|
+
q = URI.escape(query)
|
|
16
|
+
res = Gistgen::Page.get_page("http://en.wikipedia.org/w/api.php?format=json&action=query&prop=revisions&titles=#{q}&rvprop=content&rvsection=0")
|
|
17
|
+
json = JSON.parse(res)
|
|
18
|
+
rev=0
|
|
19
|
+
json['query']['pages'].each_key { |k| rev=k }
|
|
20
|
+
all_text = ''
|
|
21
|
+
text = json['query']['pages'][rev]['revisions'][0]['*'].to_s
|
|
22
|
+
if text.include?('#REDIRECT')
|
|
23
|
+
new_q = text.match(/\[\[(.*?)\]\]/)[0].gsub('[','').gsub(']','')
|
|
24
|
+
all_text = WikiAbstract.search(new_q)
|
|
25
|
+
else
|
|
26
|
+
t = text[text.index("'''")...text.size].gsub(/^\s+/,'')
|
|
27
|
+
all_text = t.gsub(/<ref>(.*?)<\/ref>/i,'').gsub(/<small>(.*?)<\/small>/,'').gsub(/<(.*?)>/,'').gsub(/\{\{(.*?)\}\}/,'').gsub(/\(stylized(.*?)\)/,'').gsub(/\[\[([^\]\]]*?)\|/,'')
|
|
28
|
+
["[","]","'''"].each { |g,clean| all_text.gsub!(g,'') }
|
|
29
|
+
end
|
|
30
|
+
all_text.extract_passage(0, length)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
end
|
|
34
|
+
end
|
data/tests/hubs_test.rb
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
require "#{File.dirname(__FILE__)}/../lib/gistgen"
|
|
2
|
+
|
|
3
|
+
puts Gistgen::HackerNews.get_score('http://news.ycombinator.com/item?id=2458202')
|
|
4
|
+
puts Gistgen::HackerNews.frontpage
|
|
5
|
+
puts Gistgen::HackerNews.new_posts
|
|
6
|
+
|
|
7
|
+
puts Gistgen::Reddit.fetch("http://reddit.com/")
|
|
8
|
+
puts Gistgen::Reddit.get_score('http://www.reddit.com/r/funny/comments/gu7jw/dictator/')
|
data/tests/wiki_test.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: gistgen
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease:
|
|
5
|
+
version: 0.1.1
|
|
6
|
+
platform: ruby
|
|
7
|
+
authors:
|
|
8
|
+
- Quan Nguyen
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
|
|
13
|
+
date: 2011-05-01 00:00:00 Z
|
|
14
|
+
dependencies:
|
|
15
|
+
- !ruby/object:Gem::Dependency
|
|
16
|
+
name: mechanize
|
|
17
|
+
prerelease: false
|
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
19
|
+
none: false
|
|
20
|
+
requirements:
|
|
21
|
+
- - ">="
|
|
22
|
+
- !ruby/object:Gem::Version
|
|
23
|
+
version: "0"
|
|
24
|
+
type: :runtime
|
|
25
|
+
version_requirements: *id001
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: json
|
|
28
|
+
prerelease: false
|
|
29
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
|
30
|
+
none: false
|
|
31
|
+
requirements:
|
|
32
|
+
- - ">="
|
|
33
|
+
- !ruby/object:Gem::Version
|
|
34
|
+
version: "0"
|
|
35
|
+
type: :runtime
|
|
36
|
+
version_requirements: *id002
|
|
37
|
+
- !ruby/object:Gem::Dependency
|
|
38
|
+
name: htmlentities
|
|
39
|
+
prerelease: false
|
|
40
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
|
41
|
+
none: false
|
|
42
|
+
requirements:
|
|
43
|
+
- - ">="
|
|
44
|
+
- !ruby/object:Gem::Version
|
|
45
|
+
version: "0"
|
|
46
|
+
type: :runtime
|
|
47
|
+
version_requirements: *id003
|
|
48
|
+
description: gistgen has several modules to generate summaries from wikipedia and crunchbase
|
|
49
|
+
email:
|
|
50
|
+
- mquannie@gmail.com
|
|
51
|
+
executables: []
|
|
52
|
+
|
|
53
|
+
extensions: []
|
|
54
|
+
|
|
55
|
+
extra_rdoc_files: []
|
|
56
|
+
|
|
57
|
+
files:
|
|
58
|
+
- .gitignore
|
|
59
|
+
- Gemfile
|
|
60
|
+
- Rakefile
|
|
61
|
+
- gistgen.gemspec
|
|
62
|
+
- lib/gistgen.rb
|
|
63
|
+
- lib/gistgen/crunch_view.rb
|
|
64
|
+
- lib/gistgen/hacker_news.rb
|
|
65
|
+
- lib/gistgen/page.rb
|
|
66
|
+
- lib/gistgen/reddit.rb
|
|
67
|
+
- lib/gistgen/string.rb
|
|
68
|
+
- lib/gistgen/url.rb
|
|
69
|
+
- lib/gistgen/version.rb
|
|
70
|
+
- lib/gistgen/wiki_abstract.rb
|
|
71
|
+
- tests/crunch_view_test.rb
|
|
72
|
+
- tests/hubs_test.rb
|
|
73
|
+
- tests/wiki_test.rb
|
|
74
|
+
homepage: http://github.com/mquan/gistgen
|
|
75
|
+
licenses: []
|
|
76
|
+
|
|
77
|
+
post_install_message:
|
|
78
|
+
rdoc_options: []
|
|
79
|
+
|
|
80
|
+
require_paths:
|
|
81
|
+
- lib
|
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
83
|
+
none: false
|
|
84
|
+
requirements:
|
|
85
|
+
- - ">="
|
|
86
|
+
- !ruby/object:Gem::Version
|
|
87
|
+
version: "0"
|
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
89
|
+
none: false
|
|
90
|
+
requirements:
|
|
91
|
+
- - ">="
|
|
92
|
+
- !ruby/object:Gem::Version
|
|
93
|
+
version: "0"
|
|
94
|
+
requirements: []
|
|
95
|
+
|
|
96
|
+
rubyforge_project: gistgen
|
|
97
|
+
rubygems_version: 1.7.2
|
|
98
|
+
signing_key:
|
|
99
|
+
specification_version: 3
|
|
100
|
+
summary: generate different types of summaries for a text
|
|
101
|
+
test_files: []
|
|
102
|
+
|