gistgen 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/Gemfile +4 -0
- data/Rakefile +2 -0
- data/gistgen.gemspec +24 -0
- data/lib/gistgen.rb +8 -0
- data/lib/gistgen/crunch_view.rb +44 -0
- data/lib/gistgen/hacker_news.rb +61 -0
- data/lib/gistgen/page.rb +18 -0
- data/lib/gistgen/reddit.rb +48 -0
- data/lib/gistgen/string.rb +39 -0
- data/lib/gistgen/url.rb +32 -0
- data/lib/gistgen/version.rb +3 -0
- data/lib/gistgen/wiki_abstract.rb +34 -0
- data/tests/crunch_view_test.rb +5 -0
- data/tests/hubs_test.rb +8 -0
- data/tests/wiki_test.rb +3 -0
- metadata +102 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/gistgen.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "gistgen/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "gistgen"
|
7
|
+
s.version = Gistgen::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Quan Nguyen"]
|
10
|
+
s.email = ["mquannie@gmail.com"]
|
11
|
+
s.homepage = "http://github.com/mquan/gistgen"
|
12
|
+
s.summary = %q{generate different types of summaries for a text}
|
13
|
+
s.description = %q{gistgen has several modules to generate summaries from wikipedia and crunchbase}
|
14
|
+
s.rubyforge_project = "gistgen"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_dependency "mechanize"
|
22
|
+
s.add_dependency "json"
|
23
|
+
s.add_dependency "htmlentities"
|
24
|
+
end
|
data/lib/gistgen.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'htmlentities'
|
3
|
+
|
4
|
+
require "gistgen/page"
|
5
|
+
require "gistgen/string"
|
6
|
+
require "gistgen/url"
|
7
|
+
|
8
|
+
module Gistgen
|
9
|
+
class CrunchView
|
10
|
+
def initialize(name)
|
11
|
+
begin
|
12
|
+
res = Gistgen::Page.get_page("http://api.crunchbase.com/v/1/company/#{name}.js")
|
13
|
+
@json = JSON.parse(res)
|
14
|
+
@json = (@json and @json['overview'])? @json : nil
|
15
|
+
rescue
|
16
|
+
nil
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def overview(length=500)
|
21
|
+
begin
|
22
|
+
text = @json['overview'].gsub(/\u003C(.*?)\\u003E/,'').gsub(/<(.*?)>/,'').gsub("\n",'')
|
23
|
+
text = HTMLEntities.new.decode(text) #decode_html
|
24
|
+
text.extract_passage(0, length)
|
25
|
+
rescue
|
26
|
+
nil
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def permalink
|
31
|
+
begin
|
32
|
+
Gistgen::URL.standardize("http://www.crunchbase.com/company/#{@json['permalink']}")
|
33
|
+
rescue
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def homepage
|
39
|
+
(@json)? Gistgen::URL.standardize(@json['homepage_url']) : nil
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
require 'gistgen/page'
|
4
|
+
require 'gistgen/url'
|
5
|
+
|
6
|
+
module Gistgen
|
7
|
+
class HackerNews
|
8
|
+
#http://api.ihackernews.com/
|
9
|
+
def self.frontpage
|
10
|
+
begin
|
11
|
+
res = Gistgen::Page.get_page("http://api.ihackernews.com/page")
|
12
|
+
Gistgen::HackerNews.get_hash(res)
|
13
|
+
rescue
|
14
|
+
nil
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
#don't use this too often (low score shouldn't be added)
|
19
|
+
def self.new_posts
|
20
|
+
begin
|
21
|
+
res = Gistgen::Page.get_page("http://api.ihackernews.com/new")
|
22
|
+
Gistgen::HackerNews.get_hash(res)
|
23
|
+
rescue
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.get_score(hn_url)
|
29
|
+
id = hn_url.match(/\d+$/)[0]
|
30
|
+
begin
|
31
|
+
res = Gistgen::Page.get_page("http://api.ihackernews.com/post/#{id}")
|
32
|
+
json = JSON.parse(res)
|
33
|
+
json['points']
|
34
|
+
rescue
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.get_hash(res)
|
40
|
+
json = JSON.parse(res)
|
41
|
+
json['items'].map do |i|
|
42
|
+
{"title" => i['title'],
|
43
|
+
"url" => Gistgen::URL.standardize(i['url']),
|
44
|
+
"score" => i['points'],
|
45
|
+
"time" => Gistgen::HackerNews.parse_time(i['postedAgo']),
|
46
|
+
"discussion_url" => "http://news.ycombinator.com/item?id=#{i['id']}"
|
47
|
+
}
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.parse_time(time_ago)
|
52
|
+
begin
|
53
|
+
tmp = time_ago.split(' ')
|
54
|
+
time = tmp[0].to_i.send(tmp[1]).ago
|
55
|
+
rescue
|
56
|
+
Time.now.utc
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
data/lib/gistgen/page.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module Gistgen
|
4
|
+
class Page
|
5
|
+
#make http request and return the html page
|
6
|
+
def self.get_page(url, user_agent='gistgen gem request')
|
7
|
+
begin
|
8
|
+
agent = Mechanize.new
|
9
|
+
agent.user_agent = user_agent
|
10
|
+
agent.get(url)
|
11
|
+
agent.page.body
|
12
|
+
rescue
|
13
|
+
nil
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
require 'gistgen/page'
|
4
|
+
require 'gistgen/url'
|
5
|
+
|
6
|
+
module Gistgen
|
7
|
+
class Reddit
|
8
|
+
#http://code.reddit.com/wiki/API
|
9
|
+
#reddit api is so nice, you just pick a page and add .json to get just the data
|
10
|
+
#ex: http://www.reddit.com/.json
|
11
|
+
def self.fetch(url)
|
12
|
+
res = Gistgen::Page.get_page("#{url.gsub(/\/$/,'')}/.json")
|
13
|
+
Gistgen::Reddit.get_hash(res)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.get_score(reddit_url)
|
17
|
+
begin
|
18
|
+
res = Gistgen::Page.get_page("#{reddit_url.gsub(/\/$/,'')}/.json")
|
19
|
+
score = res.scan(/"score"\s*:\s*(\d+)/)[0].join('').to_i #reddit nested comments is too deep for json
|
20
|
+
rescue
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.get_hash(res)
|
26
|
+
json = JSON.parse(res)
|
27
|
+
items = json['data']['children']
|
28
|
+
items.map do |i|
|
29
|
+
post = i['data']
|
30
|
+
{"title" => post['title'],
|
31
|
+
"url" => Gistgen::URL.standardize(post['url']),
|
32
|
+
"score" => post['score'],
|
33
|
+
"time" => Time.at(post['created_utc']),
|
34
|
+
"discussion_url" => "http://reddit.com#{post['permalink']}"
|
35
|
+
}
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#ban digg: they link to their url shortener
|
40
|
+
#http://developers.digg.com/documentation
|
41
|
+
#require 'uri'
|
42
|
+
#def self.get_diggs(url)
|
43
|
+
# res = Gistgen::Page.get_page(URI.escape(url)) #need to encode url
|
44
|
+
# json = JSON.parse(res)
|
45
|
+
# json['stories'][0]['diggs']
|
46
|
+
#end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
class String
|
2
|
+
#return a passage of size <= length from start_indexth sentence
|
3
|
+
def extract_passage(start_index=0,length=500)
|
4
|
+
sentences = self.split_sentences
|
5
|
+
stop = ((start_index+1)...sentences.size).detect { |i| (sentences[start_index..i].join('. ')).size > length }
|
6
|
+
stop = (stop and stop <= sentences.size)? stop-1 : sentences.size - 1
|
7
|
+
passages = sentences[start_index...stop].join('. ').split("\n")
|
8
|
+
(passages.size > 0)? passages[0].gsub(/^[^\w]+/,'').limit(length) : ''
|
9
|
+
end
|
10
|
+
|
11
|
+
#split text into sentences, take into account Mr.|Ms. endings are not end of sentence
|
12
|
+
def split_sentences
|
13
|
+
#break text first by paragraph then into chunks delimited by a period
|
14
|
+
#but these are not quite sentences yet
|
15
|
+
chunks = (self.split(/\n+/).map { |p| "#{p}\n".split(/\.(?:[^\w])/) }).flatten.compact
|
16
|
+
|
17
|
+
#if a sentence is split at Mr.|Ms.|Dr.|Mrs.
|
18
|
+
#then recombine it with its remaining part and nil it to delete later
|
19
|
+
tmp=''
|
20
|
+
sentences = chunks.map { |c|
|
21
|
+
ss = (tmp != '')? "#{tmp}. #{c}" : c
|
22
|
+
if c.match(/(?:Dr|Mr|Ms|Mrs)$/) #what about John F. Kennedy ([A-Z])
|
23
|
+
tmp = ss
|
24
|
+
ss=nil
|
25
|
+
else
|
26
|
+
tmp = ''
|
27
|
+
end
|
28
|
+
ss
|
29
|
+
}
|
30
|
+
sentences.compact #delete nil elements
|
31
|
+
end
|
32
|
+
|
33
|
+
#constraint a string to a fixed length or less
|
34
|
+
#discard everything after the last punctuation that occurs right before lenght limit
|
35
|
+
#the regexp look ahead for any punctuation
|
36
|
+
def limit(length)
|
37
|
+
(self.length > length)? self[0...length].gsub(/(?![\s\S]+?[,:;)\/\\\|])([,:;)\/\\\|].*)/,'') : self
|
38
|
+
end
|
39
|
+
end
|
data/lib/gistgen/url.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
module Gistgen
|
2
|
+
class URL
|
3
|
+
def self.standardize(url)
|
4
|
+
protocol = url.split('.')[0].match(/^(.*):\/\//)
|
5
|
+
u1 = (!protocol)? "http://#{url}" : url
|
6
|
+
#raise error if protocol && protocol[0] != 'http'
|
7
|
+
|
8
|
+
#remove www subdomain if exist
|
9
|
+
u2 = u1.gsub(/^(http|https):\/\/www\./ix,'http://')
|
10
|
+
|
11
|
+
#make sure google.com and google.com/ are the same thing
|
12
|
+
u3 = (u2.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?$/ix))? "#{u2}/" : u2
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.is_valid?(url)
|
16
|
+
url.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}((:[0-9]{1,5})?\/.*)?$/ix)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.is_image?(url)
|
20
|
+
url.match(/\.(?:jpg|jpeg|png|gif|tiff|raw|bmp|webp|ai|psd|svg)$/i)
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.is_multimedia?(url)
|
24
|
+
url.match(/\.(?:js|css|mp3|swf|wmv|mov|doc|pdf|ppt|xls|xlsx|docx|eps|ps|ttf|xml)$/i)
|
25
|
+
end
|
26
|
+
|
27
|
+
#return true if it's a base url (nothing after first '/') and does not have subdomain
|
28
|
+
def self.is_root?(url)
|
29
|
+
url.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?\/$/ix) #and url.split('.').size == 2
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
require "gistgen/page"
|
5
|
+
require "gistgen/string"
|
6
|
+
|
7
|
+
module Gistgen
|
8
|
+
class WikiAbstract
|
9
|
+
|
10
|
+
def self.permalink(name)
|
11
|
+
"http://en.wikipedia.org/wiki/#{name}"
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.search(query, length=500)
|
15
|
+
q = URI.escape(query)
|
16
|
+
res = Gistgen::Page.get_page("http://en.wikipedia.org/w/api.php?format=json&action=query&prop=revisions&titles=#{q}&rvprop=content&rvsection=0")
|
17
|
+
json = JSON.parse(res)
|
18
|
+
rev=0
|
19
|
+
json['query']['pages'].each_key { |k| rev=k }
|
20
|
+
all_text = ''
|
21
|
+
text = json['query']['pages'][rev]['revisions'][0]['*'].to_s
|
22
|
+
if text.include?('#REDIRECT')
|
23
|
+
new_q = text.match(/\[\[(.*?)\]\]/)[0].gsub('[','').gsub(']','')
|
24
|
+
all_text = WikiAbstract.search(new_q)
|
25
|
+
else
|
26
|
+
t = text[text.index("'''")...text.size].gsub(/^\s+/,'')
|
27
|
+
all_text = t.gsub(/<ref>(.*?)<\/ref>/i,'').gsub(/<small>(.*?)<\/small>/,'').gsub(/<(.*?)>/,'').gsub(/\{\{(.*?)\}\}/,'').gsub(/\(stylized(.*?)\)/,'').gsub(/\[\[([^\]\]]*?)\|/,'')
|
28
|
+
["[","]","'''"].each { |g,clean| all_text.gsub!(g,'') }
|
29
|
+
end
|
30
|
+
all_text.extract_passage(0, length)
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
data/tests/hubs_test.rb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
require "#{File.dirname(__FILE__)}/../lib/gistgen"
|
2
|
+
|
3
|
+
puts Gistgen::HackerNews.get_score('http://news.ycombinator.com/item?id=2458202')
|
4
|
+
puts Gistgen::HackerNews.frontpage
|
5
|
+
puts Gistgen::HackerNews.new_posts
|
6
|
+
|
7
|
+
puts Gistgen::Reddit.fetch("http://reddit.com/")
|
8
|
+
puts Gistgen::Reddit.get_score('http://www.reddit.com/r/funny/comments/gu7jw/dictator/')
|
data/tests/wiki_test.rb
ADDED
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gistgen
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Quan Nguyen
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-05-01 00:00:00 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: mechanize
|
17
|
+
prerelease: false
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: *id001
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: json
|
28
|
+
prerelease: false
|
29
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
|
+
none: false
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: "0"
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id002
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: htmlentities
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: "0"
|
46
|
+
type: :runtime
|
47
|
+
version_requirements: *id003
|
48
|
+
description: gistgen has several modules to generate summaries from wikipedia and crunchbase
|
49
|
+
email:
|
50
|
+
- mquannie@gmail.com
|
51
|
+
executables: []
|
52
|
+
|
53
|
+
extensions: []
|
54
|
+
|
55
|
+
extra_rdoc_files: []
|
56
|
+
|
57
|
+
files:
|
58
|
+
- .gitignore
|
59
|
+
- Gemfile
|
60
|
+
- Rakefile
|
61
|
+
- gistgen.gemspec
|
62
|
+
- lib/gistgen.rb
|
63
|
+
- lib/gistgen/crunch_view.rb
|
64
|
+
- lib/gistgen/hacker_news.rb
|
65
|
+
- lib/gistgen/page.rb
|
66
|
+
- lib/gistgen/reddit.rb
|
67
|
+
- lib/gistgen/string.rb
|
68
|
+
- lib/gistgen/url.rb
|
69
|
+
- lib/gistgen/version.rb
|
70
|
+
- lib/gistgen/wiki_abstract.rb
|
71
|
+
- tests/crunch_view_test.rb
|
72
|
+
- tests/hubs_test.rb
|
73
|
+
- tests/wiki_test.rb
|
74
|
+
homepage: http://github.com/mquan/gistgen
|
75
|
+
licenses: []
|
76
|
+
|
77
|
+
post_install_message:
|
78
|
+
rdoc_options: []
|
79
|
+
|
80
|
+
require_paths:
|
81
|
+
- lib
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: "0"
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: "0"
|
94
|
+
requirements: []
|
95
|
+
|
96
|
+
rubyforge_project: gistgen
|
97
|
+
rubygems_version: 1.7.2
|
98
|
+
signing_key:
|
99
|
+
specification_version: 3
|
100
|
+
summary: generate different types of summaries for a text
|
101
|
+
test_files: []
|
102
|
+
|