gistgen 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ .DS_Store
4
+ Gemfile.lock
5
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in gistgen.gemspec
4
+ gemspec
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "gistgen/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "gistgen"
7
+ s.version = Gistgen::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Quan Nguyen"]
10
+ s.email = ["mquannie@gmail.com"]
11
+ s.homepage = "http://github.com/mquan/gistgen"
12
+ s.summary = %q{generate different types of summaries for a text}
13
+ s.description = %q{gistgen has several modules to generate summaries from wikipedia and crunchbase}
14
+ s.rubyforge_project = "gistgen"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_dependency "mechanize"
22
+ s.add_dependency "json"
23
+ s.add_dependency "htmlentities"
24
+ end
@@ -0,0 +1,8 @@
1
+ $: << File.dirname(__FILE__)
2
+ require 'gistgen/wiki_abstract'
3
+ require 'gistgen/crunch_view'
4
+ require 'gistgen/hacker_news'
5
+ require 'gistgen/reddit'
6
+
7
+ module Gistgen
8
+ end
@@ -0,0 +1,44 @@
1
+ require 'json'
2
+ require 'htmlentities'
3
+
4
+ require "gistgen/page"
5
+ require "gistgen/string"
6
+ require "gistgen/url"
7
+
8
+ module Gistgen
9
+ class CrunchView
10
+ def initialize(name)
11
+ begin
12
+ res = Gistgen::Page.get_page("http://api.crunchbase.com/v/1/company/#{name}.js")
13
+ @json = JSON.parse(res)
14
+ @json = (@json and @json['overview'])? @json : nil
15
+ rescue
16
+ nil
17
+ end
18
+ end
19
+
20
+ def overview(length=500)
21
+ begin
22
+ text = @json['overview'].gsub(/\u003C(.*?)\\u003E/,'').gsub(/<(.*?)>/,'').gsub("\n",'')
23
+ text = HTMLEntities.new.decode(text) #decode_html
24
+ text.extract_passage(0, length)
25
+ rescue
26
+ nil
27
+ end
28
+ end
29
+
30
+ def permalink
31
+ begin
32
+ Gistgen::URL.standardize("http://www.crunchbase.com/company/#{@json['permalink']}")
33
+ rescue
34
+ nil
35
+ end
36
+ end
37
+
38
+ def homepage
39
+ (@json)? Gistgen::URL.standardize(@json['homepage_url']) : nil
40
+ end
41
+
42
+ end
43
+ end
44
+
@@ -0,0 +1,61 @@
1
+ require 'json'
2
+
3
+ require 'gistgen/page'
4
+ require 'gistgen/url'
5
+
6
+ module Gistgen
7
+ class HackerNews
8
+ #http://api.ihackernews.com/
9
+ def self.frontpage
10
+ begin
11
+ res = Gistgen::Page.get_page("http://api.ihackernews.com/page")
12
+ Gistgen::HackerNews.get_hash(res)
13
+ rescue
14
+ nil
15
+ end
16
+ end
17
+
18
+ #don't use this too often (low score shouldn't be added)
19
+ def self.new_posts
20
+ begin
21
+ res = Gistgen::Page.get_page("http://api.ihackernews.com/new")
22
+ Gistgen::HackerNews.get_hash(res)
23
+ rescue
24
+ nil
25
+ end
26
+ end
27
+
28
+ def self.get_score(hn_url)
29
+ id = hn_url.match(/\d+$/)[0]
30
+ begin
31
+ res = Gistgen::Page.get_page("http://api.ihackernews.com/post/#{id}")
32
+ json = JSON.parse(res)
33
+ json['points']
34
+ rescue
35
+ nil
36
+ end
37
+ end
38
+
39
+ def self.get_hash(res)
40
+ json = JSON.parse(res)
41
+ json['items'].map do |i|
42
+ {"title" => i['title'],
43
+ "url" => Gistgen::URL.standardize(i['url']),
44
+ "score" => i['points'],
45
+ "time" => Gistgen::HackerNews.parse_time(i['postedAgo']),
46
+ "discussion_url" => "http://news.ycombinator.com/item?id=#{i['id']}"
47
+ }
48
+ end
49
+ end
50
+
51
+ def self.parse_time(time_ago)
52
+ begin
53
+ tmp = time_ago.split(' ')
54
+ time = tmp[0].to_i.send(tmp[1]).ago
55
+ rescue
56
+ Time.now.utc
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,18 @@
1
+ require 'mechanize'
2
+
3
+ module Gistgen
4
+ class Page
5
+ #make http request and return the html page
6
+ def self.get_page(url, user_agent='gistgen gem request')
7
+ begin
8
+ agent = Mechanize.new
9
+ agent.user_agent = user_agent
10
+ agent.get(url)
11
+ agent.page.body
12
+ rescue
13
+ nil
14
+ end
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,48 @@
1
+ require 'json'
2
+
3
+ require 'gistgen/page'
4
+ require 'gistgen/url'
5
+
6
+ module Gistgen
7
+ class Reddit
8
+ #http://code.reddit.com/wiki/API
9
+ #reddit api is so nice, you just pick a page and add .json to get just the data
10
+ #ex: http://www.reddit.com/.json
11
+ def self.fetch(url)
12
+ res = Gistgen::Page.get_page("#{url.gsub(/\/$/,'')}/.json")
13
+ Gistgen::Reddit.get_hash(res)
14
+ end
15
+
16
+ def self.get_score(reddit_url)
17
+ begin
18
+ res = Gistgen::Page.get_page("#{reddit_url.gsub(/\/$/,'')}/.json")
19
+ score = res.scan(/"score"\s*:\s*(\d+)/)[0].join('').to_i #reddit nested comments is too deep for json
20
+ rescue
21
+ nil
22
+ end
23
+ end
24
+
25
+ def self.get_hash(res)
26
+ json = JSON.parse(res)
27
+ items = json['data']['children']
28
+ items.map do |i|
29
+ post = i['data']
30
+ {"title" => post['title'],
31
+ "url" => Gistgen::URL.standardize(post['url']),
32
+ "score" => post['score'],
33
+ "time" => Time.at(post['created_utc']),
34
+ "discussion_url" => "http://reddit.com#{post['permalink']}"
35
+ }
36
+ end
37
+ end
38
+
39
+ #ban digg: they link to their url shortener
40
+ #http://developers.digg.com/documentation
41
+ #require 'uri'
42
+ #def self.get_diggs(url)
43
+ # res = Gistgen::Page.get_page(URI.escape(url)) #need to encode url
44
+ # json = JSON.parse(res)
45
+ # json['stories'][0]['diggs']
46
+ #end
47
+ end
48
+ end
@@ -0,0 +1,39 @@
1
+ class String
2
+ #return a passage of size <= length from start_indexth sentence
3
+ def extract_passage(start_index=0,length=500)
4
+ sentences = self.split_sentences
5
+ stop = ((start_index+1)...sentences.size).detect { |i| (sentences[start_index..i].join('. ')).size > length }
6
+ stop = (stop and stop <= sentences.size)? stop-1 : sentences.size - 1
7
+ passages = sentences[start_index...stop].join('. ').split("\n")
8
+ (passages.size > 0)? passages[0].gsub(/^[^\w]+/,'').limit(length) : ''
9
+ end
10
+
11
+ #split text into sentences, take into account Mr.|Ms. endings are not end of sentence
12
+ def split_sentences
13
+ #break text first by paragraph then into chunks delimited by a period
14
+ #but these are not quite sentences yet
15
+ chunks = (self.split(/\n+/).map { |p| "#{p}\n".split(/\.(?:[^\w])/) }).flatten.compact
16
+
17
+ #if a sentence is split at Mr.|Ms.|Dr.|Mrs.
18
+ #then recombine it with its remaining part and nil it to delete later
19
+ tmp=''
20
+ sentences = chunks.map { |c|
21
+ ss = (tmp != '')? "#{tmp}. #{c}" : c
22
+ if c.match(/(?:Dr|Mr|Ms|Mrs)$/) #what about John F. Kennedy ([A-Z])
23
+ tmp = ss
24
+ ss=nil
25
+ else
26
+ tmp = ''
27
+ end
28
+ ss
29
+ }
30
+ sentences.compact #delete nil elements
31
+ end
32
+
33
+ #constraint a string to a fixed length or less
34
+ #discard everything after the last punctuation that occurs right before lenght limit
35
+ #the regexp look ahead for any punctuation
36
+ def limit(length)
37
+ (self.length > length)? self[0...length].gsub(/(?![\s\S]+?[,:;)\/\\\|])([,:;)\/\\\|].*)/,'') : self
38
+ end
39
+ end
@@ -0,0 +1,32 @@
1
+ module Gistgen
2
+ class URL
3
+ def self.standardize(url)
4
+ protocol = url.split('.')[0].match(/^(.*):\/\//)
5
+ u1 = (!protocol)? "http://#{url}" : url
6
+ #raise error if protocol && protocol[0] != 'http'
7
+
8
+ #remove www subdomain if exist
9
+ u2 = u1.gsub(/^(http|https):\/\/www\./ix,'http://')
10
+
11
+ #make sure google.com and google.com/ are the same thing
12
+ u3 = (u2.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?$/ix))? "#{u2}/" : u2
13
+ end
14
+
15
+ def self.is_valid?(url)
16
+ url.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}((:[0-9]{1,5})?\/.*)?$/ix)
17
+ end
18
+
19
+ def self.is_image?(url)
20
+ url.match(/\.(?:jpg|jpeg|png|gif|tiff|raw|bmp|webp|ai|psd|svg)$/i)
21
+ end
22
+
23
+ def self.is_multimedia?(url)
24
+ url.match(/\.(?:js|css|mp3|swf|wmv|mov|doc|pdf|ppt|xls|xlsx|docx|eps|ps|ttf|xml)$/i)
25
+ end
26
+
27
+ #return true if it's a base url (nothing after first '/') and does not have subdomain
28
+ def self.is_root?(url)
29
+ url.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?\/$/ix) #and url.split('.').size == 2
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,3 @@
1
+ module Gistgen
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,34 @@
1
+ require 'uri'
2
+ require 'json'
3
+
4
+ require "gistgen/page"
5
+ require "gistgen/string"
6
+
7
+ module Gistgen
8
+ class WikiAbstract
9
+
10
+ def self.permalink(name)
11
+ "http://en.wikipedia.org/wiki/#{name}"
12
+ end
13
+
14
+ def self.search(query, length=500)
15
+ q = URI.escape(query)
16
+ res = Gistgen::Page.get_page("http://en.wikipedia.org/w/api.php?format=json&action=query&prop=revisions&titles=#{q}&rvprop=content&rvsection=0")
17
+ json = JSON.parse(res)
18
+ rev=0
19
+ json['query']['pages'].each_key { |k| rev=k }
20
+ all_text = ''
21
+ text = json['query']['pages'][rev]['revisions'][0]['*'].to_s
22
+ if text.include?('#REDIRECT')
23
+ new_q = text.match(/\[\[(.*?)\]\]/)[0].gsub('[','').gsub(']','')
24
+ all_text = WikiAbstract.search(new_q)
25
+ else
26
+ t = text[text.index("'''")...text.size].gsub(/^\s+/,'')
27
+ all_text = t.gsub(/<ref>(.*?)<\/ref>/i,'').gsub(/<small>(.*?)<\/small>/,'').gsub(/<(.*?)>/,'').gsub(/\{\{(.*?)\}\}/,'').gsub(/\(stylized(.*?)\)/,'').gsub(/\[\[([^\]\]]*?)\|/,'')
28
+ ["[","]","'''"].each { |g,clean| all_text.gsub!(g,'') }
29
+ end
30
+ all_text.extract_passage(0, length)
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,5 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/gistgen"
2
+
3
+ cv = Gistgen::CrunchView.new('google')
4
+ puts cv.permalink
5
+ puts cv.overview
@@ -0,0 +1,8 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/gistgen"
2
+
3
+ puts Gistgen::HackerNews.get_score('http://news.ycombinator.com/item?id=2458202')
4
+ puts Gistgen::HackerNews.frontpage
5
+ puts Gistgen::HackerNews.new_posts
6
+
7
+ puts Gistgen::Reddit.fetch("http://reddit.com/")
8
+ puts Gistgen::Reddit.get_score('http://www.reddit.com/r/funny/comments/gu7jw/dictator/')
@@ -0,0 +1,3 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/gistgen"
2
+
3
+ puts Gistgen::WikiAbstract.search('google.com')
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gistgen
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.1
6
+ platform: ruby
7
+ authors:
8
+ - Quan Nguyen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-05-01 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: mechanize
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: json
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: "0"
35
+ type: :runtime
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: htmlentities
39
+ prerelease: false
40
+ requirement: &id003 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ type: :runtime
47
+ version_requirements: *id003
48
+ description: gistgen has several modules to generate summaries from wikipedia and crunchbase
49
+ email:
50
+ - mquannie@gmail.com
51
+ executables: []
52
+
53
+ extensions: []
54
+
55
+ extra_rdoc_files: []
56
+
57
+ files:
58
+ - .gitignore
59
+ - Gemfile
60
+ - Rakefile
61
+ - gistgen.gemspec
62
+ - lib/gistgen.rb
63
+ - lib/gistgen/crunch_view.rb
64
+ - lib/gistgen/hacker_news.rb
65
+ - lib/gistgen/page.rb
66
+ - lib/gistgen/reddit.rb
67
+ - lib/gistgen/string.rb
68
+ - lib/gistgen/url.rb
69
+ - lib/gistgen/version.rb
70
+ - lib/gistgen/wiki_abstract.rb
71
+ - tests/crunch_view_test.rb
72
+ - tests/hubs_test.rb
73
+ - tests/wiki_test.rb
74
+ homepage: http://github.com/mquan/gistgen
75
+ licenses: []
76
+
77
+ post_install_message:
78
+ rdoc_options: []
79
+
80
+ require_paths:
81
+ - lib
82
+ required_ruby_version: !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: "0"
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: "0"
94
+ requirements: []
95
+
96
+ rubyforge_project: gistgen
97
+ rubygems_version: 1.7.2
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: generate different types of summaries for a text
101
+ test_files: []
102
+