gistgen 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ .DS_Store
4
+ Gemfile.lock
5
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in gistgen.gemspec
4
+ gemspec
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "gistgen/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "gistgen"
7
+ s.version = Gistgen::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Quan Nguyen"]
10
+ s.email = ["mquannie@gmail.com"]
11
+ s.homepage = "http://github.com/mquan/gistgen"
12
+ s.summary = %q{generate different types of summaries for a text}
13
+ s.description = %q{gistgen has several modules to generate summaries from wikipedia and crunchbase}
14
+ s.rubyforge_project = "gistgen"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_dependency "mechanize"
22
+ s.add_dependency "json"
23
+ s.add_dependency "htmlentities"
24
+ end
@@ -0,0 +1,8 @@
1
+ $: << File.dirname(__FILE__)
2
+ require 'gistgen/wiki_abstract'
3
+ require 'gistgen/crunch_view'
4
+ require 'gistgen/hacker_news'
5
+ require 'gistgen/reddit'
6
+
7
+ module Gistgen
8
+ end
@@ -0,0 +1,44 @@
1
+ require 'json'
2
+ require 'htmlentities'
3
+
4
+ require "gistgen/page"
5
+ require "gistgen/string"
6
+ require "gistgen/url"
7
+
8
+ module Gistgen
9
+ class CrunchView
10
+ def initialize(name)
11
+ begin
12
+ res = Gistgen::Page.get_page("http://api.crunchbase.com/v/1/company/#{name}.js")
13
+ @json = JSON.parse(res)
14
+ @json = (@json and @json['overview'])? @json : nil
15
+ rescue
16
+ nil
17
+ end
18
+ end
19
+
20
+ def overview(length=500)
21
+ begin
22
+ text = @json['overview'].gsub(/\u003C(.*?)\\u003E/,'').gsub(/<(.*?)>/,'').gsub("\n",'')
23
+ text = HTMLEntities.new.decode(text) #decode_html
24
+ text.extract_passage(0, length)
25
+ rescue
26
+ nil
27
+ end
28
+ end
29
+
30
+ def permalink
31
+ begin
32
+ Gistgen::URL.standardize("http://www.crunchbase.com/company/#{@json['permalink']}")
33
+ rescue
34
+ nil
35
+ end
36
+ end
37
+
38
+ def homepage
39
+ (@json)? Gistgen::URL.standardize(@json['homepage_url']) : nil
40
+ end
41
+
42
+ end
43
+ end
44
+
@@ -0,0 +1,61 @@
1
+ require 'json'
2
+
3
+ require 'gistgen/page'
4
+ require 'gistgen/url'
5
+
6
+ module Gistgen
7
+ class HackerNews
8
+ #http://api.ihackernews.com/
9
+ def self.frontpage
10
+ begin
11
+ res = Gistgen::Page.get_page("http://api.ihackernews.com/page")
12
+ Gistgen::HackerNews.get_hash(res)
13
+ rescue
14
+ nil
15
+ end
16
+ end
17
+
18
+ #don't use this too often (low score shouldn't be added)
19
+ def self.new_posts
20
+ begin
21
+ res = Gistgen::Page.get_page("http://api.ihackernews.com/new")
22
+ Gistgen::HackerNews.get_hash(res)
23
+ rescue
24
+ nil
25
+ end
26
+ end
27
+
28
+ def self.get_score(hn_url)
29
+ id = hn_url.match(/\d+$/)[0]
30
+ begin
31
+ res = Gistgen::Page.get_page("http://api.ihackernews.com/post/#{id}")
32
+ json = JSON.parse(res)
33
+ json['points']
34
+ rescue
35
+ nil
36
+ end
37
+ end
38
+
39
+ def self.get_hash(res)
40
+ json = JSON.parse(res)
41
+ json['items'].map do |i|
42
+ {"title" => i['title'],
43
+ "url" => Gistgen::URL.standardize(i['url']),
44
+ "score" => i['points'],
45
+ "time" => Gistgen::HackerNews.parse_time(i['postedAgo']),
46
+ "discussion_url" => "http://news.ycombinator.com/item?id=#{i['id']}"
47
+ }
48
+ end
49
+ end
50
+
51
+ def self.parse_time(time_ago)
52
+ begin
53
+ tmp = time_ago.split(' ')
54
+ time = tmp[0].to_i.send(tmp[1]).ago
55
+ rescue
56
+ Time.now.utc
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,18 @@
1
+ require 'mechanize'
2
+
3
+ module Gistgen
4
+ class Page
5
+ #make http request and return the html page
6
+ def self.get_page(url, user_agent='gistgen gem request')
7
+ begin
8
+ agent = Mechanize.new
9
+ agent.user_agent = user_agent
10
+ agent.get(url)
11
+ agent.page.body
12
+ rescue
13
+ nil
14
+ end
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,48 @@
1
+ require 'json'
2
+
3
+ require 'gistgen/page'
4
+ require 'gistgen/url'
5
+
6
+ module Gistgen
7
+ class Reddit
8
+ #http://code.reddit.com/wiki/API
9
+ #reddit api is so nice, you just pick a page and add .json to get just the data
10
+ #ex: http://www.reddit.com/.json
11
+ def self.fetch(url)
12
+ res = Gistgen::Page.get_page("#{url.gsub(/\/$/,'')}/.json")
13
+ Gistgen::Reddit.get_hash(res)
14
+ end
15
+
16
+ def self.get_score(reddit_url)
17
+ begin
18
+ res = Gistgen::Page.get_page("#{reddit_url.gsub(/\/$/,'')}/.json")
19
+ score = res.scan(/"score"\s*:\s*(\d+)/)[0].join('').to_i #reddit nested comments is too deep for json
20
+ rescue
21
+ nil
22
+ end
23
+ end
24
+
25
+ def self.get_hash(res)
26
+ json = JSON.parse(res)
27
+ items = json['data']['children']
28
+ items.map do |i|
29
+ post = i['data']
30
+ {"title" => post['title'],
31
+ "url" => Gistgen::URL.standardize(post['url']),
32
+ "score" => post['score'],
33
+ "time" => Time.at(post['created_utc']),
34
+ "discussion_url" => "http://reddit.com#{post['permalink']}"
35
+ }
36
+ end
37
+ end
38
+
39
+ #ban digg: they link to their url shortener
40
+ #http://developers.digg.com/documentation
41
+ #require 'uri'
42
+ #def self.get_diggs(url)
43
+ # res = Gistgen::Page.get_page(URI.escape(url)) #need to encode url
44
+ # json = JSON.parse(res)
45
+ # json['stories'][0]['diggs']
46
+ #end
47
+ end
48
+ end
@@ -0,0 +1,39 @@
1
+ class String
2
+ #return a passage of size <= length from start_indexth sentence
3
+ def extract_passage(start_index=0,length=500)
4
+ sentences = self.split_sentences
5
+ stop = ((start_index+1)...sentences.size).detect { |i| (sentences[start_index..i].join('. ')).size > length }
6
+ stop = (stop and stop <= sentences.size)? stop-1 : sentences.size - 1
7
+ passages = sentences[start_index...stop].join('. ').split("\n")
8
+ (passages.size > 0)? passages[0].gsub(/^[^\w]+/,'').limit(length) : ''
9
+ end
10
+
11
+ #split text into sentences, take into account Mr.|Ms. endings are not end of sentence
12
+ def split_sentences
13
+ #break text first by paragraph then into chunks delimited by a period
14
+ #but these are not quite sentences yet
15
+ chunks = (self.split(/\n+/).map { |p| "#{p}\n".split(/\.(?:[^\w])/) }).flatten.compact
16
+
17
+ #if a sentence is split at Mr.|Ms.|Dr.|Mrs.
18
+ #then recombine it with its remaining part and nil it to delete later
19
+ tmp=''
20
+ sentences = chunks.map { |c|
21
+ ss = (tmp != '')? "#{tmp}. #{c}" : c
22
+ if c.match(/(?:Dr|Mr|Ms|Mrs)$/) #what about John F. Kennedy ([A-Z])
23
+ tmp = ss
24
+ ss=nil
25
+ else
26
+ tmp = ''
27
+ end
28
+ ss
29
+ }
30
+ sentences.compact #delete nil elements
31
+ end
32
+
33
+ #constraint a string to a fixed length or less
34
+ #discard everything after the last punctuation that occurs right before lenght limit
35
+ #the regexp look ahead for any punctuation
36
+ def limit(length)
37
+ (self.length > length)? self[0...length].gsub(/(?![\s\S]+?[,:;)\/\\\|])([,:;)\/\\\|].*)/,'') : self
38
+ end
39
+ end
@@ -0,0 +1,32 @@
1
+ module Gistgen
2
+ class URL
3
+ def self.standardize(url)
4
+ protocol = url.split('.')[0].match(/^(.*):\/\//)
5
+ u1 = (!protocol)? "http://#{url}" : url
6
+ #raise error if protocol && protocol[0] != 'http'
7
+
8
+ #remove www subdomain if exist
9
+ u2 = u1.gsub(/^(http|https):\/\/www\./ix,'http://')
10
+
11
+ #make sure google.com and google.com/ are the same thing
12
+ u3 = (u2.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?$/ix))? "#{u2}/" : u2
13
+ end
14
+
15
+ def self.is_valid?(url)
16
+ url.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}((:[0-9]{1,5})?\/.*)?$/ix)
17
+ end
18
+
19
+ def self.is_image?(url)
20
+ url.match(/\.(?:jpg|jpeg|png|gif|tiff|raw|bmp|webp|ai|psd|svg)$/i)
21
+ end
22
+
23
+ def self.is_multimedia?(url)
24
+ url.match(/\.(?:js|css|mp3|swf|wmv|mov|doc|pdf|ppt|xls|xlsx|docx|eps|ps|ttf|xml)$/i)
25
+ end
26
+
27
+ #return true if it's a base url (nothing after first '/') and does not have subdomain
28
+ def self.is_root?(url)
29
+ url.match(/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?\/$/ix) #and url.split('.').size == 2
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,3 @@
1
+ module Gistgen
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,34 @@
1
+ require 'uri'
2
+ require 'json'
3
+
4
+ require "gistgen/page"
5
+ require "gistgen/string"
6
+
7
+ module Gistgen
8
+ class WikiAbstract
9
+
10
+ def self.permalink(name)
11
+ "http://en.wikipedia.org/wiki/#{name}"
12
+ end
13
+
14
+ def self.search(query, length=500)
15
+ q = URI.escape(query)
16
+ res = Gistgen::Page.get_page("http://en.wikipedia.org/w/api.php?format=json&action=query&prop=revisions&titles=#{q}&rvprop=content&rvsection=0")
17
+ json = JSON.parse(res)
18
+ rev=0
19
+ json['query']['pages'].each_key { |k| rev=k }
20
+ all_text = ''
21
+ text = json['query']['pages'][rev]['revisions'][0]['*'].to_s
22
+ if text.include?('#REDIRECT')
23
+ new_q = text.match(/\[\[(.*?)\]\]/)[0].gsub('[','').gsub(']','')
24
+ all_text = WikiAbstract.search(new_q)
25
+ else
26
+ t = text[text.index("'''")...text.size].gsub(/^\s+/,'')
27
+ all_text = t.gsub(/<ref>(.*?)<\/ref>/i,'').gsub(/<small>(.*?)<\/small>/,'').gsub(/<(.*?)>/,'').gsub(/\{\{(.*?)\}\}/,'').gsub(/\(stylized(.*?)\)/,'').gsub(/\[\[([^\]\]]*?)\|/,'')
28
+ ["[","]","'''"].each { |g,clean| all_text.gsub!(g,'') }
29
+ end
30
+ all_text.extract_passage(0, length)
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,5 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/gistgen"
2
+
3
+ cv = Gistgen::CrunchView.new('google')
4
+ puts cv.permalink
5
+ puts cv.overview
@@ -0,0 +1,8 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/gistgen"
2
+
3
+ puts Gistgen::HackerNews.get_score('http://news.ycombinator.com/item?id=2458202')
4
+ puts Gistgen::HackerNews.frontpage
5
+ puts Gistgen::HackerNews.new_posts
6
+
7
+ puts Gistgen::Reddit.fetch("http://reddit.com/")
8
+ puts Gistgen::Reddit.get_score('http://www.reddit.com/r/funny/comments/gu7jw/dictator/')
@@ -0,0 +1,3 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/gistgen"
2
+
3
+ puts Gistgen::WikiAbstract.search('google.com')
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gistgen
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.1
6
+ platform: ruby
7
+ authors:
8
+ - Quan Nguyen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-05-01 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: mechanize
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: json
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: "0"
35
+ type: :runtime
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: htmlentities
39
+ prerelease: false
40
+ requirement: &id003 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ type: :runtime
47
+ version_requirements: *id003
48
+ description: gistgen has several modules to generate summaries from wikipedia and crunchbase
49
+ email:
50
+ - mquannie@gmail.com
51
+ executables: []
52
+
53
+ extensions: []
54
+
55
+ extra_rdoc_files: []
56
+
57
+ files:
58
+ - .gitignore
59
+ - Gemfile
60
+ - Rakefile
61
+ - gistgen.gemspec
62
+ - lib/gistgen.rb
63
+ - lib/gistgen/crunch_view.rb
64
+ - lib/gistgen/hacker_news.rb
65
+ - lib/gistgen/page.rb
66
+ - lib/gistgen/reddit.rb
67
+ - lib/gistgen/string.rb
68
+ - lib/gistgen/url.rb
69
+ - lib/gistgen/version.rb
70
+ - lib/gistgen/wiki_abstract.rb
71
+ - tests/crunch_view_test.rb
72
+ - tests/hubs_test.rb
73
+ - tests/wiki_test.rb
74
+ homepage: http://github.com/mquan/gistgen
75
+ licenses: []
76
+
77
+ post_install_message:
78
+ rdoc_options: []
79
+
80
+ require_paths:
81
+ - lib
82
+ required_ruby_version: !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: "0"
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: "0"
94
+ requirements: []
95
+
96
+ rubyforge_project: gistgen
97
+ rubygems_version: 1.7.2
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: generate different types of summaries for a text
101
+ test_files: []
102
+