chomchom 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Binary file
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in chomchom.gemspec
4
+ gemspec
data/README ADDED
@@ -0,0 +1,14 @@
1
+ Usage
2
+
3
+ require 'mechanize'
4
+ agent = Mechanize.new
5
+ agent.get("http://gistpoint.com/")
6
+ html = agent.page.body
7
+
8
+ doc = Chomchom::Document.new(html)
9
+ doc.title
10
+ doc.publish_date
11
+ doc.author
12
+ doc.fulltext
13
+ doc.topics
14
+ doc.summary
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "chomchom/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "chomchom"
7
+ s.version = Chomchom::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Quan Nguyen"]
10
+ s.email = ["mquannie@gmail.com"]
11
+ s.homepage = "http://github.com/mquan/chomchom"
12
+ s.summary = %q{chomchom is a ruby gem that extracts key information from an html page}
13
+ s.description = %q{chomchom extracts article's title, published_date, author, and fulltext. It also detects videos and audio for classifying the media type of a given page}
14
+
15
+ s.rubyforge_project = "chomchom"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency(%q<mechanize>)
23
+ s.add_dependency(%q<nokogiri>)
24
+ s.add_dependency(%q<ruby-stemmer>)
25
+ s.add_dependency(%q<ruby-readability>)
26
+ s.add_dependency(%q<htmlentities>)
27
+ s.add_dependency(%q<json>)
28
+
29
+ end
Binary file
@@ -0,0 +1,57 @@
1
+ $: << File.dirname(__FILE__)
2
+ require 'chomchom/summary'
3
+ require 'chomchom/topic'
4
+ require 'chomchom/extractor'
5
+
6
+ require 'chomchom/scorer'
7
+ require 'chomchom/social_analytics'
8
+
9
+ module Chomchom
10
+ class Document
11
+ def initialize(html)
12
+ @extr = Chomchom::Extractor.new(html)
13
+ @title = @extr.title
14
+ @fulltext = @extr.fulltext
15
+ end
16
+
17
+ def title
18
+ @title
19
+ end
20
+
21
+ def fulltext
22
+ @fulltext
23
+ end
24
+
25
+ def publish_date
26
+ @extr.publish_date
27
+ end
28
+
29
+ def author
30
+ @extr.author
31
+ end
32
+
33
+ def consume_duration
34
+ @extr.consume_duration
35
+ end
36
+
37
+ def all_topics
38
+ @all_topics = Chomchom::Topic.new(@fulltext, @title, 1).singles
39
+ end
40
+
41
+ def center_of_gravity(length=400)
42
+ Chomchom::Summary.new.center_of_gravity(@fulltext, @all_topics, length)
43
+ end
44
+
45
+ def first_mentions(length=400)
46
+ Chomchom::Summary.first_mentions(@fulltext, @all_topics, length)
47
+ end
48
+
49
+ def topic_sentences(length=400)
50
+ Chomchom::Summary.topic_sentences(@fulltext, @all_topics, length)
51
+ end
52
+
53
+ def best_sentences(length=400)
54
+ Chomchom::Summary.best_sentences(@fulltext, @all_topics, length)
55
+ end
56
+ end
57
+ end
Binary file
@@ -0,0 +1,145 @@
1
+ require 'nokogiri'
2
+ require 'readability'
3
+ require 'date'
4
+ require 'htmlentities'
5
+
6
+ require "chomchom/regex_path"
7
+
8
+ module Chomchom
9
+ class Extractor
10
+ WPM = 250 #average reading speed
11
+
12
+ #parameters for max number of topics to retrieve
13
+ MAX_MONOS = 5
14
+ MAX_MULTIS = 3
15
+
16
+ #TODO: the current ruby-readability doesn't pull next pages' text
17
+ def initialize(html_txt)
18
+ #fix utf-8 invalid string
19
+ #http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
20
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
21
+ html = ic.iconv(html_txt + ' ')[0..-2]
22
+
23
+ begin
24
+ @fulltext = Readability::Document.new(html).content
25
+ rescue
26
+ @fulltext = ''
27
+ end
28
+
29
+ @fulltext = @fulltext.gsub(/\s+/," ").gsub(/<\/.*?>/, "\n").gsub(/<.*?>/,'')
30
+ @fulltext = HTMLEntities.new.decode(@fulltext) #decode html
31
+
32
+ @title = Nokogiri::XML(html.scan(/<title.*>(?:\n|.)*?<\/title>/i)[0])
33
+ @title = (@title)? @title.inner_text.gsub(/^\s+/,'').gsub(/\s+$/,'').gsub(/\n+/,' ') : ''
34
+ @title = HTMLEntities.new.decode(@title)
35
+
36
+ #use greedy match for <body> to cover embeded frames
37
+ @body = html.match(/<body.*?>(?:\n|.)*<\/body>/i)
38
+ @body = (@body)? @body[0] : '<body></body>'
39
+
40
+ #remove scripts, styles, frames, and comments (all non greedy)
41
+ @body.gsub!(/<script.*?>(?:\n|.)*?<\/script>/i,'')
42
+ @body.gsub!(/<style.*?>(?:\n|.)*?<\/style>/i,'')
43
+ @body.gsub!(/<frame.*?>(?:\n|.)*?<\/frame>/i,'')
44
+ @body.gsub!(/<iframe.*?>(?:\n|.)*?<\/iframe>/i,'')
45
+
46
+ @body = @body.gsub(/<!\-\-(?:\n|.)*?\-\->/,'').gsub(/\s+/,' ').gsub(/\n+/,"\n")
47
+
48
+ @body_dom = Nokogiri::XML(@body)
49
+ end
50
+
51
+ #readability getArticleTitle
52
+ def readability_title
53
+ title = ''
54
+ if @title.match(/[\|\-]/)
55
+ title = @title.scan(/(.*)[\|\-].*/).flatten[0]
56
+ title = @title.scan(/[^\|\-]*[\|\-](.*)/).flatten[0] if title.split(' ').size < 3
57
+ elsif @title.index(': ')
58
+ title = @title.scan(/.*:(.*)/).flatten[0]
59
+ elsif @title.length > 150 or @title.length < 15
60
+ h1s = @body_dom.xpath(".//h1")
61
+ title = h1s[0].inner_text if h1s and h1s.size > 0
62
+ end
63
+ title = @title if title.split(' ').size <= 4
64
+ title.gsub(/^\s+/,'')
65
+ end
66
+
67
+ #retrieving title strategy:
68
+ #1. get all the elements with class/id="...title|head..." and h1-h3
69
+ #2. match them against the page title to get a bunch of candidates
70
+ #3. take the longest candidate, take original title if no candidate avail
71
+ def title
72
+ titles1 = @body_dom.xpath(".//*[regex(.,'.*title|head.*','id|class')]", Chomchom::RegexPath.new).map { |n| n.inner_text }
73
+ titles2 = @body_dom.search('//h1','//h2').map {|n| n.inner_text }
74
+ titles = (titles1 + titles2).flatten.compact
75
+ candidates = titles.select { |t| @title.downcase.include?(t.downcase) }
76
+ #select the longest candidate as title
77
+ if candidates.size > 0
78
+ title = ''
79
+ candidates.each { |c| title = c if c.length > title.length }
80
+ title.gsub(/\s+/,' ').gsub(/\n+/,'')
81
+ else
82
+ @title
83
+ end
84
+ end
85
+
86
+ #match and select publish date. Strategy:
87
+ #1. scan for the most used patterns
88
+ #2. take the one at the very top (usually the one near title) - this fails for pages displaying today date
89
+ #3. parse to date object (ruby amazingly handles all the different formats)
90
+ #Note: won't work for pages using javascript to write date
91
+ #agent.page.response['Last-Modified'] doesn't work b/c most pages now are dynamically generated
92
+ MONTHS_RE = "(?:#{(Date::MONTHNAMES + Date::ABBR_MONTHNAMES).compact.join("|")})"
93
+ def publish_date
94
+ dates = @body.scan(/(?:(#{MONTHS_RE}[^\w]+\d{1,2}(?:th|st|nd|rd)?[^\w]+(?:\d{4}|\d{2})?)[^\w]) |
95
+ (?:(\d{1,2}(?:th|st|nd|rd)?\s#{MONTHS_RE}[^\w]+(?:\d{4}|\d{2})?)[^\w]) |
96
+ (?:(\d{1,2}\-\d{1,2}\-\d{4})[^\w]) | (?:(\d{1,2}\.\d{1,2}\.\d{4})[^\w]) | (?:(\d{1,2}\/\d{1,2}\/\d{4})[^\w]) |
97
+ (?:(\d{4}\-\d{1,2}\-\d{1,2})[^\w]) | (?:(\d{4}\.\d{1,2}\.\d{1,2})[^\w]) | (?:(\d{4}\/\d{1,2}\/\d{1,2})[^\w])
98
+ /ix).flatten.compact
99
+
100
+ dates.delete_if { |d| is_not_date(d) } if dates
101
+ begin
102
+ Date.parse(dates[0])
103
+ rescue
104
+ Date.today
105
+ end
106
+ end
107
+
108
+ def author
109
+ writers = @body_dom.xpath(".//*[regex(.,'.*author|byline|auth.*','id|class|href')]", Chomchom::RegexPath.new).map { |n| n.inner_text }
110
+ writers = writers.flatten.compact
111
+ (writers and writers[0])? writers[0].gsub(/^\s+/,'') : ''
112
+ end
113
+
114
+ def fulltext
115
+ @fulltext
116
+ end
117
+
118
+ #return time in mintues
119
+ #factor in other embedded media duration
120
+ def consume_duration
121
+ (@fulltext)? (@fulltext.gsub(/<.*?>/,'').split(/[\s\n]/).size/WPM).ceil : 0
122
+ end
123
+
124
+ private
125
+ #eliminate things that aren't dates: 0/0/2001 1/1/7493
126
+ def is_not_date(date)
127
+ if date.match(/(\d{4}[\-\.\/]\d{1,2}[\-\.\/]\d{1,2})/)
128
+ tmp = date.split(/[\-\.\/]/)
129
+ y = tmp[0].to_i
130
+ m = tmp[1].to_i
131
+ d = tmp[2].to_i
132
+ y > Date.today.year or m <= 0 or m > 12 or d <= 0 or d > 31
133
+ elsif date.match(/(\d{1,2}[\-\.\/]\d{1,2}[\-\.\/]\d{4})/)
134
+ tmp = date.split(/[\-\.\/]/)
135
+ y = tmp[2].to_i
136
+ m = tmp[1].to_i
137
+ d = tmp[0].to_i
138
+ y > Date.today.year or m <= 0 or d <= 0
139
+ else
140
+ false
141
+ end
142
+ end
143
+
144
+ end
145
+ end
@@ -0,0 +1,11 @@
1
+ #custom xpath function for regexp matching of element attributes
2
+ #atts is a string delimited by "|" for or matching attributes ("id|class")
3
+ module Chomchom
4
+ class RegexPath
5
+ def regex(node_set, re, atts)
6
+ node_set.find_all do |node|
7
+ atts.split('|').detect { |att| node[att] =~ /#{re}/ }
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,187 @@
1
+ #coding: utf-8
2
+ require 'lingua/stemmer' #https://github.com/aurelian/ruby-stemmer
3
+
4
+ module Chomchom
5
+ class Scorer
6
+ def score(text, summary, topics)
7
+ #solve the utf-8 invalid string error
8
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
9
+ text = ic.iconv(text + ' ')[0..-2]
10
+ summary = ic.iconv(summary + ' ')[0..-2]
11
+
12
+ #step 1: prep the texts for analysis
13
+ stemmer = Lingua::Stemmer.new(:language => 'en')
14
+
15
+ stem_topics = topics.map { |t| stemmer.stem(t) }
16
+
17
+ text_sentences = text.downcase.split_sentences
18
+ tss = text_sentences.map { |ts|
19
+ #stemmer.stem(ts) if ts.match(/\p{Word}+/)
20
+ words = ts.downcase.split(/[^\p{Word}]/).map { |w| stemmer.stem(w) if w and w.size>1 and !w.is_common?}.compact
21
+ words if ts.match(/\p{Word}+/) and words.size > 0
22
+ }.compact
23
+
24
+ #rudimentary sentences scoring (number of non-common words)
25
+ #another scoring approach is to manually go throu each sentence and mark important ones
26
+ #do they have an identifiable pattern (have topic and some other words?)
27
+ #or first and last paragraphs are important? first sentence in paragraph?
28
+ tss_scores = tss.map { |ts| ts.uniq.size }
29
+
30
+ #File.open("fulltexts/#{title}.txt", "w") do |f|
31
+ #text_sentences.map {|ts| ts if ts.match(/\p{Word}+/)}.compact.each_with_index { |ts,i| f.puts "#{i} #{ts}" }
32
+ #end
33
+
34
+ #step 2: coverage analysis by performing exact word match (with stemming)
35
+
36
+ #evaluate the whole summary, this will more likely increase the score
37
+ #coverages = find_coverages(summary, ts)
38
+
39
+ #separating by sentences has the effect of designating each sentence to a section
40
+ coverages = []
41
+ #ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s|
42
+ ss = summary.downcase.split_sentences.each { |s|
43
+ coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/)
44
+ }
45
+
46
+ #step 3: synonym analysis and domain specific fusion on words that didn't match
47
+ #since the matched one already established, it's less likely that a word carries double meanings in the same story
48
+
49
+ #step 4: compute coverage score
50
+ covered = coverages.flatten.uniq
51
+
52
+ #redundancy = coverages.flatten.size - coverages.flatten.uniq.size
53
+ #uncovered = (0...ts.size).to_a.select { |i| i if !covered.delete(i) }
54
+
55
+ #this treats every uncommon word as 1 unit
56
+ total_score = tss_scores.inject { |sum, score| sum + score }
57
+ summary_score = covered.inject { |sum, i| sum + tss_scores[i] }
58
+ #puts "#{total_score} #{tss_scores}"
59
+ #puts "#{summary_score} #{covered.map{|i| tss_scores[i]}}"
60
+
61
+ #this treats every sentence as 1 unit (all sentences created equal)
62
+ #puts "#{covered.size.to_f/tss.size*100}"
63
+
64
+ #punish for length with the idea of length_tax, no tax below 100 and then progressively increase
65
+ summary_score.to_f/total_score*100*(1-length_tax(summary.size))
66
+
67
+ #algo weaknesses:
68
+ #extracted passage from text often scores higher (b/c of exact word matches)
69
+ #people listing most occurred words in every sentence. check for proper grammar and coherence?
70
+ #negation: take a high scoring summary, say the same thing but negate its meaning? check for meaning?
71
+ end
72
+
73
+ private
74
+ #progressive length tax
75
+ #max = .025 + .05 + .1 = .175 (17.5%)
76
+ #no punishment for short summary b/c itself won't be able to cover as much
77
+ def length_tax(summary_size)
78
+ if summary_size <= 100
79
+ 0
80
+ elsif summary_size <= 200
81
+ (summary_size-100)*0.00025
82
+ elsif summary_size <= 300
83
+ 0.025 + (summary_size-200)*0.0005
84
+ else
85
+ 0.025 + 0.05 + (summary_size-300)*0.001
86
+ end
87
+ end
88
+
89
+ def find_coverages(summary, text_sentences, topics)
90
+ terms = []
91
+ hits = [] #array of indexes of sentences with matched terms
92
+ stemmer = Lingua::Stemmer.new(:language => 'en')
93
+ summary.split(/[^\p{Word}]/).each do |w|
94
+ positions = []
95
+ if !w.is_common?
96
+ #take word stemming, synonym, AND domain specific fusion into consideration
97
+ word = stemmer.stem(w)
98
+ text_sentences.each_with_index { |sentence, i| positions.push(i) if sentence.index(word) }
99
+ if positions.size > 0 and word != "" and word.size > 1
100
+ terms.push(word)
101
+ hits.push(positions)
102
+ end
103
+ end
104
+ #need to get word position, not just sentence position
105
+ #to see word chain into phrase (2 uncommon consec words is very good)
106
+ end
107
+ terms.uniq!
108
+
109
+ #puts "----------#{terms}"
110
+ stretches = possible_stretches(hits.flatten)
111
+ stretches = stretches.map do |stretch|
112
+ stretch_text = " " + text_sentences[stretch.first..stretch.last].flatten.join(" ") + " "
113
+ count = 0
114
+ nontopics = 0
115
+ terms.each { |term|
116
+ if stretch_text.match(/\b#{Regexp.quote(term)}\b/)
117
+ count += 1
118
+ nontopics += 1 if !topics.index(term)
119
+ end
120
+ }
121
+
122
+ #some long stretch doesn't make the count < stretch.size requirement
123
+ #but some sentence in there might be valuable so break it up and analyze each sentence
124
+ second_chances = []
125
+ if(count < stretch.size and stretch.size > 3)
126
+ stretch.each do |i|
127
+ sentence = text_sentences[i].join(" ")
128
+ topic_count = 0
129
+ nontopic_count = 0
130
+ second_chances.push(i) if terms.detect { |term|
131
+ topic_count += 1 if sentence.index(term) and topics.index(term)
132
+ nontopic_count += 1 if sentence.index(term) and !topics.index(term)
133
+ (topic_count >= 1 and nontopic_count > 1) or nontopic_count > 1
134
+ }
135
+ end
136
+ end
137
+
138
+ #uniq_terms = []
139
+ #terms.each { |term| uniq_terms.push(term) if stretch_text.match(/\b#{term}\b/) }
140
+ #puts "#{stretch} #{uniq_terms} #{count >= stretch.size and nontopics > 0}" if count >= stretch.size and count > 1 and nontopics > 0
141
+ #puts "2nd: #{second_chances}" if second_chances.size > 0
142
+
143
+ if second_chances.size > 0
144
+ second_chances
145
+ elsif (count >= stretch.size) and count > 1 and nontopics > 0
146
+ stretch
147
+ else
148
+ nil
149
+ end
150
+ end
151
+ stretches.compact
152
+
153
+ #right now it's either covered or not
154
+ #think up a strategy to compute confidence in covering (1 as very confident, less than one gradually)
155
+
156
+ #it's possible, though unlikely to have 1 sentence covering 2 different sections apart
157
+ #knowing this, a sentence should only have a few stretch not too much distance apart
158
+ #and corresponding to summary_sentence index (i=0 ==> stretch [7..13] rather than [103...109])
159
+ end
160
+
161
+ #find coverage stretch by number of consec sentences covered
162
+ #give more probability for the stretch with sentences with most repetitions
163
+ #give more weight for summary sentence corresponding to equivalent text section
164
+ #(first S is more probable for first few sentences of article)
165
+ def possible_stretches(hits)
166
+ coverages = hits.uniq.map { |hit| [hit, hits.count(hit)] }.sort { |a,b| a[0] <=> b[0] }
167
+ stretches = []
168
+ last = -1
169
+ count = 0
170
+ stretch = []
171
+ coverages.each { |cover|
172
+ if(last == cover[0] - 1) #stitch continuous stretch together
173
+ stretch.push(cover[0])
174
+ count += cover[1]
175
+ else
176
+ stretches.push([stretch, count]) #first element in stretches will be junk
177
+ stretch = [cover[0]]
178
+ count = cover[1]
179
+ end
180
+ last = cover[0]
181
+ }
182
+ #remove insignificant short stretches with low hit count
183
+ stretches = stretches.delete_if { |s| s[1] <= s[0].size and s[0].size < 3 }
184
+ stretches.map { |s| s[0] }
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,106 @@
1
+ require 'mechanize'
2
+ require 'json'
3
+ require 'digest/md5'
4
+
5
+ module Chomchom
6
+ class SocialAnalytics
7
+
8
+ #http://news.ycombinator.com/item?id=2347428
9
+ #http://sharedcount.com/documentation.php
10
+ def initialize(url)
11
+ @url = url
12
+ @agent = Mechanize.new
13
+ @agent.user_agent = "chomchom social analytics"
14
+ end
15
+
16
+ def facebook
17
+ begin
18
+ @agent.get("http://graph.facebook.com/#{@url}")
19
+ facebook = JSON.parse(@agent.page.body)['shares']
20
+ rescue
21
+ 0
22
+ end
23
+ end
24
+
25
+ #this breaks down the counts and possibly allow multiple urls
26
+ def facebook_more
27
+ @agent.get("http://api.ak.facebook.com/restserver.php?v=1.0&method=links.getStats&urls=#{@url}&format=json")
28
+ json = JSON.parse(@agent.page.body)
29
+ share_count = json['share_count']
30
+ like_count = json['like_count']
31
+ comment_count = json['comment_count']
32
+ total_count = json['total_count'] #same as above
33
+ click_count = json['click_count']
34
+ end
35
+
36
+ def twitter
37
+ begin
38
+ @agent.get("http://urls.api.twitter.com/1/urls/count.json?url=#{@url}")
39
+ twitter = JSON.parse(@agent.page.body)['count'].to_i
40
+ rescue
41
+ 0
42
+ end
43
+ end
44
+
45
+ def digg
46
+ begin
47
+ @agent.get("http://widgets.digg.com/buttons/count?url=#{@url}")
48
+ digg = JSON.parse(@agent.page.body.match(/\{.*?\}/)[0])['diggs']
49
+ rescue
50
+ 0
51
+ end
52
+ end
53
+
54
+ def delicious
55
+ begin
56
+ @agent.get("http://feeds.delicious.com/v2/json/urlinfo/#{Digest::MD5.hexdigest(@url)}")
57
+ json = JSON.parse(@agent.page.body)[0]
58
+ #puts tags = json['top_tags']
59
+ delicious = json['total_posts']
60
+ rescue
61
+ 0
62
+ end
63
+ end
64
+
65
+ def stumbleupon
66
+ begin
67
+ @agent.get("http://www.stumbleupon.com/services/1.01/badge.getinfo?url=#{@url}")
68
+ stumbleupon = JSON.parse(@agent.page.body)['result']['views']
69
+ rescue
70
+ 0
71
+ end
72
+ end
73
+
74
+ def google_buzz
75
+ begin
76
+ @agent.get("http://www.googleapis.com/buzz/v1/activities/count?alt=json&url=#{@url}")
77
+ google_buzz = JSON.parse(@agent.page.body)['data']['counts'][@url][0]['count']
78
+ #check multiple sites
79
+ #"https://www.googleapis.com/buzz/v1/activities/count?alt=json&url=http://news.ycombinator.com&url=http://www.techcrunch.com&url=http://www.cnn.com"
80
+ rescue
81
+ 0
82
+ end
83
+ end
84
+
85
+ def linkedin
86
+ begin
87
+ @agent.get("http://www.linkedin.com/cws/share-count?url=#{@url}")
88
+ linkedin = JSON.parse(@agent.page.body.match(/\{.*?\}/)[0])['count']
89
+ rescue
90
+ 0
91
+ end
92
+ end
93
+
94
+ #http://code.google.com/p/bitly-api/wiki/ApiDocumentation#/v3/clicks
95
+ #need the shorten url and a key (free signup)
96
+ def bitly
97
+
98
+ end
99
+
100
+ #they need key, no commercial use
101
+ #http://www.backtype.com/developers
102
+ def backtweets
103
+ "http://backtweets.com/search.json?q=http://news.ycombinator.com/&key=key"
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,44 @@
1
+ #coding: utf-8
2
+
3
+ class String
4
+ #split text into sentences, take into account Mr.|Ms. endings are not end of sentence
5
+ def split_sentences
6
+ #break text first by paragraph then into chunks delimited by a period
7
+ #but these are not quite sentences yet
8
+ chunks = (self.split(/\n+/).map { |p| "#{p}\n".split(/[!?]+|(?:\.+(?:[^\p{Word}]))/) }).flatten.compact
9
+
10
+ #if a sentence is split at Mr.|Ms.|Dr.|Mrs.
11
+ #then recombine it with its remaining part and nil it to delete later
12
+ tmp=''
13
+ sentences = chunks.map { |c|
14
+ ss = (tmp != '')? "#{tmp}. #{c}" : c
15
+ if c.match(/(?:Dr|Mr|Ms|Mrs)$/)
16
+ #what about John F. Kennedy ([A-Z])
17
+ #I finish at 5 a.m. today.
18
+ #At 5 p.m. I have to go to the bank
19
+ #rule 1: every sentence starts with a Cap (what about iPhone?)
20
+ #just check if a sentence is too short then combine with the previous or next?
21
+ tmp = ss
22
+ ss=nil
23
+ else
24
+ tmp = ''
25
+ end
26
+ ss
27
+ }
28
+ sentences.compact #delete nil elements
29
+ end
30
+
31
+ #constraint a string to a fixed length or less
32
+ #discard everything after the last punctuation that occurs right before lenght limit
33
+ #the regexp look ahead for any punctuation
34
+ def limit(length)
35
+ (self.length > length)? self[0...length].gsub(/(?![\s\S]+?[,:;)\/\\\|])([,:;)\/\\\|].*)/,'') : self
36
+ end
37
+
38
+ #common dictionary built from google's top 300 1-grams
39
+ #hand-removed some sunch as english, god, american, united, states, john
40
+ def is_common?
41
+ common = " the of and to in a is that was for as with be by it his which on i not he or are from at this have had but were an their they all one been we has you who so more will her him them would its no may other there when than into any only time if some can these my such out two our very up should she me made about upon what most said could also do must then those great same being after man much many now over before well between where like under us through own life men even your work did see good without people part t little day shall each found new every make long mr might three against place both because himself down never used while still too how old case given however use another world know de called right take here last general whole though water country number state large come say form year less few far order does came during small again just back among yet give hand left different having thought always fact end high go per taken often within p things course certain others off cannot means think find above therefore side since am ever known themselves once set thus seen following nothing until whom house four away second itself whose put possible either rather several best took went done d almost subject words become true head necessary young better get common whether half cases brought least nor early five later full thing already together "
42
+ common.include?(" #{self.downcase} ")
43
+ end
44
+ end
@@ -0,0 +1,158 @@
1
+ require "chomchom/string"
2
+
3
+ module Chomchom
4
+ class Summary
5
+ #select the stretch with highest scoring sentences, basically captures the center of gravity of article
6
+ #pros: very coherent and computationally feasible
7
+ #cons: not good with coverage, only good when capturing passage is a summarizing intro/conclusion, otherwise just the key paragraph, not the whole
8
+ def center_of_gravity(text, topics, length=500)
9
+ sentences = text.split_sentences
10
+ summary = ''
11
+ if sentences.size > 0
12
+ start_index = 0
13
+ stop_index = 0
14
+ best_score = 0
15
+ (0...sentences.size).each do |i|
16
+ j = passage_last_index(sentences, i, length) #this returns the index of last sentence
17
+ #avoid extracting passage from 2 different paragraphs
18
+ #this usually lowers the score b/c less text means less match against topics
19
+ #but if a short passage has higher score then more power to it
20
+ passage = get_passage(sentences,i,j)
21
+
22
+ #this following score computation doesn't account for diversity
23
+ #so it often gives passages where the main topics are repeated in every sentences
24
+ #current_score = scores[i..j].inject { |sum, sc| sum + sc }
25
+
26
+ #this computation here count all topics once per passage
27
+ current_score = Chomchom::Summary.compute_score(passage, topics)
28
+ if best_score < current_score
29
+ best_score = current_score
30
+ start_index = i
31
+ stop_index = j
32
+ end
33
+ end
34
+
35
+ #use intro if the score is too low
36
+ if best_score < 3
37
+ start_index = 0
38
+ stop_index = passage_last_index(sentences, start_index, length)
39
+
40
+ #this following avoids using intro that are too short (usually are title)
41
+ #start_index = (0...sentences.size).detect { |i| get_passage(sentences,i,stop_index=passage_last_index(sentences, i, length)).split(' ').size > 5 }
42
+ end
43
+
44
+ #the .limit(length) prevents single sentence that's longer than allowable length
45
+ #select a substring within max length by removing the last occurrence of a punctuation
46
+ summary = get_passage(sentences, start_index, stop_index).limit(length).limit(length)
47
+ end
48
+ summary
49
+ end
50
+
51
+ #a variation of topic sentences extraction, this starts with most important topic
52
+ #extract first sentence mentioning it, do the same for the next topic unless already mentioned by previous sentence
53
+ #continue until length is reached or all topics covered
54
+ #pros: ok coherent and decent coverage
55
+ #cons: irrelevant long intro mentioning main topics will throw this off
56
+ def self.first_mentions(text, topics, length=500)
57
+ sentences = text.split_sentences
58
+ summary = Chomchom::Summary.love_at_first_sight(sentences, topics, length)
59
+ end
60
+
61
+ #the result of this is almost similar to first mention, except this runs greater risk of not reaching length
62
+ #this is also a minorly more computationally expensive
63
+ def self.topic_sentences(text, topics, length=400)
64
+ topic_sentences = []
65
+ paragraphs = text.split(/\n+/).each do |p|
66
+ sentences = p.split_sentences
67
+ topic_sentences.push(sentences[0]) if sentences[0] and Chomchom::Summary.compute_score(sentences[0], topics) > topics.last[1]
68
+ end
69
+
70
+ summary = Chomchom::Summary.love_at_first_sight(topic_sentences, topics, length)
71
+ end
72
+
73
+ #select the highest scoring sentence from each paragraph, then run love_at_first_sight
74
+ def self.best_sentences(text, topics, length=400)
75
+ paragraphs = text.split(/\n+/)
76
+ best_sentences = []
77
+ paragraphs.each do |p|
78
+ sentences = p.split_sentences
79
+ best_score = 0
80
+ index = 0
81
+ sentences.each_with_index do |s, i|
82
+ current_score = Chomchom::Summary.compute_score(s, topics)
83
+ if best_score < current_score
84
+ index = i
85
+ best_score = current_score
86
+ end
87
+ end
88
+ best_sentences.push(sentences[index]) if sentences[index] and best_score > topics.last[1]
89
+ end
90
+
91
+ summary = Chomchom::Summary.love_at_first_sight(best_sentences, topics, length)
92
+ end
93
+
94
+ #add the score of each topic occurrs in text up
95
+ def self.compute_score(text, topics)
96
+ begin
97
+ sum = 0
98
+ #compute geometric sum of occurrences (1 occurrence =1/2*score, 2 occurrences=(1/2+1/4)*score)...
99
+ #SUM(score*r^k)k:0..n = a*(1-r^(n+1))/(1-r), a=score/2 and r=1/2
100
+ #this is to limit too much diversity, a mention of a topic shouldn't get all the score
101
+ #if a topic has high score that means it's important and mentioning it several times in summary should be rewarded regressively
102
+ topics.each do |t|
103
+ f = text.scan(/\b#{Regexp.quote(t[0])}\b/).size
104
+ sum += t[1]*(1-(1/2.0)**(f+1))/(1-1/2.0) if f > 0
105
+ end
106
+ sum
107
+ #rescue
108
+ # 0
109
+ end
110
+ end
111
+
112
+ #for each topic, select the first sentence that has the topic unless the summary already covers it
113
+ def self.love_at_first_sight(sentences, topics, length)
114
+ separator = "~@#"
115
+ summary = ''
116
+ t = 0
117
+ points = []
118
+ while summary.size < length and t < topics.size
119
+ if summary.match(/\b#{Regexp.quote(topics[t][0])}\b/)
120
+ #find the next occurrence sentence not already in the summary
121
+ #what if this sentence will be covered by next topics?
122
+ else
123
+ match_sentence = sentences.detect { |s| s.match(/\b#{Regexp.quote(topics[t][0])}\b/) }
124
+ if match_sentence and (new_summary = summary + match_sentence + separator).size < length
125
+ summary = new_summary
126
+ points.push(sentences.index(match_sentence)) #track sentence order
127
+ end
128
+ end
129
+ t += 1
130
+ end
131
+ #have a strategy to include other sentences when summary is less than half the length
132
+ #backups array which stores possible candidates, sort by score
133
+ #run a loop and add to points if summary is < length
134
+ #for low topic article like the reddit one (no candidates) just use the unused topic sentences
135
+
136
+ #or unused = points.each { |i| sentences.delete_at(i) } #must delete from highest index back
137
+ #then rerun this first_sight search
138
+
139
+ #reorder the summary
140
+ points.sort! {|a,b| a <=> b}
141
+ summary = points.map { |i| sentences[i] }.join(separator).gsub(/\n+/,"").gsub(/\s+/," ")
142
+ end
143
+
144
+ private
145
+ #start from start_index until the combine sentences exceed max_length
146
+ #return the index of last sentence in that passage
147
+ def passage_last_index(sentences, start_index, max_length=500)
148
+ stop = ((start_index+1)...sentences.size).detect { |i| (sentences[start_index..i].join('. ')).size > max_length }
149
+ (stop)? stop-1 : sentences.size
150
+ end
151
+
152
+ def get_passage(sentences, start_index, stop_index)
153
+ passages = sentences[start_index..stop_index].join('. ').split("\n")
154
+ (passages.size > 0)? passages[0].gsub(/^[^\w]+/,'') : ''
155
+ end
156
+
157
+ end
158
+ end
@@ -0,0 +1,44 @@
1
+ #coding: utf-8
2
+ require "chomchom/string"
3
+
4
+ module Chomchom
5
+ class Topic
6
+ MAX = 8
7
+
8
+ def initialize(text, title='', title_weight=1)
9
+ #support unicode (require ruby 1.9.x)
10
+ text = text.force_encoding("UTF-8")
11
+ title = title.force_encoding("UTF-8")
12
+ @content = title * title_weight + text.gsub(/\n+/,"\n")
13
+ @content = @content.force_encoding("UTF-8").downcase
14
+ end
15
+
16
+ def singles
17
+ words = @content.split(' ').map { |w| w.downcase.gsub(/[^\p{Word}]/, '') }.uniq.delete_if { |w| !w or w.length<2 or w.is_common? }
18
+ @singles = words.map { |w| [w, frequency(w)] }
19
+ @singles = @singles.delete_if { |g| g[1] < 3}.sort { |a,b| b[1] <=> a[1] }
20
+ @singles[0..MAX]
21
+ end
22
+
23
+ #this is not for the benefit of summary (but for db storage so move this into topic method in chomchom.rb)
24
+ #merge words before sorting (this keeps order of words as they appear)
25
+ #look at each word in single_groups and merge with the others O(n^2)(this is inefficient)
26
+ #just go through the list in order, for each combine them and switch the order, take whichever one generate more counts
27
+ #merge for 2-word, then 3-word only
28
+ #for 3 (triples) just build from the doubles, then combine with non-overlap singles
29
+ #subtract from count everytime you legally take away (combine is more than 2 and remainder is more than 2)
30
+ def multiples
31
+
32
+ end
33
+
34
+ private
35
+ def is_substring(mono, multis)
36
+ multis.detect { |m| m[0].include?(mono[0]) and mono[1]==m[1] }
37
+ end
38
+
39
+ #count no. of times a word occurs in a text
40
+ def frequency(word)
41
+ @content.scan(/\b#{Regexp.quote(word)}[^\p{Word}\.]\b/).size
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,3 @@
1
+ module Chomchom
2
+ VERSION = "0.3.1"
3
+ end
Binary file
@@ -0,0 +1,17 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/chomchom/social_analytics"
2
+
3
+ urls = ["http://news.ycombinator.com/", "http://news.ycombinator.com", "mydomain.local",
4
+ "http://itmanagement.earthweb.com/entdev/article.php/3930466/That-Developers-Salary-is-Bigger-than-Mine.htm"
5
+ ]
6
+
7
+ urls.each do |url|
8
+ puts url
9
+ analytics = Chomchom::SocialAnalytics.new(url)
10
+ puts "facebook: #{analytics.facebook}"
11
+ puts "twitter: #{analytics.twitter}"
12
+ puts "digg: #{analytics.digg}"
13
+ puts "delicious: #{analytics.delicious}"
14
+ puts "stumbleupon: #{analytics.stumbleupon}"
15
+ puts "google buzz: #{analytics.google_buzz}"
16
+ puts "linkedin: #{analytics.linkedin}"
17
+ end
@@ -0,0 +1,51 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/chomchom"
2
+ require 'mechanize'
3
+
4
+ #conclusion: use single center_of_gravity which is better than cap_summary in every case
5
+ #topic_sentences is only marginally better than first_mention in most case, but it's too cost
6
+ #both topic_sentences and first_mention are rarely coherent, they also often take metadata (photo of..., date and location at beginning)
7
+ begin
8
+ urls = [ "http://37signals.com/svn/posts/2800-bootstrapped-profitable-proud-braintree",
9
+ "http://www.bbc.co.uk/news/world-africa-12726032",
10
+ "http://www.nytimes.com/2011/03/14/world/asia/14japan.html?_r=1&hp",
11
+ "http://www.nytimes.com/2006/01/15/magazine/15japanese.html?_r=2&pagewanted=all",
12
+ "http://www.theatlanticwire.com/national/2011/03/who-said-it-julian-assange-james-okeefe/35759/",
13
+ "http://blog.reddit.com/2011/03/so-long-and-thanks-for-all-postcards.html",
14
+ "http://www.mediapost.com/publications/?fa=Articles.showArticle&art_aid=146801&nid=124777",
15
+ "http://petewarden.typepad.com/searchbrowser/2010/04/how-i-got-sued-by-facebook.html"
16
+ ]
17
+
18
+ agent = Mechanize.new
19
+ agent.user_agent = "chomchom request client"
20
+ urls.each do |url|
21
+ agent.get(url)
22
+ begin
23
+ html = agent.page.body
24
+ rescue
25
+ html = ''
26
+ end
27
+ doc=Chomchom::Document.new(html)
28
+ puts doc.title
29
+
30
+ single_topics = doc.all_topics
31
+ puts "single topics: " + single_topics.map { |t| "[#{t[0]},#{t[1]}]" }.join(', ')
32
+
33
+ single_summary = doc.center_of_gravity
34
+ puts "score = #{Chomchom::Summary.compute_score(single_summary, single_topics)}"
35
+ puts "single topics center of gravity: #{single_summary} \n\n"
36
+
37
+ first_mentions = doc.first_mentions
38
+ puts "score = #{Chomchom::Summary.compute_score(first_mentions, single_topics)}"
39
+ puts "first mentions: #{first_mentions}\n\n"
40
+
41
+ topic_sentences = doc.topic_sentences
42
+ puts "score = #{Chomchom::Summary.compute_score(topic_sentences, single_topics)}"
43
+ puts "topic_sentences: #{topic_sentences}\n\n"
44
+
45
+ cherry_pick = doc.best_sentences
46
+ puts "score = #{Chomchom::Summary.compute_score(cherry_pick, single_topics)}"
47
+ puts "cherry pick: #{cherry_pick}\n\n"
48
+
49
+ puts "__________________"
50
+ end
51
+ end
Binary file
@@ -0,0 +1,29 @@
1
+ http://www.theatlantic.com/magazine/print/2011/01/hard-core/8327/|||Broadband Internet brings hardcore porn widely accessible to everyone, and thus influences our sex lives. However, online porn, which often caters to predominantly male consumers' desire, poses a threat to men and women equality as hardcore sex is legitimized and becomes the norm in everyone's bedroom
2
+ http://deborahcampbell.org/writing/politics/the-most-hated-name-in-news/|||A look into Al Jazeera's difficulties in breaking into the US market, mainly caused by major cable networks refusal to carry them as the channel was labeled as the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who decide to broadcast it, and it has been well received for its fresh and informative worldview coverage.
3
+ http://www.theatlantic.com/magazine/archive/2003/03/caring-for-your-introvert/2696/|||The self-described introvert author explains introversion and his view on extroverts treatment. He describes extroverts as people energized by other people while introverts find energy in deep thoughts and find socializing exhausted. He believes introverts are misunderstood b/c most of society's activities and expectations are dominated by extroverts.
4
+ http://www.nybooks.com/articles/archives/2010/nov/25/generation-why/?pagination=false|||The article uses the Social Network movie as a template to contemplate about Zuckerberg motives in creating Facebook and his creation's implication on human interaction, specifically privacy, human relationship and feelings, your representation as a person online. The author argues that digital profiles aren't adequate enough to foster the genuine relationship real life offers, this medium even misrepresents your image with a shallow format of trivia personal info and photos.
5
+ http://www.wired.com/wired/archive/1.04/gibson_pr.html|||Singapore has established its prosperity and cleanliness through an overly controlled government at the cost of free expression. Its population and culture are based on forced conformity that makes it look more like a bland theme park. Finally, it has a harsh attitude toward drug trafficking
6
+ http://www.newyorker.com/reporting/2008/02/11/080211fa_fact_orlean?currentPage=all|||Hollinger is a passionate inventor who simultaneously wrestles with his numerous projects and ideas. His current project is to reinvent the umbrella, which suffers many design flaws. His solution is a hip umbrella with a stylist shape that uses the principle of aerodynamics to withstand strong wind.
7
+ http://www.nytimes.com/2001/02/25/magazine/25STOCK-TRADER.html?pagewanted=all|||A 15-year-old boy made close to $1M by plugging stocks he owns on sites such as yahoo finance. He exploits the fact that most people trading stocks have no clue what they're doing but are eager to trade based on pundits and manipulators info. The SEC gets in and vilifies the kid because the fact that a kid could easily rig the system expose its flaws and newfound complexities in an Internet age where the amateurs can appear as professional as the pros.
8
+ http://www.stevenberlinjohnson.com/2010/04/the-glass-box-and-the-commonplace-book.html|||Today's Internet and search engines, much like classical texts, are collections of texts from a variety of source stitched together in ways that the original author may never imagine. The free mixing of information creates new meaning and value for a healthy information ecosystem. But new crop of apps such as iBook, NYT, WSJ apps deliberately erect glassbox that make it impossible to copy and paste text for collection. The author advocates for an open web where information can freely flow, connect, and remixed in new imaginative ways.
9
+ http://www.wired.com/wired/archive/15.02/yahoo_pr.html|||A look at the path that leads to Yahoo falling behind Google, from its failure to recognize and monetize search, to inability to buy google, then failure to integrate its two search related acquisitions Inktomi and Overture. These failures are often attributed to ex-CEO Terry Semel due to his lack of deep technical understanding coming from the media world.
10
+ http://www.gq.com/entertainment/movies-and-tv/201102/the-day-the-movies-died-mark-harris?currentPage=all|||Today's generation of summer block buster movies genre, which dominates the movies landscape, lacks creativity and risk. Movies making is reduced to that of marketing and packaging, and less about the content. But ultimately, it's about us moviegoers, movies are made for people and if people stop going then the studios have to make movies for those who do, under 25 yr-old males who always crave super hero comics and refuse to grow up.
11
+ http://www.theregister.co.uk/2011/03/01/the_rise_and_rise_of_node_dot_js/print.html|||Node.js is a server side Javascript framework built to handle dynamic real time application. It's based on JavaScript event-based architecture which is different from traditional I/O blocking used in typical web servers. The end result is developers can now code their high performance stack, both backend and frontend, using the same language.
12
+ http://www.lycaeum.org/~martins/M2/ventura.html|||Most people work at unfulfilled jobs that are more out of necessity rather than choice. The economy dictates that for businesses to run efficiently workers at the bottom should have no decision making power. They have no power, no upside, ultimately their their time and economic wellness are compromised by decisions made at the top, by those who profit mightily from actions that lead to the collapse of many industries.
13
+ http://www.esquire.com/print-this/steve-jobs-1008?page=all|||As Jobs took the stage to introduce the iPhone 3G, his mortal sign revealed by his struggle against a rare form pancreatic cancer is more apparent than ever. Yet his immortality has been in the making the last 30 years with each new Apple invention. Each invention strives to surpass its predecessor in ways that surpass everyone's expectation, each carries Jobs DNA and vision in his quest to redefine the human-machine relationship and with that his existence forever.
14
+ http://www.nytimes.com/2010/08/22/magazine/22Adulthood-t.html?_r=1&pagewanted=print|||The phenomenon of today's young generation taking forever to reach adulthood is defined as a new life stage driven by economic and social changes.Just like a century ago when psychologists must make a case for adolescence as a new developmental stage, today's 20-something stage will also lead to changes to accommodate their unique needs and profile. People at this stage tend to be more self-focused, engage in identity exploration, and optimistic about the future
15
+ http://www.theatlantic.com/technology/archive/2011/03/from-lulz-to-labor-unions-the-evolution-of-anonymous/72001/|||Anonymous used to center around Internet memes to get laughs within its forum, but recently they have been in the media for promotion of freedom of speech against powerful enemies. Its most recent action against security firm HBGary led to the company's CEO resignation. But this questions their very own agenda, is Anonymous using their Ddos power to suppress freedom of speech?
16
+ http://www.theatlantic.com/magazine/print/2011/04/north-korea-8217-s-digital-underground/8414/|||North Korea is an isolated country with no access to outside internet or media. But crops of underground reporters run by North Korean defectors and outside activists are trying to bring news into and out of the country. Their operations, while effective in giving the outside world an accurate look into NK, are often put their lives at great risk
17
+ http://ycombinator.com/atyc.html|||A look into the inner workings of Y Combinator from dinners with famous startup founders, office hours, fund raising strategy, to launch and Demo Day preps. YC also offers introduction to valuable angels, VCs, and YC alumni. Ultimately, it's the 3 months of intense focus on everything startups that fuel productivity and energy in YC founders.
18
+ http://english.caing.com/2010-07-28/100164846.html|||A group of conspirators plot elaborate plans to extort money from illegal mine owners by killing mine workers in staged accidents, most of the victims are the conspirators' relatives. In a case where the mine owner refused to pay, the group mistakenly report the dead which led to their eventual investigation and conviction
19
+ http://www.dailykos.com/story/2011/02/26/950079/-I-Dont-Want-to-be-a-Teacher-Any-More|||As a student the author already aspire to be a teacher. She passionately teaches for 30 years despite many challenges such as budget cutbacks that result bigger class size and her taking a pay cut, cleaning her classroom, and buying school supplies with her own money. When the district failed her school for not meeting their test score without considering these difficult challenges, it compound to cause her to finally stop being a teacher
20
+ http://www.xconomy.com/san-francisco/inside-googles-age-of-augmented-humanity/|||Google has the most advanced and largest infrastructure to support its mission of organizing all information around us. The scope of their operation has expanded beyond search to areas from speech recognition, language translation, to visual search. But the backbone to all their innovation are machine learning and huge amount of data mining.
21
+ http://www.time.com/time/printout/0,8816,2053595,00.html|||Germany reforms focusing on keeping manufacturing at home have helped its economy growth and unemployment low. Like China, Germany carries a trade surplus with other EU nations such as France, Spain, Ireland, which effect the health of EU economy and hence the value of the euro.
22
+ http://thestandard.org.nz/what-will-future-generations-condemn-us-for/|||Like Wife beating, slavery, and hanging of homosexuals, which used to be accepted by the mass and squarely condemned nowadays, the US prison system, industrial meat production, the institutionalization of the elderly, and polluting of our environment are destined for future condemnation.
23
+ http://www.theatlantic.com/magazine/print/2010/07/the-politically-incorrect-guide-to-ending-poverty/8134/|||Light regulation and fair laws in the medieval town of Lubeck attracted more merchants and turned it into a prosperous town with many neighbors adopting its model. Now, a Stanford economist, Paul Romer, is advocating the same model to end poverty: creating charter-cities with new laws in poor countries. Romer often cites Hong Kong as an example of a successful charter city. Like Lubeck, Hong Kong inspire other nearby Chinese cities to adopt its model which transform China into the manufacture powerhouse it is today. But his plan is often opposed by regional nationalists balking at the idea of giving lands to foreigners.
24
+ http://www.nytimes.com/2011/03/01/science/01angier.html|||As a student, Portman made it to the semifinal round of the Intel Science Talent Search, a grueling competition that requires dedicated hard work. At the same time, she manages to star in many movies, most notably as Queen Amidala in Star Wars. She went on to major in neuroscience at Harvard.
25
+ http://www.vanityfair.com/business/features/2011/04/jack-dorsey-201104?currentPage=all|||Jack Dorsey, the original Twitter creator, is now CEO of his second creation, Square - a company that allows individuals to process credit card with their phone. He holds s variety of interests, is friend with many actors like Ashton Kutcher, and aspires to become the mayor of New York city.
26
+ http://www.independent.co.uk/news/world/modern-art-was-cia-weapon-1578808.html?service=Print|||In a campaign known as "long leash" the CIA promoted Abstract Expressionist paintings, without the artists knowledge, as propaganda weapon during the Cold War. The decision was made to counter Communist propaganda, which often attracted artists and intellectuals, while American democracy was at odds with McCarthy's intolerance. With its many covert operations to bring exhibitions everywhere the CIA helped Abstract Expressionism become the prominent art movement post WW2.
27
+ http://www.theatlantic.com/magazine/print/2009/09/how-american-health-care-killed-my-father/7617/|||After the needless death of his father, the author, a business executive, began a personal exploration of a health-care industry that for years has delivered poor service and irregular quality at astonishingly high cost. It is a system, he argues, that is not worth preserving in anything like its current form. And the health-care reform now being contemplated will not fix it. Here’s a radical solution to an agonizing problem.
28
+ http://www.theatlantic.com/magazine/print/2011/03/mind-vs-machine/8386/|||In the race to build computers that can think like humans, the proving ground is the Turing Test—an annual battle between the world’s most advanced artificial-intelligence programs and ordinary people. The objective? To find out whether a computer can act “more human” than a person. In his own quest to beat the machines, the author discovers that the march of technology isn’t just changing how we live, it’s raising new questions about what it means to be human.
29
+ http://37signals.com/svn/posts/2800-bootstrapped-profitable-proud-braintree|||Bryan Johnson worked as a salesperson selling credit services to business, but he hates the job and decided to create Braintree. Braintree avoids the freemium model and charged a premium, which was a lot higher than their competitors. It helps filter out the inexperienced merchants who don't recognize the pain yet. They generated $4.5M in revenue and grew to 24 employees and doubled its customer base.
@@ -0,0 +1,40 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/chomchom"
2
+ require 'mechanize'
3
+
4
+ begin
5
+ urls = ["http://www.foundersatwork.com/blog.html",
6
+ "http://www.independent.co.uk/opinion/commentators/johann-hari/the-dark-side-of-dubai-1664368.html",
7
+ "http://37signals.com/svn/posts/2800-bootstrapped-profitable-proud-braintree",
8
+ "http://www.bbc.co.uk/news/world-africa-12726032",
9
+ "http://danluan.org/node/7967",
10
+ "http://www.nytimes.com/2011/03/14/world/asia/14japan.html?_r=1&hp",
11
+ "http://www.nytimes.com/2006/01/15/magazine/15japanese.html?_r=2&pagewanted=all",
12
+ "http://www.theatlanticwire.com/national/2011/03/who-said-it-julian-assange-james-okeefe/35759/",
13
+ "http://www.google.com/",
14
+ "http://blog.reddit.com/2011/03/so-long-and-thanks-for-all-postcards.html",
15
+ "http://www.mediapost.com/publications/?fa=Articles.showArticle&art_aid=146801&nid=124777",
16
+ "http://petewarden.typepad.com/searchbrowser/2010/04/how-i-got-sued-by-facebook.html"
17
+ ]
18
+
19
+ agent = Mechanize.new
20
+ agent.user_agent = "chomchom request client"
21
+ urls.each do |url|
22
+ begin
23
+ agent.get(url)
24
+ html = agent.page.body
25
+ rescue
26
+ html = ''
27
+ end
28
+
29
+ puts url
30
+
31
+ doc=Chomchom::Document.new(html)
32
+ puts doc.title
33
+ puts "date: #{doc.publish_date}"
34
+ puts "author: " + doc.author
35
+ puts doc.fulltext
36
+ puts "duration: #{doc.consume_duration}"
37
+ puts "topics: #{doc.all_topics}"
38
+ puts "summary: " + doc.center_of_gravity
39
+ end
40
+ end
@@ -0,0 +1,41 @@
1
+ require "#{File.dirname(__FILE__)}/../lib/chomchom"
2
+ require "#{File.dirname(__FILE__)}/../lib/chomchom/scorer"
3
+
4
+ require 'mechanize'
5
+
6
+ urls = []
7
+ summaries = []
8
+ File.open('files/summaries.txt', 'r') do |file|
9
+ while line = file.gets
10
+ tmp = line.split("|||")
11
+ urls.push(tmp[0])
12
+ summaries.push(tmp[1])
13
+ end
14
+ end
15
+
16
+ agent = Mechanize.new
17
+ agent.user_agent = "chomchom request client"
18
+ fake_summary = "A look into Al Jazeera's difficulties in breaking into the US market, because they are the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who are unpatriotic by broadcasting it, and it hasn't been well received for its fresh and informative worldview coverage."
19
+ urls.each_with_index do |url, i|
20
+ if i==i
21
+ agent.get(url)
22
+ begin
23
+ html = agent.page.body
24
+ rescue
25
+ html = ''
26
+ end
27
+
28
+ doc = Chomchom::Document.new(html)
29
+ puts title = doc.title
30
+ topics = doc.all_topics
31
+ puts "#{topics}"
32
+ text = doc.fulltext
33
+
34
+ puts summaries[i]
35
+
36
+ topic_words = topics.map { |t| t[0] }
37
+ scorer = Chomchom::Scorer.new
38
+ puts scorer.score(text, summaries[i], topic_words)
39
+ end
40
+ end
41
+
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chomchom
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.3.1
6
+ platform: ruby
7
+ authors:
8
+ - Quan Nguyen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-05-01 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: mechanize
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: nokogiri
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: "0"
35
+ type: :runtime
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: ruby-stemmer
39
+ prerelease: false
40
+ requirement: &id003 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ type: :runtime
47
+ version_requirements: *id003
48
+ - !ruby/object:Gem::Dependency
49
+ name: ruby-readability
50
+ prerelease: false
51
+ requirement: &id004 !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ type: :runtime
58
+ version_requirements: *id004
59
+ - !ruby/object:Gem::Dependency
60
+ name: htmlentities
61
+ prerelease: false
62
+ requirement: &id005 !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ type: :runtime
69
+ version_requirements: *id005
70
+ - !ruby/object:Gem::Dependency
71
+ name: json
72
+ prerelease: false
73
+ requirement: &id006 !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ version: "0"
79
+ type: :runtime
80
+ version_requirements: *id006
81
+ description: chomchom extracts article's title, published_date, author, and fulltext. It also detects videos and audio for classifying the media type of a given page
82
+ email:
83
+ - mquannie@gmail.com
84
+ executables: []
85
+
86
+ extensions: []
87
+
88
+ extra_rdoc_files: []
89
+
90
+ files:
91
+ - .DS_Store
92
+ - .gitignore
93
+ - Gemfile
94
+ - README
95
+ - Rakefile
96
+ - chomchom.gemspec
97
+ - lib/.DS_Store
98
+ - lib/chomchom.rb
99
+ - lib/chomchom/.DS_Store
100
+ - lib/chomchom/extractor.rb
101
+ - lib/chomchom/regex_path.rb
102
+ - lib/chomchom/scorer.rb
103
+ - lib/chomchom/social_analytics.rb
104
+ - lib/chomchom/string.rb
105
+ - lib/chomchom/summary.rb
106
+ - lib/chomchom/topic.rb
107
+ - lib/chomchom/version.rb
108
+ - tests/.DS_Store
109
+ - tests/analytics_test.rb
110
+ - tests/benchmark.rb
111
+ - tests/files/.DS_Store
112
+ - tests/files/summaries.txt
113
+ - tests/functionals.rb
114
+ - tests/scoring.rb
115
+ homepage: http://github.com/mquan/chomchom
116
+ licenses: []
117
+
118
+ post_install_message:
119
+ rdoc_options: []
120
+
121
+ require_paths:
122
+ - lib
123
+ required_ruby_version: !ruby/object:Gem::Requirement
124
+ none: false
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: "0"
129
+ required_rubygems_version: !ruby/object:Gem::Requirement
130
+ none: false
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: "0"
135
+ requirements: []
136
+
137
+ rubyforge_project: chomchom
138
+ rubygems_version: 1.7.2
139
+ signing_key:
140
+ specification_version: 3
141
+ summary: chomchom is a ruby gem that extracts key information from an html page
142
+ test_files: []
143
+