RubyGems - chomchom - Versions diffs - 0.3.1 - Mend

chomchom 0.3.1

Files changed (25) hide show

data/.DS_Store +0 -0
data/.gitignore +4 -0
data/Gemfile +4 -0
data/README +14 -0
data/Rakefile +2 -0
data/chomchom.gemspec +29 -0
data/lib/.DS_Store +0 -0
data/lib/chomchom.rb +57 -0
data/lib/chomchom/.DS_Store +0 -0
data/lib/chomchom/extractor.rb +145 -0
data/lib/chomchom/regex_path.rb +11 -0
data/lib/chomchom/scorer.rb +187 -0
data/lib/chomchom/social_analytics.rb +106 -0
data/lib/chomchom/string.rb +44 -0
data/lib/chomchom/summary.rb +158 -0
data/lib/chomchom/topic.rb +44 -0
data/lib/chomchom/version.rb +3 -0
data/tests/.DS_Store +0 -0
data/tests/analytics_test.rb +17 -0
data/tests/benchmark.rb +51 -0
data/tests/files/.DS_Store +0 -0
data/tests/files/summaries.txt +29 -0
data/tests/functionals.rb +40 -0
data/tests/scoring.rb +41 -0
metadata +143 -0

data/.DS_Store ADDED

Binary file

data/.gitignore ADDED

@@ -0,0 +1,4 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in chomchom.gemspec
+gemspec

data/README ADDED

@@ -0,0 +1,14 @@
+Usage
+require 'mechanize'
+agent = Mechanize.new
+agent.get("http://gistpoint.com/")
+html = agent.page.body
+doc = Chomchom::Document.new(html)
+doc.title
+doc.publish_date
+doc.author
+doc.fulltext
+doc.topics
+doc.summary

data/Rakefile ADDED

	@@ -0,0 +1,2 @@
1	+ require 'bundler'
2	+ Bundler::GemHelper.install_tasks

data/chomchom.gemspec ADDED

@@ -0,0 +1,29 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "chomchom/version"
+Gem::Specification.new do |s|
+  s.name        = "chomchom"
+  s.version     = Chomchom::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Quan Nguyen"]
+  s.email       = ["mquannie@gmail.com"]
+  s.homepage    = "http://github.com/mquan/chomchom"
+  s.summary     = %q{chomchom is a ruby gem that extracts key information from an html page}
+  s.description = %q{chomchom extracts article's title, published_date, author, and fulltext. It also detects videos and audio for classifying the media type of a given page}
+  s.rubyforge_project = "chomchom"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_dependency(%q<mechanize>)
+  s.add_dependency(%q<nokogiri>)
+  s.add_dependency(%q<ruby-stemmer>)
+  s.add_dependency(%q<ruby-readability>)
+  s.add_dependency(%q<htmlentities>)
+  s.add_dependency(%q<json>)
+end

data/lib/.DS_Store ADDED

Binary file

data/lib/chomchom.rb ADDED

@@ -0,0 +1,57 @@
+$: << File.dirname(__FILE__)
+require 'chomchom/summary'
+require 'chomchom/topic'
+require 'chomchom/extractor'
+require 'chomchom/scorer'
+require 'chomchom/social_analytics'
+module Chomchom
+  class Document
+    def initialize(html)
+      @extr = Chomchom::Extractor.new(html)
+      @title = @extr.title
+      @fulltext = @extr.fulltext
+    end
+    def title
+      @title
+    end
+    def fulltext
+      @fulltext
+    end
+    def publish_date
+      @extr.publish_date
+    end
+    def author
+      @extr.author
+    end
+    def consume_duration
+      @extr.consume_duration
+    end
+    def all_topics
+      @all_topics = Chomchom::Topic.new(@fulltext, @title, 1).singles
+    end
+    def center_of_gravity(length=400)
+      Chomchom::Summary.new.center_of_gravity(@fulltext, @all_topics, length)
+    end
+    def first_mentions(length=400)
+      Chomchom::Summary.first_mentions(@fulltext, @all_topics, length)
+    end
+    def topic_sentences(length=400)
+      Chomchom::Summary.topic_sentences(@fulltext, @all_topics, length)
+    end
+    def best_sentences(length=400)
+      Chomchom::Summary.best_sentences(@fulltext, @all_topics, length)
+    end
+  end
+end

data/lib/chomchom/.DS_Store ADDED

Binary file

data/lib/chomchom/extractor.rb ADDED

@@ -0,0 +1,145 @@
+require 'nokogiri'
+require 'readability'
+require 'date'
+require 'htmlentities'
+require "chomchom/regex_path"
+module Chomchom
+  class Extractor
+    WPM = 250 #average reading speed
+    #parameters for max number of topics to retrieve
+    MAX_MONOS = 5
+    MAX_MULTIS = 3
+    #TODO: the current ruby-readability doesn't pull next pages' text
+    def initialize(html_txt)
+      #fix utf-8 invalid string
+      #http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
+      ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
+      html = ic.iconv(html_txt + ' ')[0..-2]
+      begin
+        @fulltext = Readability::Document.new(html).content
+      rescue
+        @fulltext = ''
+      end
+      @fulltext = @fulltext.gsub(/\s+/," ").gsub(/<\/.*?>/, "\n").gsub(/<.*?>/,'')
+      @fulltext = HTMLEntities.new.decode(@fulltext) #decode html
+      @title = Nokogiri::XML(html.scan(/<title.*>(?:\n|.)*?<\/title>/i)[0])
+      @title = (@title)? @title.inner_text.gsub(/^\s+/,'').gsub(/\s+$/,'').gsub(/\n+/,' ') : ''
+      @title = HTMLEntities.new.decode(@title)
+      #use greedy match for <body> to cover embeded frames
+      @body = html.match(/<body.*?>(?:\n|.)*<\/body>/i)
+      @body = (@body)? @body[0] : '<body></body>'
+      #remove scripts, styles, frames, and comments (all non greedy)
+      @body.gsub!(/<script.*?>(?:\n|.)*?<\/script>/i,'')
+      @body.gsub!(/<style.*?>(?:\n|.)*?<\/style>/i,'')
+      @body.gsub!(/<frame.*?>(?:\n|.)*?<\/frame>/i,'')
+      @body.gsub!(/<iframe.*?>(?:\n|.)*?<\/iframe>/i,'')
+      @body = @body.gsub(/<!\-\-(?:\n|.)*?\-\->/,'').gsub(/\s+/,' ').gsub(/\n+/,"\n")
+      @body_dom = Nokogiri::XML(@body)
+    end
+    #readability getArticleTitle
+    def readability_title
+      title = ''
+      if @title.match(/[\|\-]/)
+        title = @title.scan(/(.*)[\|\-].*/).flatten[0]
+        title = @title.scan(/[^\|\-]*[\|\-](.*)/).flatten[0] if title.split(' ').size < 3
+      elsif @title.index(': ')
+        title = @title.scan(/.*:(.*)/).flatten[0]
+      elsif @title.length > 150 or @title.length < 15
+        h1s = @body_dom.xpath(".//h1")
+        title = h1s[0].inner_text if h1s and h1s.size > 0
+      end
+      title = @title if title.split(' ').size <= 4
+      title.gsub(/^\s+/,'')
+    end
+    #retrieving title strategy:
+    #1. get all the elements with class/id="...title|head..." and h1-h3
+    #2. match them against the page title to get a bunch of candidates
+    #3. take the longest candidate, take original title if no candidate avail
+    def title
+      titles1 = @body_dom.xpath(".//*[regex(.,'.*title|head.*','id|class')]", Chomchom::RegexPath.new).map { |n| n.inner_text }
+      titles2 = @body_dom.search('//h1','//h2').map {|n| n.inner_text }
+      titles = (titles1 + titles2).flatten.compact
+      candidates = titles.select { |t| @title.downcase.include?(t.downcase) }
+      #select the longest candidate as title
+      if candidates.size > 0
+        title = ''
+        candidates.each { |c| title = c if c.length > title.length }
+        title.gsub(/\s+/,' ').gsub(/\n+/,'')
+      else
+        @title
+      end
+    end
+    #match and select publish date. Strategy:
+    #1. scan for the most used patterns
+    #2. take the one at the very top (usually the one near title) - this fails for pages displaying today date
+    #3. parse to date object (ruby amazingly handles all the different formats)
+    #Note: won't work for pages using javascript to write date
+    #agent.page.response['Last-Modified'] doesn't work b/c most pages now are dynamically generated
+    MONTHS_RE = "(?:#{(Date::MONTHNAMES + Date::ABBR_MONTHNAMES).compact.join("|")})"
+    def publish_date
+      dates = @body.scan(/(?:(#{MONTHS_RE}[^\w]+\d{1,2}(?:th|st|nd|rd)?[^\w]+(?:\d{4}|\d{2})?)[^\w]) |
+                          (?:(\d{1,2}(?:th|st|nd|rd)?\s#{MONTHS_RE}[^\w]+(?:\d{4}|\d{2})?)[^\w]) |
+                          (?:(\d{1,2}\-\d{1,2}\-\d{4})[^\w]) | (?:(\d{1,2}\.\d{1,2}\.\d{4})[^\w]) | (?:(\d{1,2}\/\d{1,2}\/\d{4})[^\w]) |
+                          (?:(\d{4}\-\d{1,2}\-\d{1,2})[^\w]) | (?:(\d{4}\.\d{1,2}\.\d{1,2})[^\w]) | (?:(\d{4}\/\d{1,2}\/\d{1,2})[^\w])
+                         /ix).flatten.compact
+      dates.delete_if { |d| is_not_date(d) } if dates
+      begin
+        Date.parse(dates[0])
+      rescue
+        Date.today
+      end
+    end
+    def author
+      writers = @body_dom.xpath(".//*[regex(.,'.*author|byline|auth.*','id|class|href')]", Chomchom::RegexPath.new).map { |n| n.inner_text }
+      writers = writers.flatten.compact
+      (writers and writers[0])? writers[0].gsub(/^\s+/,'') : ''
+    end
+    def fulltext
+      @fulltext
+    end
+    #return time in mintues
+    #factor in other embedded media duration
+    def consume_duration
+      (@fulltext)? (@fulltext.gsub(/<.*?>/,'').split(/[\s\n]/).size/WPM).ceil : 0
+    end
+    private
+    #eliminate things that aren't dates: 0/0/2001 1/1/7493
+    def is_not_date(date)
+      if date.match(/(\d{4}[\-\.\/]\d{1,2}[\-\.\/]\d{1,2})/)
+        tmp = date.split(/[\-\.\/]/)
+        y = tmp[0].to_i
+        m = tmp[1].to_i
+        d = tmp[2].to_i
+        y > Date.today.year or m <= 0 or m > 12 or d <= 0 or d > 31
+      elsif date.match(/(\d{1,2}[\-\.\/]\d{1,2}[\-\.\/]\d{4})/)
+        tmp = date.split(/[\-\.\/]/)
+        y = tmp[2].to_i
+        m = tmp[1].to_i
+        d = tmp[0].to_i
+        y > Date.today.year or m <= 0 or d <= 0
+      else
+        false
+      end
+    end
+  end
+end

data/lib/chomchom/regex_path.rb ADDED

@@ -0,0 +1,11 @@
+#custom xpath function for regexp matching of element attributes
+#atts is a string delimited by "|" for or matching attributes ("id|class")
+module Chomchom
+  class RegexPath
+    def regex(node_set, re, atts)
+      node_set.find_all do |node|
+        atts.split('|').detect { |att| node[att] =~ /#{re}/ }
+      end
+    end
+  end
+end

data/lib/chomchom/scorer.rb ADDED

@@ -0,0 +1,187 @@
+#coding: utf-8
+require 'lingua/stemmer' #https://github.com/aurelian/ruby-stemmer
+module Chomchom
+  class Scorer
+    def score(text, summary, topics)
+      #solve the utf-8 invalid string error
+      ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
+      text = ic.iconv(text + ' ')[0..-2]
+      summary = ic.iconv(summary + ' ')[0..-2]
+      #step 1: prep the texts for analysis
+      stemmer = Lingua::Stemmer.new(:language => 'en')
+      stem_topics = topics.map { |t| stemmer.stem(t) }
+      text_sentences = text.downcase.split_sentences
+      tss = text_sentences.map { |ts|
+        #stemmer.stem(ts) if ts.match(/\p{Word}+/)
+        words = ts.downcase.split(/[^\p{Word}]/).map { |w| stemmer.stem(w) if w and w.size>1 and !w.is_common?}.compact
+        words if ts.match(/\p{Word}+/) and words.size > 0
+      }.compact
+      #rudimentary sentences scoring (number of non-common words)
+      #another scoring approach is to manually go throu each sentence and mark important ones
+      #do they have an identifiable pattern (have topic and some other words?)
+      #or first and last paragraphs are important? first sentence in paragraph?
+      tss_scores = tss.map { |ts| ts.uniq.size }
+      #File.open("fulltexts/#{title}.txt", "w") do |f|
+      #text_sentences.map {|ts| ts if ts.match(/\p{Word}+/)}.compact.each_with_index { |ts,i| f.puts "#{i} #{ts}" }
+      #end
+      #step 2: coverage analysis by performing exact word match (with stemming)
+      #evaluate the whole summary, this will more likely increase the score
+      #coverages = find_coverages(summary, ts)
+      #separating by sentences has the effect of designating each sentence to a section
+      coverages = []
+      #ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s|
+      ss = summary.downcase.split_sentences.each { |s|
+        coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/)
+      }
+      #step 3: synonym analysis and domain specific fusion on words that didn't match
+      #since the matched one already established, it's less likely that a word carries double meanings in the same story
+      #step 4: compute coverage score
+      covered = coverages.flatten.uniq
+      #redundancy = coverages.flatten.size - coverages.flatten.uniq.size
+      #uncovered = (0...ts.size).to_a.select { |i| i if !covered.delete(i) }
+      #this treats every uncommon word as 1 unit
+      total_score = tss_scores.inject { |sum, score| sum + score }
+      summary_score = covered.inject { |sum, i| sum + tss_scores[i] }
+      #puts "#{total_score} #{tss_scores}"
+      #puts "#{summary_score} #{covered.map{|i| tss_scores[i]}}"
+      #this treats every sentence as 1 unit (all sentences created equal)
+      #puts "#{covered.size.to_f/tss.size*100}"
+      #punish for length with the idea of length_tax, no tax below 100 and then progressively increase
+      summary_score.to_f/total_score*100*(1-length_tax(summary.size))
+      #algo weaknesses:
+      #extracted passage from text often scores higher (b/c of exact word matches)
+      #people listing most occurred words in every sentence. check for proper grammar and coherence?
+      #negation: take a high scoring summary, say the same thing but negate its meaning? check for meaning?
+    end
+    private
+    #progressive length tax
+    #max = .025 + .05 + .1 = .175 (17.5%)
+    #no punishment for short summary b/c itself won't be able to cover as much
+    def length_tax(summary_size)
+      if summary_size <= 100
+        0
+      elsif summary_size <= 200
+        (summary_size-100)*0.00025
+      elsif summary_size <= 300
+        0.025 + (summary_size-200)*0.0005
+      else
+        0.025 + 0.05 + (summary_size-300)*0.001
+      end
+    end
+    def find_coverages(summary, text_sentences, topics)
+      terms = []
+      hits = [] #array of indexes of sentences with matched terms
+      stemmer = Lingua::Stemmer.new(:language => 'en')
+      summary.split(/[^\p{Word}]/).each do |w|
+        positions = []
+        if !w.is_common?
+          #take word stemming, synonym, AND domain specific fusion into consideration
+          word = stemmer.stem(w)
+          text_sentences.each_with_index { |sentence, i| positions.push(i) if sentence.index(word) }
+          if positions.size > 0 and word != "" and word.size > 1
+            terms.push(word)
+            hits.push(positions)
+          end
+        end
+        #need to get word position, not just sentence position
+        #to see word chain into phrase (2 uncommon consec words is very good)
+      end
+      terms.uniq!
+      #puts "----------#{terms}"
+      stretches = possible_stretches(hits.flatten)
+      stretches = stretches.map do |stretch|
+        stretch_text = " " + text_sentences[stretch.first..stretch.last].flatten.join(" ") + " "
+        count = 0
+        nontopics = 0
+        terms.each { |term|
+          if stretch_text.match(/\b#{Regexp.quote(term)}\b/)
+            count += 1
+            nontopics += 1 if !topics.index(term)
+          end
+        }
+        #some long stretch doesn't make the count < stretch.size requirement
+        #but some sentence in there might be valuable so break it up and analyze each sentence
+        second_chances = []
+        if(count < stretch.size and stretch.size > 3)
+          stretch.each do |i|
+            sentence = text_sentences[i].join(" ")
+            topic_count = 0
+            nontopic_count = 0
+            second_chances.push(i) if terms.detect { |term|
+              topic_count += 1 if sentence.index(term) and topics.index(term)
+              nontopic_count += 1 if sentence.index(term) and !topics.index(term)
+              (topic_count >= 1 and nontopic_count > 1) or nontopic_count > 1
+            }
+          end
+        end
+        #uniq_terms = []
+        #terms.each { |term| uniq_terms.push(term) if stretch_text.match(/\b#{term}\b/) }
+        #puts "#{stretch} #{uniq_terms} #{count >= stretch.size and nontopics > 0}" if count >= stretch.size and count > 1 and nontopics > 0
+        #puts "2nd: #{second_chances}" if second_chances.size > 0
+        if second_chances.size > 0
+          second_chances
+        elsif (count >= stretch.size) and count > 1 and nontopics > 0
+         stretch
+        else
+          nil
+        end
+      end
+      stretches.compact
+      #right now it's either covered or not
+      #think up a strategy to compute confidence in covering (1 as very confident, less than one gradually)
+      #it's possible, though unlikely to have 1 sentence covering 2 different sections apart
+      #knowing this, a sentence should only have a few stretch not too much distance apart
+      #and corresponding to summary_sentence index (i=0 ==> stretch [7..13] rather than [103...109])
+    end
+    #find coverage stretch by number of consec sentences covered
+    #give more probability for the stretch with sentences with most repetitions
+    #give more weight for summary sentence corresponding to equivalent text section
+    #(first S is more probable for first few sentences of article)
+    def possible_stretches(hits)
+      coverages = hits.uniq.map { |hit| [hit, hits.count(hit)] }.sort { |a,b| a[0] <=> b[0] }
+      stretches = []
+      last = -1
+      count = 0
+      stretch = []
+      coverages.each { |cover|
+        if(last == cover[0] - 1) #stitch continuous stretch together
+          stretch.push(cover[0])
+          count += cover[1]
+        else
+          stretches.push([stretch, count]) #first element in stretches will be junk
+          stretch = [cover[0]]
+          count = cover[1]
+        end
+        last = cover[0]
+      }
+      #remove insignificant short stretches with low hit count
+      stretches = stretches.delete_if { |s| s[1] <= s[0].size and s[0].size < 3 }
+      stretches.map { |s| s[0] }
+    end
+  end
+end

data/lib/chomchom/social_analytics.rb ADDED

@@ -0,0 +1,106 @@
+require 'mechanize'
+require 'json'
+require 'digest/md5'
+module Chomchom
+  class SocialAnalytics
+    #http://news.ycombinator.com/item?id=2347428
+    #http://sharedcount.com/documentation.php
+    def initialize(url)
+      @url = url
+      @agent = Mechanize.new
+      @agent.user_agent = "chomchom social analytics"
+    end
+    def facebook
+      begin
+        @agent.get("http://graph.facebook.com/#{@url}")
+        facebook = JSON.parse(@agent.page.body)['shares']
+      rescue
+        0
+      end
+    end
+    #this breaks down the counts and possibly allow multiple urls
+    def facebook_more
+      @agent.get("http://api.ak.facebook.com/restserver.php?v=1.0&method=links.getStats&urls=#{@url}&format=json")
+      json = JSON.parse(@agent.page.body)
+      share_count = json['share_count']
+      like_count = json['like_count']
+      comment_count = json['comment_count']
+      total_count = json['total_count'] #same as above
+      click_count = json['click_count']
+    end
+    def twitter
+      begin
+        @agent.get("http://urls.api.twitter.com/1/urls/count.json?url=#{@url}")
+        twitter = JSON.parse(@agent.page.body)['count'].to_i
+      rescue
+        0
+      end
+    end
+    def digg
+      begin
+        @agent.get("http://widgets.digg.com/buttons/count?url=#{@url}")
+        digg = JSON.parse(@agent.page.body.match(/\{.*?\}/)[0])['diggs']
+      rescue
+        0
+      end
+    end
+    def delicious
+      begin
+        @agent.get("http://feeds.delicious.com/v2/json/urlinfo/#{Digest::MD5.hexdigest(@url)}")
+        json = JSON.parse(@agent.page.body)[0]
+        #puts tags = json['top_tags']
+        delicious = json['total_posts']
+      rescue
+        0
+      end
+    end
+    def stumbleupon
+      begin
+        @agent.get("http://www.stumbleupon.com/services/1.01/badge.getinfo?url=#{@url}")
+        stumbleupon = JSON.parse(@agent.page.body)['result']['views']
+      rescue
+        0
+      end
+    end
+    def google_buzz
+      begin
+        @agent.get("http://www.googleapis.com/buzz/v1/activities/count?alt=json&url=#{@url}")
+        google_buzz = JSON.parse(@agent.page.body)['data']['counts'][@url][0]['count']
+      #check multiple sites
+      #"https://www.googleapis.com/buzz/v1/activities/count?alt=json&url=http://news.ycombinator.com&url=http://www.techcrunch.com&url=http://www.cnn.com"
+      rescue
+        0
+      end
+    end
+    def linkedin
+      begin
+        @agent.get("http://www.linkedin.com/cws/share-count?url=#{@url}")
+        linkedin = JSON.parse(@agent.page.body.match(/\{.*?\}/)[0])['count']
+      rescue
+        0
+      end
+    end
+    #http://code.google.com/p/bitly-api/wiki/ApiDocumentation#/v3/clicks
+    #need the shorten url and a key (free signup)
+    def bitly
+    end
+    #they need key, no commercial use
+    #http://www.backtype.com/developers
+    def backtweets
+      "http://backtweets.com/search.json?q=http://news.ycombinator.com/&key=key"
+    end
+  end
+end

data/lib/chomchom/string.rb ADDED

@@ -0,0 +1,44 @@
+#coding: utf-8
+class String
+  #split text into sentences, take into account Mr.|Ms. endings are not end of sentence
+  def split_sentences
+    #break text first by paragraph then into chunks delimited by a period
+    #but these are not quite sentences yet
+    chunks = (self.split(/\n+/).map { |p| "#{p}\n".split(/[!?]+|(?:\.+(?:[^\p{Word}]))/) }).flatten.compact
+    #if a sentence is split at Mr.|Ms.|Dr.|Mrs.
+    #then recombine it with its remaining part and nil it to delete later
+    tmp=''
+    sentences = chunks.map { |c|
+      ss = (tmp != '')? "#{tmp}. #{c}" : c
+      if c.match(/(?:Dr|Mr|Ms|Mrs)$/)
+        #what about John F. Kennedy ([A-Z])
+        #I finish at 5 a.m. today.
+        #At 5 p.m. I have to go to the bank
+        #rule 1: every sentence starts with a Cap (what about iPhone?)
+        #just check if a sentence is too short then combine with the previous or next?
+        tmp = ss
+        ss=nil
+      else
+        tmp = ''
+      end
+      ss
+    }
+    sentences.compact #delete nil elements
+  end
+  #constraint a string to a fixed length or less
+  #discard everything after the last punctuation that occurs right before lenght limit
+  #the regexp look ahead for any punctuation
+  def limit(length)
+    (self.length > length)? self[0...length].gsub(/(?![\s\S]+?[,:;)\/\\\|])([,:;)\/\\\|].*)/,'') : self
+  end
+  #common dictionary built from google's top 300 1-grams
+  #hand-removed some sunch as english, god, american, united, states, john
+  def is_common?
+    common = " the of and to in a is that was for as with be by it his which on i not he or are from at this have had but were an their they all one been we has you who so more will her him them would its no may other there when than into any only time if some can these my such out two our very up should she me made about upon what most said could also do must then those great same being after man much many now over before well between where like under us through own life men even your work did see good without people part t little day shall each found new every make long mr might three against place both because himself down never used while still too how old case given however use another world know de called right take here last general whole though water country number state large come say form year less few far order does came during small again just back among yet give hand left different having thought always fact end high go per taken often within p things course certain others off cannot means think find above therefore side since am ever known themselves once set thus seen following nothing until whom house four away second itself whose put possible either rather several best took went done d almost subject words become true head necessary young better get common whether half cases brought least nor early five later full thing already together "
+    common.include?(" #{self.downcase} ")
+  end
+end

data/lib/chomchom/summary.rb ADDED

@@ -0,0 +1,158 @@
+require "chomchom/string"
+module Chomchom
+  class Summary
+    #select the stretch with highest scoring sentences, basically captures the center of gravity of article
+    #pros: very coherent and computationally feasible
+    #cons: not good with coverage, only good when capturing passage is a summarizing intro/conclusion, otherwise just the key paragraph, not the whole
+    def center_of_gravity(text, topics, length=500)
+      sentences = text.split_sentences
+      summary = ''
+      if sentences.size > 0
+        start_index = 0
+        stop_index = 0
+        best_score = 0
+        (0...sentences.size).each do |i|
+          j = passage_last_index(sentences, i, length) #this returns the index of last sentence
+          #avoid extracting passage from 2 different paragraphs
+          #this usually lowers the score b/c less text means less match against topics
+          #but if a short passage has higher score then more power to it
+          passage = get_passage(sentences,i,j)
+          #this following score computation doesn't account for diversity
+          #so it often gives passages where the main topics are repeated in every sentences
+          #current_score = scores[i..j].inject { |sum, sc| sum + sc }
+          #this computation here count all topics once per passage
+          current_score = Chomchom::Summary.compute_score(passage, topics)
+          if best_score < current_score
+            best_score = current_score
+            start_index = i
+            stop_index = j
+          end
+        end
+        #use intro if the score is too low
+        if best_score < 3
+          start_index = 0
+          stop_index = passage_last_index(sentences, start_index, length)
+          #this following avoids using intro that are too short (usually are title)
+          #start_index = (0...sentences.size).detect { |i| get_passage(sentences,i,stop_index=passage_last_index(sentences, i, length)).split(' ').size > 5 }
+        end
+        #the .limit(length) prevents single sentence that's longer than allowable length
+        #select a substring within max length by removing the last occurrence of a punctuation
+        summary = get_passage(sentences, start_index, stop_index).limit(length).limit(length)
+      end
+      summary
+    end
+    #a variation of topic sentences extraction, this starts with most important topic
+    #extract first sentence mentioning it, do the same for the next topic unless already mentioned by previous sentence
+    #continue until length is reached or all topics covered
+    #pros: ok coherent and decent coverage
+    #cons: irrelevant long intro mentioning main topics will throw this off
+    def self.first_mentions(text, topics, length=500)
+      sentences = text.split_sentences
+      summary = Chomchom::Summary.love_at_first_sight(sentences, topics, length)
+    end
+    #the result of this is almost similar to first mention, except this runs greater risk of not reaching length
+    #this is also a minorly more computationally expensive
+    def self.topic_sentences(text, topics, length=400)
+      topic_sentences = []
+      paragraphs = text.split(/\n+/).each do |p|
+        sentences = p.split_sentences
+        topic_sentences.push(sentences[0]) if sentences[0] and Chomchom::Summary.compute_score(sentences[0], topics) > topics.last[1]
+      end
+      summary = Chomchom::Summary.love_at_first_sight(topic_sentences, topics, length)
+    end
+    #select the highest scoring sentence from each paragraph, then run love_at_first_sight
+    def self.best_sentences(text, topics, length=400)
+      paragraphs = text.split(/\n+/)
+      best_sentences = []
+      paragraphs.each do |p|
+        sentences = p.split_sentences
+        best_score = 0
+        index = 0
+        sentences.each_with_index do |s, i|
+          current_score = Chomchom::Summary.compute_score(s, topics)
+          if best_score < current_score
+            index = i
+            best_score = current_score
+          end
+        end
+        best_sentences.push(sentences[index]) if sentences[index] and best_score > topics.last[1]
+      end
+      summary = Chomchom::Summary.love_at_first_sight(best_sentences, topics, length)
+    end
+    #add the score of each topic occurrs in text up
+    def self.compute_score(text, topics)
+      begin
+        sum = 0
+        #compute geometric sum of occurrences (1 occurrence =1/2*score, 2 occurrences=(1/2+1/4)*score)...
+        #SUM(score*r^k)k:0..n = a*(1-r^(n+1))/(1-r), a=score/2 and r=1/2
+        #this is to limit too much diversity, a mention of a topic shouldn't get all the score
+        #if a topic has high score that means it's important and mentioning it several times in summary should be rewarded regressively
+        topics.each do |t|
+          f = text.scan(/\b#{Regexp.quote(t[0])}\b/).size
+          sum += t[1]*(1-(1/2.0)**(f+1))/(1-1/2.0) if f > 0
+        end
+        sum
+      #rescue
+      #  0
+      end
+    end
+    #for each topic, select the first sentence that has the topic unless the summary already covers it
+    def self.love_at_first_sight(sentences, topics, length)
+      separator = "~@#"
+      summary = ''
+      t = 0
+      points = []
+      while summary.size < length and t < topics.size
+        if summary.match(/\b#{Regexp.quote(topics[t][0])}\b/)
+          #find the next occurrence sentence not already in the summary
+          #what if this sentence will be covered by next topics?
+        else
+          match_sentence = sentences.detect { |s| s.match(/\b#{Regexp.quote(topics[t][0])}\b/) }
+          if match_sentence and (new_summary = summary + match_sentence + separator).size < length
+            summary = new_summary
+            points.push(sentences.index(match_sentence)) #track sentence order
+          end
+        end
+        t += 1
+      end
+      #have a strategy to include other sentences when summary is less than half the length
+      #backups array which stores possible candidates, sort by score
+      #run a loop and add to points if summary is < length
+      #for low topic article like the reddit one (no candidates) just use the unused topic sentences
+      #or unused = points.each { |i| sentences.delete_at(i) } #must delete from highest index back
+      #then rerun this first_sight search
+      #reorder the summary
+      points.sort! {|a,b| a <=> b}
+      summary = points.map { |i| sentences[i] }.join(separator).gsub(/\n+/,"").gsub(/\s+/," ")
+    end
+    private
+    #start from start_index until the combine sentences exceed max_length
+    #return the index of last sentence in that passage
+    def passage_last_index(sentences, start_index, max_length=500)
+      stop = ((start_index+1)...sentences.size).detect { |i| (sentences[start_index..i].join('. ')).size > max_length }
+      (stop)? stop-1 : sentences.size
+    end
+    def get_passage(sentences, start_index, stop_index)
+      passages = sentences[start_index..stop_index].join('. ').split("\n")
+      (passages.size > 0)? passages[0].gsub(/^[^\w]+/,'') : ''
+    end
+  end
+end

data/lib/chomchom/topic.rb ADDED

@@ -0,0 +1,44 @@
+#coding: utf-8
+require "chomchom/string"
+module Chomchom
+  class Topic
+    MAX = 8
+    def initialize(text, title='', title_weight=1)
+        #support unicode (require ruby 1.9.x)
+        text = text.force_encoding("UTF-8")
+        title = title.force_encoding("UTF-8")
+        @content = title * title_weight + text.gsub(/\n+/,"\n")
+        @content = @content.force_encoding("UTF-8").downcase
+    end
+    def singles
+      words = @content.split(' ').map { |w| w.downcase.gsub(/[^\p{Word}]/, '') }.uniq.delete_if { |w| !w or w.length<2 or w.is_common? }
+      @singles = words.map { |w| [w, frequency(w)] }
+      @singles = @singles.delete_if { |g| g[1] < 3}.sort { |a,b| b[1] <=> a[1] }
+      @singles[0..MAX]
+    end
+    #this is not for the benefit of summary (but for db storage so move this into topic method in chomchom.rb)
+    #merge words before sorting (this keeps order of words as they appear)
+    #look at each word in single_groups and merge with the others  O(n^2)(this is inefficient)
+    #just go through the list in order, for each combine them and switch the order, take whichever one generate more counts
+    #merge for 2-word, then 3-word only
+    #for 3 (triples) just build from the doubles, then combine with non-overlap singles
+    #subtract from count everytime you legally take away (combine is more than 2 and remainder is more than 2)
+    def multiples
+    end
+    private
+    def is_substring(mono, multis)
+      multis.detect { |m| m[0].include?(mono[0]) and mono[1]==m[1] }
+    end
+    #count no. of times a word occurs in a text
+    def frequency(word)
+      @content.scan(/\b#{Regexp.quote(word)}[^\p{Word}\.]\b/).size
+    end
+  end
+end

data/lib/chomchom/version.rb ADDED

@@ -0,0 +1,3 @@
+module Chomchom
+  VERSION = "0.3.1"
+end

data/tests/.DS_Store ADDED

Binary file

data/tests/analytics_test.rb ADDED

@@ -0,0 +1,17 @@
+require "#{File.dirname(__FILE__)}/../lib/chomchom/social_analytics"
+urls = ["http://news.ycombinator.com/", "http://news.ycombinator.com", "mydomain.local",
+  "http://itmanagement.earthweb.com/entdev/article.php/3930466/That-Developers-Salary-is-Bigger-than-Mine.htm"
+      ]
+urls.each do |url|
+  puts url
+  analytics = Chomchom::SocialAnalytics.new(url)
+  puts "facebook: #{analytics.facebook}"
+  puts "twitter: #{analytics.twitter}"
+  puts "digg: #{analytics.digg}"
+  puts "delicious: #{analytics.delicious}"
+  puts "stumbleupon: #{analytics.stumbleupon}"
+  puts "google buzz: #{analytics.google_buzz}"
+  puts "linkedin: #{analytics.linkedin}"
+end

data/tests/benchmark.rb ADDED

@@ -0,0 +1,51 @@
+require "#{File.dirname(__FILE__)}/../lib/chomchom"
+require 'mechanize'
+#conclusion: use single center_of_gravity which is better than cap_summary in every case
+#topic_sentences is only marginally better than first_mention in most case, but it's too cost
+#both topic_sentences and first_mention are rarely coherent, they also often take metadata (photo of..., date and location at beginning)
+begin
+  urls = [ "http://37signals.com/svn/posts/2800-bootstrapped-profitable-proud-braintree",
+           "http://www.bbc.co.uk/news/world-africa-12726032",
+           "http://www.nytimes.com/2011/03/14/world/asia/14japan.html?_r=1&hp",
+           "http://www.nytimes.com/2006/01/15/magazine/15japanese.html?_r=2&pagewanted=all",
+           "http://www.theatlanticwire.com/national/2011/03/who-said-it-julian-assange-james-okeefe/35759/",
+           "http://blog.reddit.com/2011/03/so-long-and-thanks-for-all-postcards.html",
+           "http://www.mediapost.com/publications/?fa=Articles.showArticle&art_aid=146801&nid=124777",
+           "http://petewarden.typepad.com/searchbrowser/2010/04/how-i-got-sued-by-facebook.html"
+         ]
+  agent = Mechanize.new
+  agent.user_agent = "chomchom request client"
+  urls.each do |url|
+    agent.get(url)
+    begin
+      html = agent.page.body
+    rescue
+      html = ''
+    end
+    doc=Chomchom::Document.new(html)
+    puts doc.title
+    single_topics = doc.all_topics
+    puts "single topics: " + single_topics.map { |t| "[#{t[0]},#{t[1]}]" }.join(', ')
+    single_summary = doc.center_of_gravity
+    puts "score = #{Chomchom::Summary.compute_score(single_summary, single_topics)}"
+    puts "single topics center of gravity: #{single_summary} \n\n"
+    first_mentions = doc.first_mentions
+    puts "score = #{Chomchom::Summary.compute_score(first_mentions, single_topics)}"
+    puts "first mentions: #{first_mentions}\n\n"
+    topic_sentences = doc.topic_sentences
+    puts "score = #{Chomchom::Summary.compute_score(topic_sentences, single_topics)}"
+    puts "topic_sentences: #{topic_sentences}\n\n"
+    cherry_pick = doc.best_sentences
+    puts "score = #{Chomchom::Summary.compute_score(cherry_pick, single_topics)}"
+    puts "cherry pick: #{cherry_pick}\n\n"
+    puts "__________________"
+  end
+end

data/tests/files/.DS_Store ADDED

Binary file

data/tests/files/summaries.txt ADDED

@@ -0,0 +1,29 @@
+http://www.theatlantic.com/magazine/print/2011/01/hard-core/8327/|||Broadband Internet brings hardcore porn widely accessible to everyone, and thus influences our sex lives. However, online porn, which often caters to predominantly male consumers' desire, poses a threat to men and women equality as hardcore sex is legitimized and becomes the norm in everyone's bedroom
+http://deborahcampbell.org/writing/politics/the-most-hated-name-in-news/|||A look into Al Jazeera's difficulties in breaking into the US market, mainly caused by major cable networks refusal to carry them as the channel was labeled as the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who decide to broadcast it, and it has been well received for its fresh and informative worldview coverage.
+http://www.theatlantic.com/magazine/archive/2003/03/caring-for-your-introvert/2696/|||The self-described introvert author explains introversion and his view on extroverts treatment. He describes extroverts as people energized by other people while introverts find energy in deep thoughts and find socializing exhausted. He believes introverts are misunderstood b/c most of society's activities and expectations are dominated by extroverts.
+http://www.nybooks.com/articles/archives/2010/nov/25/generation-why/?pagination=false|||The article uses the Social Network movie as a template to contemplate about Zuckerberg motives in creating Facebook and his creation's implication on human interaction, specifically privacy, human relationship and feelings, your representation as a person online. The author argues that digital profiles aren't adequate enough to foster the genuine relationship real life offers, this medium even misrepresents your image with a shallow format of trivia personal info and photos.
+http://www.wired.com/wired/archive/1.04/gibson_pr.html|||Singapore has established its prosperity and cleanliness through an overly controlled government at the cost of free expression. Its population and culture are based on forced conformity that makes it look more like a bland theme park. Finally, it has a harsh attitude toward drug trafficking
+http://www.newyorker.com/reporting/2008/02/11/080211fa_fact_orlean?currentPage=all|||Hollinger is a passionate inventor who simultaneously wrestles with his numerous projects and ideas. His current project is to reinvent the umbrella, which suffers many design flaws. His solution is a hip umbrella with a stylist shape that uses the principle of aerodynamics to withstand strong wind.
+http://www.nytimes.com/2001/02/25/magazine/25STOCK-TRADER.html?pagewanted=all|||A 15-year-old boy made close to $1M by plugging stocks he owns on sites such as yahoo finance. He exploits the fact that most people trading stocks have no clue what they're doing but are eager to trade based on pundits and manipulators info. The SEC gets in and vilifies the kid because the fact that a kid could easily rig the system expose its flaws and newfound complexities in an Internet age where the amateurs can appear as professional as the pros.
+http://www.stevenberlinjohnson.com/2010/04/the-glass-box-and-the-commonplace-book.html|||Today's Internet and search engines, much like classical texts, are collections of texts from a variety of source stitched together in ways that the original author may never imagine. The free mixing of information creates new meaning and value for a healthy information ecosystem. But new crop of apps such as iBook, NYT, WSJ apps deliberately erect glassbox that make it impossible to copy and paste text for collection. The author advocates for an open web where information can freely flow, connect, and remixed in new imaginative ways.
+http://www.wired.com/wired/archive/15.02/yahoo_pr.html|||A look at the path that leads to Yahoo falling behind Google, from its failure to recognize and monetize search, to inability to buy google, then failure to integrate its two search related acquisitions Inktomi and Overture. These failures are often attributed to ex-CEO Terry Semel due to his lack of deep technical understanding coming from the media world.
+http://www.gq.com/entertainment/movies-and-tv/201102/the-day-the-movies-died-mark-harris?currentPage=all|||Today's generation of summer block buster movies genre, which dominates the movies landscape, lacks creativity and risk. Movies making is reduced to that of marketing and packaging, and less about the content. But ultimately, it's about us moviegoers, movies are made for people and if people stop going then the studios have to make movies for those who do, under 25 yr-old males who always crave super hero comics and refuse to grow up.
+http://www.theregister.co.uk/2011/03/01/the_rise_and_rise_of_node_dot_js/print.html|||Node.js is a server side Javascript framework built to handle dynamic real time application. It's based on JavaScript event-based architecture which is different from traditional I/O blocking used in typical web servers. The end result is developers can now code their high performance stack, both backend and frontend, using the same language.
+http://www.lycaeum.org/~martins/M2/ventura.html|||Most people work at unfulfilled jobs that are more out of necessity rather than choice. The economy dictates that for businesses to run efficiently workers at the bottom should have no decision making power. They have no power, no upside, ultimately their their time and economic wellness are compromised by decisions made at the top, by those who profit mightily from actions that lead to the collapse of many industries.
+http://www.esquire.com/print-this/steve-jobs-1008?page=all|||As Jobs took the stage to introduce the iPhone 3G, his mortal sign revealed by his struggle against a rare form pancreatic cancer is more apparent than ever. Yet his immortality has been in the making the last 30 years with each new Apple invention. Each invention strives to surpass its predecessor in ways that surpass everyone's expectation, each carries Jobs DNA and vision in his quest to redefine the human-machine relationship and with that his existence forever.
+http://www.nytimes.com/2010/08/22/magazine/22Adulthood-t.html?_r=1&pagewanted=print|||The phenomenon of today's young generation taking forever to reach adulthood is defined as a new life stage driven by economic and social changes.Just like a century ago when psychologists must make a case for adolescence as a new developmental stage, today's 20-something stage will also lead to changes to accommodate their unique needs and profile. People at this stage tend to be more self-focused, engage in identity exploration, and optimistic about the future
+http://www.theatlantic.com/technology/archive/2011/03/from-lulz-to-labor-unions-the-evolution-of-anonymous/72001/|||Anonymous used to center around Internet memes to get laughs within its forum, but recently they have been in the media for promotion of freedom of speech against powerful enemies. Its most recent action against security firm HBGary led to the company's CEO resignation. But this questions their very own agenda, is Anonymous using their Ddos power to suppress freedom of speech?
+http://www.theatlantic.com/magazine/print/2011/04/north-korea-8217-s-digital-underground/8414/|||North Korea is an isolated country with no access to outside internet or media. But crops of underground reporters run by North Korean defectors and outside activists are trying to bring news into and out of the country. Their operations, while effective in giving the outside world an accurate look into NK, are often put their lives at great risk
+http://ycombinator.com/atyc.html|||A look into the inner workings of Y Combinator from dinners with famous startup founders, office hours, fund raising strategy, to launch and Demo Day preps. YC also offers introduction to valuable angels, VCs, and YC alumni. Ultimately, it's the 3 months of intense focus on everything startups that fuel productivity and energy in YC founders.
+http://english.caing.com/2010-07-28/100164846.html|||A group of conspirators plot elaborate plans to extort money from illegal mine owners by killing mine workers in staged accidents, most of the victims are the conspirators' relatives. In a case where the mine owner refused to pay, the group mistakenly report the dead which led to their eventual investigation and conviction
+http://www.dailykos.com/story/2011/02/26/950079/-I-Dont-Want-to-be-a-Teacher-Any-More|||As a student the author already aspire to be a teacher. She passionately teaches for 30 years despite many challenges such as budget cutbacks that result bigger class size and her taking a pay cut, cleaning her classroom, and buying school supplies with her own money.  When the district failed her school for not meeting their test score without considering these difficult challenges, it compound to cause her to finally stop being a teacher
+http://www.xconomy.com/san-francisco/inside-googles-age-of-augmented-humanity/|||Google has the most advanced and largest infrastructure to support its mission of organizing all information around us. The scope of their operation has expanded beyond search to areas from speech recognition, language translation, to visual search. But the backbone to all their innovation are machine learning and huge amount of data mining.
+http://www.time.com/time/printout/0,8816,2053595,00.html|||Germany reforms focusing on keeping manufacturing at home have helped its economy growth and unemployment low. Like China, Germany carries a trade surplus with other EU nations such as France, Spain, Ireland, which effect the health of EU economy and hence the value of the euro.
+http://thestandard.org.nz/what-will-future-generations-condemn-us-for/|||Like Wife beating, slavery, and hanging of homosexuals, which used to be accepted by the mass and squarely condemned nowadays, the US prison system, industrial meat production, the institutionalization of the elderly, and polluting of our environment are destined for future condemnation.
+http://www.theatlantic.com/magazine/print/2010/07/the-politically-incorrect-guide-to-ending-poverty/8134/|||Light regulation and fair laws in the medieval town of Lubeck attracted more merchants and turned it into a prosperous town with many neighbors adopting its model. Now, a Stanford economist, Paul Romer, is advocating the same model to end poverty: creating charter-cities with new laws in poor countries. Romer often cites Hong Kong as an example of a successful charter city. Like Lubeck, Hong Kong inspire other nearby Chinese cities to adopt its model which transform China into the manufacture powerhouse it is today. But his plan is often opposed by regional nationalists balking at the idea of giving lands to foreigners.
+http://www.nytimes.com/2011/03/01/science/01angier.html|||As a student, Portman made it to the semifinal round of the Intel Science Talent Search, a grueling competition that requires dedicated hard work. At the same time, she manages to star in many movies, most notably as Queen Amidala in Star Wars. She went on to major in neuroscience at Harvard.
+http://www.vanityfair.com/business/features/2011/04/jack-dorsey-201104?currentPage=all|||Jack Dorsey, the original Twitter creator, is now CEO of his second creation, Square - a company that allows individuals to process credit card with their phone. He holds s variety of interests, is friend with many actors like Ashton Kutcher, and aspires to become the mayor of New York city.
+http://www.independent.co.uk/news/world/modern-art-was-cia-weapon-1578808.html?service=Print|||In a campaign known as "long leash" the CIA promoted Abstract Expressionist paintings, without the artists knowledge, as propaganda weapon during the Cold War. The decision was made to counter Communist propaganda, which often attracted artists and intellectuals, while American democracy was at odds with McCarthy's intolerance. With its many covert operations to bring exhibitions everywhere the CIA helped Abstract Expressionism become the prominent art movement post WW2.
+http://www.theatlantic.com/magazine/print/2009/09/how-american-health-care-killed-my-father/7617/|||After the needless death of his father, the author, a business executive, began a personal exploration of a health-care industry that for years has delivered poor service and irregular quality at astonishingly high cost. It is a system, he argues, that is not worth preserving in anything like its current form. And the health-care reform now being contemplated will not fix it. Here’s a radical solution to an agonizing problem.
+http://www.theatlantic.com/magazine/print/2011/03/mind-vs-machine/8386/|||In the race to build computers that can think like humans, the proving ground is the Turing Test—an annual battle between the world’s most advanced artificial-intelligence programs and ordinary people. The objective? To find out whether a computer can act “more human” than a person. In his own quest to beat the machines, the author discovers that the march of technology isn’t just changing how we live, it’s raising new questions about what it means to be human.
+http://37signals.com/svn/posts/2800-bootstrapped-profitable-proud-braintree|||Bryan Johnson worked as a salesperson selling credit services to business, but he hates the job and decided to create Braintree. Braintree avoids the freemium model and charged a premium, which was a lot higher than their competitors. It helps filter out the inexperienced merchants who don't recognize the pain yet. They generated $4.5M in revenue and grew to 24 employees and doubled its customer base.

data/tests/functionals.rb ADDED

@@ -0,0 +1,40 @@
+require "#{File.dirname(__FILE__)}/../lib/chomchom"
+require 'mechanize'
+begin
+  urls = ["http://www.foundersatwork.com/blog.html",
+          "http://www.independent.co.uk/opinion/commentators/johann-hari/the-dark-side-of-dubai-1664368.html",
+           "http://37signals.com/svn/posts/2800-bootstrapped-profitable-proud-braintree",
+           "http://www.bbc.co.uk/news/world-africa-12726032",
+           "http://danluan.org/node/7967",
+           "http://www.nytimes.com/2011/03/14/world/asia/14japan.html?_r=1&hp",
+           "http://www.nytimes.com/2006/01/15/magazine/15japanese.html?_r=2&pagewanted=all",
+           "http://www.theatlanticwire.com/national/2011/03/who-said-it-julian-assange-james-okeefe/35759/",
+           "http://www.google.com/",
+           "http://blog.reddit.com/2011/03/so-long-and-thanks-for-all-postcards.html",
+           "http://www.mediapost.com/publications/?fa=Articles.showArticle&art_aid=146801&nid=124777",
+           "http://petewarden.typepad.com/searchbrowser/2010/04/how-i-got-sued-by-facebook.html"
+         ]
+  agent = Mechanize.new
+  agent.user_agent = "chomchom request client"
+  urls.each do |url|
+    begin
+      agent.get(url)
+      html = agent.page.body
+    rescue
+      html = ''
+    end
+    puts url
+    doc=Chomchom::Document.new(html)
+    puts doc.title
+    puts "date: #{doc.publish_date}"
+    puts "author: " + doc.author
+    puts doc.fulltext
+    puts "duration: #{doc.consume_duration}"
+    puts "topics: #{doc.all_topics}"
+    puts "summary: " + doc.center_of_gravity
+  end
+end

data/tests/scoring.rb ADDED

@@ -0,0 +1,41 @@
+require "#{File.dirname(__FILE__)}/../lib/chomchom"
+require "#{File.dirname(__FILE__)}/../lib/chomchom/scorer"
+require 'mechanize'
+urls = []
+summaries = []
+File.open('files/summaries.txt', 'r') do |file|
+  while line = file.gets
+    tmp = line.split("|||")
+    urls.push(tmp[0])
+    summaries.push(tmp[1])
+  end
+end
+agent = Mechanize.new
+agent.user_agent = "chomchom request client"
+fake_summary = "A look into Al Jazeera's difficulties in breaking into the US market, because they are the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who are unpatriotic by broadcasting it, and it hasn't been well received for its fresh and informative worldview coverage."
+urls.each_with_index do |url, i|
+if i==i
+  agent.get(url)
+  begin
+    html = agent.page.body
+  rescue
+    html = ''
+  end
+  doc = Chomchom::Document.new(html)
+  puts title = doc.title
+  topics = doc.all_topics
+  puts "#{topics}"
+  text = doc.fulltext
+  puts summaries[i]
+  topic_words = topics.map { |t| t[0] }
+  scorer = Chomchom::Scorer.new
+  puts scorer.score(text, summaries[i], topic_words)
+end
+end

metadata ADDED

@@ -0,0 +1,143 @@
+--- !ruby/object:Gem::Specification
+name: chomchom
+version: !ruby/object:Gem::Version
+  prerelease:
+  version: 0.3.1
+platform: ruby
+authors:
+- Quan Nguyen
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-05-01 00:00:00 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: ruby-stemmer
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: ruby-readability
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  version_requirements: *id004
+- !ruby/object:Gem::Dependency
+  name: htmlentities
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  version_requirements: *id005
+- !ruby/object:Gem::Dependency
+  name: json
+  prerelease: false
+  requirement: &id006 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  version_requirements: *id006
+description: chomchom extracts article's title, published_date, author, and fulltext. It also detects videos and audio for classifying the media type of a given page
+email:
+- mquannie@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .DS_Store
+- .gitignore
+- Gemfile
+- README
+- Rakefile
+- chomchom.gemspec
+- lib/.DS_Store
+- lib/chomchom.rb
+- lib/chomchom/.DS_Store
+- lib/chomchom/extractor.rb
+- lib/chomchom/regex_path.rb
+- lib/chomchom/scorer.rb
+- lib/chomchom/social_analytics.rb
+- lib/chomchom/string.rb
+- lib/chomchom/summary.rb
+- lib/chomchom/topic.rb
+- lib/chomchom/version.rb
+- tests/.DS_Store
+- tests/analytics_test.rb
+- tests/benchmark.rb
+- tests/files/.DS_Store
+- tests/files/summaries.txt
+- tests/functionals.rb
+- tests/scoring.rb
+homepage: http://github.com/mquan/chomchom
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+requirements: []
+rubyforge_project: chomchom
+rubygems_version: 1.7.2
+signing_key:
+specification_version: 3
+summary: chomchom is a ruby gem that extracts key information from an html page
+test_files: []