RubyGems - chomchom - Versions diffs - 0.3.1 → 0.4.0 - Mend

chomchom 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/.DS_Store CHANGED Viewed

Binary file

data/chomchom.gemspec CHANGED Viewed

@@ -25,5 +25,5 @@ Gem::Specification.new do |s|
   s.add_dependency(%q<ruby-readability>)
   s.add_dependency(%q<htmlentities>)
   s.add_dependency(%q<json>)
+  s.add_dependency(%q<grammar_police>)
 end

data/lib/chomchom/scorer.rb CHANGED Viewed

@@ -1,9 +1,10 @@
 #coding: utf-8
+require 'grammar_police'
 require 'lingua/stemmer' #https://github.com/aurelian/ruby-stemmer
 module Chomchom
   class Scorer
-    def score(text, summary, topics)
+    def score(text, summary, topics, dictionary, parse_options)
       #solve the utf-8 invalid string error
       ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
       text = ic.iconv(text + ' ')[0..-2]
@@ -38,8 +39,13 @@ module Chomchom
       #separating by sentences has the effect of designating each sentence to a section
       coverages = []
+      copy_taxes = []
+      proper_sentences = 0
       #ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s|
       ss = summary.downcase.split_sentences.each { |s|
+        #take a flat 30% for every copied sentence
+        copy_taxes.push(0.3) if text.downcase.index(s.gsub(/[^\p{Word}]+$/,'').gsub(/^[^\p{Word}]+/,''))
+        proper_sentences += 1 if GrammarPolice::Sentence.count_linkages(s, dictionary, parse_options) > 0
         coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/)
       }
@@ -61,8 +67,16 @@ module Chomchom
       #this treats every sentence as 1 unit (all sentences created equal)
       #puts "#{covered.size.to_f/tss.size*100}"
+      #average tax for all sentences
+      copy_tax = (copy_taxes.size > 0)? (copy_taxes.inject { |sum, t| sum + t}/ss.size) : 0.0
+      #calculate grammar tax
+      grammar_tax = grammar_tax(proper_sentences, ss.size)
       #punish for length with the idea of length_tax, no tax below 100 and then progressively increase
-      summary_score.to_f/total_score*100*(1-length_tax(summary.size))
+      tax = length_tax(summary.size) + copy_tax + grammar_tax
+      puts "total tax =#{tax}"
+      summary_score.to_f/total_score*100*(1-tax)
       #algo weaknesses:
       #extracted passage from text often scores higher (b/c of exact word matches)
@@ -86,6 +100,16 @@ module Chomchom
       end
     end
+    def grammar_tax(proper_sentences, total_sentences)
+      if total_sentences < 3
+        proper_sentences * 0.1 #10% for each invalid sentence
+      elsif total_sentences < 5
+        proper_sentences * 0.07 #less harsh
+      else
+        proper_sentences.to_f/total_sentences/3 #ex 1/5 ==> 7%
+      end
+    end
     def find_coverages(summary, text_sentences, topics)
       terms = []
       hits = [] #array of indexes of sentences with matched terms

data/lib/chomchom/summary.rb CHANGED Viewed

@@ -111,7 +111,7 @@ module Chomchom
     #for each topic, select the first sentence that has the topic unless the summary already covers it
     def self.love_at_first_sight(sentences, topics, length)
-      separator = "~@#"
+      separator = "\n"
       summary = ''
       t = 0
       points = []

data/lib/chomchom/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Chomchom
-  VERSION = "0.3.1"
+  VERSION = "0.4.0"
 end

data/tests/.DS_Store CHANGED Viewed

Binary file

data/tests/scoring.rb CHANGED Viewed

@@ -13,29 +13,40 @@ File.open('files/summaries.txt', 'r') do |file|
   end
 end
 agent = Mechanize.new
 agent.user_agent = "chomchom request client"
 fake_summary = "A look into Al Jazeera's difficulties in breaking into the US market, because they are the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who are unpatriotic by broadcasting it, and it hasn't been well received for its fresh and informative worldview coverage."
+dict = GrammarPolice::Dictionary.create("en")
+opts = GrammarPolice::ParseOptions.parse_options_create
+GrammarPolice::ParseOptions.parse_options_set_linkage_limit(opts, 10)
+GrammarPolice::ParseOptions.parse_options_set_short_length(opts, 5)
 urls.each_with_index do |url, i|
-if i==i
-  agent.get(url)
-  begin
-    html = agent.page.body
-  rescue
-    html = ''
-  end
+  if i==i
+    agent.get(url)
+    begin
+      html = agent.page.body
+    rescue
+      html = ''
+    end
-  doc = Chomchom::Document.new(html)
-  puts title = doc.title
-  topics = doc.all_topics
-  puts "#{topics}"
-  text = doc.fulltext
-  puts summaries[i]
+    doc = Chomchom::Document.new(html)
+    puts title = doc.title
+    topics = doc.all_topics
+    puts "#{topics}"
+    text = doc.fulltext
+    puts summaries[i]
-  topic_words = topics.map { |t| t[0] }
-  scorer = Chomchom::Scorer.new
-  puts scorer.score(text, summaries[i], topic_words)
-end
+    topic_words = topics.map { |t| t[0] }
+    scorer = Chomchom::Scorer.new
+    puts scorer.score(text, summaries[i], topic_words, dict, opts)
+    #check copy tax
+    puts passage = doc.center_of_gravity()
+    puts scorer.score(text, passage, topic_words, dict, opts)
+  end
 end
+GrammarPolice::ParseOptions.parse_options_delete(opts)
+GrammarPolice::Dictionary.destroy(dict)

metadata CHANGED Viewed

@@ -2,7 +2,7 @@
 name: chomchom
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.3.1
+  version: 0.4.0
 platform: ruby
 authors:
 - Quan Nguyen
@@ -10,7 +10,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-05-01 00:00:00 Z
+date: 2011-05-04 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -78,6 +78,17 @@ dependencies:
         version: "0"
   type: :runtime
   version_requirements: *id006
+- !ruby/object:Gem::Dependency
+  name: grammar_police
+  prerelease: false
+  requirement: &id007 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :runtime
+  version_requirements: *id007
 description: chomchom extracts article's title, published_date, author, and fulltext. It also detects videos and audio for classifying the media type of a given page
 email:
 - mquannie@gmail.com