chomchom 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.DS_Store CHANGED
Binary file
data/chomchom.gemspec CHANGED
@@ -25,5 +25,5 @@ Gem::Specification.new do |s|
25
25
  s.add_dependency(%q<ruby-readability>)
26
26
  s.add_dependency(%q<htmlentities>)
27
27
  s.add_dependency(%q<json>)
28
-
28
+ s.add_dependency(%q<grammar_police>)
29
29
  end
@@ -1,9 +1,10 @@
1
1
  #coding: utf-8
2
+ require 'grammar_police'
2
3
  require 'lingua/stemmer' #https://github.com/aurelian/ruby-stemmer
3
4
 
4
5
  module Chomchom
5
6
  class Scorer
6
- def score(text, summary, topics)
7
+ def score(text, summary, topics, dictionary, parse_options)
7
8
  #solve the utf-8 invalid string error
8
9
  ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
9
10
  text = ic.iconv(text + ' ')[0..-2]
@@ -38,8 +39,13 @@ module Chomchom
38
39
 
39
40
  #separating by sentences has the effect of designating each sentence to a section
40
41
  coverages = []
42
+ copy_taxes = []
43
+ proper_sentences = 0
41
44
  #ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s|
42
45
  ss = summary.downcase.split_sentences.each { |s|
46
+ #take a flat 30% for every copied sentence
47
+ copy_taxes.push(0.3) if text.downcase.index(s.gsub(/[^\p{Word}]+$/,'').gsub(/^[^\p{Word}]+/,''))
48
+ proper_sentences += 1 if GrammarPolice::Sentence.count_linkages(s, dictionary, parse_options) > 0
43
49
  coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/)
44
50
  }
45
51
 
@@ -61,8 +67,16 @@ module Chomchom
61
67
  #this treats every sentence as 1 unit (all sentences created equal)
62
68
  #puts "#{covered.size.to_f/tss.size*100}"
63
69
 
70
+ #average tax for all sentences
71
+ copy_tax = (copy_taxes.size > 0)? (copy_taxes.inject { |sum, t| sum + t}/ss.size) : 0.0
72
+
73
+ #calculate grammar tax
74
+ grammar_tax = grammar_tax(proper_sentences, ss.size)
75
+
64
76
  #punish for length with the idea of length_tax, no tax below 100 and then progressively increase
65
- summary_score.to_f/total_score*100*(1-length_tax(summary.size))
77
+ tax = length_tax(summary.size) + copy_tax + grammar_tax
78
+ puts "total tax =#{tax}"
79
+ summary_score.to_f/total_score*100*(1-tax)
66
80
 
67
81
  #algo weaknesses:
68
82
  #extracted passage from text often scores higher (b/c of exact word matches)
@@ -86,6 +100,16 @@ module Chomchom
86
100
  end
87
101
  end
88
102
 
103
+ def grammar_tax(proper_sentences, total_sentences)
104
+ if total_sentences < 3
105
+ proper_sentences * 0.1 #10% for each invalid sentence
106
+ elsif total_sentences < 5
107
+ proper_sentences * 0.07 #less harsh
108
+ else
109
+ proper_sentences.to_f/total_sentences/3 #ex 1/5 ==> 7%
110
+ end
111
+ end
112
+
89
113
  def find_coverages(summary, text_sentences, topics)
90
114
  terms = []
91
115
  hits = [] #array of indexes of sentences with matched terms
@@ -111,7 +111,7 @@ module Chomchom
111
111
 
112
112
  #for each topic, select the first sentence that has the topic unless the summary already covers it
113
113
  def self.love_at_first_sight(sentences, topics, length)
114
- separator = "~@#"
114
+ separator = "\n"
115
115
  summary = ''
116
116
  t = 0
117
117
  points = []
@@ -1,3 +1,3 @@
1
1
  module Chomchom
2
- VERSION = "0.3.1"
2
+ VERSION = "0.4.0"
3
3
  end
data/tests/.DS_Store CHANGED
Binary file
data/tests/scoring.rb CHANGED
@@ -13,29 +13,40 @@ File.open('files/summaries.txt', 'r') do |file|
13
13
  end
14
14
  end
15
15
 
16
+
16
17
  agent = Mechanize.new
17
18
  agent.user_agent = "chomchom request client"
18
19
  fake_summary = "A look into Al Jazeera's difficulties in breaking into the US market, because they are the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who are unpatriotic by broadcasting it, and it hasn't been well received for its fresh and informative worldview coverage."
20
+
21
+ dict = GrammarPolice::Dictionary.create("en")
22
+ opts = GrammarPolice::ParseOptions.parse_options_create
23
+ GrammarPolice::ParseOptions.parse_options_set_linkage_limit(opts, 10)
24
+ GrammarPolice::ParseOptions.parse_options_set_short_length(opts, 5)
19
25
  urls.each_with_index do |url, i|
20
- if i==i
21
- agent.get(url)
22
- begin
23
- html = agent.page.body
24
- rescue
25
- html = ''
26
- end
26
+ if i==i
27
+ agent.get(url)
28
+ begin
29
+ html = agent.page.body
30
+ rescue
31
+ html = ''
32
+ end
27
33
 
28
- doc = Chomchom::Document.new(html)
29
- puts title = doc.title
30
- topics = doc.all_topics
31
- puts "#{topics}"
32
- text = doc.fulltext
33
-
34
- puts summaries[i]
34
+ doc = Chomchom::Document.new(html)
35
+ puts title = doc.title
36
+ topics = doc.all_topics
37
+ puts "#{topics}"
38
+ text = doc.fulltext
39
+
40
+ puts summaries[i]
35
41
 
36
- topic_words = topics.map { |t| t[0] }
37
- scorer = Chomchom::Scorer.new
38
- puts scorer.score(text, summaries[i], topic_words)
39
- end
42
+ topic_words = topics.map { |t| t[0] }
43
+ scorer = Chomchom::Scorer.new
44
+ puts scorer.score(text, summaries[i], topic_words, dict, opts)
45
+
46
+ #check copy tax
47
+ puts passage = doc.center_of_gravity()
48
+ puts scorer.score(text, passage, topic_words, dict, opts)
49
+ end
40
50
  end
41
-
51
+ GrammarPolice::ParseOptions.parse_options_delete(opts)
52
+ GrammarPolice::Dictionary.destroy(dict)
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: chomchom
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.3.1
5
+ version: 0.4.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - Quan Nguyen
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-05-01 00:00:00 Z
13
+ date: 2011-05-04 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: mechanize
@@ -78,6 +78,17 @@ dependencies:
78
78
  version: "0"
79
79
  type: :runtime
80
80
  version_requirements: *id006
81
+ - !ruby/object:Gem::Dependency
82
+ name: grammar_police
83
+ prerelease: false
84
+ requirement: &id007 !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ type: :runtime
91
+ version_requirements: *id007
81
92
  description: chomchom extracts article's title, published_date, author, and fulltext. It also detects videos and audio for classifying the media type of a given page
82
93
  email:
83
94
  - mquannie@gmail.com