chomchom 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/.DS_Store CHANGED
Binary file
data/chomchom.gemspec CHANGED
@@ -25,5 +25,5 @@ Gem::Specification.new do |s|
25
25
  s.add_dependency(%q<ruby-readability>)
26
26
  s.add_dependency(%q<htmlentities>)
27
27
  s.add_dependency(%q<json>)
28
-
28
+ s.add_dependency(%q<grammar_police>)
29
29
  end
@@ -1,9 +1,10 @@
1
1
  #coding: utf-8
2
+ require 'grammar_police'
2
3
  require 'lingua/stemmer' #https://github.com/aurelian/ruby-stemmer
3
4
 
4
5
  module Chomchom
5
6
  class Scorer
6
- def score(text, summary, topics)
7
+ def score(text, summary, topics, dictionary, parse_options)
7
8
  #solve the utf-8 invalid string error
8
9
  ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
9
10
  text = ic.iconv(text + ' ')[0..-2]
@@ -38,8 +39,13 @@ module Chomchom
38
39
 
39
40
  #separating by sentences has the effect of designating each sentence to a section
40
41
  coverages = []
42
+ copy_taxes = []
43
+ proper_sentences = 0
41
44
  #ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s|
42
45
  ss = summary.downcase.split_sentences.each { |s|
46
+ #take a flat 30% for every copied sentence
47
+ copy_taxes.push(0.3) if text.downcase.index(s.gsub(/[^\p{Word}]+$/,'').gsub(/^[^\p{Word}]+/,''))
48
+ proper_sentences += 1 if GrammarPolice::Sentence.count_linkages(s, dictionary, parse_options) > 0
43
49
  coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/)
44
50
  }
45
51
 
@@ -61,8 +67,16 @@ module Chomchom
61
67
  #this treats every sentence as 1 unit (all sentences created equal)
62
68
  #puts "#{covered.size.to_f/tss.size*100}"
63
69
 
70
+ #average tax for all sentences
71
+ copy_tax = (copy_taxes.size > 0)? (copy_taxes.inject { |sum, t| sum + t}/ss.size) : 0.0
72
+
73
+ #calculate grammar tax
74
+ grammar_tax = grammar_tax(proper_sentences, ss.size)
75
+
64
76
  #punish for length with the idea of length_tax, no tax below 100 and then progressively increase
65
- summary_score.to_f/total_score*100*(1-length_tax(summary.size))
77
+ tax = length_tax(summary.size) + copy_tax + grammar_tax
78
+ puts "total tax =#{tax}"
79
+ summary_score.to_f/total_score*100*(1-tax)
66
80
 
67
81
  #algo weaknesses:
68
82
  #extracted passage from text often scores higher (b/c of exact word matches)
@@ -86,6 +100,16 @@ module Chomchom
86
100
  end
87
101
  end
88
102
 
103
+ def grammar_tax(proper_sentences, total_sentences)
104
+ if total_sentences < 3
105
+ proper_sentences * 0.1 #10% for each invalid sentence
106
+ elsif total_sentences < 5
107
+ proper_sentences * 0.07 #less harsh
108
+ else
109
+ proper_sentences.to_f/total_sentences/3 #ex 1/5 ==> 7%
110
+ end
111
+ end
112
+
89
113
  def find_coverages(summary, text_sentences, topics)
90
114
  terms = []
91
115
  hits = [] #array of indexes of sentences with matched terms
@@ -111,7 +111,7 @@ module Chomchom
111
111
 
112
112
  #for each topic, select the first sentence that has the topic unless the summary already covers it
113
113
  def self.love_at_first_sight(sentences, topics, length)
114
- separator = "~@#"
114
+ separator = "\n"
115
115
  summary = ''
116
116
  t = 0
117
117
  points = []
@@ -1,3 +1,3 @@
1
1
  module Chomchom
2
- VERSION = "0.3.1"
2
+ VERSION = "0.4.0"
3
3
  end
data/tests/.DS_Store CHANGED
Binary file
data/tests/scoring.rb CHANGED
@@ -13,29 +13,40 @@ File.open('files/summaries.txt', 'r') do |file|
13
13
  end
14
14
  end
15
15
 
16
+
16
17
  agent = Mechanize.new
17
18
  agent.user_agent = "chomchom request client"
18
19
  fake_summary = "A look into Al Jazeera's difficulties in breaking into the US market, because they are the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who are unpatriotic by broadcasting it, and it hasn't been well received for its fresh and informative worldview coverage."
20
+
21
+ dict = GrammarPolice::Dictionary.create("en")
22
+ opts = GrammarPolice::ParseOptions.parse_options_create
23
+ GrammarPolice::ParseOptions.parse_options_set_linkage_limit(opts, 10)
24
+ GrammarPolice::ParseOptions.parse_options_set_short_length(opts, 5)
19
25
  urls.each_with_index do |url, i|
20
- if i==i
21
- agent.get(url)
22
- begin
23
- html = agent.page.body
24
- rescue
25
- html = ''
26
- end
26
+ if i==i
27
+ agent.get(url)
28
+ begin
29
+ html = agent.page.body
30
+ rescue
31
+ html = ''
32
+ end
27
33
 
28
- doc = Chomchom::Document.new(html)
29
- puts title = doc.title
30
- topics = doc.all_topics
31
- puts "#{topics}"
32
- text = doc.fulltext
33
-
34
- puts summaries[i]
34
+ doc = Chomchom::Document.new(html)
35
+ puts title = doc.title
36
+ topics = doc.all_topics
37
+ puts "#{topics}"
38
+ text = doc.fulltext
39
+
40
+ puts summaries[i]
35
41
 
36
- topic_words = topics.map { |t| t[0] }
37
- scorer = Chomchom::Scorer.new
38
- puts scorer.score(text, summaries[i], topic_words)
39
- end
42
+ topic_words = topics.map { |t| t[0] }
43
+ scorer = Chomchom::Scorer.new
44
+ puts scorer.score(text, summaries[i], topic_words, dict, opts)
45
+
46
+ #check copy tax
47
+ puts passage = doc.center_of_gravity()
48
+ puts scorer.score(text, passage, topic_words, dict, opts)
49
+ end
40
50
  end
41
-
51
+ GrammarPolice::ParseOptions.parse_options_delete(opts)
52
+ GrammarPolice::Dictionary.destroy(dict)
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: chomchom
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.3.1
5
+ version: 0.4.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - Quan Nguyen
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-05-01 00:00:00 Z
13
+ date: 2011-05-04 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: mechanize
@@ -78,6 +78,17 @@ dependencies:
78
78
  version: "0"
79
79
  type: :runtime
80
80
  version_requirements: *id006
81
+ - !ruby/object:Gem::Dependency
82
+ name: grammar_police
83
+ prerelease: false
84
+ requirement: &id007 !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ type: :runtime
91
+ version_requirements: *id007
81
92
  description: chomchom extracts article's title, published_date, author, and fulltext. It also detects videos and audio for classifying the media type of a given page
82
93
  email:
83
94
  - mquannie@gmail.com