chomchom 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.DS_Store +0 -0
- data/chomchom.gemspec +1 -1
- data/lib/chomchom/scorer.rb +26 -2
- data/lib/chomchom/summary.rb +1 -1
- data/lib/chomchom/version.rb +1 -1
- data/tests/.DS_Store +0 -0
- data/tests/scoring.rb +30 -19
- metadata +13 -2
data/.DS_Store
CHANGED
Binary file
|
data/chomchom.gemspec
CHANGED
data/lib/chomchom/scorer.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
#coding: utf-8
|
2
|
+
require 'grammar_police'
|
2
3
|
require 'lingua/stemmer' #https://github.com/aurelian/ruby-stemmer
|
3
4
|
|
4
5
|
module Chomchom
|
5
6
|
class Scorer
|
6
|
-
def score(text, summary, topics)
|
7
|
+
def score(text, summary, topics, dictionary, parse_options)
|
7
8
|
#solve the utf-8 invalid string error
|
8
9
|
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
9
10
|
text = ic.iconv(text + ' ')[0..-2]
|
@@ -38,8 +39,13 @@ module Chomchom
|
|
38
39
|
|
39
40
|
#separating by sentences has the effect of designating each sentence to a section
|
40
41
|
coverages = []
|
42
|
+
copy_taxes = []
|
43
|
+
proper_sentences = 0
|
41
44
|
#ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s|
|
42
45
|
ss = summary.downcase.split_sentences.each { |s|
|
46
|
+
#take a flat 30% for every copied sentence
|
47
|
+
copy_taxes.push(0.3) if text.downcase.index(s.gsub(/[^\p{Word}]+$/,'').gsub(/^[^\p{Word}]+/,''))
|
48
|
+
proper_sentences += 1 if GrammarPolice::Sentence.count_linkages(s, dictionary, parse_options) > 0
|
43
49
|
coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/)
|
44
50
|
}
|
45
51
|
|
@@ -61,8 +67,16 @@ module Chomchom
|
|
61
67
|
#this treats every sentence as 1 unit (all sentences created equal)
|
62
68
|
#puts "#{covered.size.to_f/tss.size*100}"
|
63
69
|
|
70
|
+
#average tax for all sentences
|
71
|
+
copy_tax = (copy_taxes.size > 0)? (copy_taxes.inject { |sum, t| sum + t}/ss.size) : 0.0
|
72
|
+
|
73
|
+
#calculate grammar tax
|
74
|
+
grammar_tax = grammar_tax(proper_sentences, ss.size)
|
75
|
+
|
64
76
|
#punish for length with the idea of length_tax, no tax below 100 and then progressively increase
|
65
|
-
|
77
|
+
tax = length_tax(summary.size) + copy_tax + grammar_tax
|
78
|
+
puts "total tax =#{tax}"
|
79
|
+
summary_score.to_f/total_score*100*(1-tax)
|
66
80
|
|
67
81
|
#algo weaknesses:
|
68
82
|
#extracted passage from text often scores higher (b/c of exact word matches)
|
@@ -86,6 +100,16 @@ module Chomchom
|
|
86
100
|
end
|
87
101
|
end
|
88
102
|
|
103
|
+
def grammar_tax(proper_sentences, total_sentences)
|
104
|
+
if total_sentences < 3
|
105
|
+
proper_sentences * 0.1 #10% for each invalid sentence
|
106
|
+
elsif total_sentences < 5
|
107
|
+
proper_sentences * 0.07 #less harsh
|
108
|
+
else
|
109
|
+
proper_sentences.to_f/total_sentences/3 #ex 1/5 ==> 7%
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
89
113
|
def find_coverages(summary, text_sentences, topics)
|
90
114
|
terms = []
|
91
115
|
hits = [] #array of indexes of sentences with matched terms
|
data/lib/chomchom/summary.rb
CHANGED
@@ -111,7 +111,7 @@ module Chomchom
|
|
111
111
|
|
112
112
|
#for each topic, select the first sentence that has the topic unless the summary already covers it
|
113
113
|
def self.love_at_first_sight(sentences, topics, length)
|
114
|
-
separator = "
|
114
|
+
separator = "\n"
|
115
115
|
summary = ''
|
116
116
|
t = 0
|
117
117
|
points = []
|
data/lib/chomchom/version.rb
CHANGED
data/tests/.DS_Store
CHANGED
Binary file
|
data/tests/scoring.rb
CHANGED
@@ -13,29 +13,40 @@ File.open('files/summaries.txt', 'r') do |file|
|
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
+
|
16
17
|
agent = Mechanize.new
|
17
18
|
agent.user_agent = "chomchom request client"
|
18
19
|
fake_summary = "A look into Al Jazeera's difficulties in breaking into the US market, because they are the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who are unpatriotic by broadcasting it, and it hasn't been well received for its fresh and informative worldview coverage."
|
20
|
+
|
21
|
+
dict = GrammarPolice::Dictionary.create("en")
|
22
|
+
opts = GrammarPolice::ParseOptions.parse_options_create
|
23
|
+
GrammarPolice::ParseOptions.parse_options_set_linkage_limit(opts, 10)
|
24
|
+
GrammarPolice::ParseOptions.parse_options_set_short_length(opts, 5)
|
19
25
|
urls.each_with_index do |url, i|
|
20
|
-
if i==i
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
26
|
+
if i==i
|
27
|
+
agent.get(url)
|
28
|
+
begin
|
29
|
+
html = agent.page.body
|
30
|
+
rescue
|
31
|
+
html = ''
|
32
|
+
end
|
27
33
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
34
|
+
doc = Chomchom::Document.new(html)
|
35
|
+
puts title = doc.title
|
36
|
+
topics = doc.all_topics
|
37
|
+
puts "#{topics}"
|
38
|
+
text = doc.fulltext
|
39
|
+
|
40
|
+
puts summaries[i]
|
35
41
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
42
|
+
topic_words = topics.map { |t| t[0] }
|
43
|
+
scorer = Chomchom::Scorer.new
|
44
|
+
puts scorer.score(text, summaries[i], topic_words, dict, opts)
|
45
|
+
|
46
|
+
#check copy tax
|
47
|
+
puts passage = doc.center_of_gravity()
|
48
|
+
puts scorer.score(text, passage, topic_words, dict, opts)
|
49
|
+
end
|
40
50
|
end
|
41
|
-
|
51
|
+
GrammarPolice::ParseOptions.parse_options_delete(opts)
|
52
|
+
GrammarPolice::Dictionary.destroy(dict)
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: chomchom
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.4.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Quan Nguyen
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-05-
|
13
|
+
date: 2011-05-04 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: mechanize
|
@@ -78,6 +78,17 @@ dependencies:
|
|
78
78
|
version: "0"
|
79
79
|
type: :runtime
|
80
80
|
version_requirements: *id006
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: grammar_police
|
83
|
+
prerelease: false
|
84
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: "0"
|
90
|
+
type: :runtime
|
91
|
+
version_requirements: *id007
|
81
92
|
description: chomchom extracts article's title, published_date, author, and fulltext. It also detects videos and audio for classifying the media type of a given page
|
82
93
|
email:
|
83
94
|
- mquannie@gmail.com
|