chomchom 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.DS_Store +0 -0
- data/chomchom.gemspec +1 -1
- data/lib/chomchom/scorer.rb +26 -2
- data/lib/chomchom/summary.rb +1 -1
- data/lib/chomchom/version.rb +1 -1
- data/tests/.DS_Store +0 -0
- data/tests/scoring.rb +30 -19
- metadata +13 -2
data/.DS_Store
CHANGED
Binary file
|
data/chomchom.gemspec
CHANGED
data/lib/chomchom/scorer.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
#coding: utf-8
|
2
|
+
require 'grammar_police'
|
2
3
|
require 'lingua/stemmer' #https://github.com/aurelian/ruby-stemmer
|
3
4
|
|
4
5
|
module Chomchom
|
5
6
|
class Scorer
|
6
|
-
def score(text, summary, topics)
|
7
|
+
def score(text, summary, topics, dictionary, parse_options)
|
7
8
|
#solve the utf-8 invalid string error
|
8
9
|
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
9
10
|
text = ic.iconv(text + ' ')[0..-2]
|
@@ -38,8 +39,13 @@ module Chomchom
|
|
38
39
|
|
39
40
|
#separating by sentences has the effect of designating each sentence to a section
|
40
41
|
coverages = []
|
42
|
+
copy_taxes = []
|
43
|
+
proper_sentences = 0
|
41
44
|
#ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s|
|
42
45
|
ss = summary.downcase.split_sentences.each { |s|
|
46
|
+
#take a flat 30% for every copied sentence
|
47
|
+
copy_taxes.push(0.3) if text.downcase.index(s.gsub(/[^\p{Word}]+$/,'').gsub(/^[^\p{Word}]+/,''))
|
48
|
+
proper_sentences += 1 if GrammarPolice::Sentence.count_linkages(s, dictionary, parse_options) > 0
|
43
49
|
coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/)
|
44
50
|
}
|
45
51
|
|
@@ -61,8 +67,16 @@ module Chomchom
|
|
61
67
|
#this treats every sentence as 1 unit (all sentences created equal)
|
62
68
|
#puts "#{covered.size.to_f/tss.size*100}"
|
63
69
|
|
70
|
+
#average tax for all sentences
|
71
|
+
copy_tax = (copy_taxes.size > 0)? (copy_taxes.inject { |sum, t| sum + t}/ss.size) : 0.0
|
72
|
+
|
73
|
+
#calculate grammar tax
|
74
|
+
grammar_tax = grammar_tax(proper_sentences, ss.size)
|
75
|
+
|
64
76
|
#punish for length with the idea of length_tax, no tax below 100 and then progressively increase
|
65
|
-
|
77
|
+
tax = length_tax(summary.size) + copy_tax + grammar_tax
|
78
|
+
puts "total tax =#{tax}"
|
79
|
+
summary_score.to_f/total_score*100*(1-tax)
|
66
80
|
|
67
81
|
#algo weaknesses:
|
68
82
|
#extracted passage from text often scores higher (b/c of exact word matches)
|
@@ -86,6 +100,16 @@ module Chomchom
|
|
86
100
|
end
|
87
101
|
end
|
88
102
|
|
103
|
+
def grammar_tax(proper_sentences, total_sentences)
|
104
|
+
if total_sentences < 3
|
105
|
+
proper_sentences * 0.1 #10% for each invalid sentence
|
106
|
+
elsif total_sentences < 5
|
107
|
+
proper_sentences * 0.07 #less harsh
|
108
|
+
else
|
109
|
+
proper_sentences.to_f/total_sentences/3 #ex 1/5 ==> 7%
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
89
113
|
def find_coverages(summary, text_sentences, topics)
|
90
114
|
terms = []
|
91
115
|
hits = [] #array of indexes of sentences with matched terms
|
data/lib/chomchom/summary.rb
CHANGED
@@ -111,7 +111,7 @@ module Chomchom
|
|
111
111
|
|
112
112
|
#for each topic, select the first sentence that has the topic unless the summary already covers it
|
113
113
|
def self.love_at_first_sight(sentences, topics, length)
|
114
|
-
separator = "
|
114
|
+
separator = "\n"
|
115
115
|
summary = ''
|
116
116
|
t = 0
|
117
117
|
points = []
|
data/lib/chomchom/version.rb
CHANGED
data/tests/.DS_Store
CHANGED
Binary file
|
data/tests/scoring.rb
CHANGED
@@ -13,29 +13,40 @@ File.open('files/summaries.txt', 'r') do |file|
|
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
+
|
16
17
|
agent = Mechanize.new
|
17
18
|
agent.user_agent = "chomchom request client"
|
18
19
|
fake_summary = "A look into Al Jazeera's difficulties in breaking into the US market, because they are the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who are unpatriotic by broadcasting it, and it hasn't been well received for its fresh and informative worldview coverage."
|
20
|
+
|
21
|
+
dict = GrammarPolice::Dictionary.create("en")
|
22
|
+
opts = GrammarPolice::ParseOptions.parse_options_create
|
23
|
+
GrammarPolice::ParseOptions.parse_options_set_linkage_limit(opts, 10)
|
24
|
+
GrammarPolice::ParseOptions.parse_options_set_short_length(opts, 5)
|
19
25
|
urls.each_with_index do |url, i|
|
20
|
-
if i==i
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
26
|
+
if i==i
|
27
|
+
agent.get(url)
|
28
|
+
begin
|
29
|
+
html = agent.page.body
|
30
|
+
rescue
|
31
|
+
html = ''
|
32
|
+
end
|
27
33
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
34
|
+
doc = Chomchom::Document.new(html)
|
35
|
+
puts title = doc.title
|
36
|
+
topics = doc.all_topics
|
37
|
+
puts "#{topics}"
|
38
|
+
text = doc.fulltext
|
39
|
+
|
40
|
+
puts summaries[i]
|
35
41
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
42
|
+
topic_words = topics.map { |t| t[0] }
|
43
|
+
scorer = Chomchom::Scorer.new
|
44
|
+
puts scorer.score(text, summaries[i], topic_words, dict, opts)
|
45
|
+
|
46
|
+
#check copy tax
|
47
|
+
puts passage = doc.center_of_gravity()
|
48
|
+
puts scorer.score(text, passage, topic_words, dict, opts)
|
49
|
+
end
|
40
50
|
end
|
41
|
-
|
51
|
+
GrammarPolice::ParseOptions.parse_options_delete(opts)
|
52
|
+
GrammarPolice::Dictionary.destroy(dict)
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: chomchom
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.4.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Quan Nguyen
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-05-
|
13
|
+
date: 2011-05-04 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: mechanize
|
@@ -78,6 +78,17 @@ dependencies:
|
|
78
78
|
version: "0"
|
79
79
|
type: :runtime
|
80
80
|
version_requirements: *id006
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: grammar_police
|
83
|
+
prerelease: false
|
84
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: "0"
|
90
|
+
type: :runtime
|
91
|
+
version_requirements: *id007
|
81
92
|
description: chomchom extracts article's title, published_date, author, and fulltext. It also detects videos and audio for classifying the media type of a given page
|
82
93
|
email:
|
83
94
|
- mquannie@gmail.com
|