chomchom 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.DS_Store +0 -0
- data/lib/.DS_Store +0 -0
- data/lib/chomchom/.DS_Store +0 -0
- data/lib/chomchom/scorer.rb +10 -4
- data/lib/chomchom/version.rb +1 -1
- data/tests/.DS_Store +0 -0
- data/tests/files/summaries.txt +1 -0
- data/tests/scoring.rb +4 -1
- metadata +2 -2
data/.DS_Store
CHANGED
Binary file
|
data/lib/.DS_Store
CHANGED
Binary file
|
data/lib/chomchom/.DS_Store
CHANGED
Binary file
|
data/lib/chomchom/scorer.rb
CHANGED
@@ -40,12 +40,14 @@ module Chomchom
|
|
40
40
|
#separating by sentences has the effect of designating each sentence to a section
|
41
41
|
coverages = []
|
42
42
|
copy_taxes = []
|
43
|
-
|
43
|
+
improper_grammar = 0 #number of chars violating the grammar rule
|
44
44
|
#ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s|
|
45
45
|
ss = summary.downcase.split_sentences.each { |s|
|
46
46
|
#take a flat 30% for every copied sentence
|
47
47
|
copy_taxes.push(0.3) if text.downcase.index(s.gsub(/[^\p{Word}]+$/,'').gsub(/^[^\p{Word}]+/,''))
|
48
|
-
|
48
|
+
|
49
|
+
s_grammar = s.gsub(/[\"\|\(\)\[\]\{\}\<\>]/,',') #replace by comma b/c link-grammar is bad with sentence containing quotes
|
50
|
+
improper_grammar += s.size if GrammarCop::Sentence.count_linkages(s_grammar, dictionary, parse_options) == 0
|
49
51
|
coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/)
|
50
52
|
}
|
51
53
|
|
@@ -71,11 +73,15 @@ module Chomchom
|
|
71
73
|
copy_tax = (copy_taxes.size > 0)? (copy_taxes.inject { |sum, t| sum + t}/ss.size) : 0.0
|
72
74
|
|
73
75
|
#calculate grammar tax
|
74
|
-
grammar_tax = grammar_tax(proper_sentences, ss.size)
|
76
|
+
#grammar_tax = grammar_tax(proper_sentences, ss.size)
|
77
|
+
|
78
|
+
#take 30% of the improper portion of the
|
79
|
+
grammar_tax = improper_grammar/summary.size*0.30
|
80
|
+
#puts "grammar tax=#{grammar_tax}"
|
75
81
|
|
76
82
|
#punish for length with the idea of length_tax, no tax below 100 and then progressively increase
|
77
83
|
tax = length_tax(summary.size) + copy_tax + grammar_tax
|
78
|
-
puts "total tax =#{tax}"
|
84
|
+
#puts "total tax =#{tax}"
|
79
85
|
summary_score.to_f/total_score*100*(1-tax)
|
80
86
|
|
81
87
|
#algo weaknesses:
|
data/lib/chomchom/version.rb
CHANGED
data/tests/.DS_Store
CHANGED
Binary file
|
data/tests/files/summaries.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
http://newsroom.intel.com/community/intel_newsroom/blog/2011/05/04/intel-reinvents-transistors-using-new-3-d-structure|||in reference to operate at lower voltage with lower, cost and marks the implementation of this work for the pace of silicon technological announces a major technical breakthrough will also aid in the delivery of more highly intel announces a major to switch" back and decreasing functionality to make current devices smarter and wholly new Transistor, this time utilizing the third dimensional
|
1
2
|
http://www.theatlantic.com/magazine/print/2011/01/hard-core/8327/|||Broadband Internet brings hardcore porn widely accessible to everyone, and thus influences our sex lives. However, online porn, which often caters to predominantly male consumers' desire, poses a threat to men and women equality as hardcore sex is legitimized and becomes the norm in everyone's bedroom
|
2
3
|
http://deborahcampbell.org/writing/politics/the-most-hated-name-in-news/|||A look into Al Jazeera's difficulties in breaking into the US market, mainly caused by major cable networks refusal to carry them as the channel was labeled as the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who decide to broadcast it, and it has been well received for its fresh and informative worldview coverage.
|
3
4
|
http://www.theatlantic.com/magazine/archive/2003/03/caring-for-your-introvert/2696/|||The self-described introvert author explains introversion and his view on extroverts treatment. He describes extroverts as people energized by other people while introverts find energy in deep thoughts and find socializing exhausted. He believes introverts are misunderstood b/c most of society's activities and expectations are dominated by extroverts.
|
data/tests/scoring.rb
CHANGED
@@ -17,11 +17,13 @@ end
|
|
17
17
|
agent = Mechanize.new
|
18
18
|
agent.user_agent = "chomchom request client"
|
19
19
|
fake_summary = "A look into Al Jazeera's difficulties in breaking into the US market, because they are the terrorist mouthpiece post 9/11. However, there are a handful of small cable companies who are unpatriotic by broadcasting it, and it hasn't been well received for its fresh and informative worldview coverage."
|
20
|
+
scramble_summary = %q<NEWS HIGHly coordination of improved performance and consumer electronics. For the histor. The additional control of current is accomplished by implex with the 22nm 3-D Tri-Gate transistor. The traditional "flat" two-dimensionality and performance, functionality and software like nothing as possible chips to opening the progress of Moore's Law is a forecast for to switch very quickly between>
|
20
21
|
|
21
22
|
dict = GrammarCop::Dictionary.create("en")
|
22
23
|
opts = GrammarCop::ParseOptions.create
|
23
24
|
GrammarCop::ParseOptions.set_linkage_limit(opts, 10)
|
24
25
|
GrammarCop::ParseOptions.set_short_length(opts, 5)
|
26
|
+
GrammarCop::ParseOptions.set_disjunct_cost(opts, 1000)
|
25
27
|
urls.each_with_index do |url, i|
|
26
28
|
if i==i
|
27
29
|
agent.get(url)
|
@@ -42,7 +44,8 @@ urls.each_with_index do |url, i|
|
|
42
44
|
topic_words = topics.map { |t| t[0] }
|
43
45
|
scorer = Chomchom::Scorer.new
|
44
46
|
puts scorer.score(text, summaries[i], topic_words, dict, opts)
|
45
|
-
|
47
|
+
#puts scorer.score(text, scramble_summary, topic_words, dict, opts)
|
48
|
+
|
46
49
|
#check copy tax
|
47
50
|
puts passage = doc.center_of_gravity()
|
48
51
|
puts scorer.score(text, passage, topic_words, dict, opts)
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: chomchom
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.5.
|
5
|
+
version: 0.5.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Quan Nguyen
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-05-
|
13
|
+
date: 2011-05-13 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: mechanize
|