treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
@@ -2,20 +2,13 @@ module Treat
|
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
4
|
class FrequencyIn
|
5
|
-
DefaultOptions = {
|
6
|
-
|
5
|
+
DefaultOptions = { :parent => nil }
|
6
|
+
# Find the frequency of a given string value.
|
7
|
+
def self.statistics(entity, options = {})
|
7
8
|
options = DefaultOptions.merge(options)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
0
|
12
|
-
else
|
13
|
-
entity.token_registry(options[:type])[:value][w].size
|
14
|
-
end
|
15
|
-
else
|
16
|
-
raise Treat::Exception,
|
17
|
-
'Cannot get the frequency of a non-terminal entity.'
|
18
|
-
end
|
9
|
+
tr = entity.token_registry(options[:parent])
|
10
|
+
tv = tr[:value][entity.value]
|
11
|
+
tv ? tv.size : 1
|
19
12
|
end
|
20
13
|
end
|
21
14
|
end
|
@@ -1,34 +1,102 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Statistics
|
4
|
-
# "The term count in the given document is simply the
|
5
|
-
# number of times a given term appears in that document.
|
6
|
-
# This count is usually normalized to prevent a bias
|
7
|
-
# towards longer documents (which may have a higher
|
8
|
-
# term count regardless of the actual importance of
|
9
|
-
# that term in the document) to give a measure of the
|
10
|
-
# importance of the term t within the particular document d.
|
11
|
-
# Thus we have the term frequency tf(t,d), defined in the
|
4
|
+
# "The term count in the given document is simply the
|
5
|
+
# number of times a given term appears in that document.
|
6
|
+
# This count is usually normalized to prevent a bias
|
7
|
+
# towards longer documents (which may have a higher
|
8
|
+
# term count regardless of the actual importance of
|
9
|
+
# that term in the document) to give a measure of the
|
10
|
+
# importance of the term t within the particular document d.
|
11
|
+
# Thus we have the term frequency tf(t,d), defined in the
|
12
12
|
# simplest case as the occurrence count of a term in a document.
|
13
|
-
#
|
14
|
-
# The inverse document frequency is a measure of the general
|
15
|
-
# importance of the term (obtained by dividing the total number
|
16
|
-
# of documents by the number of documents containing the term,
|
13
|
+
#
|
14
|
+
# The inverse document frequency is a measure of the general
|
15
|
+
# importance of the term (obtained by dividing the total number
|
16
|
+
# of documents by the number of documents containing the term,
|
17
17
|
# and then taking the logarithm of that quotient)."
|
18
18
|
#
|
19
19
|
# (From Wikipedia)
|
20
20
|
class TfIdf
|
21
|
-
DefaultOptions = {
|
21
|
+
DefaultOptions = {
|
22
|
+
:tf => :natural,
|
23
|
+
:idf => :logarithm,
|
24
|
+
:remove_common_words => true,
|
25
|
+
:precision => 4
|
26
|
+
}
|
27
|
+
Algorithms = {
|
28
|
+
:tf => {
|
29
|
+
:natural => lambda { |tf| tf },
|
30
|
+
:logarithm => lambda { |tf| Math.log(1 + tf) },
|
31
|
+
:sqrt =>lambda { |tf| Math.sqrt(tf) }
|
32
|
+
},
|
33
|
+
:idf => {
|
34
|
+
:logarithm => lambda { |n,df| Math.log(n/(1 + df)) },
|
35
|
+
:none => lambda { |n,idf| 1 }
|
36
|
+
}
|
37
|
+
}
|
38
|
+
# Optimization caches for tf idf.
|
39
|
+
@@n = {} # Number of documents in the collection (n).
|
40
|
+
@@df= {} # Number of documents that have a given value (document count).
|
41
|
+
@@f = {} # Number of times a word appears in a given document (term count).
|
42
|
+
@@wc = {} # Number of words in a given document (word count).
|
43
|
+
@@cw = {} # Common words to filter out.
|
22
44
|
def self.statistics(entity, options={})
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
45
|
+
l = Treat::Languages.get(entity.language)
|
46
|
+
if l.const_defined?(:CommonWords)
|
47
|
+
@@cw[entity.language] = l.const_get(:CommonWords)
|
48
|
+
return 0 if @@cw[entity.language].include?(entity.value)
|
49
|
+
end
|
50
|
+
return 0 if entity.value.length <= 2
|
51
|
+
options = DefaultOptions.merge(options)
|
52
|
+
lambdas = options.partition do |k,v|
|
53
|
+
[:tf, :idf, :normalization].include?(k)
|
54
|
+
end[0]
|
55
|
+
lambdas.each do |opt,val|
|
56
|
+
if opt.is_a?(Symbol)
|
57
|
+
if Algorithms[opt][val]
|
58
|
+
options[opt] = Algorithms[opt][val]
|
59
|
+
else
|
60
|
+
raise Treat::Exception,
|
61
|
+
"The specified algorithm '#{val}' "+
|
62
|
+
"to calculate #{opt} does not exist."
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
collection = entity.parent_collection
|
67
|
+
document = entity.parent_document
|
68
|
+
dc = collection.document_count
|
69
|
+
if !collection || !document
|
70
|
+
raise Treat::Exception,
|
71
|
+
"Tf*Idf requires a collection with documents."
|
72
|
+
end
|
73
|
+
val = entity.value.downcase
|
74
|
+
@@n[collection.id] = dc if @@n[collection.id].nil?
|
75
|
+
@@df[collection.id] ||= {}
|
76
|
+
if @@df[collection.id][val].nil?
|
77
|
+
df = 0
|
78
|
+
collection.each_document do |doc|
|
79
|
+
@@f[doc.id] ||= {}
|
80
|
+
if @@f[doc.id][val].nil?
|
81
|
+
@@f[doc.id][val] =
|
82
|
+
doc.token_registry[:value][val] ?
|
83
|
+
doc.token_registry[:value][val].size : 0
|
84
|
+
end
|
85
|
+
df += 1 if @@f[doc.id][val] > 0
|
86
|
+
end
|
87
|
+
@@df[collection.id][val] = df
|
88
|
+
end
|
89
|
+
f = @@f[document.id][entity.value].to_f
|
90
|
+
df = @@df[collection.id][entity.value].to_f
|
91
|
+
tf = options[:tf].call(f).to_f
|
92
|
+
if options[:normalize_word_count]
|
93
|
+
@@wc[document.id] ||= document.word_count
|
94
|
+
tf /= @@wc[document.id]
|
29
95
|
end
|
30
|
-
|
31
|
-
|
96
|
+
n = @@n[collection.id].to_f
|
97
|
+
idf = options[:idf].call(n, df)
|
98
|
+
tf_idf = tf * idf
|
99
|
+
tf_idf.abs.round(options[:precision])
|
32
100
|
end
|
33
101
|
end
|
34
102
|
end
|
@@ -4,11 +4,11 @@ module Treat
|
|
4
4
|
# Experimental algorithm to generate transition matrices.
|
5
5
|
class TransitionMatrix
|
6
6
|
DefaultOptions = {
|
7
|
-
normalize
|
8
|
-
features
|
9
|
-
condition
|
10
|
-
entity_types
|
11
|
-
relationships
|
7
|
+
:normalize => true,
|
8
|
+
:features => [:tag],
|
9
|
+
:condition => lambda { |e| true },
|
10
|
+
:entity_types => [:word],
|
11
|
+
:relationships => [:parent, :right, :children]
|
12
12
|
}
|
13
13
|
# Find the transition matrix.
|
14
14
|
def self.statistics(entity, options={})
|
@@ -34,7 +34,7 @@ module Treat
|
|
34
34
|
next unless options[:condition].call(target)
|
35
35
|
|
36
36
|
# Initialize the empty transition matrix.
|
37
|
-
|
37
|
+
|
38
38
|
|
39
39
|
# Calculate the transition probabilities.
|
40
40
|
options[:features].each do |f1|
|
@@ -57,16 +57,16 @@ module Treat
|
|
57
57
|
end
|
58
58
|
end
|
59
59
|
|
60
|
-
tm[f1][v1][:
|
60
|
+
tm[f1][v1][:dependency] = empty.call
|
61
61
|
|
62
|
-
target.
|
62
|
+
target.dependencies.each do |dependency|
|
63
63
|
s = target.ancestor_with_type :sentence
|
64
64
|
if s
|
65
|
-
x = s.find(
|
65
|
+
x = s.find(dependency.target)
|
66
66
|
next unless relative.has?(f2)
|
67
67
|
v2 = x.send(f2)
|
68
|
-
tm[f1][v1][:
|
69
|
-
tm[f1][v1][:
|
68
|
+
tm[f1][v1][:dependency][f2][v2] ||= 0.0
|
69
|
+
tm[f1][v1][:dependency][f2][v2] += 1.0
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
@@ -34,14 +34,14 @@ module Treat
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
entity.
|
37
|
+
entity.dependencies.each do |dependency|
|
38
38
|
s = entity.ancestor_with_type :sentence
|
39
39
|
if s
|
40
|
-
x = s.find(
|
40
|
+
x = s.find(dependency.target)
|
41
41
|
next unless h.has?(f2)
|
42
42
|
v2 = x.send(f2)
|
43
|
-
if tm[f1][v1][:
|
44
|
-
score += tm[f1][v1][:
|
43
|
+
if tm[f1][v1][:dependency][f2][v2]
|
44
|
+
score += tm[f1][v1][:dependency][f2][v2]
|
45
45
|
count += 1
|
46
46
|
end
|
47
47
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Time
|
4
|
-
# A wrapper for the 'nickel' gem, which parses
|
4
|
+
# A wrapper for the 'nickel' gem, which parses
|
5
5
|
# times and dates and supplies additional information
|
6
6
|
# concerning these. The additional information supplied
|
7
7
|
# that this class annotates entities with is:
|
@@ -11,7 +11,7 @@ module Treat
|
|
11
11
|
# - start_time: a DateTime object representing the beginning of
|
12
12
|
# an event.
|
13
13
|
# - end_time: a DateTime object representing the end of an event.
|
14
|
-
#
|
14
|
+
#
|
15
15
|
# Examples of values for time_recurrence are:
|
16
16
|
#
|
17
17
|
# - single: "lunch with megan tomorrow at noon"
|
@@ -19,33 +19,51 @@ module Treat
|
|
19
19
|
# - weekly: "math class every wed from 8-11am"
|
20
20
|
# - daymonthly: "open bar at joes the first friday of every month"
|
21
21
|
# - datemonthly: "pay credit card bill on the 22nd of each month"
|
22
|
-
#
|
22
|
+
#
|
23
23
|
# Project website: http://naturalinputs.com/
|
24
|
-
|
24
|
+
class Nickel
|
25
25
|
require 'date'
|
26
26
|
silence_warnings { require 'nickel' }
|
27
27
|
# Extract time information from a bit of text.
|
28
28
|
def self.time(entity, options = {})
|
29
|
-
|
29
|
+
return nil if entity.to_s.strip == ''
|
30
|
+
n = nil
|
31
|
+
silence_warnings { n = ::Nickel.parse(entity.to_s.strip) }
|
30
32
|
occ = n.occurrences[0]
|
33
|
+
return nil unless occ
|
31
34
|
|
32
35
|
rec = occ.type.to_s.gsub('single', 'once').intern
|
33
|
-
|
36
|
+
time_recurrence = rec
|
34
37
|
interval = occ.interval ? occ.interval : :none
|
35
|
-
|
38
|
+
time_recurrence_interval = interval
|
36
39
|
|
37
40
|
s = [occ.start_date, occ.start_time]
|
38
41
|
ds = [s[0].year, s[0].month, s[0].day] if s[0]
|
39
|
-
|
42
|
+
ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
|
40
43
|
|
41
44
|
e = [occ.end_date, occ.end_time]
|
42
45
|
de = [e[0].year, e[0].month, e[0].day] if e[0]
|
43
|
-
|
46
|
+
te = [e[1].hour, e[1].minute, e[1].second] if e[1]
|
47
|
+
|
48
|
+
start_time = ::DateTime.civil(*ds) if ds && !ts
|
49
|
+
start_time = ::DateTime.civil(*ds, *ts) if ds && ts
|
50
|
+
end_time = ::DateTime.civil(*de) if de && !te
|
51
|
+
end_time = ::DateTime.civil(*de, *te) if de && te
|
44
52
|
|
45
|
-
|
46
|
-
|
53
|
+
time = Treat::Features::Time.new( # Fix - time message.
|
54
|
+
start_time, end_time, time_recurrence,
|
55
|
+
time_recurrence_interval
|
56
|
+
)
|
47
57
|
|
48
|
-
|
58
|
+
# Keeps the lowest-level time annotations
|
59
|
+
# that do not conflict with the highest-level
|
60
|
+
# time annotation.
|
61
|
+
entity.ancestors_with_type(:phrase).each do |a|
|
62
|
+
unless a.id == entity.id || a.children[0].size == 0
|
63
|
+
a.unset(:time)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
time
|
49
67
|
end
|
50
68
|
end
|
51
69
|
end
|
@@ -29,26 +29,26 @@ module Treat
|
|
29
29
|
end
|
30
30
|
# Default options for the LDA algorithm.
|
31
31
|
DefaultOptions = {
|
32
|
-
|
33
|
-
words_per_topic
|
34
|
-
iterations
|
32
|
+
:num_topics => 20,
|
33
|
+
:words_per_topic => 10,
|
34
|
+
:iterations => 20
|
35
35
|
}
|
36
36
|
# Retrieve the topic words of a collection.
|
37
37
|
def self.topic_words(collection, options = {})
|
38
38
|
options = DefaultOptions.merge(options)
|
39
39
|
# Create a corpus with the collection
|
40
40
|
sections = collection.sections.collect do |t|
|
41
|
-
t.to_s.
|
41
|
+
t.to_s.encode('UTF-8', :invalid => :replace,
|
42
|
+
:undef => :replace, :replace => "?") # Fix
|
42
43
|
end
|
43
44
|
corpus = Lda::TextCorpus.new(sections)
|
44
45
|
|
45
46
|
# Create an Lda object for training
|
46
47
|
lda = Lda::Lda.new(corpus)
|
47
|
-
lda.num_topics = options[:
|
48
|
+
lda.num_topics = options[:num_topics]
|
48
49
|
lda.max_iter = options[:iterations]
|
49
50
|
# Run the EM algorithm using random starting points
|
50
|
-
|
51
|
-
|
51
|
+
silence_stdout { lda.em('random') }
|
52
52
|
# Load the vocabulary.
|
53
53
|
if options[:vocabulary]
|
54
54
|
lda.load_vocabulary(options[:vocabulary])
|
@@ -57,8 +57,8 @@ module Treat
|
|
57
57
|
# Get the topic words and annotate the section.
|
58
58
|
topic_words = lda.top_words(options[:words_per_topic])
|
59
59
|
|
60
|
-
|
61
|
-
|
60
|
+
collection.each_word do |word|
|
61
|
+
topic_words.each do |i, words|
|
62
62
|
if words.include?(word)
|
63
63
|
word.set :is_topic_word?, true
|
64
64
|
word.set :topic_id, i
|
@@ -22,24 +22,27 @@ module Treat
|
|
22
22
|
def self.topics(text, options = {})
|
23
23
|
stems = []
|
24
24
|
@@reduce = 0
|
25
|
-
text.
|
25
|
+
unless text.words.size > 0
|
26
|
+
raise Treat::Exception,
|
27
|
+
"Annotator 'topics' requires processor 'tokenize'."
|
28
|
+
end
|
29
|
+
text.words.collect! do |tok|
|
26
30
|
stem = tok.stem.downcase
|
27
31
|
val = tok.value.downcase
|
28
32
|
stems << stem
|
29
33
|
unless stem == val
|
30
34
|
stems << val
|
31
|
-
@@reduce += 1
|
32
35
|
end
|
33
36
|
end
|
34
37
|
get_topics
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
+
score_words(@@industry, stems) +
|
39
|
+
score_words(@@region, stems) +
|
40
|
+
score_words(@@topics, stems)
|
38
41
|
#Treat::Feature.new(topics)
|
39
42
|
end
|
40
43
|
# Read the topics from the XML files.
|
41
44
|
def self.get_topics
|
42
|
-
return unless @@industry.
|
45
|
+
return unless @@industry.size == 0
|
43
46
|
@@industry = read_xml(Treat.lib + '/treat/extractors/topics/reuters/industry.xml')
|
44
47
|
@@region = read_xml(Treat.lib + '/treat/extractors/topics/reuters/region.xml')
|
45
48
|
@@topics = read_xml(Treat.lib + '/treat/extractors/topics/reuters/topics.xml')
|
@@ -65,21 +68,17 @@ module Treat
|
|
65
68
|
count_hash[cat_name] ||= 0
|
66
69
|
word_list.each do |word|
|
67
70
|
unless hash[cat_name][word].nil?
|
68
|
-
count_hash[cat_name]
|
69
|
-
count_hash[cat_name] +
|
71
|
+
count_hash[cat_name] +=
|
70
72
|
hash[cat_name][word]
|
71
73
|
end
|
72
74
|
end
|
73
75
|
end
|
74
|
-
count_hash = best_of_hash(count_hash
|
75
|
-
|
76
|
-
100.0 / (1 + word_list.size.to_f - @@reduce.to_f))
|
77
|
-
count_hash
|
76
|
+
count_hash = best_of_hash(count_hash)
|
77
|
+
count_hash.keys
|
78
78
|
end
|
79
|
-
def self.best_of_hash(hash, cutoff =
|
80
|
-
cutoff = 1 if cutoff == 0
|
79
|
+
def self.best_of_hash(hash, cutoff = 0.0, scale = 1.0)
|
81
80
|
ret = {}
|
82
|
-
hash.keys.each
|
81
|
+
hash.keys.each do |key|
|
83
82
|
if hash[key] > cutoff
|
84
83
|
ret[key] = hash[key] * scale
|
85
84
|
ret[key] = ret[key].round(2)
|
@@ -13437,6 +13437,7 @@
|
|
13437
13437
|
<word cat="CANADA" name="nuinsco" score="1.000000" />
|
13438
13438
|
<word cat="CANADA" name="noverco" score="1.000000" />
|
13439
13439
|
<word cat="CANADA" name="enscor" score="1.000000" />
|
13440
|
+
<word cat="CANADA" name="ottawa" score="1.000000" />
|
13440
13441
|
<word cat="CANADA" name="winnipegg" score="1.000000" />
|
13441
13442
|
<word cat="CANADA" name="mantadoc" score="1.000000" />
|
13442
13443
|
<word cat="CANADA" name="canmar" score="1.000000" />
|
@@ -18,7 +18,12 @@ module Treat
|
|
18
18
|
end
|
19
19
|
def text(s)
|
20
20
|
if s != 'AbiWord' && s != 'application/x-abiword'
|
21
|
-
|
21
|
+
s.strip!
|
22
|
+
if s.length > 0
|
23
|
+
s += ' '
|
24
|
+
s += "\n\n" if s.length < 60
|
25
|
+
end
|
26
|
+
@plain_text << s
|
22
27
|
end
|
23
28
|
end
|
24
29
|
end
|
@@ -6,10 +6,8 @@ module Treat
|
|
6
6
|
# the appropriate reader based on the file
|
7
7
|
# extension of the supplied document.
|
8
8
|
class Autoselect
|
9
|
-
# A list of image extensions that should be routed to
|
9
|
+
# A list of image extensions that should be routed to Ocropus.
|
10
10
|
ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
|
11
|
-
# Default options.
|
12
|
-
DefaultOptions = {:ocr => :ocropus}
|
13
11
|
# Select the appropriate reader based on the format
|
14
12
|
# of the filename in document.
|
15
13
|
#
|
@@ -17,19 +15,20 @@ module Treat
|
|
17
15
|
#
|
18
16
|
# - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
|
19
17
|
def self.read(document, options)
|
20
|
-
options = DefaultOptions.merge(options)
|
21
18
|
ext = document.file.split('.')[-1]
|
22
|
-
reader = ImageExtensions.include?(ext) ? '
|
19
|
+
reader = ImageExtensions.include?(ext) ? 'image' : ext
|
23
20
|
reader = 'html' if reader == 'htm'
|
24
21
|
reader = 'yaml' if reader == 'yml'
|
25
22
|
begin
|
26
23
|
r = Treat::Formatters::Readers.const_get(cc(reader))
|
27
|
-
rescue NameError
|
24
|
+
rescue NameError
|
28
25
|
puts e.message
|
29
26
|
raise Treat::Exception,
|
30
27
|
"Cannot find a reader for format: '#{ext}'."
|
31
28
|
end
|
32
29
|
document = r.read(document, options)
|
30
|
+
document.set :encoding, document.to_s.encoding.to_s.downcase
|
31
|
+
document
|
33
32
|
end
|
34
33
|
end
|
35
34
|
end
|