treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,55 @@
|
|
1
|
+
# A wrapper for the 'chronic' gem, which parses
|
2
|
+
# date information.
|
3
|
+
#
|
4
|
+
# Project website: http://chronic.rubyforge.org/
|
5
|
+
class Treat::Extractors::Time::Chronic
|
6
|
+
|
7
|
+
# Require the 'chronic' gem.
|
8
|
+
silence_warnings { require 'chronic' }
|
9
|
+
|
10
|
+
# Require the Ruby DateTime module
|
11
|
+
require 'date'
|
12
|
+
|
13
|
+
# Return the date information contained within
|
14
|
+
# the entity by parsing it with the 'chronic' gem.
|
15
|
+
#
|
16
|
+
# Options: none.
|
17
|
+
def self.time(entity, options = {})
|
18
|
+
|
19
|
+
s = entity.to_s
|
20
|
+
return if s =~ /^[0-9]+$/
|
21
|
+
time = nil
|
22
|
+
|
23
|
+
silence_warnings do
|
24
|
+
time = ::Chronic.parse(s, {:guess => true})
|
25
|
+
end
|
26
|
+
|
27
|
+
if entity.has_parent? && remove_time_from_ancestors(entity, time)
|
28
|
+
nil
|
29
|
+
else
|
30
|
+
time
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
# Keeps the lowest-level time annotations that do
|
36
|
+
# not conflict with a higher time annotation.
|
37
|
+
# Returns true if the entity conflicts with a
|
38
|
+
# higher-level time annotation.
|
39
|
+
def self.remove_time_from_ancestors(entity, time)
|
40
|
+
|
41
|
+
entity.ancestors_with_type(:phrase).each do |a|
|
42
|
+
|
43
|
+
next if !a.has?(:time)
|
44
|
+
unless a.get(:time) == time
|
45
|
+
return true
|
46
|
+
end
|
47
|
+
a.unset(:time)
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
false
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -1,71 +1,95 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
#
|
23
|
-
# Project website: http://naturalinputs.com/
|
24
|
-
class Nickel
|
25
|
-
require 'date'
|
26
|
-
silence_warnings { require 'nickel' }
|
27
|
-
# Extract time information from a bit of text.
|
28
|
-
def self.time(entity, options = {})
|
29
|
-
return nil if entity.to_s.strip == ''
|
30
|
-
n = nil
|
31
|
-
silence_warnings { n = ::Nickel.parse(entity.to_s.strip) }
|
32
|
-
occ = n.occurrences[0]
|
33
|
-
return nil unless occ
|
1
|
+
# A wrapper for the 'nickel' gem, which parses
|
2
|
+
# times and dates and supplies additional information
|
3
|
+
# concerning these. The additional information supplied
|
4
|
+
# that this class annotates entities with is:
|
5
|
+
#
|
6
|
+
# - time_recurrence: frequency of recurrence in words*.
|
7
|
+
# - time_recurrence_interval: frequency of recurrence in days.
|
8
|
+
# - start_time: a DateTime object representing the beginning of
|
9
|
+
# an event.
|
10
|
+
# - end_time: a DateTime object representing the end of an event.
|
11
|
+
#
|
12
|
+
# Examples of values for time_recurrence are:
|
13
|
+
#
|
14
|
+
# - single: "lunch with megan tomorrow at noon"
|
15
|
+
# - daily: "Art exhibit until March 1st"
|
16
|
+
# - weekly: "math class every wed from 8-11am"
|
17
|
+
# - daymonthly: "open bar at joes the first friday of every month"
|
18
|
+
# - datemonthly: "pay credit card bill on the 22nd of each month"
|
19
|
+
#
|
20
|
+
# Project website: http://naturalinputs.com/
|
21
|
+
class Treat::Extractors::Time::Nickel
|
34
22
|
|
35
|
-
|
36
|
-
time_recurrence = rec
|
37
|
-
interval = occ.interval ? occ.interval : :none
|
38
|
-
time_recurrence_interval = interval
|
23
|
+
require 'date'
|
39
24
|
|
40
|
-
|
41
|
-
ds = [s[0].year, s[0].month, s[0].day] if s[0]
|
42
|
-
ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
|
25
|
+
silence_warnings { require 'nickel' }
|
43
26
|
|
44
|
-
|
45
|
-
|
46
|
-
|
27
|
+
# Extract time information from a bit of text.
|
28
|
+
def self.time(entity, options = {})
|
29
|
+
|
30
|
+
s = entity.to_s
|
31
|
+
return if s =~ /^[0-9]+$/
|
32
|
+
|
33
|
+
n = nil
|
34
|
+
|
35
|
+
begin
|
36
|
+
silence_warnings { n = ::Nickel.parse(s.to_s.strip) }
|
37
|
+
rescue
|
38
|
+
return
|
39
|
+
end
|
40
|
+
|
41
|
+
occ = n.occurrences[0]
|
42
|
+
|
43
|
+
return unless occ
|
44
|
+
|
45
|
+
rec = occ.type.to_s.gsub('single', 'once').intern
|
46
|
+
time_recurrence = rec
|
47
|
+
interval = occ.interval ?
|
48
|
+
occ.interval : :none
|
49
|
+
time_recurrence_interval = interval
|
50
|
+
|
51
|
+
|
52
|
+
s = [occ.start_date, occ.start_time]
|
53
|
+
ds = [s[0].year, s[0].month, s[0].day] if s[0]
|
54
|
+
ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
|
47
55
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
end_time = ::DateTime.civil(*de, *te) if de && te
|
56
|
+
e = [occ.end_date, occ.end_time]
|
57
|
+
de = [e[0].year, e[0].month, e[0].day] if e[0]
|
58
|
+
te = [e[1].hour, e[1].minute, e[1].second] if e[1]
|
52
59
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
60
|
+
start_time = ::DateTime.civil(*ds) if ds && !ts
|
61
|
+
start_time = ::DateTime.civil(*ds, *ts) if ds && ts
|
62
|
+
end_time = ::DateTime.civil(*de) if de && !te
|
63
|
+
end_time = ::DateTime.civil(*de, *te) if de && te
|
64
|
+
|
65
|
+
return unless start_time
|
57
66
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
end
|
67
|
+
if entity.has_parent? &&
|
68
|
+
remove_time_from_ancestors(entity, start_time)
|
69
|
+
nil
|
70
|
+
else
|
71
|
+
entity.set :time_recurrence,
|
72
|
+
time_recurrence
|
73
|
+
entity.set :time_recurrence_interval,
|
74
|
+
time_recurrence_interval
|
75
|
+
entity.set :end_time, end_time if end_time
|
76
|
+
start_time
|
69
77
|
end
|
78
|
+
|
70
79
|
end
|
80
|
+
|
81
|
+
# Keeps the lowest-level time annotations that do
|
82
|
+
# not conflict with a higher time annotation.
|
83
|
+
# Returns true if the entity conflicts with a
|
84
|
+
# higher-level time annotation.
|
85
|
+
def self.remove_time_from_ancestors(entity, time)
|
86
|
+
entity.ancestors_with_type(:phrase).each do |a|
|
87
|
+
next if !a.has?(:time)
|
88
|
+
return false unless a.get(:time).to_s == time.to_s
|
89
|
+
a.unset(:time, :time_recurrence,
|
90
|
+
:time_recurrence_interval, :end_time)
|
91
|
+
end
|
92
|
+
true
|
93
|
+
end
|
94
|
+
|
71
95
|
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# A wrapper for Ruby's native date/time parsing.
|
2
|
+
class Treat::Extractors::Time::Ruby
|
3
|
+
|
4
|
+
# Require Ruby's date module.
|
5
|
+
require 'date'
|
6
|
+
|
7
|
+
# Return a DateTime object representing the date/time
|
8
|
+
# contained within the entity, using Ruby's native
|
9
|
+
# date/time parser. This extractor is suitable for the
|
10
|
+
# detection of well-structured dates and times, such as
|
11
|
+
# 2011/02/03 5:00.
|
12
|
+
#
|
13
|
+
# Options: none.
|
14
|
+
def self.time(entity, options = {})
|
15
|
+
s = entity.to_s
|
16
|
+
return if s =~ /^[0-9]+$/
|
17
|
+
begin
|
18
|
+
time = ::DateTime.parse(s)
|
19
|
+
if entity.has_parent? &&
|
20
|
+
remove_time_from_ancestors(entity, time)
|
21
|
+
nil
|
22
|
+
else
|
23
|
+
time
|
24
|
+
end
|
25
|
+
rescue
|
26
|
+
nil
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
# Keeps the lowest-level time annotations that do
|
32
|
+
# not conflict with a higher time annotation.
|
33
|
+
# Returns true if the entity conflicts with a
|
34
|
+
# higher-level time annotation.
|
35
|
+
def self.remove_time_from_ancestors(entity, time)
|
36
|
+
|
37
|
+
entity.ancestors_with_type(:phrase).each do |a|
|
38
|
+
|
39
|
+
next if !a.has?(:time)
|
40
|
+
|
41
|
+
unless a.get(:time) == time
|
42
|
+
return true
|
43
|
+
end
|
44
|
+
|
45
|
+
a.unset(:time)
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
false
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
@@ -1,63 +1,72 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
#
|
13
|
-
# Project website: https://github.com/ealdent/lda-ruby
|
14
|
-
class LDA
|
15
|
-
# Require the lda-ruby gem.
|
16
|
-
silence_warnings { require 'lda-ruby' }
|
17
|
-
# Monkey patch the TextCorpus class to call it without
|
18
|
-
# having to create any files.
|
19
|
-
Lda::TextCorpus.class_eval do
|
20
|
-
# Ruby, Y U NO SHUT UP!
|
21
|
-
silence_warnings { undef :initialize }
|
22
|
-
# Redefine initialize to take in an array of sections
|
23
|
-
def initialize(sections)
|
24
|
-
super(nil)
|
25
|
-
sections.each do |section|
|
26
|
-
add_document(Lda::TextDocument.new(self, section))
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
# Default options for the LDA algorithm.
|
31
|
-
DefaultOptions = {
|
32
|
-
:num_topics => 20,
|
33
|
-
:words_per_topic => 10,
|
34
|
-
:iterations => 20
|
35
|
-
}
|
36
|
-
# Retrieve the topic words of a collection.
|
37
|
-
def self.topic_words(collection, options = {})
|
38
|
-
options = DefaultOptions.merge(options)
|
39
|
-
# Create a corpus with the collection
|
40
|
-
sections = collection.sections.collect do |t|
|
41
|
-
t.to_s.encode('UTF-8', :invalid => :replace,
|
42
|
-
:undef => :replace, :replace => "?") # Fix
|
43
|
-
end
|
44
|
-
corpus = Lda::TextCorpus.new(sections)
|
1
|
+
# An adapter for the 'lda-ruby' gem, which clusters
|
2
|
+
# documents into topics based on Latent Dirichlet
|
3
|
+
# Allocation.
|
4
|
+
#
|
5
|
+
# Original paper:
|
6
|
+
# Blei, David M., Ng, Andrew Y., and Jordan, Michael
|
7
|
+
# I. 2003. Latent dirichlet allocation. Journal of
|
8
|
+
# Machine Learning Research. 3 (Mar. 2003), 993-1022.
|
9
|
+
#
|
10
|
+
# Project website: https://github.com/ealdent/lda-ruby
|
11
|
+
module Treat::Extractors::TopicWords::LDA
|
45
12
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
13
|
+
# Require the lda-ruby gem.
|
14
|
+
silence_warnings { require 'lda-ruby' }
|
15
|
+
|
16
|
+
# Monkey patch the TextCorpus class to
|
17
|
+
# call it without having to create any files.
|
18
|
+
Lda::TextCorpus.class_eval do
|
19
|
+
# Ruby, Y U NO SHUT UP!
|
20
|
+
silence_warnings { undef :initialize }
|
21
|
+
# Redefine initialize to take in an
|
22
|
+
# array of sections.
|
23
|
+
def initialize(sections)
|
24
|
+
super(nil)
|
25
|
+
sections.each do |section|
|
26
|
+
add_document(
|
27
|
+
Lda::TextDocument.new(self, section))
|
60
28
|
end
|
61
29
|
end
|
62
30
|
end
|
31
|
+
|
32
|
+
# Default options for the LDA algorithm.
|
33
|
+
DefaultOptions = {
|
34
|
+
:num_topics => 20,
|
35
|
+
:words_per_topic => 10,
|
36
|
+
:iterations => 20,
|
37
|
+
:vocabulary => nil
|
38
|
+
}
|
39
|
+
|
40
|
+
# Retrieve the topic words of a collection.
|
41
|
+
def self.topic_words(collection, options = {})
|
42
|
+
|
43
|
+
options = DefaultOptions.merge(options)
|
44
|
+
|
45
|
+
docs = collection.documents.map { |d| d.to_s }
|
46
|
+
# Create a corpus with the collection
|
47
|
+
corpus = Lda::TextCorpus.new(docs)
|
48
|
+
|
49
|
+
# Create an Lda object for training
|
50
|
+
lda = Lda::Lda.new(corpus)
|
51
|
+
lda.num_topics = options[:num_topics]
|
52
|
+
lda.max_iter = options[:iterations]
|
53
|
+
# Run the EM algorithm using random
|
54
|
+
# starting points
|
55
|
+
|
56
|
+
silence_stdout do
|
57
|
+
lda.em('random')
|
58
|
+
end
|
59
|
+
|
60
|
+
# Load the vocabulary.
|
61
|
+
if options[:vocabulary]
|
62
|
+
lda.load_vocabulary(options[:vocabulary])
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get the topic words.
|
66
|
+
lda.top_words(
|
67
|
+
options[:words_per_topic]
|
68
|
+
).values
|
69
|
+
|
70
|
+
end
|
71
|
+
|
63
72
|
end
|
@@ -1,92 +1,105 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
end
|
86
|
-
end
|
87
|
-
ret
|
1
|
+
# A Ruby text categorizer that was trained using
|
2
|
+
# the Reuters news story corpus.
|
3
|
+
#
|
4
|
+
# Copyright 2005 Mark Watson. All rights reserved.
|
5
|
+
# Rewrite for inclusion in Treat by Louis Mullie (2011).
|
6
|
+
#
|
7
|
+
# Original project website:
|
8
|
+
# http://www.markwatson.com/opensource/
|
9
|
+
module Treat::Extractors::Topics::Reuters
|
10
|
+
|
11
|
+
# Require the Nokogiri XML parser.
|
12
|
+
require 'nokogiri'
|
13
|
+
|
14
|
+
# Hashes to hold the topics.
|
15
|
+
@@industry = {}
|
16
|
+
@@region = {}
|
17
|
+
@@topics = {}
|
18
|
+
|
19
|
+
# Get the general topic of the text using
|
20
|
+
# a Reuters-trained model.
|
21
|
+
#
|
22
|
+
# Options: none.
|
23
|
+
def self.topics(text, options = {})
|
24
|
+
stems = []
|
25
|
+
@@reduce = 0
|
26
|
+
unless text.words.size > 0
|
27
|
+
raise Treat::Exception,
|
28
|
+
"Annotator 'topics' requires " +
|
29
|
+
"processor 'tokenize'."
|
30
|
+
end
|
31
|
+
text.words.collect! do |tok|
|
32
|
+
stem = tok.stem.downcase
|
33
|
+
val = tok.value.downcase
|
34
|
+
stems << stem
|
35
|
+
unless stem == val
|
36
|
+
stems << val
|
37
|
+
end
|
38
|
+
end
|
39
|
+
get_topics
|
40
|
+
score_words(@@industry, stems) +
|
41
|
+
score_words(@@region, stems) +
|
42
|
+
score_words(@@topics, stems)
|
43
|
+
#Treat::Feature.new(topics)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Read the topics from the XML files.
|
47
|
+
def self.get_topics
|
48
|
+
return unless @@industry.size == 0
|
49
|
+
@@industry = read_xml(Treat.models +
|
50
|
+
'reuters/industry.xml')
|
51
|
+
@@region = read_xml(Treat.models +
|
52
|
+
'reuters/region.xml')
|
53
|
+
@@topics = read_xml(Treat.models +
|
54
|
+
'reuters/topics.xml')
|
55
|
+
end
|
56
|
+
|
57
|
+
# Read an XML file and populate a
|
58
|
+
# hash of topics.
|
59
|
+
def self.read_xml(file_name)
|
60
|
+
hash = {}
|
61
|
+
doc = Nokogiri::XML(File.read(file_name))
|
62
|
+
doc.root.children.each do |category|
|
63
|
+
cat = category["cat"]
|
64
|
+
next if cat.nil?
|
65
|
+
cat = cat.downcase
|
66
|
+
hash[cat] ||= {}
|
67
|
+
hash[cat][category["name"]] =
|
68
|
+
category["score"].to_f
|
69
|
+
end
|
70
|
+
hash
|
71
|
+
end
|
72
|
+
|
73
|
+
# Score the words by adding the scores
|
74
|
+
# of each word occurence.
|
75
|
+
def self.score_words(hash, word_list)
|
76
|
+
category_names = hash.keys
|
77
|
+
count_hash = {}
|
78
|
+
category_names.each do |cat_name|
|
79
|
+
cat_name = cat_name.downcase
|
80
|
+
count_hash[cat_name] ||= 0
|
81
|
+
word_list.each do |word|
|
82
|
+
unless hash[cat_name][word].nil?
|
83
|
+
count_hash[cat_name] +=
|
84
|
+
hash[cat_name][word]
|
88
85
|
end
|
89
86
|
end
|
90
87
|
end
|
88
|
+
count_hash = best_of_hash(count_hash)
|
89
|
+
count_hash.keys
|
90
|
+
end
|
91
|
+
|
92
|
+
# Retrieve the words with the scores above
|
93
|
+
# cutoff inside the hash of scored words.
|
94
|
+
def self.best_of_hash(hash, cutoff = 0.0, scale = 1.0)
|
95
|
+
ret = {}
|
96
|
+
hash.keys.each do |key|
|
97
|
+
if hash[key] > cutoff
|
98
|
+
ret[key] = hash[key] * scale
|
99
|
+
ret[key] = ret[key].round(2)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
ret
|
91
103
|
end
|
104
|
+
|
92
105
|
end
|