treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,17 @@
|
|
1
|
+
# This class is a wrapper for the functions included
|
2
|
+
# in the 'linguistics' gem that allow to describe a
|
3
|
+
# number in words in ordinal form.
|
4
|
+
#
|
5
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
6
|
+
class Treat::Inflectors::Ordinalizers::Linguistics
|
7
|
+
|
8
|
+
require 'treat/loaders/linguistics'
|
9
|
+
|
10
|
+
# Desribe a number in words in ordinal form, using the
|
11
|
+
# 'linguistics' gem.
|
12
|
+
def self.ordinal(number, options = {})
|
13
|
+
klass = Treat::Loaders::Linguistics.load(number.language)
|
14
|
+
klass.ordinate(number.to_s)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# Stem a word using a native Ruby implementation of the
|
2
|
+
# Porter stemming algorithm, ported to Ruby from a
|
3
|
+
# version coded up in Perl. This is a simplified
|
4
|
+
# implementation; for a true and fast Porter stemmer,
|
5
|
+
# see Treat::Inflectors::Stemmers::PorterC.
|
6
|
+
#
|
7
|
+
# Authored by Ray Pereda (raypereda@hotmail.com).
|
8
|
+
# Unknown license.
|
9
|
+
#
|
10
|
+
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
11
|
+
# Program, Vol. 14, no. 3, pp 130-137,
|
12
|
+
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
13
|
+
class Treat::Inflectors::Stemmers::Porter
|
14
|
+
|
15
|
+
# Returns the stem of a word using a native Porter stemmer.
|
16
|
+
#
|
17
|
+
# Options: none.
|
18
|
+
def self.stem(word, options = {})
|
19
|
+
# Copy the word and convert it to a string.
|
20
|
+
w = word.to_s
|
21
|
+
return w if w.length < 3
|
22
|
+
# Map initial y to Y so that the patterns
|
23
|
+
# never treat it as vowel.
|
24
|
+
w[0] = 'Y' if w[0] == ?y
|
25
|
+
# Step 1a
|
26
|
+
if w =~ /(ss|i)es$/
|
27
|
+
w = $` + $1
|
28
|
+
elsif w =~ /([^s])s$/
|
29
|
+
w = $` + $1
|
30
|
+
end
|
31
|
+
# Step 1b
|
32
|
+
if w =~ /eed$/
|
33
|
+
w.chop! if $` =~ MGR0
|
34
|
+
elsif w =~ /(ed|ing)$/
|
35
|
+
stem = $`
|
36
|
+
if stem =~ VOWEL_IN_STEM
|
37
|
+
w = stem
|
38
|
+
case w
|
39
|
+
when /(at|bl|iz)$/ then w << "e"
|
40
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
41
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
if w =~ /y$/
|
46
|
+
stem = $`
|
47
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
48
|
+
end
|
49
|
+
# Step 2
|
50
|
+
if w =~ SUFFIX_1_REGEXP
|
51
|
+
stem = $`
|
52
|
+
suffix = $1
|
53
|
+
if stem =~ MGR0
|
54
|
+
w = stem + STEP_2_LIST[suffix]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
# Step 3
|
58
|
+
if w =~
|
59
|
+
/(icate|ative|alize|iciti|ical|ful|ness)$/
|
60
|
+
stem = $`
|
61
|
+
suffix = $1
|
62
|
+
if stem =~ MGR0
|
63
|
+
w = stem + STEP_3_LIST[suffix]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
# Step 4
|
67
|
+
if w =~ SUFFIX_2_REGEXP
|
68
|
+
stem = $`
|
69
|
+
if stem =~ MGR1
|
70
|
+
w = stem
|
71
|
+
end
|
72
|
+
elsif w =~ /(s|t)(ion)$/
|
73
|
+
stem = $` + $1
|
74
|
+
if stem =~ MGR1
|
75
|
+
w = stem
|
76
|
+
end
|
77
|
+
end
|
78
|
+
# Step 5
|
79
|
+
if w =~ /e$/
|
80
|
+
stem = $`
|
81
|
+
if (stem =~ MGR1) ||
|
82
|
+
(stem =~ MEQ1 && stem !~
|
83
|
+
/^#{CC}#{V}[^aeiouwxy]$/o)
|
84
|
+
w = stem
|
85
|
+
end
|
86
|
+
end
|
87
|
+
if w =~ /ll$/ && w =~ MGR1
|
88
|
+
w.chop!
|
89
|
+
end
|
90
|
+
# and turn initial Y back to y
|
91
|
+
w[0] = 'y' if w[0] == ?Y
|
92
|
+
w
|
93
|
+
end
|
94
|
+
|
95
|
+
STEP_2_LIST = {
|
96
|
+
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
97
|
+
'izer'=>'ize', 'bli'=>'ble',
|
98
|
+
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
99
|
+
'ization'=>'ize', 'ation'=>'ate',
|
100
|
+
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
101
|
+
'ousness'=>'ous', 'anati'=>'al',
|
102
|
+
'iviti'=>'ive', 'binati'=>'ble', 'logi'=>'log'
|
103
|
+
}
|
104
|
+
|
105
|
+
STEP_3_LIST = {
|
106
|
+
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
107
|
+
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
108
|
+
}
|
109
|
+
|
110
|
+
SUFFIX_1_REGEXP = /(
|
111
|
+
ational |
|
112
|
+
tional |
|
113
|
+
enci |
|
114
|
+
anci |
|
115
|
+
izer |
|
116
|
+
bli |
|
117
|
+
alli |
|
118
|
+
entli |
|
119
|
+
eli |
|
120
|
+
ousli |
|
121
|
+
ization |
|
122
|
+
ation |
|
123
|
+
ator |
|
124
|
+
alism |
|
125
|
+
iveness |
|
126
|
+
fulness |
|
127
|
+
ousness |
|
128
|
+
anati |
|
129
|
+
iviti |
|
130
|
+
binati |
|
131
|
+
logi)$/x
|
132
|
+
SUFFIX_2_REGEXP = /(
|
133
|
+
al |
|
134
|
+
ance |
|
135
|
+
ence |
|
136
|
+
er |
|
137
|
+
ic |
|
138
|
+
able |
|
139
|
+
ible |
|
140
|
+
ant |
|
141
|
+
ement |
|
142
|
+
ment |
|
143
|
+
ent |
|
144
|
+
ou |
|
145
|
+
ism |
|
146
|
+
ate |
|
147
|
+
iti |
|
148
|
+
ous |
|
149
|
+
ive |
|
150
|
+
ize)$/x
|
151
|
+
C = "[^aeiou]" # consonant
|
152
|
+
V = "[aeiouy]" # vowel
|
153
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
154
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
155
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
156
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
157
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
158
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
159
|
+
|
160
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Stems words using the 'ruby-stemmer' gem, which
|
2
|
+
# wraps a C version of the Porter stemming algorithm.
|
3
|
+
#
|
4
|
+
# Project website: https://github.com/aurelian/ruby-stemmer
|
5
|
+
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
6
|
+
# Program, Vol. 14, no. 3, pp 130-137,
|
7
|
+
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
8
|
+
module Treat::Inflectors::Stemmers::PorterC
|
9
|
+
|
10
|
+
# Require the 'ruby-stemmer' gem.
|
11
|
+
silence_warnings { require 'lingua/stemmer' }
|
12
|
+
|
13
|
+
# Remove a conflict between this gem and the 'engtagger' gem.
|
14
|
+
::LinguaStemmer = ::Lingua
|
15
|
+
Object.instance_eval { remove_const :Lingua }
|
16
|
+
|
17
|
+
# Stem the word using a full-blown Porter stemmer in C.
|
18
|
+
#
|
19
|
+
# Options: none.
|
20
|
+
def self.stem(word, options = {})
|
21
|
+
::LinguaStemmer.stemmer(word.to_s)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# Stems a word using the UEA algorithm, implemented
|
2
|
+
# by the 'uea-stemmer' gem.
|
3
|
+
#
|
4
|
+
# "Similar to other stemmers, UEA-Lite operates on a
|
5
|
+
# set of rules which are used as steps. There are two
|
6
|
+
# groups of rules: the first to clean the tokens, and
|
7
|
+
# the second to alter suffixes."
|
8
|
+
#
|
9
|
+
# Project website: https://github.com/ealdent/uea-stemmer
|
10
|
+
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
|
11
|
+
# Conservative stemming for search and indexing, 2005.
|
12
|
+
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
13
|
+
class Treat::Inflectors::Stemmers::UEA
|
14
|
+
|
15
|
+
# Require the 'uea-stemmer' gem.
|
16
|
+
silence_warnings { require 'uea-stemmer' }
|
17
|
+
|
18
|
+
# Keep only one copy of the stemmer.
|
19
|
+
@@stemmer = nil
|
20
|
+
|
21
|
+
# Stems a word using the UEA algorithm, implemented
|
22
|
+
# by the 'uea-stemmer' gem.
|
23
|
+
def self.stem(entity, options = {})
|
24
|
+
@@stemmer ||= ::UEAStemmer.new
|
25
|
+
@@stemmer.stem(entity.to_s).strip
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,308 @@
|
|
1
|
+
# Installer is a dependency manager for languages.
|
2
|
+
#
|
3
|
+
# It can be called by using Treat.install(language).
|
4
|
+
module Treat::Installer
|
5
|
+
|
6
|
+
# Require the Rubygem dependency installer.
|
7
|
+
silence_warnings do
|
8
|
+
require 'rubygems/dependency_installer'
|
9
|
+
end
|
10
|
+
require 'treat/downloader'
|
11
|
+
require 'treat/dependencies'
|
12
|
+
|
13
|
+
# Package managers for each platforms.
|
14
|
+
PackageManagers = {
|
15
|
+
:mac => 'port',
|
16
|
+
:linux => 'apt-get',
|
17
|
+
:windows => 'win-get'
|
18
|
+
}
|
19
|
+
|
20
|
+
# Address of the server with the files.
|
21
|
+
Server = 'www.louismullie.com'
|
22
|
+
|
23
|
+
# Filenames for the Stanford packages.
|
24
|
+
StanfordPackages = {
|
25
|
+
:minimal => "stanford-core-nlp-minimal.zip",
|
26
|
+
:english => "stanford-core-nlp-english.zip",
|
27
|
+
:all => "stanford-core-nlp-all.zip"
|
28
|
+
}
|
29
|
+
|
30
|
+
# Absolute paths required for cp and mkdir.
|
31
|
+
Paths = {
|
32
|
+
:tmp => File.absolute_path(Treat.tmp),
|
33
|
+
:bin => File.absolute_path(Treat.bin),
|
34
|
+
:models => File.absolute_path(Treat.models)
|
35
|
+
}
|
36
|
+
|
37
|
+
# Install required dependencies and optional
|
38
|
+
# dependencies for a specific language.
|
39
|
+
def self.install(language = :english)
|
40
|
+
|
41
|
+
@@installer = Gem::DependencyInstaller.new
|
42
|
+
|
43
|
+
if language == :travis
|
44
|
+
install_travis; return
|
45
|
+
end
|
46
|
+
|
47
|
+
lang_class = Treat::Languages.get(language.to_s)
|
48
|
+
l = "#{language.to_s.capitalize} language"
|
49
|
+
|
50
|
+
puts
|
51
|
+
puts "Treat Installer, v. #{Treat::VERSION.to_s}\n"
|
52
|
+
puts
|
53
|
+
|
54
|
+
begin
|
55
|
+
|
56
|
+
title "Install language-independent gem dependencies."
|
57
|
+
|
58
|
+
case prompt(
|
59
|
+
"1 - Install all default language-independent dependencies\n" +
|
60
|
+
"2 - Select dependencies to install manually\n" +
|
61
|
+
"3 - Skip this step", ['1', '2', '3'])
|
62
|
+
when '1' then install_dependencies(false)
|
63
|
+
when '2' then install_dependencies(true)
|
64
|
+
when '3' then puts 'Skipping this step.'
|
65
|
+
end
|
66
|
+
|
67
|
+
title "Install gem dependencies for the #{l}.\n"
|
68
|
+
|
69
|
+
dflt = lang_class::RequiredDependencies
|
70
|
+
all = dflt + lang_class::OptionalDependencies
|
71
|
+
case prompt("1 - Install default dependencies.\n" +
|
72
|
+
"2 - Select dependencies to install manually.\n" +
|
73
|
+
"3 - Skip this step.", ['1', '2', '3'])
|
74
|
+
when '1' then install_language_dependencies(dflt, false)
|
75
|
+
when '2' then install_language_dependencies(all, true)
|
76
|
+
when '3' then puts 'Skipping this step.'
|
77
|
+
end
|
78
|
+
|
79
|
+
Treat::Downloader.show_progress = true
|
80
|
+
|
81
|
+
# If gem is installed only, download models.
|
82
|
+
begin
|
83
|
+
Gem::Specification.find_by_name('punkt-segmenter')
|
84
|
+
title "Downloading model for the Punkt segmenter for the #{l}."
|
85
|
+
download_punkt_models(language)
|
86
|
+
rescue Gem::LoadError; end
|
87
|
+
|
88
|
+
# If stanford is installed, download models.
|
89
|
+
begin
|
90
|
+
Gem::Specification.find_by_name('stanford-core-nlp')
|
91
|
+
title "Download Stanford Core NLP JARs and " +
|
92
|
+
"model files for the the #{l}.\n\n"
|
93
|
+
package = (language == :english) ? :english : :all
|
94
|
+
download_stanford(package)
|
95
|
+
rescue Gem::LoadError; end
|
96
|
+
|
97
|
+
title "Install external binary libraries " +
|
98
|
+
"(requires port, apt-get or win-get).\n"
|
99
|
+
puts "Warning: this may take a long amount of time."
|
100
|
+
|
101
|
+
case prompt("1 - Select binaries to install manually.\n" +
|
102
|
+
"2 - Skip this step.", ['1', '2'])
|
103
|
+
when '1' then install_binaries
|
104
|
+
when '2' then puts 'Skipping this step.'
|
105
|
+
end
|
106
|
+
|
107
|
+
puts
|
108
|
+
puts "-----\nDone!"
|
109
|
+
|
110
|
+
rescue Errno::EACCES => e
|
111
|
+
|
112
|
+
raise Treat::Exception,
|
113
|
+
"Couldn't write to file - permission denied (#{e.message}). " +
|
114
|
+
"You may need to run Ruby or Rake on sudo."
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
# Automated install for Travis CI.
|
121
|
+
def self.install_travis
|
122
|
+
dep = (Treat::Languages::English::RequiredDependencies +
|
123
|
+
Treat::Languages::English::OptionalDependencies)
|
124
|
+
install_dependencies(false)
|
125
|
+
install_language_dependencies(dep, false)
|
126
|
+
download_stanford(:minimal)
|
127
|
+
download_punkt_models(:english)
|
128
|
+
end
|
129
|
+
|
130
|
+
def self.install_dependencies(optionally)
|
131
|
+
|
132
|
+
Treat::Dependencies::Gem.each do |d|
|
133
|
+
dep, ver, pur = *d
|
134
|
+
install_gem(dep, ver, pur, optionally)
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.install_language_dependencies(dependencies, optionally)
|
140
|
+
|
141
|
+
puts "No dependencies to install.\n" if dependencies.empty?
|
142
|
+
dependencies.each do |dependency|
|
143
|
+
install_gem(dependency, nil, nil, optionally)
|
144
|
+
end
|
145
|
+
|
146
|
+
end
|
147
|
+
|
148
|
+
def self.install_binaries
|
149
|
+
|
150
|
+
puts "Warning: this will require authentification."
|
151
|
+
|
152
|
+
p = detect_platform
|
153
|
+
man = PackageManagers[p]
|
154
|
+
|
155
|
+
if !man
|
156
|
+
puts "Cannot find a download manager "+
|
157
|
+
"for the #{p} platform.\n\n"
|
158
|
+
else
|
159
|
+
unless `hash #{man} 2>&1` == ''
|
160
|
+
puts "The '#{man}' command is required "+
|
161
|
+
"to install binaries on #{p}.\n\n"
|
162
|
+
man = nil
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
unless man
|
167
|
+
puts "Skipping installation of the "+
|
168
|
+
"following binaries:\n\n"
|
169
|
+
Binaries.each do |binary, purpose|
|
170
|
+
puts "- #{binary} to #{purpose}"
|
171
|
+
end
|
172
|
+
return
|
173
|
+
end
|
174
|
+
|
175
|
+
Treat::Dependencies::Binary.each do |binary, purpose|
|
176
|
+
if prompt("install #{binary} to " +
|
177
|
+
"#{purpose} (y/n)", ['y', 'n']) == 'y'
|
178
|
+
`sudo #{man} install #{binary}`
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
183
|
+
|
184
|
+
def self.download_stanford(package = :minimal)
|
185
|
+
|
186
|
+
f = StanfordPackages[package]
|
187
|
+
loc = Treat::Downloader.download(
|
188
|
+
'http', Server, 'treat', f, Treat.tmp)
|
189
|
+
puts "- Unzipping package ..."
|
190
|
+
dest = File.join(Treat.tmp, 'stanford')
|
191
|
+
unzip_stanford(loc, dest)
|
192
|
+
|
193
|
+
model_dir = File.join(Paths[:models], 'stanford')
|
194
|
+
bin_dir = File.join(Paths[:bin], 'stanford')
|
195
|
+
origin = File.join(Paths[:tmp], 'stanford')
|
196
|
+
|
197
|
+
# Mac hidden files fix.
|
198
|
+
mac_remove = File.join(dest, '__MACOSX')
|
199
|
+
if File.readable?(mac_remove)
|
200
|
+
FileUtils.rm_rf(mac_remove)
|
201
|
+
end
|
202
|
+
|
203
|
+
unless File.readable?(bin_dir)
|
204
|
+
puts "- Creating directory bin/stanford ..."
|
205
|
+
FileUtils.mkdir_p(bin_dir)
|
206
|
+
end
|
207
|
+
|
208
|
+
unless File.readable?(model_dir)
|
209
|
+
puts "- Creating directory models/stanford ..."
|
210
|
+
FileUtils.mkdir_p(model_dir)
|
211
|
+
end
|
212
|
+
|
213
|
+
puts "- Copying JAR files to bin/stanford " +
|
214
|
+
"and model files to models/stanford ..."
|
215
|
+
Dir.glob(File.join(origin, '*')) do |f|
|
216
|
+
next if ['.', '..'].include?(f)
|
217
|
+
if f.index('jar')
|
218
|
+
FileUtils.cp(f, File.join(Paths[:bin],
|
219
|
+
'stanford', File.basename(f)))
|
220
|
+
elsif FileTest.directory?(f)
|
221
|
+
FileUtils.cp_r(f, model_dir)
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
puts "- Cleaning up..."
|
226
|
+
FileUtils.rm_rf(origin)
|
227
|
+
|
228
|
+
end
|
229
|
+
|
230
|
+
def self.download_punkt_models(language)
|
231
|
+
|
232
|
+
f = "#{language}.yaml"
|
233
|
+
dest = "#{Treat.models}punkt/"
|
234
|
+
|
235
|
+
loc = Treat::Downloader.download(
|
236
|
+
'http', Server, 'treat/punkt', f, Treat.tmp)
|
237
|
+
|
238
|
+
unless File.readable?(dest)
|
239
|
+
puts "- Creating directory models/punkt ..."
|
240
|
+
FileUtils.mkdir_p(File.absolute_path(dest))
|
241
|
+
end
|
242
|
+
|
243
|
+
puts "- Copying model file to models/punkt ..."
|
244
|
+
FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
|
245
|
+
|
246
|
+
puts "- Cleaning up..."
|
247
|
+
FileUtils.rm_rf(Paths[:tmp] + Server)
|
248
|
+
|
249
|
+
end
|
250
|
+
|
251
|
+
private
|
252
|
+
|
253
|
+
@@n = 1
|
254
|
+
|
255
|
+
# Print out a numbered title.
|
256
|
+
def self.title(string)
|
257
|
+
puts
|
258
|
+
puts "#{@@n}. #{string}"
|
259
|
+
puts
|
260
|
+
@@n += 1
|
261
|
+
end
|
262
|
+
|
263
|
+
# Install a dependency with a supplied purpose
|
264
|
+
# but ask the user if she wishes to do so first.
|
265
|
+
def self.install_gem(dependency, version = nil,
|
266
|
+
purpose = nil, optionally = false)
|
267
|
+
|
268
|
+
install = false
|
269
|
+
|
270
|
+
begin
|
271
|
+
purpose = purpose ? " to #{purpose}" : ''
|
272
|
+
if optionally
|
273
|
+
if prompt("install #{dependency}#{purpose}",
|
274
|
+
['y', 'n']) == 'y'
|
275
|
+
install = true
|
276
|
+
end
|
277
|
+
else
|
278
|
+
puts "\n- Installing #{dependency}#{purpose}."
|
279
|
+
install = true
|
280
|
+
end
|
281
|
+
silence_warnings do
|
282
|
+
@@installer.install(dependency, version)
|
283
|
+
end if install
|
284
|
+
rescue Exception => error
|
285
|
+
raise
|
286
|
+
puts "Couldn't install gem '#{dependency}' " +
|
287
|
+
"(#{error.message})."
|
288
|
+
end
|
289
|
+
|
290
|
+
end
|
291
|
+
|
292
|
+
# Unzip a file to the destination path.
|
293
|
+
def self.unzip_stanford(file, destination)
|
294
|
+
|
295
|
+
require 'zip/zip'
|
296
|
+
f_path = ''
|
297
|
+
|
298
|
+
Zip::ZipFile.open(file) do |zip_file|
|
299
|
+
zip_file.each do |f|
|
300
|
+
f_path = File.join(destination, f.name)
|
301
|
+
FileUtils.mkdir_p(File.absolute_path(File.dirname(f_path)))
|
302
|
+
zip_file.extract(f, f_path) unless File.exist?(f_path)
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
end
|
307
|
+
|
308
|
+
end
|