treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,17 @@
1
+ # This class is a wrapper for the functions included
2
+ # in the 'linguistics' gem that allow to describe a
3
+ # number in words in ordinal form.
4
+ #
5
+ # Project website: http://deveiate.org/projects/Linguistics/
6
+ class Treat::Inflectors::Ordinalizers::Linguistics
7
+
8
+ require 'treat/loaders/linguistics'
9
+
10
+ # Desribe a number in words in ordinal form, using the
11
+ # 'linguistics' gem.
12
+ def self.ordinal(number, options = {})
13
+ klass = Treat::Loaders::Linguistics.load(number.language)
14
+ klass.ordinate(number.to_s)
15
+ end
16
+
17
+ end
@@ -0,0 +1,160 @@
1
+ # Stem a word using a native Ruby implementation of the
2
+ # Porter stemming algorithm, ported to Ruby from a
3
+ # version coded up in Perl. This is a simplified
4
+ # implementation; for a true and fast Porter stemmer,
5
+ # see Treat::Inflectors::Stemmers::PorterC.
6
+ #
7
+ # Authored by Ray Pereda (raypereda@hotmail.com).
8
+ # Unknown license.
9
+ #
10
+ # Original paper: Porter, 1980. An algorithm for suffix stripping,
11
+ # Program, Vol. 14, no. 3, pp 130-137,
12
+ # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
13
+ class Treat::Inflectors::Stemmers::Porter
14
+
15
+ # Returns the stem of a word using a native Porter stemmer.
16
+ #
17
+ # Options: none.
18
+ def self.stem(word, options = {})
19
+ # Copy the word and convert it to a string.
20
+ w = word.to_s
21
+ return w if w.length < 3
22
+ # Map initial y to Y so that the patterns
23
+ # never treat it as vowel.
24
+ w[0] = 'Y' if w[0] == ?y
25
+ # Step 1a
26
+ if w =~ /(ss|i)es$/
27
+ w = $` + $1
28
+ elsif w =~ /([^s])s$/
29
+ w = $` + $1
30
+ end
31
+ # Step 1b
32
+ if w =~ /eed$/
33
+ w.chop! if $` =~ MGR0
34
+ elsif w =~ /(ed|ing)$/
35
+ stem = $`
36
+ if stem =~ VOWEL_IN_STEM
37
+ w = stem
38
+ case w
39
+ when /(at|bl|iz)$/ then w << "e"
40
+ when /([^aeiouylsz])\1$/ then w.chop!
41
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
42
+ end
43
+ end
44
+ end
45
+ if w =~ /y$/
46
+ stem = $`
47
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
48
+ end
49
+ # Step 2
50
+ if w =~ SUFFIX_1_REGEXP
51
+ stem = $`
52
+ suffix = $1
53
+ if stem =~ MGR0
54
+ w = stem + STEP_2_LIST[suffix]
55
+ end
56
+ end
57
+ # Step 3
58
+ if w =~
59
+ /(icate|ative|alize|iciti|ical|ful|ness)$/
60
+ stem = $`
61
+ suffix = $1
62
+ if stem =~ MGR0
63
+ w = stem + STEP_3_LIST[suffix]
64
+ end
65
+ end
66
+ # Step 4
67
+ if w =~ SUFFIX_2_REGEXP
68
+ stem = $`
69
+ if stem =~ MGR1
70
+ w = stem
71
+ end
72
+ elsif w =~ /(s|t)(ion)$/
73
+ stem = $` + $1
74
+ if stem =~ MGR1
75
+ w = stem
76
+ end
77
+ end
78
+ # Step 5
79
+ if w =~ /e$/
80
+ stem = $`
81
+ if (stem =~ MGR1) ||
82
+ (stem =~ MEQ1 && stem !~
83
+ /^#{CC}#{V}[^aeiouwxy]$/o)
84
+ w = stem
85
+ end
86
+ end
87
+ if w =~ /ll$/ && w =~ MGR1
88
+ w.chop!
89
+ end
90
+ # and turn initial Y back to y
91
+ w[0] = 'y' if w[0] == ?Y
92
+ w
93
+ end
94
+
95
+ STEP_2_LIST = {
96
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
97
+ 'izer'=>'ize', 'bli'=>'ble',
98
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
99
+ 'ization'=>'ize', 'ation'=>'ate',
100
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
101
+ 'ousness'=>'ous', 'anati'=>'al',
102
+ 'iviti'=>'ive', 'binati'=>'ble', 'logi'=>'log'
103
+ }
104
+
105
+ STEP_3_LIST = {
106
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
107
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
108
+ }
109
+
110
+ SUFFIX_1_REGEXP = /(
111
+ ational |
112
+ tional |
113
+ enci |
114
+ anci |
115
+ izer |
116
+ bli |
117
+ alli |
118
+ entli |
119
+ eli |
120
+ ousli |
121
+ ization |
122
+ ation |
123
+ ator |
124
+ alism |
125
+ iveness |
126
+ fulness |
127
+ ousness |
128
+ anati |
129
+ iviti |
130
+ binati |
131
+ logi)$/x
132
+ SUFFIX_2_REGEXP = /(
133
+ al |
134
+ ance |
135
+ ence |
136
+ er |
137
+ ic |
138
+ able |
139
+ ible |
140
+ ant |
141
+ ement |
142
+ ment |
143
+ ent |
144
+ ou |
145
+ ism |
146
+ ate |
147
+ iti |
148
+ ous |
149
+ ive |
150
+ ize)$/x
151
+ C = "[^aeiou]" # consonant
152
+ V = "[aeiouy]" # vowel
153
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
154
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
155
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
156
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
157
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
158
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
159
+
160
+ end
@@ -0,0 +1,24 @@
1
+ # Stems words using the 'ruby-stemmer' gem, which
2
+ # wraps a C version of the Porter stemming algorithm.
3
+ #
4
+ # Project website: https://github.com/aurelian/ruby-stemmer
5
+ # Original paper: Porter, 1980. An algorithm for suffix stripping,
6
+ # Program, Vol. 14, no. 3, pp 130-137,
7
+ # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
8
+ module Treat::Inflectors::Stemmers::PorterC
9
+
10
+ # Require the 'ruby-stemmer' gem.
11
+ silence_warnings { require 'lingua/stemmer' }
12
+
13
+ # Remove a conflict between this gem and the 'engtagger' gem.
14
+ ::LinguaStemmer = ::Lingua
15
+ Object.instance_eval { remove_const :Lingua }
16
+
17
+ # Stem the word using a full-blown Porter stemmer in C.
18
+ #
19
+ # Options: none.
20
+ def self.stem(word, options = {})
21
+ ::LinguaStemmer.stemmer(word.to_s)
22
+ end
23
+
24
+ end
@@ -0,0 +1,28 @@
1
+ # Stems a word using the UEA algorithm, implemented
2
+ # by the 'uea-stemmer' gem.
3
+ #
4
+ # "Similar to other stemmers, UEA-Lite operates on a
5
+ # set of rules which are used as steps. There are two
6
+ # groups of rules: the first to clean the tokens, and
7
+ # the second to alter suffixes."
8
+ #
9
+ # Project website: https://github.com/ealdent/uea-stemmer
10
+ # Original paper: Jenkins, Marie-Claire, Smith, Dan,
11
+ # Conservative stemming for search and indexing, 2005.
12
+ # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
13
+ class Treat::Inflectors::Stemmers::UEA
14
+
15
+ # Require the 'uea-stemmer' gem.
16
+ silence_warnings { require 'uea-stemmer' }
17
+
18
+ # Keep only one copy of the stemmer.
19
+ @@stemmer = nil
20
+
21
+ # Stems a word using the UEA algorithm, implemented
22
+ # by the 'uea-stemmer' gem.
23
+ def self.stem(entity, options = {})
24
+ @@stemmer ||= ::UEAStemmer.new
25
+ @@stemmer.stem(entity.to_s).strip
26
+ end
27
+
28
+ end
@@ -0,0 +1,308 @@
1
+ # Installer is a dependency manager for languages.
2
+ #
3
+ # It can be called by using Treat.install(language).
4
+ module Treat::Installer
5
+
6
+ # Require the Rubygem dependency installer.
7
+ silence_warnings do
8
+ require 'rubygems/dependency_installer'
9
+ end
10
+ require 'treat/downloader'
11
+ require 'treat/dependencies'
12
+
13
+ # Package managers for each platforms.
14
+ PackageManagers = {
15
+ :mac => 'port',
16
+ :linux => 'apt-get',
17
+ :windows => 'win-get'
18
+ }
19
+
20
+ # Address of the server with the files.
21
+ Server = 'www.louismullie.com'
22
+
23
+ # Filenames for the Stanford packages.
24
+ StanfordPackages = {
25
+ :minimal => "stanford-core-nlp-minimal.zip",
26
+ :english => "stanford-core-nlp-english.zip",
27
+ :all => "stanford-core-nlp-all.zip"
28
+ }
29
+
30
+ # Absolute paths required for cp and mkdir.
31
+ Paths = {
32
+ :tmp => File.absolute_path(Treat.tmp),
33
+ :bin => File.absolute_path(Treat.bin),
34
+ :models => File.absolute_path(Treat.models)
35
+ }
36
+
37
+ # Install required dependencies and optional
38
+ # dependencies for a specific language.
39
+ def self.install(language = :english)
40
+
41
+ @@installer = Gem::DependencyInstaller.new
42
+
43
+ if language == :travis
44
+ install_travis; return
45
+ end
46
+
47
+ lang_class = Treat::Languages.get(language.to_s)
48
+ l = "#{language.to_s.capitalize} language"
49
+
50
+ puts
51
+ puts "Treat Installer, v. #{Treat::VERSION.to_s}\n"
52
+ puts
53
+
54
+ begin
55
+
56
+ title "Install language-independent gem dependencies."
57
+
58
+ case prompt(
59
+ "1 - Install all default language-independent dependencies\n" +
60
+ "2 - Select dependencies to install manually\n" +
61
+ "3 - Skip this step", ['1', '2', '3'])
62
+ when '1' then install_dependencies(false)
63
+ when '2' then install_dependencies(true)
64
+ when '3' then puts 'Skipping this step.'
65
+ end
66
+
67
+ title "Install gem dependencies for the #{l}.\n"
68
+
69
+ dflt = lang_class::RequiredDependencies
70
+ all = dflt + lang_class::OptionalDependencies
71
+ case prompt("1 - Install default dependencies.\n" +
72
+ "2 - Select dependencies to install manually.\n" +
73
+ "3 - Skip this step.", ['1', '2', '3'])
74
+ when '1' then install_language_dependencies(dflt, false)
75
+ when '2' then install_language_dependencies(all, true)
76
+ when '3' then puts 'Skipping this step.'
77
+ end
78
+
79
+ Treat::Downloader.show_progress = true
80
+
81
+ # If gem is installed only, download models.
82
+ begin
83
+ Gem::Specification.find_by_name('punkt-segmenter')
84
+ title "Downloading model for the Punkt segmenter for the #{l}."
85
+ download_punkt_models(language)
86
+ rescue Gem::LoadError; end
87
+
88
+ # If stanford is installed, download models.
89
+ begin
90
+ Gem::Specification.find_by_name('stanford-core-nlp')
91
+ title "Download Stanford Core NLP JARs and " +
92
+ "model files for the the #{l}.\n\n"
93
+ package = (language == :english) ? :english : :all
94
+ download_stanford(package)
95
+ rescue Gem::LoadError; end
96
+
97
+ title "Install external binary libraries " +
98
+ "(requires port, apt-get or win-get).\n"
99
+ puts "Warning: this may take a long amount of time."
100
+
101
+ case prompt("1 - Select binaries to install manually.\n" +
102
+ "2 - Skip this step.", ['1', '2'])
103
+ when '1' then install_binaries
104
+ when '2' then puts 'Skipping this step.'
105
+ end
106
+
107
+ puts
108
+ puts "-----\nDone!"
109
+
110
+ rescue Errno::EACCES => e
111
+
112
+ raise Treat::Exception,
113
+ "Couldn't write to file - permission denied (#{e.message}). " +
114
+ "You may need to run Ruby or Rake on sudo."
115
+
116
+ end
117
+
118
+ end
119
+
120
+ # Automated install for Travis CI.
121
+ def self.install_travis
122
+ dep = (Treat::Languages::English::RequiredDependencies +
123
+ Treat::Languages::English::OptionalDependencies)
124
+ install_dependencies(false)
125
+ install_language_dependencies(dep, false)
126
+ download_stanford(:minimal)
127
+ download_punkt_models(:english)
128
+ end
129
+
130
+ def self.install_dependencies(optionally)
131
+
132
+ Treat::Dependencies::Gem.each do |d|
133
+ dep, ver, pur = *d
134
+ install_gem(dep, ver, pur, optionally)
135
+ end
136
+
137
+ end
138
+
139
+ def self.install_language_dependencies(dependencies, optionally)
140
+
141
+ puts "No dependencies to install.\n" if dependencies.empty?
142
+ dependencies.each do |dependency|
143
+ install_gem(dependency, nil, nil, optionally)
144
+ end
145
+
146
+ end
147
+
148
+ def self.install_binaries
149
+
150
+ puts "Warning: this will require authentification."
151
+
152
+ p = detect_platform
153
+ man = PackageManagers[p]
154
+
155
+ if !man
156
+ puts "Cannot find a download manager "+
157
+ "for the #{p} platform.\n\n"
158
+ else
159
+ unless `hash #{man} 2>&1` == ''
160
+ puts "The '#{man}' command is required "+
161
+ "to install binaries on #{p}.\n\n"
162
+ man = nil
163
+ end
164
+ end
165
+
166
+ unless man
167
+ puts "Skipping installation of the "+
168
+ "following binaries:\n\n"
169
+ Binaries.each do |binary, purpose|
170
+ puts "- #{binary} to #{purpose}"
171
+ end
172
+ return
173
+ end
174
+
175
+ Treat::Dependencies::Binary.each do |binary, purpose|
176
+ if prompt("install #{binary} to " +
177
+ "#{purpose} (y/n)", ['y', 'n']) == 'y'
178
+ `sudo #{man} install #{binary}`
179
+ end
180
+ end
181
+
182
+ end
183
+
184
+ def self.download_stanford(package = :minimal)
185
+
186
+ f = StanfordPackages[package]
187
+ loc = Treat::Downloader.download(
188
+ 'http', Server, 'treat', f, Treat.tmp)
189
+ puts "- Unzipping package ..."
190
+ dest = File.join(Treat.tmp, 'stanford')
191
+ unzip_stanford(loc, dest)
192
+
193
+ model_dir = File.join(Paths[:models], 'stanford')
194
+ bin_dir = File.join(Paths[:bin], 'stanford')
195
+ origin = File.join(Paths[:tmp], 'stanford')
196
+
197
+ # Mac hidden files fix.
198
+ mac_remove = File.join(dest, '__MACOSX')
199
+ if File.readable?(mac_remove)
200
+ FileUtils.rm_rf(mac_remove)
201
+ end
202
+
203
+ unless File.readable?(bin_dir)
204
+ puts "- Creating directory bin/stanford ..."
205
+ FileUtils.mkdir_p(bin_dir)
206
+ end
207
+
208
+ unless File.readable?(model_dir)
209
+ puts "- Creating directory models/stanford ..."
210
+ FileUtils.mkdir_p(model_dir)
211
+ end
212
+
213
+ puts "- Copying JAR files to bin/stanford " +
214
+ "and model files to models/stanford ..."
215
+ Dir.glob(File.join(origin, '*')) do |f|
216
+ next if ['.', '..'].include?(f)
217
+ if f.index('jar')
218
+ FileUtils.cp(f, File.join(Paths[:bin],
219
+ 'stanford', File.basename(f)))
220
+ elsif FileTest.directory?(f)
221
+ FileUtils.cp_r(f, model_dir)
222
+ end
223
+ end
224
+
225
+ puts "- Cleaning up..."
226
+ FileUtils.rm_rf(origin)
227
+
228
+ end
229
+
230
+ def self.download_punkt_models(language)
231
+
232
+ f = "#{language}.yaml"
233
+ dest = "#{Treat.models}punkt/"
234
+
235
+ loc = Treat::Downloader.download(
236
+ 'http', Server, 'treat/punkt', f, Treat.tmp)
237
+
238
+ unless File.readable?(dest)
239
+ puts "- Creating directory models/punkt ..."
240
+ FileUtils.mkdir_p(File.absolute_path(dest))
241
+ end
242
+
243
+ puts "- Copying model file to models/punkt ..."
244
+ FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
245
+
246
+ puts "- Cleaning up..."
247
+ FileUtils.rm_rf(Paths[:tmp] + Server)
248
+
249
+ end
250
+
251
+ private
252
+
253
+ @@n = 1
254
+
255
+ # Print out a numbered title.
256
+ def self.title(string)
257
+ puts
258
+ puts "#{@@n}. #{string}"
259
+ puts
260
+ @@n += 1
261
+ end
262
+
263
+ # Install a dependency with a supplied purpose
264
+ # but ask the user if she wishes to do so first.
265
+ def self.install_gem(dependency, version = nil,
266
+ purpose = nil, optionally = false)
267
+
268
+ install = false
269
+
270
+ begin
271
+ purpose = purpose ? " to #{purpose}" : ''
272
+ if optionally
273
+ if prompt("install #{dependency}#{purpose}",
274
+ ['y', 'n']) == 'y'
275
+ install = true
276
+ end
277
+ else
278
+ puts "\n- Installing #{dependency}#{purpose}."
279
+ install = true
280
+ end
281
+ silence_warnings do
282
+ @@installer.install(dependency, version)
283
+ end if install
284
+ rescue Exception => error
285
+ raise
286
+ puts "Couldn't install gem '#{dependency}' " +
287
+ "(#{error.message})."
288
+ end
289
+
290
+ end
291
+
292
+ # Unzip a file to the destination path.
293
+ def self.unzip_stanford(file, destination)
294
+
295
+ require 'zip/zip'
296
+ f_path = ''
297
+
298
+ Zip::ZipFile.open(file) do |zip_file|
299
+ zip_file.each do |f|
300
+ f_path = File.join(destination, f.name)
301
+ FileUtils.mkdir_p(File.absolute_path(File.dirname(f_path)))
302
+ zip_file.extract(f, f_path) unless File.exist?(f_path)
303
+ end
304
+ end
305
+
306
+ end
307
+
308
+ end