treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,42 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Declensions
4
- # This class is a wrapper for the functions included
5
- # in the 'linguistics' gem that allow to obtain the
6
- # declensions of a word.
7
- #
8
- # Project website: http://deveiate.org/projects/Linguistics/
9
- class Linguistics
10
- require 'treat/helpers/linguistics_loader'
11
- # Retrieve a declension of a word using the 'linguistics' gem.
12
- #
13
- # Options:
14
- #
15
- # - (Identifier) :count => :singular, :plural
16
- def self.declensions(entity, options = {})
17
- unless options[:count]
18
- raise Treat::Exception,
19
- "Must supply option count (:singular or :plural)."
20
- end
21
- klass = Treat::Helpers::LinguisticsLoader.load(entity.language)
22
- string = entity.to_s
23
- if entity.category == :verb
24
- raise Treat::Exception,
25
- "Cannot retrieve the declensions of a verb. " +
26
- "Use #singular_verb and #plural_verb instead."
27
- end
28
- if options[:count] == :plural
29
- if entity.has?(:category) &&
30
- [:noun, :adjective, :verb].include?(entity.category)
31
- silence_warnings do
32
- klass.send(:"plural_#{entity.category}", string)
33
- end
34
- else
35
- silence_warnings { klass.plural(string) }
36
- end
37
- end
38
- end
39
- end
40
- end
41
- end
42
- end
@@ -1,20 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module OrdinalWords
4
- # This class is a wrapper for the functions included
5
- # in the 'linguistics' gem that allow to describe a
6
- # number in words in ordinal form.
7
- #
8
- # Project website: http://deveiate.org/projects/Linguistics/
9
- class Linguistics
10
- require 'treat/helpers/linguistics_loader'
11
- # Desribe a number in words in ordinal form, using the
12
- # 'linguistics' gem.
13
- def self.ordinal_words(number, options = {})
14
- klass = Treat::Helpers::LinguisticsLoader.load(number.language)
15
- klass.ordinate(number.to_s)
16
- end
17
- end
18
- end
19
- end
20
- end
@@ -1,162 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Stem
4
- # Stem a word using a native Ruby implementation of the
5
- # Porter stemming algorithm, ported to Ruby from a
6
- # version coded up in Perl. This is a simplified
7
- # implementation; for a true and fast Porter stemmer,
8
- # see Treat::Inflectors::Stem::PorterC.
9
- #
10
- # Authored by Ray Pereda (raypereda@hotmail.com).
11
- # Unknown license.
12
- #
13
- # Original paper: Porter, 1980. An algorithm for suffix stripping,
14
- # Program, Vol. 14, no. 3, pp 130-137,
15
- # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
16
- class Porter
17
- # Returns the stem of a word using a native Porter stemmer.
18
- #
19
- # Options: none.
20
- def self.stem(word, options = {})
21
- # Copy the word and convert it to a string.
22
- w = word.to_s
23
- return w if w.length < 3
24
- # Map initial y to Y so that the patterns
25
- # never treat it as vowel.
26
- w[0] = 'Y' if w[0] == ?y
27
- # Step 1a
28
- if w =~ /(ss|i)es$/
29
- w = $` + $1
30
- elsif w =~ /([^s])s$/
31
- w = $` + $1
32
- end
33
- # Step 1b
34
- if w =~ /eed$/
35
- w.chop! if $` =~ MGR0
36
- elsif w =~ /(ed|ing)$/
37
- stem = $`
38
- if stem =~ VOWEL_IN_STEM
39
- w = stem
40
- case w
41
- when /(at|bl|iz)$/ then w << "e"
42
- when /([^aeiouylsz])\1$/ then w.chop!
43
- when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
44
- end
45
- end
46
- end
47
- if w =~ /y$/
48
- stem = $`
49
- w = stem + "i" if stem =~ VOWEL_IN_STEM
50
- end
51
- # Step 2
52
- if w =~ SUFFIX_1_REGEXP
53
- stem = $`
54
- suffix = $1
55
- if stem =~ MGR0
56
- w = stem + STEP_2_LIST[suffix]
57
- end
58
- end
59
- # Step 3
60
- if w =~
61
- /(icate|ative|alize|iciti|ical|ful|ness)$/
62
- stem = $`
63
- suffix = $1
64
- if stem =~ MGR0
65
- w = stem + STEP_3_LIST[suffix]
66
- end
67
- end
68
- # Step 4
69
- if w =~ SUFFIX_2_REGEXP
70
- stem = $`
71
- if stem =~ MGR1
72
- w = stem
73
- end
74
- elsif w =~ /(s|t)(ion)$/
75
- stem = $` + $1
76
- if stem =~ MGR1
77
- w = stem
78
- end
79
- end
80
- # Step 5
81
- if w =~ /e$/
82
- stem = $`
83
- if (stem =~ MGR1) ||
84
- (stem =~ MEQ1 && stem !~
85
- /^#{CC}#{V}[^aeiouwxy]$/o)
86
- w = stem
87
- end
88
- end
89
- if w =~ /ll$/ && w =~ MGR1
90
- w.chop!
91
- end
92
- # and turn initial Y back to y
93
- w[0] = 'y' if w[0] == ?Y
94
- w
95
- end
96
-
97
- STEP_2_LIST = {
98
- 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
99
- 'izer'=>'ize', 'bli'=>'ble',
100
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
101
- 'ization'=>'ize', 'ation'=>'ate',
102
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
103
- 'ousness'=>'ous', 'anati'=>'al',
104
- 'iviti'=>'ive', 'binati'=>'ble', 'logi'=>'log'
105
- }
106
- STEP_3_LIST = {
107
- 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
108
- 'ical'=>'ic', 'ful'=>'', 'ness'=>''
109
- }
110
- SUFFIX_1_REGEXP = /(
111
- ational |
112
- tional |
113
- enci |
114
- anci |
115
- izer |
116
- bli |
117
- alli |
118
- entli |
119
- eli |
120
- ousli |
121
- ization |
122
- ation |
123
- ator |
124
- alism |
125
- iveness |
126
- fulness |
127
- ousness |
128
- anati |
129
- iviti |
130
- binati |
131
- logi)$/x
132
- SUFFIX_2_REGEXP = /(
133
- al |
134
- ance |
135
- ence |
136
- er |
137
- ic |
138
- able |
139
- ible |
140
- ant |
141
- ement |
142
- ment |
143
- ent |
144
- ou |
145
- ism |
146
- ate |
147
- iti |
148
- ous |
149
- ive |
150
- ize)$/x
151
- C = "[^aeiou]" # consonant
152
- V = "[aeiouy]" # vowel
153
- CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
154
- VV = "#{V}(?>[aeiou]*)" # vowel sequence
155
- MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
156
- MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
157
- MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
158
- VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
159
- end
160
- end
161
- end
162
- end
@@ -1,26 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Stem
4
- # Stems words using the 'ruby-stemmer' gem, which
5
- # wraps a C version of the Porter stemming algorithm.
6
- #
7
- # Project website: https://github.com/aurelian/ruby-stemmer
8
- # Original paper: Porter, 1980. An algorithm for suffix stripping,
9
- # Program, Vol. 14, no. 3, pp 130-137,
10
- # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
11
- class PorterC
12
- # Require the 'ruby-stemmer' gem.
13
- silence_warnings { require 'lingua/stemmer' }
14
- # Remove a conflict between this gem and the 'engtagger' gem.
15
- ::LinguaStemmer = ::Lingua
16
- Object.instance_eval { remove_const :Lingua }
17
- # Stem the word using a full-blown Porter stemmer in C.
18
- #
19
- # Options: none.
20
- def self.stem(word, options = {})
21
- silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
22
- end
23
- end
24
- end
25
- end
26
- end
@@ -1,30 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Stem
4
- # Stems a word using the UEA algorithm, implemented
5
- # by the 'uea-stemmer' gem.
6
- #
7
- # "Similar to other stemmers, UEA-Lite operates on a
8
- # set of rules which are used as steps. There are two
9
- # groups of rules: the first to clean the tokens, and
10
- # the second to alter suffixes."
11
- #
12
- # Project website: https://github.com/ealdent/uea-stemmer
13
- # Original paper: Jenkins, Marie-Claire, Smith, Dan,
14
- # Conservative stemming for search and indexing, 2005.
15
- # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
16
- class UEA
17
- # Require the 'uea-stemmer' gem.
18
- silence_warnings { require 'uea-stemmer' }
19
- # Keep only one copy of the stemmer.
20
- @@stemmer = nil
21
- # Stems a word using the UEA algorithm, implemented
22
- # by the 'uea-stemmer' gem.
23
- def self.stem(entity, options = {})
24
- @@stemmer ||= silence_warnings { ::UEAStemmer.new }
25
- @@stemmer.stem(entity.to_s).strip
26
- end
27
- end
28
- end
29
- end
30
- end
data/lib/treat/install.rb DELETED
@@ -1,59 +0,0 @@
1
- module Treat
2
- class Installer
3
- require 'rubygems/dependency_installer'
4
- # Install required dependencies and optional dependencies
5
- # for a specific language.
6
- def self.install(language = :english)
7
-
8
- lang = Treat::Languages.get(language)
9
- required = lang::RequiredDependencies
10
- optional = lang::OptionalDependencies
11
-
12
- puts "Treat Installer\n\n"
13
- puts "Installing dependencies for the #{language.to_s.capitalize} language.\n\n"
14
-
15
- flag = false
16
-
17
- inst = Gem::DependencyInstaller.new
18
-
19
- required.each do |dependency|
20
- puts "Installing required dependency '#{dependency}'..."
21
- begin
22
- silence_warnings { inst.install(dependency) }
23
- rescue
24
- flag = true
25
- puts "Couldn't install '#{dependency}'. " +
26
- "You need install this dependency manually by running: " +
27
- "'gem install #{dependency}' or use 'sudo' to run this script."
28
- end
29
- end
30
-
31
- optional.each do |dependency|
32
- begin
33
- puts "Install optional dependency '#{dependency}' (yes/no, <enter> = skip) ?"
34
- answer = gets.strip
35
- raise Treat::Exception unless ['yes', 'no', ''].include?(answer)
36
- if answer == 'yes'
37
- silence_warnings { inst.install(dependency) }
38
- else
39
- puts "Skipped installing '#{dependency}'."
40
- next
41
- end
42
- rescue Treat::Exception
43
- puts "Invalid input - valid options are 'yes' or 'no'."
44
- retry
45
- rescue
46
- flag = true
47
- puts "Couldn't install '#{dependency}'. " +
48
- "You can install this dependency manually by running: " +
49
- "'gem install #{dependency}' or use 'sudo' to run this script."
50
- end
51
- end
52
-
53
- w = flag ? 'incompletely' : 'normally'
54
- puts "\nInstall proceeded #{w}."
55
- puts
56
-
57
- end
58
- end
59
- end
@@ -1,377 +0,0 @@
1
- module Treat
2
- module Languages
3
-
4
- module Tags
5
- ClawsC5 = 0
6
- Brown = 1
7
- Penn = 2
8
- Negra = 3
9
- PennChinese = 4
10
- Simple = 5
11
-
12
- PTBClauseTagDescription = [
13
- ['S', 'Simple declarative clause'],
14
- ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
15
- ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
16
- ['SINV', 'Inverted declarative sentence'],
17
- ['SQ', 'Inverted yes/no question']
18
- ]
19
-
20
- AlignedPhraseTags =
21
- [
22
- 'Adjective phrase', ['', '', 'ADJP'],
23
- 'Adverb phrase', ['', '', 'ADVP'],
24
- 'Conjunction phrase', ['', '', 'CONJP'],
25
- 'Fragment', ['', '', 'FRAG'],
26
- 'Interjection', ['', '', 'INTJ'],
27
- 'List marker', ['', '', 'LST'],
28
- 'Not a phrase', ['', '', 'NAC'],
29
- 'Noun phrase', ['', '', 'NP'],
30
- 'Head of NP', ['', '', 'NX'],
31
- 'Prepositional phrase', ['', '', 'PP'],
32
- 'Parenthetical', ['', '', 'PRN'],
33
- 'Particle', ['', '', 'PRT'],
34
- 'Quantifier phrase', ['', '', 'QP'],
35
- 'Reduced relative clause', ['', '', 'RRC'],
36
- 'Unlike coordinated phrase', ['', '', 'UCP'],
37
- 'Verb phrase', ['', '', 'VP'],
38
- 'Wh adjective phrase', ['', '', 'WHADJP'],
39
- 'Wh adverb phrase', ['', '', 'WHAVP'],
40
- 'Wh noun phrase', ['', '', 'WHNP'],
41
- 'Wh prepositional phrase', ['', '', 'WHPP'],
42
- 'Unknown', ['', '', 'X'],
43
- 'Phrase', ['', '', 'P'],
44
- 'Sentence', ['', '', 'S'],
45
- 'Phrase', ['', '', 'SBAR'] # Fix
46
- ]
47
-
48
- # A description of Enju categories.
49
- EnjuCatDescription = [
50
- ['ADJ', 'Adjective'],
51
- ['ADV', 'Adverb'],
52
- ['CONJ', 'Coordination conjunction'],
53
- ['C', 'Complementizer'],
54
- ['D', 'Determiner'],
55
- ['N', 'Noun'],
56
- ['P', 'Preposition'],
57
- ['SC', 'Subordination conjunction'],
58
- ['V', 'Verb'],
59
- ['COOD', 'Part of coordination'],
60
- ['PN', 'Punctuation'],
61
- ['PRT', 'Particle'],
62
- ['S', 'Sentence']
63
- ]
64
-
65
- # Maps Enju categories to Treat categories.
66
- EnjuCatToCategory = {
67
- 'ADJ' => :adjective,
68
- 'ADV' => :adverb,
69
- 'CONJ' => :conjunction,
70
- 'COOD' => :conjunction,
71
- 'C' => :complementizer,
72
- 'D' => :determiner,
73
- 'N' => :noun,
74
- 'P' => :preposition,
75
- 'PN' => :punctuation,
76
- 'SC' => :conjunction,
77
- 'V' => :verb,
78
- 'PRT' => :particle
79
- }
80
-
81
- # Description of the xcat in the Enju output specification.
82
- EnjuXCatDescription = [
83
- ['COOD', 'Coordinated phrase/clause'],
84
- ['IMP', 'Imperative sentence'],
85
- ['INV', 'Subject-verb inversion'],
86
- ['Q', 'Interrogative sentence with subject-verb inversion'],
87
- ['REL', 'A relativizer included'],
88
- ['FREL', 'A free relative included'],
89
- ['TRACE', 'A trace included'],
90
- ['WH', 'A wh-question word included']
91
- ]
92
-
93
- EnjuCatXcatToPTB = [
94
- ['ADJP', '', 'ADJP'],
95
- ['ADJP', 'REL', 'WHADJP'],
96
- ['ADJP', 'FREL', 'WHADJP'],
97
- ['ADJP', 'WH', 'WHADJP'],
98
- ['ADVP', '', 'ADVP'],
99
- ['ADVP', 'REL', 'WHADVP'],
100
- ['ADVP', 'FREL', 'WHADVP'],
101
- ['ADVP', 'WH', 'WHADVP'],
102
- ['CONJP', '', 'CONJP'],
103
- ['CP', '', 'SBAR'],
104
- ['DP', '', 'NP'],
105
- ['NP', '', 'NP'],
106
- ['NX', 'NX', 'NAC'],
107
- ['NP' 'REL' 'WHNP'],
108
- ['NP' 'FREL' 'WHNP'],
109
- ['NP' 'WH' 'WHNP'],
110
- ['PP', '', 'PP'],
111
- ['PP', 'REL', 'WHPP'],
112
- ['PP', 'WH', 'WHPP'],
113
- ['PRT', '', 'PRT'],
114
- ['S', '', 'S'],
115
- ['S', 'INV', 'SINV'],
116
- ['S', 'Q', 'SQ'],
117
- ['S', 'REL', 'SBAR'],
118
- ['S', 'FREL', 'SBAR'],
119
- ['S', 'WH', 'SBARQ'],
120
- ['SCP', '', 'SBAR'],
121
- ['VP', '', 'VP'],
122
- ['VP', '', 'VP'],
123
- ['', '', 'UK']
124
- ]
125
-
126
- # Aligned tags for the Claws C5, Brown and Penn tag sets.
127
- # Adapted from Manning, Christopher and Schütze, Hinrich,
128
- # 1999. Foundations of Statistical Natural Language
129
- # Processing. MIT Press, p. 141-142;
130
- # http://www.isocat.org/rest/dcs/376;
131
- #
132
- # JRS?
133
-
134
-
135
- SimpleWordTagToCategory = {
136
- 'C' => :complementizer,
137
- 'PN' => :punctuation,
138
- 'SC' => :conjunction
139
- }
140
-
141
- AlignedWordTags = [
142
-
143
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
144
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
145
- 'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
146
- 'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
147
- 'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
148
- 'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
149
- 'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
150
- 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
151
- 'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
152
- 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
153
-
154
- 'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
155
- 'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
156
- 'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
157
- 'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
158
- 'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
159
- 'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
160
- 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
161
- 'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
162
- 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
163
- 'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
164
- 'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
165
-
166
- 'Clitic', ['', '', 'POS', '', '', ''],
167
-
168
- 'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
169
- 'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
170
- 'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
171
- 'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
172
- 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
173
-
174
- 'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
175
- 'Determiner', ['DT0', 'DT', 'DET', '', 'DT', 'D'],
176
- 'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
177
- 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
178
- 'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
179
- 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
180
- 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
181
- 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
182
- 'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
183
- 'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
184
- 'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
185
- 'Determiner, possessive, second', ['DPS', 'PPSS', 'PRPS', '', '', 'D'],
186
- 'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP', '', '', 'D'],
187
- 'Determiner, possessive, second', ['DPS', 'PPSS', 'PRP', '', '', 'D'],
188
- 'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
189
- 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
190
- 'Determiner, possessive & question', ['DTQ', 'WPS', 'WPS', '', '', 'D'],
191
-
192
- 'Localizer', ['', '', '', '', 'LC'],
193
-
194
- 'Measure word', ['', '', '', '', 'M'],
195
-
196
- 'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
197
- 'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
198
- 'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
199
- 'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
200
- 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
201
- 'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
202
- 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
203
- 'Noun, temporal', ['', '', '', '', 'NT', 'N'],
204
- 'Noun, verbal', ['', '', '', '', 'NN', 'N'],
205
-
206
- 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
207
- 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
208
- 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
209
- 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
210
- 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
211
- 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
212
- 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
213
- 'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
214
- 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
215
- 'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
216
- 'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
217
- 'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
218
- 'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
219
- 'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
220
- 'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
221
- 'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
222
- 'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
223
- 'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
224
- 'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
225
-
226
- 'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
227
- 'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
228
- 'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
229
- 'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
230
- 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
231
- 'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
232
- 'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
233
- 'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
234
- 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
235
- 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
236
- 'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
237
- 'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
238
- 'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
239
- 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
240
- 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
241
- 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
242
- 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
243
- 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
244
- 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
245
- 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
246
- 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
247
- 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
248
- 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
249
- 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
250
- 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
251
- 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
252
- 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
253
- 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
254
- 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
255
- 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
256
- 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
257
- 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
258
- 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
259
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
260
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
261
- 'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
262
- 'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
263
- 'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
264
-
265
- 'Particle', ['', '', '', '', '', 'PRT'],
266
- 'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
267
- 'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
268
- 'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
269
- 'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
270
- 'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
271
-
272
- 'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
273
- 'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
274
- 'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
275
- 'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
276
- 'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
277
-
278
- 'Possessive', ['POS', '$', 'POS'],
279
-
280
- 'Postposition', ['', '', '', 'APPO'],
281
-
282
- 'Circumposition, right', ['', '', '', 'APZR', ''],
283
-
284
- 'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
285
-
286
- 'Onomatopoeia', ['', '', '', '', 'ON'],
287
-
288
- 'Punctuation', ['', '', '', '', 'PU', 'PN'],
289
- 'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
290
- 'Punctuation, sentence ender', ['PUN', '.', 'PP', '$.', '', 'PN'],
291
- 'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
292
- 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
293
- 'Punctuationm, comma', ['PUN', ',', ',', '$,'],
294
- 'Punctuation, dash', ['PUN', '-', '-'],
295
- 'Punctuation, dollar sign', ['PUN', '', '$'],
296
- 'Punctuation, left bracket', ['PUL', '(', '(', '$('],
297
- 'Punctuation, right bracket', ['PUR', ')', ')'],
298
- 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
299
- 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
300
-
301
- 'Word, truncated, left', ['', '', '', 'TRUNC'],
302
-
303
- 'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
304
-
305
- 'Symbol', ['', '', 'SYM', 'XY'],
306
- 'Symbol, alphabetical', ['ZZ0', '', ''],
307
- 'Symbol, list item', ['', '', 'LS'],
308
-
309
- # Not sure about these tags from the Chinese PTB.
310
- 'Aspect marker', ['', '', '', '', 'AS'], # ?
311
- 'Ba-construction', ['', '', '', '', 'BA'], # ?
312
- 'In relative', ['', '', '', '', 'DEC'], # ?
313
- 'Associative', ['', '', '', '', 'DER'], # ?
314
- 'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
315
- 'For words ? ', ['', '', '', '', 'ETC'], # ?
316
- 'In long bei-construct', ['', '', '', '', 'LB'], # ?
317
- 'In short bei-construct', ['', '', '', '', 'SB'], # ?
318
- 'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
319
- 'Particle, other', ['', '', '', '', 'MSP'], # ?
320
- 'Before VP', ['', '', '', '', 'DEV'], # ?
321
- 'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
322
- 'Verb, ????', ['', '', '', '', 'VC'] # ?
323
- ]
324
-
325
- wttc = {
326
-
327
- }
328
- Treat::Languages::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
329
-
330
- category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
331
-
332
- wttc[tags[ClawsC5]] ||= {}
333
- wttc[tags[Brown]] ||= {}
334
- wttc[tags[Penn]] ||= {}
335
- wttc[tags[Negra]] ||= {}
336
- wttc[tags[PennChinese]] ||= {}
337
- wttc[tags[Simple]] ||= {}
338
-
339
- wttc[tags[ClawsC5]][:claws_5] = category
340
- wttc[tags[Brown]][:brown] = category
341
- wttc[tags[Penn]][:penn] = category
342
- wttc[tags[Negra]][:negra] = category if tags[Negra]
343
- wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
344
- wttc[tags[Simple]][:simple] = category if tags[Simple]
345
-
346
- end
347
- # A hash converting word tags to word categories.
348
- WordTagToCategory = wttc
349
-
350
- # A hash converting phrase tag to categories.
351
- pttc = {}
352
- Treat::Languages::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
353
- category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
354
- pttc[tags[Penn]] ||= {};
355
- # Not yet for other tag sts.
356
- #pttc[tags[0]][:claws_5] = category
357
- #pttc[tags[1]][:brown] = category
358
- pttc[tags[Penn]][:penn] = category
359
- end
360
-
361
- # A hash converting word tags to word categories.
362
- PhraseTagToCategory = pttc
363
-
364
- def self.has_phrase_tag?(tag, tag_set)
365
- PhraseTagToCategory[tag] &&
366
- PhraseTagToCategory[tag_set]
367
- end
368
-
369
- def self.has_word_tag?(tag, tag_set)
370
- WordTagToCategory[tag] &&
371
- WordTagToCategory[tag_set]
372
- end
373
-
374
-
375
- end
376
- end
377
- end