treat 0.2.5 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,42 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Declensions
4
- # This class is a wrapper for the functions included
5
- # in the 'linguistics' gem that allow to obtain the
6
- # declensions of a word.
7
- #
8
- # Project website: http://deveiate.org/projects/Linguistics/
9
- class Linguistics
10
- require 'treat/helpers/linguistics_loader'
11
- # Retrieve a declension of a word using the 'linguistics' gem.
12
- #
13
- # Options:
14
- #
15
- # - (Identifier) :count => :singular, :plural
16
- def self.declensions(entity, options = {})
17
- unless options[:count]
18
- raise Treat::Exception,
19
- "Must supply option count (:singular or :plural)."
20
- end
21
- klass = Treat::Helpers::LinguisticsLoader.load(entity.language)
22
- string = entity.to_s
23
- if entity.category == :verb
24
- raise Treat::Exception,
25
- "Cannot retrieve the declensions of a verb. " +
26
- "Use #singular_verb and #plural_verb instead."
27
- end
28
- if options[:count] == :plural
29
- if entity.has?(:category) &&
30
- [:noun, :adjective, :verb].include?(entity.category)
31
- silence_warnings do
32
- klass.send(:"plural_#{entity.category}", string)
33
- end
34
- else
35
- silence_warnings { klass.plural(string) }
36
- end
37
- end
38
- end
39
- end
40
- end
41
- end
42
- end
@@ -1,20 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module OrdinalWords
4
- # This class is a wrapper for the functions included
5
- # in the 'linguistics' gem that allow to describe a
6
- # number in words in ordinal form.
7
- #
8
- # Project website: http://deveiate.org/projects/Linguistics/
9
- class Linguistics
10
- require 'treat/helpers/linguistics_loader'
11
- # Desribe a number in words in ordinal form, using the
12
- # 'linguistics' gem.
13
- def self.ordinal_words(number, options = {})
14
- klass = Treat::Helpers::LinguisticsLoader.load(number.language)
15
- klass.ordinate(number.to_s)
16
- end
17
- end
18
- end
19
- end
20
- end
@@ -1,162 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Stem
4
- # Stem a word using a native Ruby implementation of the
5
- # Porter stemming algorithm, ported to Ruby from a
6
- # version coded up in Perl. This is a simplified
7
- # implementation; for a true and fast Porter stemmer,
8
- # see Treat::Inflectors::Stem::PorterC.
9
- #
10
- # Authored by Ray Pereda (raypereda@hotmail.com).
11
- # Unknown license.
12
- #
13
- # Original paper: Porter, 1980. An algorithm for suffix stripping,
14
- # Program, Vol. 14, no. 3, pp 130-137,
15
- # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
16
- class Porter
17
- # Returns the stem of a word using a native Porter stemmer.
18
- #
19
- # Options: none.
20
- def self.stem(word, options = {})
21
- # Copy the word and convert it to a string.
22
- w = word.to_s
23
- return w if w.length < 3
24
- # Map initial y to Y so that the patterns
25
- # never treat it as vowel.
26
- w[0] = 'Y' if w[0] == ?y
27
- # Step 1a
28
- if w =~ /(ss|i)es$/
29
- w = $` + $1
30
- elsif w =~ /([^s])s$/
31
- w = $` + $1
32
- end
33
- # Step 1b
34
- if w =~ /eed$/
35
- w.chop! if $` =~ MGR0
36
- elsif w =~ /(ed|ing)$/
37
- stem = $`
38
- if stem =~ VOWEL_IN_STEM
39
- w = stem
40
- case w
41
- when /(at|bl|iz)$/ then w << "e"
42
- when /([^aeiouylsz])\1$/ then w.chop!
43
- when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
44
- end
45
- end
46
- end
47
- if w =~ /y$/
48
- stem = $`
49
- w = stem + "i" if stem =~ VOWEL_IN_STEM
50
- end
51
- # Step 2
52
- if w =~ SUFFIX_1_REGEXP
53
- stem = $`
54
- suffix = $1
55
- if stem =~ MGR0
56
- w = stem + STEP_2_LIST[suffix]
57
- end
58
- end
59
- # Step 3
60
- if w =~
61
- /(icate|ative|alize|iciti|ical|ful|ness)$/
62
- stem = $`
63
- suffix = $1
64
- if stem =~ MGR0
65
- w = stem + STEP_3_LIST[suffix]
66
- end
67
- end
68
- # Step 4
69
- if w =~ SUFFIX_2_REGEXP
70
- stem = $`
71
- if stem =~ MGR1
72
- w = stem
73
- end
74
- elsif w =~ /(s|t)(ion)$/
75
- stem = $` + $1
76
- if stem =~ MGR1
77
- w = stem
78
- end
79
- end
80
- # Step 5
81
- if w =~ /e$/
82
- stem = $`
83
- if (stem =~ MGR1) ||
84
- (stem =~ MEQ1 && stem !~
85
- /^#{CC}#{V}[^aeiouwxy]$/o)
86
- w = stem
87
- end
88
- end
89
- if w =~ /ll$/ && w =~ MGR1
90
- w.chop!
91
- end
92
- # and turn initial Y back to y
93
- w[0] = 'y' if w[0] == ?Y
94
- w
95
- end
96
-
97
- STEP_2_LIST = {
98
- 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
99
- 'izer'=>'ize', 'bli'=>'ble',
100
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
101
- 'ization'=>'ize', 'ation'=>'ate',
102
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
103
- 'ousness'=>'ous', 'anati'=>'al',
104
- 'iviti'=>'ive', 'binati'=>'ble', 'logi'=>'log'
105
- }
106
- STEP_3_LIST = {
107
- 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
108
- 'ical'=>'ic', 'ful'=>'', 'ness'=>''
109
- }
110
- SUFFIX_1_REGEXP = /(
111
- ational |
112
- tional |
113
- enci |
114
- anci |
115
- izer |
116
- bli |
117
- alli |
118
- entli |
119
- eli |
120
- ousli |
121
- ization |
122
- ation |
123
- ator |
124
- alism |
125
- iveness |
126
- fulness |
127
- ousness |
128
- anati |
129
- iviti |
130
- binati |
131
- logi)$/x
132
- SUFFIX_2_REGEXP = /(
133
- al |
134
- ance |
135
- ence |
136
- er |
137
- ic |
138
- able |
139
- ible |
140
- ant |
141
- ement |
142
- ment |
143
- ent |
144
- ou |
145
- ism |
146
- ate |
147
- iti |
148
- ous |
149
- ive |
150
- ize)$/x
151
- C = "[^aeiou]" # consonant
152
- V = "[aeiouy]" # vowel
153
- CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
154
- VV = "#{V}(?>[aeiou]*)" # vowel sequence
155
- MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
156
- MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
157
- MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
158
- VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
159
- end
160
- end
161
- end
162
- end
@@ -1,26 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Stem
4
- # Stems words using the 'ruby-stemmer' gem, which
5
- # wraps a C version of the Porter stemming algorithm.
6
- #
7
- # Project website: https://github.com/aurelian/ruby-stemmer
8
- # Original paper: Porter, 1980. An algorithm for suffix stripping,
9
- # Program, Vol. 14, no. 3, pp 130-137,
10
- # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
11
- class PorterC
12
- # Require the 'ruby-stemmer' gem.
13
- silence_warnings { require 'lingua/stemmer' }
14
- # Remove a conflict between this gem and the 'engtagger' gem.
15
- ::LinguaStemmer = ::Lingua
16
- Object.instance_eval { remove_const :Lingua }
17
- # Stem the word using a full-blown Porter stemmer in C.
18
- #
19
- # Options: none.
20
- def self.stem(word, options = {})
21
- silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
22
- end
23
- end
24
- end
25
- end
26
- end
@@ -1,30 +0,0 @@
1
- module Treat
2
- module Inflectors
3
- module Stem
4
- # Stems a word using the UEA algorithm, implemented
5
- # by the 'uea-stemmer' gem.
6
- #
7
- # "Similar to other stemmers, UEA-Lite operates on a
8
- # set of rules which are used as steps. There are two
9
- # groups of rules: the first to clean the tokens, and
10
- # the second to alter suffixes."
11
- #
12
- # Project website: https://github.com/ealdent/uea-stemmer
13
- # Original paper: Jenkins, Marie-Claire, Smith, Dan,
14
- # Conservative stemming for search and indexing, 2005.
15
- # http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
16
- class UEA
17
- # Require the 'uea-stemmer' gem.
18
- silence_warnings { require 'uea-stemmer' }
19
- # Keep only one copy of the stemmer.
20
- @@stemmer = nil
21
- # Stems a word using the UEA algorithm, implemented
22
- # by the 'uea-stemmer' gem.
23
- def self.stem(entity, options = {})
24
- @@stemmer ||= silence_warnings { ::UEAStemmer.new }
25
- @@stemmer.stem(entity.to_s).strip
26
- end
27
- end
28
- end
29
- end
30
- end
data/lib/treat/install.rb DELETED
@@ -1,59 +0,0 @@
1
- module Treat
2
- class Installer
3
- require 'rubygems/dependency_installer'
4
- # Install required dependencies and optional dependencies
5
- # for a specific language.
6
- def self.install(language = :english)
7
-
8
- lang = Treat::Languages.get(language)
9
- required = lang::RequiredDependencies
10
- optional = lang::OptionalDependencies
11
-
12
- puts "Treat Installer\n\n"
13
- puts "Installing dependencies for the #{language.to_s.capitalize} language.\n\n"
14
-
15
- flag = false
16
-
17
- inst = Gem::DependencyInstaller.new
18
-
19
- required.each do |dependency|
20
- puts "Installing required dependency '#{dependency}'..."
21
- begin
22
- silence_warnings { inst.install(dependency) }
23
- rescue
24
- flag = true
25
- puts "Couldn't install '#{dependency}'. " +
26
- "You need install this dependency manually by running: " +
27
- "'gem install #{dependency}' or use 'sudo' to run this script."
28
- end
29
- end
30
-
31
- optional.each do |dependency|
32
- begin
33
- puts "Install optional dependency '#{dependency}' (yes/no, <enter> = skip) ?"
34
- answer = gets.strip
35
- raise Treat::Exception unless ['yes', 'no', ''].include?(answer)
36
- if answer == 'yes'
37
- silence_warnings { inst.install(dependency) }
38
- else
39
- puts "Skipped installing '#{dependency}'."
40
- next
41
- end
42
- rescue Treat::Exception
43
- puts "Invalid input - valid options are 'yes' or 'no'."
44
- retry
45
- rescue
46
- flag = true
47
- puts "Couldn't install '#{dependency}'. " +
48
- "You can install this dependency manually by running: " +
49
- "'gem install #{dependency}' or use 'sudo' to run this script."
50
- end
51
- end
52
-
53
- w = flag ? 'incompletely' : 'normally'
54
- puts "\nInstall proceeded #{w}."
55
- puts
56
-
57
- end
58
- end
59
- end
@@ -1,377 +0,0 @@
1
- module Treat
2
- module Languages
3
-
4
- module Tags
5
- ClawsC5 = 0
6
- Brown = 1
7
- Penn = 2
8
- Negra = 3
9
- PennChinese = 4
10
- Simple = 5
11
-
12
- PTBClauseTagDescription = [
13
- ['S', 'Simple declarative clause'],
14
- ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
15
- ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
16
- ['SINV', 'Inverted declarative sentence'],
17
- ['SQ', 'Inverted yes/no question']
18
- ]
19
-
20
- AlignedPhraseTags =
21
- [
22
- 'Adjective phrase', ['', '', 'ADJP'],
23
- 'Adverb phrase', ['', '', 'ADVP'],
24
- 'Conjunction phrase', ['', '', 'CONJP'],
25
- 'Fragment', ['', '', 'FRAG'],
26
- 'Interjection', ['', '', 'INTJ'],
27
- 'List marker', ['', '', 'LST'],
28
- 'Not a phrase', ['', '', 'NAC'],
29
- 'Noun phrase', ['', '', 'NP'],
30
- 'Head of NP', ['', '', 'NX'],
31
- 'Prepositional phrase', ['', '', 'PP'],
32
- 'Parenthetical', ['', '', 'PRN'],
33
- 'Particle', ['', '', 'PRT'],
34
- 'Quantifier phrase', ['', '', 'QP'],
35
- 'Reduced relative clause', ['', '', 'RRC'],
36
- 'Unlike coordinated phrase', ['', '', 'UCP'],
37
- 'Verb phrase', ['', '', 'VP'],
38
- 'Wh adjective phrase', ['', '', 'WHADJP'],
39
- 'Wh adverb phrase', ['', '', 'WHAVP'],
40
- 'Wh noun phrase', ['', '', 'WHNP'],
41
- 'Wh prepositional phrase', ['', '', 'WHPP'],
42
- 'Unknown', ['', '', 'X'],
43
- 'Phrase', ['', '', 'P'],
44
- 'Sentence', ['', '', 'S'],
45
- 'Phrase', ['', '', 'SBAR'] # Fix
46
- ]
47
-
48
- # A description of Enju categories.
49
- EnjuCatDescription = [
50
- ['ADJ', 'Adjective'],
51
- ['ADV', 'Adverb'],
52
- ['CONJ', 'Coordination conjunction'],
53
- ['C', 'Complementizer'],
54
- ['D', 'Determiner'],
55
- ['N', 'Noun'],
56
- ['P', 'Preposition'],
57
- ['SC', 'Subordination conjunction'],
58
- ['V', 'Verb'],
59
- ['COOD', 'Part of coordination'],
60
- ['PN', 'Punctuation'],
61
- ['PRT', 'Particle'],
62
- ['S', 'Sentence']
63
- ]
64
-
65
- # Maps Enju categories to Treat categories.
66
- EnjuCatToCategory = {
67
- 'ADJ' => :adjective,
68
- 'ADV' => :adverb,
69
- 'CONJ' => :conjunction,
70
- 'COOD' => :conjunction,
71
- 'C' => :complementizer,
72
- 'D' => :determiner,
73
- 'N' => :noun,
74
- 'P' => :preposition,
75
- 'PN' => :punctuation,
76
- 'SC' => :conjunction,
77
- 'V' => :verb,
78
- 'PRT' => :particle
79
- }
80
-
81
- # Description of the xcat in the Enju output specification.
82
- EnjuXCatDescription = [
83
- ['COOD', 'Coordinated phrase/clause'],
84
- ['IMP', 'Imperative sentence'],
85
- ['INV', 'Subject-verb inversion'],
86
- ['Q', 'Interrogative sentence with subject-verb inversion'],
87
- ['REL', 'A relativizer included'],
88
- ['FREL', 'A free relative included'],
89
- ['TRACE', 'A trace included'],
90
- ['WH', 'A wh-question word included']
91
- ]
92
-
93
- EnjuCatXcatToPTB = [
94
- ['ADJP', '', 'ADJP'],
95
- ['ADJP', 'REL', 'WHADJP'],
96
- ['ADJP', 'FREL', 'WHADJP'],
97
- ['ADJP', 'WH', 'WHADJP'],
98
- ['ADVP', '', 'ADVP'],
99
- ['ADVP', 'REL', 'WHADVP'],
100
- ['ADVP', 'FREL', 'WHADVP'],
101
- ['ADVP', 'WH', 'WHADVP'],
102
- ['CONJP', '', 'CONJP'],
103
- ['CP', '', 'SBAR'],
104
- ['DP', '', 'NP'],
105
- ['NP', '', 'NP'],
106
- ['NX', 'NX', 'NAC'],
107
- ['NP' 'REL' 'WHNP'],
108
- ['NP' 'FREL' 'WHNP'],
109
- ['NP' 'WH' 'WHNP'],
110
- ['PP', '', 'PP'],
111
- ['PP', 'REL', 'WHPP'],
112
- ['PP', 'WH', 'WHPP'],
113
- ['PRT', '', 'PRT'],
114
- ['S', '', 'S'],
115
- ['S', 'INV', 'SINV'],
116
- ['S', 'Q', 'SQ'],
117
- ['S', 'REL', 'SBAR'],
118
- ['S', 'FREL', 'SBAR'],
119
- ['S', 'WH', 'SBARQ'],
120
- ['SCP', '', 'SBAR'],
121
- ['VP', '', 'VP'],
122
- ['VP', '', 'VP'],
123
- ['', '', 'UK']
124
- ]
125
-
126
- # Aligned tags for the Claws C5, Brown and Penn tag sets.
127
- # Adapted from Manning, Christopher and Schütze, Hinrich,
128
- # 1999. Foundations of Statistical Natural Language
129
- # Processing. MIT Press, p. 141-142;
130
- # http://www.isocat.org/rest/dcs/376;
131
- #
132
- # JRS?
133
-
134
-
135
- SimpleWordTagToCategory = {
136
- 'C' => :complementizer,
137
- 'PN' => :punctuation,
138
- 'SC' => :conjunction
139
- }
140
-
141
- AlignedWordTags = [
142
-
143
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
144
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
145
- 'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
146
- 'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
147
- 'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
148
- 'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
149
- 'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
150
- 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
151
- 'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
152
- 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
153
-
154
- 'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
155
- 'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
156
- 'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
157
- 'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
158
- 'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
159
- 'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
160
- 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
161
- 'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
162
- 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
163
- 'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
164
- 'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
165
-
166
- 'Clitic', ['', '', 'POS', '', '', ''],
167
-
168
- 'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
169
- 'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
170
- 'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
171
- 'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
172
- 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
173
-
174
- 'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
175
- 'Determiner', ['DT0', 'DT', 'DET', '', 'DT', 'D'],
176
- 'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
177
- 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
178
- 'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
179
- 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
180
- 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
181
- 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
182
- 'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
183
- 'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
184
- 'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
185
- 'Determiner, possessive, second', ['DPS', 'PPSS', 'PRPS', '', '', 'D'],
186
- 'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP', '', '', 'D'],
187
- 'Determiner, possessive, second', ['DPS', 'PPSS', 'PRP', '', '', 'D'],
188
- 'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
189
- 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
190
- 'Determiner, possessive & question', ['DTQ', 'WPS', 'WPS', '', '', 'D'],
191
-
192
- 'Localizer', ['', '', '', '', 'LC'],
193
-
194
- 'Measure word', ['', '', '', '', 'M'],
195
-
196
- 'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
197
- 'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
198
- 'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
199
- 'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
200
- 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
201
- 'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
202
- 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
203
- 'Noun, temporal', ['', '', '', '', 'NT', 'N'],
204
- 'Noun, verbal', ['', '', '', '', 'NN', 'N'],
205
-
206
- 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
207
- 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
208
- 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
209
- 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
210
- 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
211
- 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
212
- 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
213
- 'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
214
- 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
215
- 'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
216
- 'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
217
- 'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
218
- 'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
219
- 'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
220
- 'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
221
- 'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
222
- 'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
223
- 'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
224
- 'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
225
-
226
- 'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
227
- 'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
228
- 'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
229
- 'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
230
- 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
231
- 'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
232
- 'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
233
- 'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
234
- 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
235
- 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
236
- 'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
237
- 'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
238
- 'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
239
- 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
240
- 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
241
- 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
242
- 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
243
- 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
244
- 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
245
- 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
246
- 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
247
- 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
248
- 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
249
- 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
250
- 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
251
- 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
252
- 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
253
- 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
254
- 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
255
- 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
256
- 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
257
- 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
258
- 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
259
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
260
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
261
- 'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
262
- 'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
263
- 'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
264
-
265
- 'Particle', ['', '', '', '', '', 'PRT'],
266
- 'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
267
- 'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
268
- 'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
269
- 'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
270
- 'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
271
-
272
- 'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
273
- 'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
274
- 'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
275
- 'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
276
- 'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
277
-
278
- 'Possessive', ['POS', '$', 'POS'],
279
-
280
- 'Postposition', ['', '', '', 'APPO'],
281
-
282
- 'Circumposition, right', ['', '', '', 'APZR', ''],
283
-
284
- 'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
285
-
286
- 'Onomatopoeia', ['', '', '', '', 'ON'],
287
-
288
- 'Punctuation', ['', '', '', '', 'PU', 'PN'],
289
- 'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
290
- 'Punctuation, sentence ender', ['PUN', '.', 'PP', '$.', '', 'PN'],
291
- 'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
292
- 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
293
- 'Punctuationm, comma', ['PUN', ',', ',', '$,'],
294
- 'Punctuation, dash', ['PUN', '-', '-'],
295
- 'Punctuation, dollar sign', ['PUN', '', '$'],
296
- 'Punctuation, left bracket', ['PUL', '(', '(', '$('],
297
- 'Punctuation, right bracket', ['PUR', ')', ')'],
298
- 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
299
- 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
300
-
301
- 'Word, truncated, left', ['', '', '', 'TRUNC'],
302
-
303
- 'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
304
-
305
- 'Symbol', ['', '', 'SYM', 'XY'],
306
- 'Symbol, alphabetical', ['ZZ0', '', ''],
307
- 'Symbol, list item', ['', '', 'LS'],
308
-
309
- # Not sure about these tags from the Chinese PTB.
310
- 'Aspect marker', ['', '', '', '', 'AS'], # ?
311
- 'Ba-construction', ['', '', '', '', 'BA'], # ?
312
- 'In relative', ['', '', '', '', 'DEC'], # ?
313
- 'Associative', ['', '', '', '', 'DER'], # ?
314
- 'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
315
- 'For words ? ', ['', '', '', '', 'ETC'], # ?
316
- 'In long bei-construct', ['', '', '', '', 'LB'], # ?
317
- 'In short bei-construct', ['', '', '', '', 'SB'], # ?
318
- 'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
319
- 'Particle, other', ['', '', '', '', 'MSP'], # ?
320
- 'Before VP', ['', '', '', '', 'DEV'], # ?
321
- 'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
322
- 'Verb, ????', ['', '', '', '', 'VC'] # ?
323
- ]
324
-
325
- wttc = {
326
-
327
- }
328
- Treat::Languages::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
329
-
330
- category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
331
-
332
- wttc[tags[ClawsC5]] ||= {}
333
- wttc[tags[Brown]] ||= {}
334
- wttc[tags[Penn]] ||= {}
335
- wttc[tags[Negra]] ||= {}
336
- wttc[tags[PennChinese]] ||= {}
337
- wttc[tags[Simple]] ||= {}
338
-
339
- wttc[tags[ClawsC5]][:claws_5] = category
340
- wttc[tags[Brown]][:brown] = category
341
- wttc[tags[Penn]][:penn] = category
342
- wttc[tags[Negra]][:negra] = category if tags[Negra]
343
- wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
344
- wttc[tags[Simple]][:simple] = category if tags[Simple]
345
-
346
- end
347
- # A hash converting word tags to word categories.
348
- WordTagToCategory = wttc
349
-
350
- # A hash converting phrase tag to categories.
351
- pttc = {}
352
- Treat::Languages::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
353
- category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
354
- pttc[tags[Penn]] ||= {};
355
- # Not yet for other tag sts.
356
- #pttc[tags[0]][:claws_5] = category
357
- #pttc[tags[1]][:brown] = category
358
- pttc[tags[Penn]][:penn] = category
359
- end
360
-
361
- # A hash converting word tags to word categories.
362
- PhraseTagToCategory = pttc
363
-
364
- def self.has_phrase_tag?(tag, tag_set)
365
- PhraseTagToCategory[tag] &&
366
- PhraseTagToCategory[tag_set]
367
- end
368
-
369
- def self.has_word_tag?(tag, tag_set)
370
- WordTagToCategory[tag] &&
371
- WordTagToCategory[tag_set]
372
- end
373
-
374
-
375
- end
376
- end
377
- end