treat 0.2.5 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,97 @@
1
+ # Wrapper for the Stanford POS tagger.
2
+ class Treat::Lexicalizers::Taggers::Stanford
3
+
4
+ require 'treat/loaders/stanford'
5
+
6
+ # Hold one tagger per language.
7
+ @@taggers = {}
8
+
9
+ # Hold the default options.
10
+ DefaultOptions = {
11
+ :tagger_model => nil
12
+ }
13
+
14
+ # Tag the word using one of the Stanford taggers.
15
+ def self.tag(entity, options = {})
16
+
17
+ # Tokenize the sentence/phrase.
18
+ if !entity.has_children? &&
19
+ !entity.is_a?(Treat::Entities::Token)
20
+ entity.tokenize(:stanford, options)
21
+ end
22
+
23
+ # Handle options and initialize the tagger.
24
+ lang = entity.language
25
+ options = get_options(options, lang)
26
+ tokens, list = get_token_list(entity)
27
+ init_tagger(lang)
28
+
29
+ # Do the tagging.
30
+ i = 0
31
+ isolated_token = entity.is_a?(Treat::Entities::Token)
32
+ @@taggers[lang].apply(list).each do |tok|
33
+ tokens[i].set :tag, tok.tag
34
+ tokens[i].set :tag_set,
35
+ options[:tag_set] if isolated_token
36
+ return tok.tag if isolated_token
37
+ i += 1
38
+ end
39
+
40
+ # Handle tags for sentences and phrases.
41
+
42
+ if entity.is_a?(Treat::Entities::Sentence) ||
43
+ (entity.is_a?(Treat::Entities::Phrase) &&
44
+ !entity.parent_sentence)
45
+ entity.set :tag_set, :penn
46
+ end
47
+
48
+ if entity.is_a?(Treat::Entities::Sentence)
49
+ return 'S'
50
+ elsif entity.is_a?(Treat::Entities::Phrase)
51
+ return 'P'
52
+ end
53
+
54
+ end
55
+
56
+ # Initialize the tagger for a language.
57
+ def self.init_tagger(lang)
58
+
59
+ language = Treat::Languages.describe(lang)
60
+ model = StanfordCoreNLP::Config::Models[:pos][language]
61
+ model = Treat.models + 'stanford/' +
62
+ StanfordCoreNLP::Config::ModelFolders[:pos] + model
63
+ @@taggers[lang] ||=
64
+ StanfordCoreNLP::MaxentTagger.new(model)
65
+
66
+ end
67
+
68
+ # Handle the options for the tagger.
69
+ def self.get_options(options, lang)
70
+ language = Treat::Languages.describe(lang)
71
+ options = DefaultOptions.merge(options)
72
+ options[:tag_set] =
73
+ StanfordCoreNLP::Config::TagSets[language]
74
+ if options[:tagger_model]
75
+ ::StanfordCoreNLP.set_model('pos.model',
76
+ options[:tagger_model])
77
+ end
78
+ options[:tag_set] =
79
+ StanfordCoreNLP::Config::TagSets[language]
80
+ options
81
+ end
82
+
83
+ # Retrieve a Java ArrayList object.
84
+ def self.get_token_list(entity)
85
+ list = StanfordCoreNLP::ArrayList.new
86
+ if entity.is_a?(Treat::Entities::Token)
87
+ tokens = [entity]
88
+ else
89
+ tokens = entity.tokens
90
+ end
91
+ tokens.each do |token|
92
+ list.add(StanfordCoreNLP::Word.new(token.to_s))
93
+ end
94
+ return tokens, list
95
+ end
96
+
97
+ end
@@ -0,0 +1,9 @@
1
+ module Treat::Linguistics
2
+
3
+ p = 'treat/linguistics/*.rb'
4
+
5
+ Dir[Treat.lib + p].each do |f|
6
+ require f
7
+ end
8
+
9
+ end
@@ -0,0 +1,11 @@
1
+ module Treat::Linguistics
2
+
3
+ # A list of all possible word categories.
4
+ WordCategories = [
5
+ :adjective, :adverb, :noun, :verb, :interjection,
6
+ :clitic, :coverb, :conjunction, :determiner, :particle,
7
+ :preposition, :pronoun, :number, :symbol, :punctuation,
8
+ :complementizer
9
+ ]
10
+
11
+ end
@@ -0,0 +1,422 @@
1
+ module Treat::Linguistics::Tags
2
+
3
+ ClawsC5 = 0
4
+ Brown = 1
5
+ Penn = 2
6
+ Negra = 3
7
+ PennChinese = 4
8
+ Simple = 5
9
+
10
+ PTBClauseTagDescription = [
11
+ ['S', 'Simple declarative clause'],
12
+ ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
13
+ ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
14
+ ['SINV', 'Inverted declarative sentence'],
15
+ ['SQ', 'Inverted yes/no question']
16
+ ]
17
+
18
+ PTBEscapeCharacters = {
19
+ '(' => '-LRB-',
20
+ ')' => '-RRB-',
21
+ '[' => '-LSB-',
22
+ ']' => '-RSB-',
23
+ '{' => '-LCB-',
24
+ '}' => '-RCB-'
25
+ }
26
+
27
+ AlignedPhraseTags =
28
+ [
29
+ 'Adjective phrase', ['', '', 'ADJP'],
30
+ 'Adverb phrase', ['', '', 'ADVP'],
31
+ 'Conjunction phrase', ['', '', 'CONJP'],
32
+ 'Fragment', ['', '', 'FRAG'],
33
+ 'Interjection', ['', '', 'INTJ'],
34
+ 'List marker', ['', '', 'LST'],
35
+ 'Not a phrase', ['', '', 'NAC'],
36
+ 'Noun phrase', ['', '', 'NP'],
37
+ 'Head of NP', ['', '', 'NX'],
38
+ 'Prepositional phrase', ['', '', 'PP'],
39
+ 'Parenthetical', ['', '', 'PRN'],
40
+ 'Particle', ['', '', 'PRT'],
41
+ 'Quantifier phrase', ['', '', 'QP'],
42
+ 'Reduced relative clause', ['', '', 'RRC'],
43
+ 'Unlike coordinated phrase', ['', '', 'UCP'],
44
+ 'Verb phrase', ['', '', 'VP'],
45
+ 'Wh adjective phrase', ['', '', 'WHADJP'],
46
+ 'Wh adverb phrase', ['', '', 'WHAVP'],
47
+ 'Wh noun phrase', ['', '', 'WHNP'],
48
+ 'Wh prepositional phrase', ['', '', 'WHPP'],
49
+ 'Unknown', ['', '', 'X'],
50
+ 'Phrase', ['', '', 'P'],
51
+ 'Sentence', ['', '', 'S'],
52
+ 'Phrase', ['', '', 'SBAR'] # Fix
53
+ ]
54
+
55
+ # A description of Enju categories.
56
+ EnjuCatDescription = [
57
+ ['ADJ', 'Adjective'],
58
+ ['ADV', 'Adverb'],
59
+ ['CONJ', 'Coordination conjunction'],
60
+ ['C', 'Complementizer'],
61
+ ['D', 'Determiner'],
62
+ ['N', 'Noun'],
63
+ ['P', 'Preposition'],
64
+ ['SC', 'Subordination conjunction'],
65
+ ['V', 'Verb'],
66
+ ['COOD', 'Part of coordination'],
67
+ ['PN', 'Punctuation'],
68
+ ['PRT', 'Particle'],
69
+ ['S', 'Sentence']
70
+ ]
71
+
72
+ # Maps Enju categories to Treat categories.
73
+ EnjuCatToCategory = {
74
+ 'ADJ' => :adjective,
75
+ 'ADV' => :adverb,
76
+ 'CONJ' => :conjunction,
77
+ 'COOD' => :conjunction,
78
+ 'C' => :complementizer,
79
+ 'D' => :determiner,
80
+ 'N' => :noun,
81
+ 'P' => :preposition,
82
+ 'PN' => :punctuation,
83
+ 'SC' => :conjunction,
84
+ 'V' => :verb,
85
+ 'PRT' => :particle
86
+ }
87
+
88
+ # Description of the xcat in the Enju output specification.
89
+ EnjuXCatDescription = [
90
+ ['COOD', 'Coordinated phrase/clause'],
91
+ ['IMP', 'Imperative sentence'],
92
+ ['INV', 'Subject-verb inversion'],
93
+ ['Q', 'Interrogative sentence with subject-verb inversion'],
94
+ ['REL', 'A relativizer included'],
95
+ ['FREL', 'A free relative included'],
96
+ ['TRACE', 'A trace included'],
97
+ ['WH', 'A wh-question word included']
98
+ ]
99
+
100
+ EnjuCatXcatToPTB = [
101
+ ['ADJP', '', 'ADJP'],
102
+ ['ADJP', 'REL', 'WHADJP'],
103
+ ['ADJP', 'FREL', 'WHADJP'],
104
+ ['ADJP', 'WH', 'WHADJP'],
105
+ ['ADVP', '', 'ADVP'],
106
+ ['ADVP', 'REL', 'WHADVP'],
107
+ ['ADVP', 'FREL', 'WHADVP'],
108
+ ['ADVP', 'WH', 'WHADVP'],
109
+ ['CONJP', '', 'CONJP'],
110
+ ['CP', '', 'SBAR'],
111
+ ['DP', '', 'NP'],
112
+ ['NP', '', 'NP'],
113
+ ['NX', 'NX', 'NAC'],
114
+ ['NP' 'REL' 'WHNP'],
115
+ ['NP' 'FREL' 'WHNP'],
116
+ ['NP' 'WH' 'WHNP'],
117
+ ['PP', '', 'PP'],
118
+ ['PP', 'REL', 'WHPP'],
119
+ ['PP', 'WH', 'WHPP'],
120
+ ['PRT', '', 'PRT'],
121
+ ['S', '', 'S'],
122
+ ['S', 'INV', 'SINV'],
123
+ ['S', 'Q', 'SQ'],
124
+ ['S', 'REL', 'SBAR'],
125
+ ['S', 'FREL', 'SBAR'],
126
+ ['S', 'WH', 'SBARQ'],
127
+ ['SCP', '', 'SBAR'],
128
+ ['VP', '', 'VP'],
129
+ ['VP', '', 'VP'],
130
+ ['', '', 'UK']
131
+ ]
132
+
133
+ # Aligned tags for the Claws C5, Brown and Penn tag sets.
134
+ # Adapted from Manning, Christopher and Schütze, Hinrich,
135
+ # 1999. Foundations of Statistical Natural Language
136
+ # Processing. MIT Press, p. 141-142;
137
+ # http://www.isocat.org/rest/dcs/376;
138
+ #
139
+ # JRS?
140
+
141
+
142
+ SimpleWordTagToCategory = {
143
+ 'C' => :complementizer,
144
+ 'PN' => :punctuation,
145
+ 'SC' => :conjunction
146
+ }
147
+
148
+ PunctuationToCategory = {
149
+ '.' => :period,
150
+ ',' => :comma,
151
+ ';' => :semicolon,
152
+ ':' => :colon,
153
+ '!' => :exclamation,
154
+ '?' => :interrogation,
155
+ '"' => :quote,
156
+ "'" => :quote,
157
+
158
+ '$' => :dollar,
159
+ '%' => :percent,
160
+ '#' => :hash,
161
+ '*' => :asterisk,
162
+ '&' => :ampersand,
163
+ '+' => :plus,
164
+ '-' => :dash,
165
+
166
+ '/' => :slash,
167
+ '\\' => :backslash,
168
+ '^' => :caret,
169
+ '_' => :underscore,
170
+ '`' => :tick,
171
+ '|' => :pipe,
172
+ '~' => :tilde,
173
+ '@' => :at,
174
+
175
+ '[' => :bracket,
176
+ ']' => :bracket,
177
+ '{' => :brace,
178
+ '}' => :brace,
179
+ '(' => :parenthesis,
180
+ ')' => :parenthesis,
181
+
182
+ '<' => :tag,
183
+ '>' => :tag
184
+ }
185
+
186
+ AlignedWordTags = [
187
+
188
+ 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
189
+ 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
190
+ 'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
191
+ 'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
192
+ 'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
193
+ 'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
194
+ 'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
195
+ 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
196
+ 'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
197
+ 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
198
+
199
+ 'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
200
+ 'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
201
+ 'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
202
+ 'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
203
+ 'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
204
+ 'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
205
+ 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
206
+ 'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
207
+ 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
208
+ 'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
209
+ 'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
210
+
211
+ 'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
212
+ 'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
213
+ 'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
214
+ 'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
215
+ 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
216
+
217
+ 'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
218
+ 'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
219
+ 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
220
+ 'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
221
+ 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
222
+ 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
223
+ 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
224
+ 'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
225
+ 'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
226
+ 'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
227
+ 'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
228
+ 'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
229
+ 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
230
+
231
+ 'Localizer', ['', '', '', '', 'LC'],
232
+
233
+ 'Measure word', ['', '', '', '', 'M'],
234
+
235
+ 'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
236
+ 'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
237
+ 'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
238
+ 'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
239
+ 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
240
+ 'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
241
+ 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
242
+ 'Noun, temporal', ['', '', '', '', 'NT', 'N'],
243
+ 'Noun, verbal', ['', '', '', '', 'NN', 'N'],
244
+
245
+ 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
246
+ 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
247
+ 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
248
+ 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
249
+ 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
250
+ 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
251
+ 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
252
+ 'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
253
+ 'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
254
+ 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
255
+ 'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
256
+ 'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
257
+ 'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
258
+ 'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
259
+ 'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
260
+ 'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
261
+ 'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
262
+ 'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
263
+ 'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
264
+ 'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
265
+
266
+ 'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
267
+ 'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
268
+ 'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
269
+ 'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
270
+ 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
271
+ 'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
272
+ 'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
273
+ 'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
274
+ 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
275
+ 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
276
+ 'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
277
+ 'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
278
+ 'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
279
+ 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
280
+ 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
281
+ 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
282
+ 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
283
+ 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
284
+ 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
285
+ 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
286
+ 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
287
+ 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
288
+ 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
289
+ 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
290
+ 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
291
+ 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
292
+ 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
293
+ 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
294
+ 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
295
+ 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
296
+ 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
297
+ 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
298
+ 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
299
+ 'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
300
+ 'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
301
+ 'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
302
+ 'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
303
+ 'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
304
+
305
+ 'Particle', ['', '', '', '', '', 'PRT'],
306
+ 'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
307
+ 'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
308
+ 'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
309
+ 'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
310
+ 'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
311
+
312
+ 'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
313
+ 'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
314
+ 'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
315
+ 'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
316
+ 'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
317
+
318
+ 'Possessive', ['POS', '$', 'POS'],
319
+
320
+ 'Postposition', ['', '', '', 'APPO'],
321
+
322
+ 'Circumposition, right', ['', '', '', 'APZR', ''],
323
+
324
+ 'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
325
+
326
+ 'Onomatopoeia', ['', '', '', '', 'ON'],
327
+
328
+ 'Punctuation', ['', '', '', '', 'PU', 'PN'],
329
+ 'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
330
+
331
+ 'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
332
+ 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
333
+ 'Punctuationm, comma', ['PUN', ',', ',', '$,'],
334
+ 'Punctuation, dash', ['PUN', '-', '-'],
335
+ 'Punctuation, dollar sign', ['PUN', '', '$'],
336
+ 'Punctuation, left bracket', ['PUL', '(', '(', '$('],
337
+ 'Punctuation, right bracket', ['PUR', ')', ')'],
338
+ 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
339
+ 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
340
+
341
+ 'Punctuation, left bracket', ['PUL', '(', 'PPL'],
342
+ 'Punctuation, right bracket', ['PUR', ')', 'PPR'],
343
+ 'Punctuation, left square bracket', ['PUL', '(', 'LSB'],
344
+ 'Punctuation, right square bracket', ['PUR', ')', 'RSB'],
345
+ 'Punctuation, left curly bracket', ['PUL', '(', 'LCB'],
346
+ 'Punctuation, right curly bracket', ['PUR', ')', 'RCB'],
347
+
348
+ 'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
349
+
350
+ 'Symbol', ['', '', 'SYM', 'XY'],
351
+ 'Symbol, alphabetical', ['ZZ0', '', ''],
352
+ 'Symbol, list item', ['', '', 'LS'],
353
+
354
+ # Not sure about these tags from the Chinese PTB.
355
+ 'Aspect marker', ['', '', '', '', 'AS'], # ?
356
+ 'Ba-construction', ['', '', '', '', 'BA'], # ?
357
+ 'In relative', ['', '', '', '', 'DEC'], # ?
358
+ 'Associative', ['', '', '', '', 'DER'], # ?
359
+ 'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
360
+ 'For words ? ', ['', '', '', '', 'ETC'], # ?
361
+ 'In long bei-construct', ['', '', '', '', 'LB'], # ?
362
+ 'In short bei-construct', ['', '', '', '', 'SB'], # ?
363
+ 'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
364
+ 'Particle, other', ['', '', '', '', 'MSP'], # ?
365
+ 'Before VP', ['', '', '', '', 'DEV'], # ?
366
+ 'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
367
+ 'Verb, ????', ['', '', '', '', 'VC'] # ?
368
+ ]
369
+
370
+ wttc = {
371
+
372
+ }
373
+ Treat::Linguistics::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
374
+
375
+ category = desc.gsub(',', ' ,').
376
+ split(' ')[0].downcase.intern
377
+
378
+ wttc[tags[ClawsC5]] ||= {}
379
+ wttc[tags[Brown]] ||= {}
380
+ wttc[tags[Penn]] ||= {}
381
+ wttc[tags[Negra]] ||= {}
382
+ wttc[tags[PennChinese]] ||= {}
383
+ wttc[tags[Simple]] ||= {}
384
+
385
+ wttc[tags[ClawsC5]][:claws_5] = category
386
+ wttc[tags[Brown]][:brown] = category
387
+ wttc[tags[Penn]][:penn] = category
388
+ wttc[tags[Negra]][:negra] = category if tags[Negra]
389
+ wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
390
+ wttc[tags[Simple]][:simple] = category if tags[Simple]
391
+
392
+ end
393
+ # A hash converting word tags to word categories.
394
+ WordTagToCategory = wttc
395
+
396
+ # A hash converting phrase tag to categories.
397
+ pttc = {}
398
+ Treat::Linguistics::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
399
+ category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
400
+ pttc[tags[Penn]] ||= {};
401
+ # Not yet for other tag sts.
402
+ #pttc[tags[0]][:claws_5] = category
403
+ #pttc[tags[1]][:brown] = category
404
+ pttc[tags[Penn]][:penn] = category
405
+ end
406
+
407
+ # A hash converting word tags to word categories.
408
+ PhraseTagToCategory = pttc
409
+
410
+ def self.describe(tag, tag_set)
411
+ if PhraseTagToCategory[tag] &&
412
+ PhraseTagToCategory[tag_set] &&
413
+ WordTagToCategory[tag] &&
414
+ WordTagToCategory[tag_set]
415
+ end
416
+ end
417
+
418
+ def self.convert(tag, from, to)
419
+
420
+ end
421
+
422
+ end