treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,97 @@
1
+ # Wrapper for the Stanford POS tagger.
2
+ class Treat::Lexicalizers::Taggers::Stanford
3
+
4
+ require 'treat/loaders/stanford'
5
+
6
+ # Hold one tagger per language.
7
+ @@taggers = {}
8
+
9
+ # Hold the default options.
10
+ DefaultOptions = {
11
+ :tagger_model => nil
12
+ }
13
+
14
+ # Tag the word using one of the Stanford taggers.
15
+ def self.tag(entity, options = {})
16
+
17
+ # Tokenize the sentence/phrase.
18
+ if !entity.has_children? &&
19
+ !entity.is_a?(Treat::Entities::Token)
20
+ entity.tokenize(:stanford, options)
21
+ end
22
+
23
+ # Handle options and initialize the tagger.
24
+ lang = entity.language
25
+ options = get_options(options, lang)
26
+ tokens, list = get_token_list(entity)
27
+ init_tagger(lang)
28
+
29
+ # Do the tagging.
30
+ i = 0
31
+ isolated_token = entity.is_a?(Treat::Entities::Token)
32
+ @@taggers[lang].apply(list).each do |tok|
33
+ tokens[i].set :tag, tok.tag
34
+ tokens[i].set :tag_set,
35
+ options[:tag_set] if isolated_token
36
+ return tok.tag if isolated_token
37
+ i += 1
38
+ end
39
+
40
+ # Handle tags for sentences and phrases.
41
+
42
+ if entity.is_a?(Treat::Entities::Sentence) ||
43
+ (entity.is_a?(Treat::Entities::Phrase) &&
44
+ !entity.parent_sentence)
45
+ entity.set :tag_set, :penn
46
+ end
47
+
48
+ if entity.is_a?(Treat::Entities::Sentence)
49
+ return 'S'
50
+ elsif entity.is_a?(Treat::Entities::Phrase)
51
+ return 'P'
52
+ end
53
+
54
+ end
55
+
56
+ # Initialize the tagger for a language.
57
+ def self.init_tagger(lang)
58
+
59
+ language = Treat::Languages.describe(lang)
60
+ model = StanfordCoreNLP::Config::Models[:pos][language]
61
+ model = Treat.models + 'stanford/' +
62
+ StanfordCoreNLP::Config::ModelFolders[:pos] + model
63
+ @@taggers[lang] ||=
64
+ StanfordCoreNLP::MaxentTagger.new(model)
65
+
66
+ end
67
+
68
+ # Handle the options for the tagger.
69
+ def self.get_options(options, lang)
70
+ language = Treat::Languages.describe(lang)
71
+ options = DefaultOptions.merge(options)
72
+ options[:tag_set] =
73
+ StanfordCoreNLP::Config::TagSets[language]
74
+ if options[:tagger_model]
75
+ ::StanfordCoreNLP.set_model('pos.model',
76
+ options[:tagger_model])
77
+ end
78
+ options[:tag_set] =
79
+ StanfordCoreNLP::Config::TagSets[language]
80
+ options
81
+ end
82
+
83
+ # Retrieve a Java ArrayList object.
84
+ def self.get_token_list(entity)
85
+ list = StanfordCoreNLP::ArrayList.new
86
+ if entity.is_a?(Treat::Entities::Token)
87
+ tokens = [entity]
88
+ else
89
+ tokens = entity.tokens
90
+ end
91
+ tokens.each do |token|
92
+ list.add(StanfordCoreNLP::Word.new(token.to_s))
93
+ end
94
+ return tokens, list
95
+ end
96
+
97
+ end
@@ -0,0 +1,9 @@
1
+ module Treat::Linguistics
2
+
3
+ p = 'treat/linguistics/*.rb'
4
+
5
+ Dir[Treat.lib + p].each do |f|
6
+ require f
7
+ end
8
+
9
+ end
@@ -0,0 +1,11 @@
1
+ module Treat::Linguistics
2
+
3
+ # A list of all possible word categories.
4
+ WordCategories = [
5
+ :adjective, :adverb, :noun, :verb, :interjection,
6
+ :clitic, :coverb, :conjunction, :determiner, :particle,
7
+ :preposition, :pronoun, :number, :symbol, :punctuation,
8
+ :complementizer
9
+ ]
10
+
11
+ end
@@ -0,0 +1,422 @@
1
+ module Treat::Linguistics::Tags
2
+
3
+ ClawsC5 = 0
4
+ Brown = 1
5
+ Penn = 2
6
+ Negra = 3
7
+ PennChinese = 4
8
+ Simple = 5
9
+
10
+ PTBClauseTagDescription = [
11
+ ['S', 'Simple declarative clause'],
12
+ ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
13
+ ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
14
+ ['SINV', 'Inverted declarative sentence'],
15
+ ['SQ', 'Inverted yes/no question']
16
+ ]
17
+
18
+ PTBEscapeCharacters = {
19
+ '(' => '-LRB-',
20
+ ')' => '-RRB-',
21
+ '[' => '-LSB-',
22
+ ']' => '-RSB-',
23
+ '{' => '-LCB-',
24
+ '}' => '-RCB-'
25
+ }
26
+
27
+ AlignedPhraseTags =
28
+ [
29
+ 'Adjective phrase', ['', '', 'ADJP'],
30
+ 'Adverb phrase', ['', '', 'ADVP'],
31
+ 'Conjunction phrase', ['', '', 'CONJP'],
32
+ 'Fragment', ['', '', 'FRAG'],
33
+ 'Interjection', ['', '', 'INTJ'],
34
+ 'List marker', ['', '', 'LST'],
35
+ 'Not a phrase', ['', '', 'NAC'],
36
+ 'Noun phrase', ['', '', 'NP'],
37
+ 'Head of NP', ['', '', 'NX'],
38
+ 'Prepositional phrase', ['', '', 'PP'],
39
+ 'Parenthetical', ['', '', 'PRN'],
40
+ 'Particle', ['', '', 'PRT'],
41
+ 'Quantifier phrase', ['', '', 'QP'],
42
+ 'Reduced relative clause', ['', '', 'RRC'],
43
+ 'Unlike coordinated phrase', ['', '', 'UCP'],
44
+ 'Verb phrase', ['', '', 'VP'],
45
+ 'Wh adjective phrase', ['', '', 'WHADJP'],
46
+ 'Wh adverb phrase', ['', '', 'WHAVP'],
47
+ 'Wh noun phrase', ['', '', 'WHNP'],
48
+ 'Wh prepositional phrase', ['', '', 'WHPP'],
49
+ 'Unknown', ['', '', 'X'],
50
+ 'Phrase', ['', '', 'P'],
51
+ 'Sentence', ['', '', 'S'],
52
+ 'Phrase', ['', '', 'SBAR'] # Fix
53
+ ]
54
+
55
+ # A description of Enju categories.
56
+ EnjuCatDescription = [
57
+ ['ADJ', 'Adjective'],
58
+ ['ADV', 'Adverb'],
59
+ ['CONJ', 'Coordination conjunction'],
60
+ ['C', 'Complementizer'],
61
+ ['D', 'Determiner'],
62
+ ['N', 'Noun'],
63
+ ['P', 'Preposition'],
64
+ ['SC', 'Subordination conjunction'],
65
+ ['V', 'Verb'],
66
+ ['COOD', 'Part of coordination'],
67
+ ['PN', 'Punctuation'],
68
+ ['PRT', 'Particle'],
69
+ ['S', 'Sentence']
70
+ ]
71
+
72
+ # Maps Enju categories to Treat categories.
73
+ EnjuCatToCategory = {
74
+ 'ADJ' => :adjective,
75
+ 'ADV' => :adverb,
76
+ 'CONJ' => :conjunction,
77
+ 'COOD' => :conjunction,
78
+ 'C' => :complementizer,
79
+ 'D' => :determiner,
80
+ 'N' => :noun,
81
+ 'P' => :preposition,
82
+ 'PN' => :punctuation,
83
+ 'SC' => :conjunction,
84
+ 'V' => :verb,
85
+ 'PRT' => :particle
86
+ }
87
+
88
+ # Description of the xcat in the Enju output specification.
89
+ EnjuXCatDescription = [
90
+ ['COOD', 'Coordinated phrase/clause'],
91
+ ['IMP', 'Imperative sentence'],
92
+ ['INV', 'Subject-verb inversion'],
93
+ ['Q', 'Interrogative sentence with subject-verb inversion'],
94
+ ['REL', 'A relativizer included'],
95
+ ['FREL', 'A free relative included'],
96
+ ['TRACE', 'A trace included'],
97
+ ['WH', 'A wh-question word included']
98
+ ]
99
+
100
+ EnjuCatXcatToPTB = [
101
+ ['ADJP', '', 'ADJP'],
102
+ ['ADJP', 'REL', 'WHADJP'],
103
+ ['ADJP', 'FREL', 'WHADJP'],
104
+ ['ADJP', 'WH', 'WHADJP'],
105
+ ['ADVP', '', 'ADVP'],
106
+ ['ADVP', 'REL', 'WHADVP'],
107
+ ['ADVP', 'FREL', 'WHADVP'],
108
+ ['ADVP', 'WH', 'WHADVP'],
109
+ ['CONJP', '', 'CONJP'],
110
+ ['CP', '', 'SBAR'],
111
+ ['DP', '', 'NP'],
112
+ ['NP', '', 'NP'],
113
+ ['NX', 'NX', 'NAC'],
114
+ ['NP' 'REL' 'WHNP'],
115
+ ['NP' 'FREL' 'WHNP'],
116
+ ['NP' 'WH' 'WHNP'],
117
+ ['PP', '', 'PP'],
118
+ ['PP', 'REL', 'WHPP'],
119
+ ['PP', 'WH', 'WHPP'],
120
+ ['PRT', '', 'PRT'],
121
+ ['S', '', 'S'],
122
+ ['S', 'INV', 'SINV'],
123
+ ['S', 'Q', 'SQ'],
124
+ ['S', 'REL', 'SBAR'],
125
+ ['S', 'FREL', 'SBAR'],
126
+ ['S', 'WH', 'SBARQ'],
127
+ ['SCP', '', 'SBAR'],
128
+ ['VP', '', 'VP'],
129
+ ['VP', '', 'VP'],
130
+ ['', '', 'UK']
131
+ ]
132
+
133
+ # Aligned tags for the Claws C5, Brown and Penn tag sets.
134
+ # Adapted from Manning, Christopher and Schütze, Hinrich,
135
+ # 1999. Foundations of Statistical Natural Language
136
+ # Processing. MIT Press, p. 141-142;
137
+ # http://www.isocat.org/rest/dcs/376;
138
+ #
139
+ # JRS?
140
+
141
+
142
+ SimpleWordTagToCategory = {
143
+ 'C' => :complementizer,
144
+ 'PN' => :punctuation,
145
+ 'SC' => :conjunction
146
+ }
147
+
148
+ PunctuationToCategory = {
149
+ '.' => :period,
150
+ ',' => :comma,
151
+ ';' => :semicolon,
152
+ ':' => :colon,
153
+ '!' => :exclamation,
154
+ '?' => :interrogation,
155
+ '"' => :quote,
156
+ "'" => :quote,
157
+
158
+ '$' => :dollar,
159
+ '%' => :percent,
160
+ '#' => :hash,
161
+ '*' => :asterisk,
162
+ '&' => :ampersand,
163
+ '+' => :plus,
164
+ '-' => :dash,
165
+
166
+ '/' => :slash,
167
+ '\\' => :backslash,
168
+ '^' => :caret,
169
+ '_' => :underscore,
170
+ '`' => :tick,
171
+ '|' => :pipe,
172
+ '~' => :tilde,
173
+ '@' => :at,
174
+
175
+ '[' => :bracket,
176
+ ']' => :bracket,
177
+ '{' => :brace,
178
+ '}' => :brace,
179
+ '(' => :parenthesis,
180
+ ')' => :parenthesis,
181
+
182
+ '<' => :tag,
183
+ '>' => :tag
184
+ }
185
+
186
+ AlignedWordTags = [
187
+
188
+ 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
189
+ 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
190
+ 'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
191
+ 'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
192
+ 'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
193
+ 'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
194
+ 'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
195
+ 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
196
+ 'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
197
+ 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
198
+
199
+ 'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
200
+ 'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
201
+ 'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
202
+ 'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
203
+ 'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
204
+ 'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
205
+ 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
206
+ 'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
207
+ 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
208
+ 'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
209
+ 'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
210
+
211
+ 'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
212
+ 'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
213
+ 'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
214
+ 'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
215
+ 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
216
+
217
+ 'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
218
+ 'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
219
+ 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
220
+ 'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
221
+ 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
222
+ 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
223
+ 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
224
+ 'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
225
+ 'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
226
+ 'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
227
+ 'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
228
+ 'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
229
+ 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
230
+
231
+ 'Localizer', ['', '', '', '', 'LC'],
232
+
233
+ 'Measure word', ['', '', '', '', 'M'],
234
+
235
+ 'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
236
+ 'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
237
+ 'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
238
+ 'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
239
+ 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
240
+ 'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
241
+ 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
242
+ 'Noun, temporal', ['', '', '', '', 'NT', 'N'],
243
+ 'Noun, verbal', ['', '', '', '', 'NN', 'N'],
244
+
245
+ 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
246
+ 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
247
+ 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
248
+ 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
249
+ 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
250
+ 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
251
+ 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
252
+ 'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
253
+ 'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
254
+ 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
255
+ 'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
256
+ 'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
257
+ 'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
258
+ 'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
259
+ 'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
260
+ 'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
261
+ 'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
262
+ 'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
263
+ 'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
264
+ 'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
265
+
266
+ 'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
267
+ 'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
268
+ 'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
269
+ 'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
270
+ 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
271
+ 'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
272
+ 'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
273
+ 'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
274
+ 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
275
+ 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
276
+ 'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
277
+ 'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
278
+ 'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
279
+ 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
280
+ 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
281
+ 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
282
+ 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
283
+ 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
284
+ 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
285
+ 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
286
+ 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
287
+ 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
288
+ 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
289
+ 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
290
+ 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
291
+ 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
292
+ 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
293
+ 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
294
+ 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
295
+ 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
296
+ 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
297
+ 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
298
+ 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
299
+ 'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
300
+ 'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
301
+ 'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
302
+ 'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
303
+ 'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
304
+
305
+ 'Particle', ['', '', '', '', '', 'PRT'],
306
+ 'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
307
+ 'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
308
+ 'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
309
+ 'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
310
+ 'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
311
+
312
+ 'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
313
+ 'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
314
+ 'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
315
+ 'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
316
+ 'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
317
+
318
+ 'Possessive', ['POS', '$', 'POS'],
319
+
320
+ 'Postposition', ['', '', '', 'APPO'],
321
+
322
+ 'Circumposition, right', ['', '', '', 'APZR', ''],
323
+
324
+ 'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
325
+
326
+ 'Onomatopoeia', ['', '', '', '', 'ON'],
327
+
328
+ 'Punctuation', ['', '', '', '', 'PU', 'PN'],
329
+ 'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
330
+
331
+ 'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
332
+ 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
333
+ 'Punctuationm, comma', ['PUN', ',', ',', '$,'],
334
+ 'Punctuation, dash', ['PUN', '-', '-'],
335
+ 'Punctuation, dollar sign', ['PUN', '', '$'],
336
+ 'Punctuation, left bracket', ['PUL', '(', '(', '$('],
337
+ 'Punctuation, right bracket', ['PUR', ')', ')'],
338
+ 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
339
+ 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
340
+
341
+ 'Punctuation, left bracket', ['PUL', '(', 'PPL'],
342
+ 'Punctuation, right bracket', ['PUR', ')', 'PPR'],
343
+ 'Punctuation, left square bracket', ['PUL', '(', 'LSB'],
344
+ 'Punctuation, right square bracket', ['PUR', ')', 'RSB'],
345
+ 'Punctuation, left curly bracket', ['PUL', '(', 'LCB'],
346
+ 'Punctuation, right curly bracket', ['PUR', ')', 'RCB'],
347
+
348
+ 'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
349
+
350
+ 'Symbol', ['', '', 'SYM', 'XY'],
351
+ 'Symbol, alphabetical', ['ZZ0', '', ''],
352
+ 'Symbol, list item', ['', '', 'LS'],
353
+
354
+ # Not sure about these tags from the Chinese PTB.
355
+ 'Aspect marker', ['', '', '', '', 'AS'], # ?
356
+ 'Ba-construction', ['', '', '', '', 'BA'], # ?
357
+ 'In relative', ['', '', '', '', 'DEC'], # ?
358
+ 'Associative', ['', '', '', '', 'DER'], # ?
359
+ 'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
360
+ 'For words ? ', ['', '', '', '', 'ETC'], # ?
361
+ 'In long bei-construct', ['', '', '', '', 'LB'], # ?
362
+ 'In short bei-construct', ['', '', '', '', 'SB'], # ?
363
+ 'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
364
+ 'Particle, other', ['', '', '', '', 'MSP'], # ?
365
+ 'Before VP', ['', '', '', '', 'DEV'], # ?
366
+ 'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
367
+ 'Verb, ????', ['', '', '', '', 'VC'] # ?
368
+ ]
369
+
370
+ wttc = {
371
+
372
+ }
373
+ Treat::Linguistics::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
374
+
375
+ category = desc.gsub(',', ' ,').
376
+ split(' ')[0].downcase.intern
377
+
378
+ wttc[tags[ClawsC5]] ||= {}
379
+ wttc[tags[Brown]] ||= {}
380
+ wttc[tags[Penn]] ||= {}
381
+ wttc[tags[Negra]] ||= {}
382
+ wttc[tags[PennChinese]] ||= {}
383
+ wttc[tags[Simple]] ||= {}
384
+
385
+ wttc[tags[ClawsC5]][:claws_5] = category
386
+ wttc[tags[Brown]][:brown] = category
387
+ wttc[tags[Penn]][:penn] = category
388
+ wttc[tags[Negra]][:negra] = category if tags[Negra]
389
+ wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
390
+ wttc[tags[Simple]][:simple] = category if tags[Simple]
391
+
392
+ end
393
+ # A hash converting word tags to word categories.
394
+ WordTagToCategory = wttc
395
+
396
+ # A hash converting phrase tag to categories.
397
+ pttc = {}
398
+ Treat::Linguistics::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
399
+ category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
400
+ pttc[tags[Penn]] ||= {};
401
+ # Not yet for other tag sts.
402
+ #pttc[tags[0]][:claws_5] = category
403
+ #pttc[tags[1]][:brown] = category
404
+ pttc[tags[Penn]][:penn] = category
405
+ end
406
+
407
+ # A hash converting word tags to word categories.
408
+ PhraseTagToCategory = pttc
409
+
410
+ def self.describe(tag, tag_set)
411
+ if PhraseTagToCategory[tag] &&
412
+ PhraseTagToCategory[tag_set] &&
413
+ WordTagToCategory[tag] &&
414
+ WordTagToCategory[tag_set]
415
+ end
416
+ end
417
+
418
+ def self.convert(tag, from, to)
419
+
420
+ end
421
+
422
+ end