treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -1,13 +1,16 @@
1
1
  module Treat
2
2
  module Languages
3
3
  class Italian
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
4
6
  Extractors = {}
5
7
  Inflectors = {}
6
8
  Lexicalizers = {}
7
9
  Processors = {
8
- chunkers: [:txt],
9
- segmenters: [:tactful, :punkt, :stanford],
10
- tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
10
+ :chunkers => [:txt],
11
+ :parsers => [:stanford],
12
+ :segmenters => [:punkt],
13
+ :tokenizers => [:tactful]
11
14
  }
12
15
  end
13
16
  end
@@ -0,0 +1,16 @@
1
+ module Treat
2
+ module Languages
3
+ class Polish
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
6
+ Processors = {
7
+ :chunkers => [:txt],
8
+ :segmenters => [:punkt],
9
+ :tokenizers => [:tactful]
10
+ }
11
+ Extractors = {}
12
+ Inflectors = {}
13
+ Lexicalizers = {}
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ module Treat
2
+ module Languages
3
+ class Portuguese
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
6
+ Processors = {
7
+ :chunkers => [:txt],
8
+ :segmenters => [:punkt],
9
+ :tokenizers => [:tactful]
10
+ }
11
+ Extractors = {}
12
+ Inflectors = {}
13
+ Lexicalizers = {}
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ module Treat
2
+ module Languages
3
+ class Russian
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
6
+ Processors = {
7
+ :chunkers => [:txt],
8
+ :segmenters => [:punkt],
9
+ :tokenizers => [:tactful]
10
+ }
11
+ Extractors = {}
12
+ Inflectors = {}
13
+ Lexicalizers = {}
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ module Treat
2
+ module Languages
3
+ class Spanish
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
6
+ Processors = {
7
+ :chunkers => [:txt],
8
+ :segmenters => [:punkt],
9
+ :tokenizers => [:tactful]
10
+ }
11
+ Extractors = {}
12
+ Inflectors = {}
13
+ Lexicalizers = {}
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ module Treat
2
+ module Languages
3
+ class Swedish
4
+ RequiredDependencies = []
5
+ OptionalDependencies = []
6
+ Processors = {
7
+ :chunkers => [:txt],
8
+ :segmenters => [:punkt],
9
+ :tokenizers => [:tactful]
10
+ }
11
+ Extractors = {}
12
+ Inflectors = {}
13
+ Lexicalizers = {}
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,377 @@
1
+ module Treat
2
+ module Languages
3
+
4
+ module Tags
5
+ ClawsC5 = 0
6
+ Brown = 1
7
+ Penn = 2
8
+ Negra = 3
9
+ PennChinese = 4
10
+ Simple = 5
11
+
12
+ PTBClauseTagDescription = [
13
+ ['S', 'Simple declarative clause'],
14
+ ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
15
+ ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
16
+ ['SINV', 'Inverted declarative sentence'],
17
+ ['SQ', 'Inverted yes/no question']
18
+ ]
19
+
20
+ AlignedPhraseTags =
21
+ [
22
+ 'Adjective phrase', ['', '', 'ADJP'],
23
+ 'Adverb phrase', ['', '', 'ADVP'],
24
+ 'Conjunction phrase', ['', '', 'CONJP'],
25
+ 'Fragment', ['', '', 'FRAG'],
26
+ 'Interjection', ['', '', 'INTJ'],
27
+ 'List marker', ['', '', 'LST'],
28
+ 'Not a phrase', ['', '', 'NAC'],
29
+ 'Noun phrase', ['', '', 'NP'],
30
+ 'Head of NP', ['', '', 'NX'],
31
+ 'Prepositional phrase', ['', '', 'PP'],
32
+ 'Parenthetical', ['', '', 'PRN'],
33
+ 'Particle', ['', '', 'PRT'],
34
+ 'Quantifier phrase', ['', '', 'QP'],
35
+ 'Reduced relative clause', ['', '', 'RRC'],
36
+ 'Unlike coordinated phrase', ['', '', 'UCP'],
37
+ 'Verb phrase', ['', '', 'VP'],
38
+ 'Wh adjective phrase', ['', '', 'WHADJP'],
39
+ 'Wh adverb phrase', ['', '', 'WHAVP'],
40
+ 'Wh noun phrase', ['', '', 'WHNP'],
41
+ 'Wh prepositional phrase', ['', '', 'WHPP'],
42
+ 'Unknown', ['', '', 'X'],
43
+ 'Phrase', ['', '', 'P'],
44
+ 'Sentence', ['', '', 'S'],
45
+ 'Phrase', ['', '', 'SBAR'] # Fix
46
+ ]
47
+
48
+ # A description of Enju categories.
49
+ EnjuCatDescription = [
50
+ ['ADJ', 'Adjective'],
51
+ ['ADV', 'Adverb'],
52
+ ['CONJ', 'Coordination conjunction'],
53
+ ['C', 'Complementizer'],
54
+ ['D', 'Determiner'],
55
+ ['N', 'Noun'],
56
+ ['P', 'Preposition'],
57
+ ['SC', 'Subordination conjunction'],
58
+ ['V', 'Verb'],
59
+ ['COOD', 'Part of coordination'],
60
+ ['PN', 'Punctuation'],
61
+ ['PRT', 'Particle'],
62
+ ['S', 'Sentence']
63
+ ]
64
+
65
+ # Maps Enju categories to Treat categories.
66
+ EnjuCatToCategory = {
67
+ 'ADJ' => :adjective,
68
+ 'ADV' => :adverb,
69
+ 'CONJ' => :conjunction,
70
+ 'COOD' => :conjunction,
71
+ 'C' => :complementizer,
72
+ 'D' => :determiner,
73
+ 'N' => :noun,
74
+ 'P' => :preposition,
75
+ 'PN' => :punctuation,
76
+ 'SC' => :conjunction,
77
+ 'V' => :verb,
78
+ 'PRT' => :particle
79
+ }
80
+
81
+ # Description of the xcat in the Enju output specification.
82
+ EnjuXCatDescription = [
83
+ ['COOD', 'Coordinated phrase/clause'],
84
+ ['IMP', 'Imperative sentence'],
85
+ ['INV', 'Subject-verb inversion'],
86
+ ['Q', 'Interrogative sentence with subject-verb inversion'],
87
+ ['REL', 'A relativizer included'],
88
+ ['FREL', 'A free relative included'],
89
+ ['TRACE', 'A trace included'],
90
+ ['WH', 'A wh-question word included']
91
+ ]
92
+
93
+ EnjuCatXcatToPTB = [
94
+ ['ADJP', '', 'ADJP'],
95
+ ['ADJP', 'REL', 'WHADJP'],
96
+ ['ADJP', 'FREL', 'WHADJP'],
97
+ ['ADJP', 'WH', 'WHADJP'],
98
+ ['ADVP', '', 'ADVP'],
99
+ ['ADVP', 'REL', 'WHADVP'],
100
+ ['ADVP', 'FREL', 'WHADVP'],
101
+ ['ADVP', 'WH', 'WHADVP'],
102
+ ['CONJP', '', 'CONJP'],
103
+ ['CP', '', 'SBAR'],
104
+ ['DP', '', 'NP'],
105
+ ['NP', '', 'NP'],
106
+ ['NX', 'NX', 'NAC'],
107
+ ['NP' 'REL' 'WHNP'],
108
+ ['NP' 'FREL' 'WHNP'],
109
+ ['NP' 'WH' 'WHNP'],
110
+ ['PP', '', 'PP'],
111
+ ['PP', 'REL', 'WHPP'],
112
+ ['PP', 'WH', 'WHPP'],
113
+ ['PRT', '', 'PRT'],
114
+ ['S', '', 'S'],
115
+ ['S', 'INV', 'SINV'],
116
+ ['S', 'Q', 'SQ'],
117
+ ['S', 'REL', 'SBAR'],
118
+ ['S', 'FREL', 'SBAR'],
119
+ ['S', 'WH', 'SBARQ'],
120
+ ['SCP', '', 'SBAR'],
121
+ ['VP', '', 'VP'],
122
+ ['VP', '', 'VP'],
123
+ ['', '', 'UK']
124
+ ]
125
+
126
+ # Aligned tags for the Claws C5, Brown and Penn tag sets.
127
+ # Adapted from Manning, Christopher and Schütze, Hinrich,
128
+ # 1999. Foundations of Statistical Natural Language
129
+ # Processing. MIT Press, p. 141-142;
130
+ # http://www.isocat.org/rest/dcs/376;
131
+ #
132
+ # JRS?
133
+
134
+
135
+ SimpleWordTagToCategory = {
136
+ 'C' => :complementizer,
137
+ 'PN' => :punctuation,
138
+ 'SC' => :conjunction
139
+ }
140
+
141
+ AlignedWordTags = [
142
+
143
+ 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
144
+ 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
145
+ 'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
146
+ 'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
147
+ 'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
148
+ 'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
149
+ 'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
150
+ 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
151
+ 'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
152
+ 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
153
+
154
+ 'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
155
+ 'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
156
+ 'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
157
+ 'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
158
+ 'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
159
+ 'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
160
+ 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
161
+ 'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
162
+ 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
163
+ 'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
164
+ 'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
165
+
166
+ 'Clitic', ['', '', 'POS', '', '', ''],
167
+
168
+ 'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
169
+ 'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
170
+ 'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
171
+ 'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
172
+ 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
173
+
174
+ 'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
175
+ 'Determiner', ['DT0', 'DT', 'DET', '', 'DT', 'D'],
176
+ 'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
177
+ 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
178
+ 'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
179
+ 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
180
+ 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
181
+ 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
182
+ 'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
183
+ 'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
184
+ 'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
185
+ 'Determiner, possessive, second', ['DPS', 'PPSS', 'PRPS', '', '', 'D'],
186
+ 'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP', '', '', 'D'],
187
+ 'Determiner, possessive, second', ['DPS', 'PPSS', 'PRP', '', '', 'D'],
188
+ 'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
189
+ 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
190
+ 'Determiner, possessive & question', ['DTQ', 'WPS', 'WPS', '', '', 'D'],
191
+
192
+ 'Localizer', ['', '', '', '', 'LC'],
193
+
194
+ 'Measure word', ['', '', '', '', 'M'],
195
+
196
+ 'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
197
+ 'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
198
+ 'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
199
+ 'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
200
+ 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
201
+ 'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
202
+ 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
203
+ 'Noun, temporal', ['', '', '', '', 'NT', 'N'],
204
+ 'Noun, verbal', ['', '', '', '', 'NN', 'N'],
205
+
206
+ 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
207
+ 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
208
+ 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
209
+ 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
210
+ 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
211
+ 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
212
+ 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
213
+ 'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
214
+ 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
215
+ 'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
216
+ 'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
217
+ 'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
218
+ 'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
219
+ 'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
220
+ 'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
221
+ 'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
222
+ 'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
223
+ 'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
224
+ 'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
225
+
226
+ 'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
227
+ 'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
228
+ 'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
229
+ 'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
230
+ 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
231
+ 'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
232
+ 'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
233
+ 'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
234
+ 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
235
+ 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
236
+ 'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
237
+ 'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
238
+ 'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
239
+ 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
240
+ 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
241
+ 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
242
+ 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
243
+ 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
244
+ 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
245
+ 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
246
+ 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
247
+ 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
248
+ 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
249
+ 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
250
+ 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
251
+ 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
252
+ 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
253
+ 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
254
+ 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
255
+ 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
256
+ 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
257
+ 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
258
+ 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
259
+ 'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
260
+ 'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
261
+ 'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
262
+ 'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
263
+ 'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
264
+
265
+ 'Particle', ['', '', '', '', '', 'PRT'],
266
+ 'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
267
+ 'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
268
+ 'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
269
+ 'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
270
+ 'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
271
+
272
+ 'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
273
+ 'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
274
+ 'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
275
+ 'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
276
+ 'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
277
+
278
+ 'Possessive', ['POS', '$', 'POS'],
279
+
280
+ 'Postposition', ['', '', '', 'APPO'],
281
+
282
+ 'Circumposition, right', ['', '', '', 'APZR', ''],
283
+
284
+ 'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
285
+
286
+ 'Onomatopoeia', ['', '', '', '', 'ON'],
287
+
288
+ 'Punctuation', ['', '', '', '', 'PU', 'PN'],
289
+ 'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
290
+ 'Punctuation, sentence ender', ['PUN', '.', 'PP', '$.', '', 'PN'],
291
+ 'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
292
+ 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
293
+ 'Punctuationm, comma', ['PUN', ',', ',', '$,'],
294
+ 'Punctuation, dash', ['PUN', '-', '-'],
295
+ 'Punctuation, dollar sign', ['PUN', '', '$'],
296
+ 'Punctuation, left bracket', ['PUL', '(', '(', '$('],
297
+ 'Punctuation, right bracket', ['PUR', ')', ')'],
298
+ 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
299
+ 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
300
+
301
+ 'Word, truncated, left', ['', '', '', 'TRUNC'],
302
+
303
+ 'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
304
+
305
+ 'Symbol', ['', '', 'SYM', 'XY'],
306
+ 'Symbol, alphabetical', ['ZZ0', '', ''],
307
+ 'Symbol, list item', ['', '', 'LS'],
308
+
309
+ # Not sure about these tags from the Chinese PTB.
310
+ 'Aspect marker', ['', '', '', '', 'AS'], # ?
311
+ 'Ba-construction', ['', '', '', '', 'BA'], # ?
312
+ 'In relative', ['', '', '', '', 'DEC'], # ?
313
+ 'Associative', ['', '', '', '', 'DER'], # ?
314
+ 'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
315
+ 'For words ? ', ['', '', '', '', 'ETC'], # ?
316
+ 'In long bei-construct', ['', '', '', '', 'LB'], # ?
317
+ 'In short bei-construct', ['', '', '', '', 'SB'], # ?
318
+ 'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
319
+ 'Particle, other', ['', '', '', '', 'MSP'], # ?
320
+ 'Before VP', ['', '', '', '', 'DEV'], # ?
321
+ 'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
322
+ 'Verb, ????', ['', '', '', '', 'VC'] # ?
323
+ ]
324
+
325
+ wttc = {
326
+
327
+ }
328
+ Treat::Languages::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
329
+
330
+ category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
331
+
332
+ wttc[tags[ClawsC5]] ||= {}
333
+ wttc[tags[Brown]] ||= {}
334
+ wttc[tags[Penn]] ||= {}
335
+ wttc[tags[Negra]] ||= {}
336
+ wttc[tags[PennChinese]] ||= {}
337
+ wttc[tags[Simple]] ||= {}
338
+
339
+ wttc[tags[ClawsC5]][:claws_5] = category
340
+ wttc[tags[Brown]][:brown] = category
341
+ wttc[tags[Penn]][:penn] = category
342
+ wttc[tags[Negra]][:negra] = category if tags[Negra]
343
+ wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
344
+ wttc[tags[Simple]][:simple] = category if tags[Simple]
345
+
346
+ end
347
+ # A hash converting word tags to word categories.
348
+ WordTagToCategory = wttc
349
+
350
+ # A hash converting phrase tag to categories.
351
+ pttc = {}
352
+ Treat::Languages::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
353
+ category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
354
+ pttc[tags[Penn]] ||= {};
355
+ # Not yet for other tag sts.
356
+ #pttc[tags[0]][:claws_5] = category
357
+ #pttc[tags[1]][:brown] = category
358
+ pttc[tags[Penn]][:penn] = category
359
+ end
360
+
361
+ # A hash converting word tags to word categories.
362
+ PhraseTagToCategory = pttc
363
+
364
+ def self.has_phrase_tag?(tag, tag_set)
365
+ PhraseTagToCategory[tag] &&
366
+ PhraseTagToCategory[tag_set]
367
+ end
368
+
369
+ def self.has_word_tag?(tag, tag_set)
370
+ WordTagToCategory[tag] &&
371
+ WordTagToCategory[tag_set]
372
+ end
373
+
374
+
375
+ end
376
+ end
377
+ end