treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -1,15 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module Statistics
4
- class FrequencyOf
5
- # Find the frequency of a given string value.
6
- def self.statistics(entity, options = {})
7
- w = options[:value]
8
- raise Treat::Exception, "Must supply a non-nil value." unless w
9
- entity.token_registry[:value][w].nil? ? 0 :
10
- entity.token_registry[:value][w].size
11
- end
12
- end
13
- end
14
- end
15
- end
@@ -1,20 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module Time
4
- # A wrapper for the 'chronic' gem, which parses
5
- # time and date information.
6
- #
7
- # Project website: http://chronic.rubyforge.org/
8
- class Chronic
9
- silence_warnings { require 'chronic' }
10
- # Return the time information contained within the entity
11
- # by parsing it with the 'chronic' gem.
12
- #
13
- # Options: none.
14
- def self.time(entity, options = {})
15
- silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
16
- end
17
- end
18
- end
19
- end
20
- end
@@ -1,18 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module Time
4
- # A wrapper for Ruby's native date/time parsing.
5
- module Native
6
- require 'date'
7
- # Return a DateTime object representing the date/time
8
- # contained within the entity, using Ruby's native
9
- # date/time parser.
10
- #
11
- # Options: none.
12
- def self.time(entity, options = {})
13
- ::DateTime.parse(entity.to_s)
14
- end
15
- end
16
- end
17
- end
18
- end
@@ -1,26 +0,0 @@
1
- module Treat
2
- module Formatters
3
- module Readers
4
- # A wrapper class for the GOCR engine.
5
- #
6
- # "GOCR is an OCR (Optical Character Recognition)
7
- # program, developed under the GNU Public License.
8
- # It converts scanned images of text back to text files."
9
- #
10
- # Project site: http://jocr.sourceforge.net
11
- class GOCR
12
- # Read a file using the GOCR reader.
13
- #
14
- # Options: none.
15
- def self.read(document, options = {})
16
- create_temp_file(:pgm) do |tmp|
17
- `convert #{document.file} #{tmp}`
18
- f = `gocr #{tmp}`.strip
19
- document << Treat::Entities::Entity.from_string(f)
20
- end
21
- document
22
- end
23
- end
24
- end
25
- end
26
- end
@@ -1,31 +0,0 @@
1
- module Treat
2
- module Formatters
3
- module Readers
4
- # This class is a wrapper for the Google Ocropus
5
- # optical character recognition (OCR) engine.
6
- #
7
- # "OCRopus(tm) is a state-of-the-art document
8
- # analysis and OCR system, featuring pluggable
9
- # layout analysis, pluggable character recognition,
10
- # statistical natural language modeling, and multi-
11
- # lingual capabilities."
12
- #
13
- # Original paper:
14
- # Breuel, Thomas M. The Ocropus Open Source OCR System.
15
- # DFKI and U. Kaiserslautern, Germany.
16
- class Ocropus
17
- # Read a file using the Google Ocropus reader.
18
- #
19
- # Options: none.
20
- def self.read(document, options = {})
21
- create_temp_file(:txt) do |tmp|
22
- `ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
23
- f = File.read(tmp)
24
- document << Treat::Entities::Entity.from_string(f)
25
- end
26
- document
27
- end
28
- end
29
- end
30
- end
31
- end
@@ -1,13 +0,0 @@
1
- module Treat
2
- module Formatters
3
- module Visualizers
4
- # This class is not implemented yet.
5
- class HTML
6
- # Not implemented yet.
7
- def self.visualize(entity, options = {})
8
- raise 'Not implemented yet.'
9
- end
10
- end
11
- end
12
- end
13
- end
@@ -1,20 +0,0 @@
1
- module Treat
2
- module Formatters
3
- module Visualizers
4
- # Handles the call to inspect.
5
- class Inspect
6
- # Return a terminal-friendly visualization of an entity.
7
- #
8
- # Options: none.
9
- def self.visualize(entity, options = {})
10
- s = "#{entity.class.to_s.split('::')[-1]} (#{entity.id.to_s})"
11
- unless caller_method == :inspect
12
- s += " | #{entity.short_value.inspect} | #{entity.features.inspect}" +
13
- " | #{entity.edges.inspect}"
14
- end
15
- s
16
- end
17
- end
18
- end
19
- end
20
- end
@@ -1,18 +0,0 @@
1
- silence_warnings { require 'english' }
2
-
3
- module Treat
4
- module Inflectors
5
- module Declensions
6
- module En
7
- def self.declense(entity, options)
8
- string = entity.to_s
9
- if options[:count] == :plural
10
- ::English.plural(string)
11
- elsif options[:count] == :singular
12
- ::English.singular(string)
13
- end
14
- end
15
- end
16
- end
17
- end
18
- end
@@ -1,5 +0,0 @@
1
- module Treat
2
- module Languages
3
-
4
- end
5
- end
@@ -1,23 +0,0 @@
1
- module Treat
2
- module Languages
3
- class English
4
- # A list of all possible word categories.
5
- Categories = [
6
- :adjective, :adverb, :noun, :verb, :interjection,
7
- :clitic, :coverb, :conjunction, :determiner, :particle,
8
- :preposition, :pronoun, :number, :symbol, :punctuation,
9
- :complementizer
10
- ]
11
- wttc = {}
12
- Treat::Languages::English::AlignedWordTags.each_slice(2) do |desc, tags|
13
- category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
14
- wttc[tags[0]] ||= {}; wttc[tags[1]] ||= {} ;wttc[tags[2]] ||= {}
15
- wttc[tags[0]][:claws_5] = category
16
- wttc[tags[1]][:brown] = category
17
- wttc[tags[2]][:penn] = category
18
- end
19
- # A hash converting word tags to word categories.
20
- WordTagToCategory = wttc
21
- end
22
- end
23
- end
@@ -1,352 +0,0 @@
1
- module Treat
2
- module Languages
3
- class English
4
-
5
- ClawsC5 = 0
6
- Brown = 1
7
- Penn = 2
8
-
9
- PTBClauseTagDescription = [
10
- ['S', 'Simple declarative clause'],
11
- ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
12
- ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
13
- ['SINV', 'Inverted declarative sentence'],
14
- ['SQ', 'Inverted yes/no question']
15
- ]
16
-
17
- PTBPhraseTagDescription = [
18
- ['ADJP', 'Adjective phrase'],
19
- ['ADVP', 'Adverb phrase'],
20
- ['CONJP', 'Conjunction phrase'],
21
- ['FRAG', 'Fragment'],
22
- ['INTJ', 'Interjection'],
23
- ['LST', 'List marker'],
24
- ['NAC', 'Not a constituent'],
25
- ['NP', 'Noun phrase'],
26
- ['NX', 'Head of an NP'],
27
- ['PP', 'Prepositional phrase'],
28
- ['PRN', 'Parenthetical'],
29
- ['PRT', 'Particle'],
30
- ['QP', 'Quantifier phrase'],
31
- ['RRC', 'Reduced relative clause'],
32
- ['UCP', 'Unlike coordinated phrase'],
33
- ['VP', 'Verb phrase'],
34
- ['WHADJP', 'Wh-adjective phrase'],
35
- ['WHAVP', 'Wh-adverb phrase'],
36
- ['WHNP', 'Wh-noun phrase'],
37
- ['WHPP', 'Wh-prepositional phrase'],
38
- ['X', 'Unknown, uncertain, or unbracketable']
39
- ]
40
-
41
- PTBWordTagDescription = [
42
- ['CC', 'Coordinating conjunction'],
43
- ['CD', 'Cardinal number'],
44
- ['DT', 'Determiner'],
45
- ['EX', 'Existential there'],
46
- ['FW', 'Foreign word'],
47
- ['IN', 'Preposition or subordinating conjunction'],
48
- ['JJ', 'Adjective'],
49
- ['JJR', 'Adjective, comparative'],
50
- ['JJS', 'Adjective, superlative'],
51
- ['LS', 'List item marker'],
52
- ['MD', 'Modal'],
53
- ['NN', 'Noun, singular or mass'],
54
- ['NNS', 'Noun, plural'],
55
- ['NNP', 'Proper noun, singular'],
56
- ['NNPS', 'Proper noun, plural'],
57
- ['PDT', 'Predeterminer'],
58
- ['POS', 'Possessive ending'],
59
- ['PRP', 'Personal pronoun'],
60
- ['PRP$', 'Possessive pronoun (prolog version PRP-S)'],
61
- ['RB', 'Adverb'],
62
- ['RBR', 'Adverb, comparative'],
63
- ['RBS', 'Adverb, superlative'],
64
- ['RP', 'Particle'],
65
- ['SYM', 'Symbol'],
66
- ['TO', 'to'],
67
- ['UH', 'Interjection'],
68
- ['VB', 'Verb, base form'],
69
- ['VBD', 'Verb, past tense'],
70
- ['VBG', 'Verb, gerund or present participle'],
71
- ['VBN', 'Verb, past participle'],
72
- ['VBP', 'Verb, non 3rd person singular present'],
73
- ['VBZ', 'Verb, 3rd person singular present'],
74
- ['WDT', 'Wh-determiner'],
75
- ['WP', 'Wh-pronoun'],
76
- ['WP$', 'Possessive wh-pronoun (prolog version WP-S)'],
77
- ['WRB', 'Wh-adverb']
78
- ]
79
-
80
- BrownWordTagDescription = [
81
-
82
- ['.', 'sentence closer . ; ? !'],
83
- ['(', 'left parent'] ,
84
- [')', 'right parent'],
85
- ['*', 'not'],
86
- ['--', 'dash'],
87
- [',', 'comma'],
88
- [':', 'colon'],
89
- ['ABL', 'pre-qualifier quite, rather'],
90
- ['ABN', 'pre-quantifier half, all'],
91
- ['ABX', 'pre-quantifier both'],
92
- ['AP', 'post-determiner many, several, next'],
93
- ['AT', 'article a, the, no'],
94
- ['BE', 'be '],
95
- ['BED', 'were '],
96
- ['BEDZ', 'was '],
97
- ['BEG', 'being '],
98
- ['BEM', 'am '],
99
- ['BEN', 'been '],
100
- ['BER', 'are, art '],
101
- ['BEZ', 'is '],
102
- ['CC', 'coordinating conjunction and, or'],
103
- ['CD', 'cardinal numeral one, two, 2, etc.'],
104
- ['CS', 'subordinating conjunction if, although'],
105
- ['DO', 'do '],
106
- ['DOD', 'did '],
107
- ['DOZ', 'does '],
108
- ['DT', 'singular determiner this, that'],
109
- ['DTI', 'singular or plural determiner/quantifier some, any'],
110
- ['DTS', 'plural determiner these, those'],
111
- ['DTX', 'determiner/double conjunction either'],
112
- ['EX', 'existentil there '],
113
- ['FW', 'foreign word (hyphenated before regular tag) '],
114
- ['HL', 'word occurring in headline (hyphenated after regular tag) '],
115
- ['HV', 'have '],
116
- ['HVD', 'had (past tense) '],
117
- ['HVG', 'having '],
118
- ['HVN', 'had (past participle) '],
119
- ['HVZ', 'has '],
120
- ['IN', 'preposition '],
121
- ['JJ', 'adjective '],
122
- ['JJR', 'comparative adjective '],
123
- ['JJS', 'semantically superlative adjective chief, top'],
124
- ['JJT', 'morphologically superlative adjective biggest'],
125
- ['MD', 'modal auxiliary can, should, will'],
126
- ['NC', 'cited word (hyphenated after regular tag) '],
127
- ['NN', 'singular or mass noun '],
128
- ['NN$', 'possessive singular noun '],
129
- ['NNS', 'plural noun '],
130
- ['NNS$', 'possessive plural noun '],
131
- ['NP', 'proper noun or part of name phrase '],
132
- ['NP$', 'possessive proper noun '],
133
- ['NPS', 'plural proper noun '],
134
- ['NPS$', 'possessive plural proper noun '],
135
- ['NR', 'adverbial noun home, today, west'],
136
- ['NRS', 'plural adverbial noun'],
137
- ['OD', 'ordinal numeral first, 2nd'],
138
- ['PN', 'nominal pronoun everybody, nothing'],
139
- ['PN$', 'possessive nominal pronoun '],
140
- ['PP$', 'possessive personal pronoun my, our'],
141
- ['PP$$', 'second (nominal) possessive pronoun mine, ours'],
142
- ['PPL', 'singular reflexive/intensive personal pronoun myself'],
143
- ['PPLS', 'plural reflexive/intensive personal pronoun ourselves'],
144
- ['PPO', 'objective personal pronoun me, him, it, them'],
145
- ['PPS', '3rd. singular nominative pronoun he, she, it, one'],
146
- ['PPSS', 'other nominative personal pronoun I, we, they, you'],
147
- ['QL', 'qualifier very, fairly'],
148
- ['QLP', 'post-qualifier enough, indeed'],
149
- ['RB', 'adverb '],
150
- ['RBR', 'comparative adverb '],
151
- ['RBT', 'superlative adverb '],
152
- ['RN', 'nominal adverb here then, indoors '],
153
- ['RP', 'adverb/particle about, off, up'],
154
- ['TL', 'word occurring in title (hyphenated after regular tag)'],
155
- ['TO', 'infinitive marker to '],
156
- ['UH', 'interjection, exclamation '],
157
- ['VB', 'verb, base form '],
158
- ['VBD', 'verb, past tense '],
159
- ['VBG', 'verb, present participle/gerund '],
160
- ['VBN', 'verb, past participle '],
161
- ['VBZ', 'verb, 3rd. singular present '],
162
- ['WDT', 'wh- determiner what, which'],
163
- ['WP$', 'possessive wh- pronoun whose'],
164
- ['WPO', 'objective wh- pronoun whom, which, that'],
165
- ['WPS', 'nominative wh- pronoun who, which, that'],
166
- ['WQL', 'wh- qualifier how'],
167
- ['WRB', 'wh- adverb how, where, when']
168
-
169
- ]
170
- # A description of Enju categories.
171
- EnjuCatDescription = [
172
- ['ADJ', 'Adjective'],
173
- ['ADV', 'Adverb'],
174
- ['CONJ', 'Coordination conjunction'],
175
- ['C', 'Complementizer'],
176
- ['D', 'Determiner'],
177
- ['N', 'Noun'],
178
- ['P', 'Preposition'],
179
- ['SC', 'Subordination conjunction'],
180
- ['V', 'Verb'],
181
- ['COOD', 'Part of coordination'],
182
- ['PN', 'Punctuation'],
183
- ['PRT', 'Particle'],
184
- ['S', 'Sentence']
185
- ]
186
-
187
- # Maps Enju categories to Treat categories.
188
- EnjuCatToCategory = {
189
- 'ADJ' => :adjective,
190
- 'ADV' => :adverb,
191
- 'CONJ' => :conjunction,
192
- 'COOD' => :conjunction,
193
- 'C' => :complementizer,
194
- 'D' => :determiner,
195
- 'N' => :noun,
196
- 'P' => :preposition,
197
- 'PN' => :punctuation,
198
- 'SC' => :conjunction,
199
- 'V' => :verb,
200
- 'PRT' => :particle
201
- }
202
-
203
- # Description of the xcat in the Enju output specification.
204
- EnjuXCatDescription = [
205
- ['COOD', 'Coordinated phrase/clause'],
206
- ['IMP', 'Imperative sentence'],
207
- ['INV', 'Subject-verb inversion'],
208
- ['Q', 'Interrogative sentence with subject-verb inversion'],
209
- ['REL', 'A relativizer included'],
210
- ['FREL', 'A free relative included'],
211
- ['TRACE', 'A trace included'],
212
- ['WH', 'A wh-question word included']
213
- ]
214
-
215
- EnjuCatXcatToPTB = [
216
- ['ADJP', '', 'ADJP'],
217
- ['ADJP', 'REL', 'WHADJP'],
218
- ['ADJP', 'FREL', 'WHADJP'],
219
- ['ADJP', 'WH', 'WHADJP'],
220
- ['ADVP', '', 'ADVP'],
221
- ['ADVP', 'REL', 'WHADVP'],
222
- ['ADVP', 'FREL', 'WHADVP'],
223
- ['ADVP', 'WH', 'WHADVP'],
224
- ['CONJP', '', 'CONJP'],
225
- ['CP', '', 'SBAR'],
226
- ['DP', '', 'NP'],
227
- ['NP', '', 'NP'],
228
- ['NX', 'NX', 'NAC'],
229
- ['NP' 'REL' 'WHNP'],
230
- ['NP' 'FREL' 'WHNP'],
231
- ['NP' 'WH' 'WHNP'],
232
- ['PP', '', 'PP'],
233
- ['PP', 'REL', 'WHPP'],
234
- ['PP', 'WH', 'WHPP'],
235
- ['PRT', '', 'PRT'],
236
- ['S', '', 'S'],
237
- ['S', 'INV', 'SINV'],
238
- ['S', 'Q', 'SQ'],
239
- ['S', 'REL', 'SBAR'],
240
- ['S', 'FREL', 'SBAR'],
241
- ['S', 'WH', 'SBARQ'],
242
- ['SCP', '', 'SBAR'],
243
- ['VP', '', 'VP'],
244
- ['VP', '', 'VP'],
245
- ['', '', 'UK']
246
- ]
247
-
248
- # Aligned tags for the Claws C5, Brown and Penn tag sets.
249
- # Adapted from Manning, Christopher and Schütze, Hinrich,
250
- # 1999. Foundations of Statistical Natural Language
251
- # Processing. MIT Press, p. 141-142.
252
- AlignedWordTags = [
253
- 'Adjective', ['AJ0', 'JJ', 'JJ'],
254
- 'Adjective, ordinal number', ['ORD', 'OD', 'JJ'],
255
- 'Adjective, comparative', ['AJC', 'JJR', 'JJR'],
256
- 'Adjective, superlative', ['AJS', 'JJT', 'JJS'],
257
- 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ'],
258
- 'Adjective, cardinal number', ['CRD', 'CD', 'CD'],
259
- 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD'],
260
- 'Adverb', ['AV0', 'RB', 'RB'],
261
- 'Adverb, negative', ['XX0', '*', 'RB'],
262
- 'Adverb, comparative', ['AV0', 'RBR', 'RBR'],
263
- 'Adverb, superlative', ['AV0', 'RBT', 'RBS'],
264
- 'Adverb, particle', ['AVP', 'RP', 'RP'],
265
- 'Adverb, question', ['AVQ', 'WRB', 'WRB'],
266
- 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB'],
267
- 'Adverb, degree', ['AV0', 'QL', 'RB'],
268
- 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB'],
269
- 'Adverb, nominal', ['AV0', 'RN', 'RB'],
270
- 'Conjunction, coordination', ['CJC', 'CC', 'CC'],
271
- 'Conjunction, subordination', ['CJS', 'CS', 'IN'],
272
- 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN'],
273
- 'Determiner', ['DT0', 'DT', 'DT'],
274
- 'Determiner, pronoun', ['DT0', 'DTI', 'DT'],
275
- 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT'],
276
- 'Determiner, prequalifier', ['DT0', 'ABL', 'DT'],
277
- 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT'],
278
- 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT'],
279
- 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT'],
280
- 'Determiner, article', ['AT0', 'AT', 'DT'],
281
- 'Determiner, postdeterminer', ['DT0', 'AP', 'JJ'],
282
- 'Determiner, possessive', ['DPS', 'PP$', 'PRP$'],
283
- 'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP'],
284
- 'Determiner, question', ['DTQ', 'WDT', 'WDT'],
285
- 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$'],
286
- 'Noun', ['NN0', 'NN', 'NN'],
287
- 'Noun, singular', ['NN1', 'NN', 'NN'],
288
- 'Noun, plural', ['NN2', 'NNS', 'NNS'],
289
- 'Noun, proper, singular', ['NP0', 'NP', 'NNP'],
290
- 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
291
- 'Noun, adverbial', ['NN0', 'NR', 'NN'],
292
- 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
293
- 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP'],
294
- 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
295
- 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
296
- 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
297
- 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP'],
298
- 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP'],
299
- 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP'],
300
- 'Pronoun, question, object', ['PNQ', 'WPO', 'WP'],
301
- 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
302
- 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP'],
303
- 'Verb, infinitive', ['VVI', 'VB', 'VB'],
304
- 'Verb, past tense', ['VVD', 'VBD', 'VBD'],
305
- 'Verb, present participle', ['VVG', 'VBG', 'VBG'],
306
- 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN'],
307
- 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ'],
308
- 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP'],
309
- 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB'],
310
- 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD'],
311
- 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG'],
312
- 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN'],
313
- 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ'],
314
- 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP'],
315
- 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB'],
316
- 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD'],
317
- 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG'],
318
- 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN'],
319
- 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ'],
320
- 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB'],
321
- 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD'],
322
- 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD'],
323
- 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG'],
324
- 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN'],
325
- 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ'],
326
- 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP'],
327
- 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP'],
328
- 'Verb, modal', ['VM0', 'MD', 'MD'],
329
- 'Preposition, to as infinitive marker', ['TO0', 'TO', 'TO'],
330
- 'Preposition, to', ['PRP', 'IN', 'TO'],
331
- 'Preposition', ['PRP', 'IN', 'IN'],
332
- 'Preposition, of', ['PRF', 'IN', 'IN'],
333
- 'Possessive', ['POS', '$', 'POS'],
334
- 'Interjection (or other isolate)', ['ITJ', 'UH', 'UH'],
335
- 'Punctuation, sentence ender', ['PUN', '.', '.'],
336
- 'Punctuation, semicolon', ['PUN', '.', '.'],
337
- 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
338
- 'Punctuationm, comma', ['PUN', ',', ','],
339
- 'Punctuation, dash', ['PUN', '-', '-'],
340
- 'Punctuation, dollar sign', ['PUN', '', '$'],
341
- 'Punctuation, left bracket', ['PUL', '(', '('],
342
- 'Punctuation, right bracket', ['PUR', ')', ')'],
343
- 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
344
- 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
345
- 'Unknown, foreign words (not in English lexicon)', ['UNZ', '(FW-)', 'FW'],
346
- 'Symbol', ['', '', 'SYM'],
347
- 'Symbol, alphabetical', ['ZZ0', '', ''],
348
- 'Symbol, list item', ['', '', 'LS']
349
- ]
350
- end
351
- end
352
- end