treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -0,0 +1,29 @@
1
+ module Treat
2
+ module Lexicalizers
3
+ module Tag
4
+ class Tagger
5
+ def self.tag(entity, options = {})
6
+ if (entity.is_a?(Treat::Entities::Sentence) ||
7
+ entity.is_a?(Treat::Entities::Phrase)) &&
8
+ !entity.has_children?
9
+ raise Treat::Exception,
10
+ "Annotator 'tag' requires processor 'tokenize'."
11
+ elsif entity.is_a?(Treat::Entities::Word)
12
+ if entity.has_parent?
13
+ ps = entity.parent_sentence
14
+ pp = entity.parent_phrase
15
+ if ps
16
+ self.tag(ps, options)
17
+ elsif pp
18
+ self.tag(pp, options)
19
+ end
20
+ return entity.features[:tag]
21
+ else
22
+ return :isolated_word
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -9,15 +9,15 @@ module Treat
9
9
  # - Chunkers : split a text into zone objects.
10
10
  # - Segmenters : split a text or zone into sentence objects.
11
11
  # - Tokenizers : split a sentence into Token objects.
12
- # - Parsers: split a sentence into a tree of constituents
13
- # containing other constituents and Token objects, representing
12
+ # - Parsers: split a sentence into a tree of phrases
13
+ # containing other phrases and Token objects, representing
14
14
  # the syntactic structure.
15
15
  module Processors
16
16
  # Chunkers split a text into zones.
17
17
  module Chunkers
18
18
  extend Group
19
19
  self.type = :transformer
20
- self.targets = [:document, :zone]
20
+ self.targets = [:document, :section]
21
21
  end
22
22
  # Segmenters split a text or zone into sentences.
23
23
  module Segmenters
@@ -29,17 +29,17 @@ module Treat
29
29
  module Tokenizers
30
30
  extend Group
31
31
  self.type = :transformer
32
- self.targets = [:document, :zone, :sentence, :constituent]
32
+ self.targets = [:document, :zone, :phrase]
33
33
  end
34
- # Parsers split a sentence into constituent objects
34
+ # Parsers split a sentence into phrase objects
35
35
  # representing its syntactic structure, with the
36
- # Token objects as children of the constituents.
36
+ # Token objects as children of the phrases.
37
37
  module Parsers
38
38
  extend Group
39
39
  self.type = :transformer
40
- self.targets = [:document, :zone, :sentence, :constituent]
40
+ self.targets = [:document, :zone, :phrase]
41
41
  end
42
- # Makes all the groups autoloadable and creates the delegators.
42
+ # Makes all the groups autoloadable and creates the workers.
43
43
  extend Treat::Category
44
44
  end
45
45
  end
@@ -2,14 +2,15 @@ module Treat
2
2
  module Processors
3
3
  module Chunkers
4
4
  # This class separates a plain text file into
5
- # zones based on a very naive analysis of the
6
- # file.
5
+ # zones based on an extremely naive analysis of the
6
+ # file. Suprisingly, this works pretty well.
7
7
  class Txt
8
8
  # Split a document into Zone objects.
9
9
  def self.chunk(text, options = {})
10
10
  zones = text.to_s.split("\n")
11
11
  zones.each do |zone|
12
- next if zone.strip == ''
12
+ zone.strip!
13
+ next if zone == ''
13
14
  if false # fix
14
15
  text << Treat::Entities::List.new(zone)
15
16
  end
@@ -19,7 +20,6 @@ module Treat
19
20
  text << Treat::Entities::Paragraph.new(zone)
20
21
  end
21
22
  end
22
- text
23
23
  end
24
24
  end
25
25
  end
@@ -6,7 +6,7 @@ module Treat
6
6
  # the parser formats it runs it through Enju, and
7
7
  # parses the XML output by Enju using the Nokogiri
8
8
  # XML reader. It creates wrappers for the sentences,
9
- # syntactical constituents and tokens that Enju identified.
9
+ # syntactical phrases and tokens that Enju identified.
10
10
  #
11
11
  # Original paper:
12
12
  # Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
@@ -29,20 +29,15 @@ module Treat
29
29
  @@i = 0 if @@i == @@parsers.size
30
30
  @@parsers[@@i-1]
31
31
  end
32
- # Parse the entity into its syntactical constituents
33
- # using Enju
32
+ # Parse the entity into its syntactical phrases using Enju.
33
+ # Calls #build to initiate XML parsing.
34
34
  def self.parse(entity, options = {})
35
35
  options[:processes] ||= 1
36
36
  @@options = options
37
+ @@id_table = {}
38
+ @@dependencies_table = {}
37
39
  stdin, stdout = proc
38
- if entity.to_s.count('.') == 0
39
- remove_last = true
40
- text = entity.to_s + '.'
41
- else
42
- remove_last = false
43
- text = entity.to_s.gsub('.', '')
44
- text += '.' unless ['!', '?'].include?(text[-1])
45
- end
40
+ text, remove_last = valid_text(entity)
46
41
  stdin.puts(text + "\n")
47
42
  parsed = build(stdout.gets, remove_last)
48
43
  if not parsed.nil?
@@ -50,10 +45,16 @@ module Treat
50
45
  parsed.children.each do |child|
51
46
  entity << child
52
47
  end
48
+ # Remove the period we added at the end.
49
+ if remove_last
50
+ last = entity.punctuations[-1]
51
+ entity.remove!(last)
52
+ end
53
53
  else
54
54
  warn "Couldn't parse the text '#{entity.to_s}'."
55
55
  end
56
- entity
56
+ link_heads(entity)
57
+ add_dependencies(entity)
57
58
  end
58
59
  # Parses an Enju XML output file using the Nogoriki
59
60
  # XML reader and converts that structure into a tree
@@ -63,8 +64,6 @@ module Treat
63
64
  xml_reader = Nokogiri::XML::Reader.from_memory(xml)
64
65
  current_element = nil
65
66
  previous_depth = 0
66
- id_table = {}
67
- edges_table = {}
68
67
  # Read the XML file entity by entity.
69
68
  while xml_reader.read
70
69
  # The depth in the XML tree.
@@ -81,119 +80,135 @@ module Treat
81
80
  previous_depth = current_depth
82
81
  next
83
82
  end
83
+ # Get and format attributes and dependencies.
84
84
  attributes = xml_reader.attributes
85
- prefix = ['schema', 'lexentry', 'type']
86
- # If the entity has entributes, add them.
87
- unless attributes.empty?
88
- new_attributes = {}
89
- edges = {}
90
- id = attributes.delete('id')
91
- pred = attributes.delete('pred')
92
- attributes.each_pair do |attribute, value|
93
- if ['arg1', 'arg2'].include?(attribute)
94
- edges[value] = pred
95
- else
96
- if attribute == 'cat'
97
- if xml_reader.name == 'tok'
98
- if value.length > 1 && ['P', 'X'].include?(value[-1]) &&
99
- value != 'PN'
100
- new_attributes[:saturated] = (value[-1] == 'P')
101
- value = value[0..-2]
102
- end
103
- cat = Treat::Languages::English::EnjuCatToCategory[value]
104
- new_attributes[:cat] = cat
105
- else
106
- new_attributes[:enju_cat] = value
107
- xcat = attributes['xcat'].split(' ')[0]
108
- xcat ||= ''
109
- tags = Treat::Languages::English::EnjuCatXcatToPTB.select do |m|
110
- m[0] == value && m[1] == xcat
111
- end
112
- if tags.empty?
113
- tag = 'UK'
114
- else
115
- tag = tags[0][2]
116
- end
117
- new_attributes[:enju_xcat] = xcat
118
- attributes.delete('xcat')
119
- new_attributes[:tag] = tag
120
- end
121
- else
122
- pre = prefix.include?(attribute) ? 'enju_' : ''
123
- new_attributes[:"#{pre+attribute}"] = value
124
- end
125
- end
126
- end
127
- attributes.delete('arg1')
128
- attributes.delete('arg2')
129
- end
130
- # Handle naming conventions.
131
- if attributes.has_key?('pos')
132
- new_attributes[:tag] = new_attributes[:pos]
133
- new_attributes[:tag_set] = :penn
134
- new_attributes.delete :pos
85
+ id = attributes.delete('id')
86
+ new_attributes = {}; dependencies = {}
87
+ unless attributes.size == 0
88
+ new_attributes, dependencies =
89
+ cleanup_attributes(xml_reader.name, attributes)
135
90
  end
136
91
  # Create the appropriate entity for the
137
92
  # element.
138
93
  current_value = ''
139
- attributes = new_attributes
140
94
  case xml_reader.name
141
95
  when 'sentence'
142
96
  current_element = Treat::Entities::Sentence.new('')
143
- id_table[id] = current_element.id
144
- edges_table[current_element.id] = edges
145
- current_element.features = attributes
97
+ @@id_table[id] = current_element.id
98
+ @@dependencies_table[current_element.id] = dependencies
99
+ current_element.features = new_attributes
146
100
  when 'cons'
147
101
  current_element = current_element <<
148
102
  Treat::Entities::Phrase.new('')
149
- id_table[id] = current_element.id
150
- edges_table[current_element.id] = edges
151
- current_element.features = attributes
103
+ @@id_table[id] = current_element.id
104
+ @@dependencies_table[current_element.id] = dependencies
105
+ current_element.features = new_attributes
152
106
  when 'tok'
153
- tmp_attributes = attributes
154
- tmp_edges = edges
107
+ tmp_attributes = new_attributes
108
+ tmp_dependencies = dependencies
155
109
  else
156
110
  current_value = xml_reader.value.gsub(/\s+/, "")
157
- if !current_value.empty?
111
+ unless current_value.size == 0
158
112
  current_element = current_element <<
159
- Treat::Entities::Entity.from_string(current_value)
113
+ Treat::Entities::Token.from_string(current_value)
160
114
  if current_element.is_a?(Treat::Entities::Word)
161
115
  current_element.features = tmp_attributes
162
- id_table[id] = current_element.id
163
- edges_table[current_element.id] = tmp_edges
116
+ @@id_table[id] = current_element.id
117
+ @@dependencies_table[current_element.id] = tmp_dependencies
164
118
  end
165
119
  end
166
120
  end
167
121
  previous_depth = current_depth
168
122
  end
169
- # Add the edges to the entity.
170
- unless current_element.nil?
171
- root = current_element.root
172
- edges_table.each_pair do |id2, edges2|
173
- # Next if there are no edges.
174
- next if edges2.nil?
175
- entity = root.find(id2)
176
- edges2.each_pair do |argument, type|
177
- # Skip this argument if we don't know
178
- # the target node.
123
+ current_element
124
+ end
125
+ # Validate a text - Enju wants period to parse a sentence.
126
+ def self.valid_text(entity)
127
+ if entity.to_s.count('.') == 0
128
+ remove_last = true
129
+ text = entity.to_s + '.'
130
+ else
131
+ remove_last = false
132
+ text = entity.to_s.gsub('.', '')
133
+ text += '.' unless ['!', '?'].include?(text[-1])
134
+ end
135
+ return text, remove_last
136
+ end
137
+ # Link the head and sem_head to their entities.
138
+ def self.link_heads(entity)
139
+ entity.each_phrase do |phrase|
140
+ if phrase.has?(:head)
141
+ phrase.link(@@id_table[phrase.head], 'head', true, -1)
142
+ phrase.unset(:head)
143
+ end
144
+ if phrase.has?(:sem_head)
145
+ phrase.link(@@id_table[phrase.sem_head], 'sem_head', true, -1)
146
+ phrase.unset(:sem_head)
147
+ end
148
+ end
149
+ end
150
+ # Add dependencies a posterior to a parsed entity.
151
+ def self.add_dependencies(entity2)
152
+ entity2.each_entity(:word, :phrase) do |entity|
153
+ @@dependencies_table.each_pair do |id2, dependencies2|
154
+ # Next if there are no dependencies.
155
+ next if dependencies2.nil?
156
+ entity = entity2.root.find(id2)
157
+ next if entity.nil?
158
+ dependencies2.each_pair do |argument, type|
159
+ # Skip this argument if we don't know the target node.
179
160
  next if argument == 'unk'
180
- entity.associate(id_table[argument], type)
161
+ entity.link(@@id_table[argument], type.intern)
181
162
  end
182
163
  end
183
- # Link the head and sem_head to their entities.
184
- root.each_constituent do |constituent|
185
- constituent.set :head,
186
- root.find(id_table[constituent.head])
187
- constituent.set :sem_head,
188
- root.find(id_table[constituent.sem_head])
164
+ end
165
+ end
166
+ # Helper function to convert Enju attributes to Treat attributes.
167
+ def self.cleanup_attributes(name, attributes)
168
+ new_attributes = {}
169
+ dependencies = {}
170
+ pred = attributes.delete('pred')
171
+ attributes.each_pair do |attribute2, value|
172
+ attribute = attribute2.strip
173
+ if attribute == 'arg1' || attribute == 'arg2'
174
+ dependencies[value] = pred
175
+ next
176
+ end
177
+ if attribute == 'cat'
178
+ new_attributes[:cat] = value
179
+ if name == 'tok'
180
+ if value.length > 1 && ['P', 'X'].include?(value[-1]) &&
181
+ value != 'PN'
182
+ new_attributes[:saturated] = (value[-1] == 'P')
183
+ value = value[0..-2]
184
+ end
185
+ new_attributes[:category] =
186
+ Treat::Languages::Tags::EnjuCatToCategory[value]
187
+ else
188
+ tags = Treat::Languages::Tags::EnjuCatXcatToPTB.select do |m|
189
+ m[0] == value && m[1] == attributes['xcat']
190
+ end
191
+ tag = (tags.size == 0) ? 'FW' : tags[0][2]
192
+ new_attributes[:tag] = tag
193
+ end
194
+ else
195
+ new_attributes[:"#{attribute}"] = value
189
196
  end
190
197
  end
191
- # Remove the period we added at the end.
192
- if remove_last
193
- last = current_element.punctuations[-1]
194
- current_element.remove!(last)
198
+ # Delete after iteration.
199
+ attributes.delete('arg1')
200
+ attributes.delete('arg2')
201
+ # Handle naming conventions.
202
+ if attributes.has_key?('pos')
203
+ new_attributes[:tag] = new_attributes[:pos]
204
+ new_attributes[:tag_set] = :penn
205
+ new_attributes.delete :pos
195
206
  end
196
- current_element
207
+ if attributes.has_key?('base')
208
+ new_attributes[:lemma] = new_attributes[:base]
209
+ new_attributes.delete :base
210
+ end
211
+ return new_attributes, dependencies
197
212
  end
198
213
  end
199
214
  end
@@ -3,60 +3,128 @@ module Treat
3
3
  module Parsers
4
4
  # A wrapper class for the Stanford parser.
5
5
  class Stanford
6
- # Require the Ruby-Java bridge.
7
- silence_warnings { require 'rjb' }
8
- jar = "#{Treat.bin}/stanford-parser*/stanford-parser*.jar"
9
- jars = Dir.glob(jar)
10
- if jars.empty? || !File.readable?(jars[0])
11
- raise "Could not find stanford parser JAR file (looking in #{jar})"+
12
- " You may need to manually download the JAR files and/or set Treat.bin."
13
- end
14
- Rjb::load(jars[0], ['-Xms256M', '-Xmx512M'])
15
- LexicalizedParser = ::Rjb::import('edu.stanford.nlp.parser.lexparser.LexicalizedParser')
16
- @@parsers = {}
6
+ require 'stanford-core-nlp'
7
+ @@parser = {}
8
+ DefaultOptions = {
9
+ :silence => false,
10
+ :log_to_file => nil,
11
+ :parser_model => nil,
12
+ :tagger_model => nil
13
+ }
17
14
  # Parse the entity using the Stanford parser.
15
+ #
16
+ # Options:
17
+ # - (String) :log_to_file => a filename to log output to
18
+ # instead of displaying it.
18
19
  def self.parse(entity, options = {})
19
- lang = Treat::Languages.describe(entity.language).to_s.upcase
20
- pcfg = "#{Treat.bin}/stanford-parser*/grammar/#{lang.upcase}PCFG.ser.gz"
21
- pcfgs = Dir.glob(pcfg)
22
- if pcfgs.empty? || !File.readable?(pcfgs[0])
23
- raise "Could not find a language model for #{lang.downcase} (looking in #{pcfg})."
20
+ options = DefaultOptions.merge(options)
21
+ lang = entity.language
22
+ StanfordCoreNLP.use(lang)
23
+ if options[:tagger_model]
24
+ ::StanfordCoreNLP.set_model(
25
+ 'pos.model', options[:tagger_model]
26
+ )
27
+ end
28
+ if options[:parser_model]
29
+ ::StanfordCoreNLP.set_model(
30
+ 'parser.model', options[:parser_model]
31
+ )
32
+ end
33
+ if options[:silence]
34
+ options[:log_to_file] = '/dev/null'
35
+ end
36
+ if options[:log_to_file]
37
+ ::StanfordCoreNLP.log_file =
38
+ options[:log_to_file]
39
+ end
40
+ @@parser[lang] ||=
41
+ ::StanfordCoreNLP.load(
42
+ :tokenize, :ssplit, :pos, :lemma, :parse
43
+ )
44
+ text = ::StanfordCoreNLP::Text.new(entity.to_s)
45
+ @@parser[lang].annotate(text)
46
+
47
+ text.get(:sentences).each do |s|
48
+ if entity.is_a?(Treat::Entities::Sentence) ||
49
+ entity.is_a?(Treat::Entities::Phrase)
50
+ tag = s.get(:category).to_s
51
+ tag_s, tag_opt = *tag.split('-')
52
+ tag_s ||= 'S'
53
+ entity.set :tag_set, :penn
54
+ entity.set :tag, tag_s
55
+ entity.set :tag_opt, tag_opt if tag_opt
56
+ recurse(s.get(:tree), entity)
57
+ break
58
+ else
59
+ recurse(s.get(:tree), entity)
60
+ end
24
61
  end
25
- @@parsers[lang] ||= LexicalizedParser.new(pcfgs[0])
26
- parse = @@parsers[lang].apply(entity.to_s)
27
- entity.remove_all!
28
- recurse(parse, entity)
29
- entity
30
62
  end
31
- # Helper method which recurses the tree supplied by
63
+
64
+ # Helper method which recurses the tree supplied by
32
65
  # the Stanford parser.
33
- def self.recurse(java_node, ruby_node)
66
+ def self.recurse(java_node, ruby_node, additional_tags = [])
34
67
  # Leaf
35
68
  if java_node.num_children == 0
36
- ruby_child = Treat::Entities::Entity.from_string(java_node.value)
37
- labels = java_node.labels.iterator
38
- while labels.has_next
39
- label = labels.next
40
- ruby_child.set :begin_char, label.begin_position
41
- ruby_child.set :end_char, label.end_position
42
- ruby_child.set :tag, ruby_node.tag
69
+ label = java_node.label
70
+ tag = label.get(:part_of_speech).to_s
71
+ tag_s, tag_opt = *tag.split('-')
72
+ tag_s ||= ''
73
+ ruby_node.value = java_node.value.to_s.strip
74
+ ruby_node.set :tag_set, :penn
75
+ ruby_node.set :tag, tag_s
76
+ ruby_node.set :tag_opt, tag_opt if tag_opt
77
+ ruby_node.set :tag_set, :penn
78
+ ruby_node.set :lemma, label.get(:lemma).to_s
79
+
80
+ ruby_node.set :character_offset_begin,
81
+ label.get(:character_offset_begin).to_s
82
+
83
+ ruby_node.set :character_offset_end,
84
+ label.get(:character_offset_end).to_s
85
+
86
+ ruby_node.set :begin_index,
87
+ label.get(:begin_index).to_s
88
+
89
+ ruby_node.set :end_index,
90
+ label.get(:end_index).to_s
91
+
92
+ additional_tags.each do |t|
93
+ lt = label.get(t)
94
+ ruby_node.set t, lt.to_s if lt
43
95
  end
44
- ruby_node << ruby_child
96
+ return ruby_node
45
97
  else
46
- if java_node.num_children == 1
47
- return recurse(java_node.children[0], ruby_node)
98
+
99
+ if java_node.num_children == 1 &&
100
+ java_node.children[0].num_children == 0
101
+ recurse(java_node.children[0], ruby_node, additional_tags)
102
+ return
48
103
  end
104
+
49
105
  java_node.children.each do |java_child|
50
- # dependencies = java_child.dependencies.iterator
51
- # while dependencies.has_next
52
- #dependency = dependencies.next
53
- # end
54
- ruby_child = Treat::Entities::Phrase.new
55
- ruby_child.set :tag, java_child.value
106
+ label = java_child.label
107
+ tag = label.get(:category).to_s
108
+ tag_s, tag_opt = *tag.split('-')
109
+ tag_s ||= ''
110
+
111
+ if Treat::Languages::Tags::PhraseTagToCategory[tag_s]
112
+ ruby_child = Treat::Entities::Phrase.new
113
+ else
114
+ l = java_child.children[0].to_s
115
+ v = java_child.children[0].value.to_s.strip
116
+ # Mhmhmhmhmhm
117
+ val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
118
+ ruby_child = Treat::Entities::Token.from_string(val)
119
+ end
120
+
56
121
  ruby_child.set :tag_set, :penn
122
+ ruby_child.set :tag, tag_s
123
+ ruby_child.set :tag_opt, tag_opt if tag_opt
57
124
  ruby_node << ruby_child
125
+
58
126
  unless java_child.children.empty?
59
- recurse(java_child, ruby_child)
127
+ recurse(java_child, ruby_child, additional_tags)
60
128
  end
61
129
  end
62
130
  end
@@ -64,4 +132,4 @@ module Treat
64
132
  end
65
133
  end
66
134
  end
67
- end
135
+ end