shalmaneser-frappe 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/frappe/Ampersand.rb +41 -0
  7. data/lib/frappe/file_parser.rb +126 -0
  8. data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
  9. data/lib/frappe/frappe.rb +217 -0
  10. data/lib/frappe/frappe_flat_syntax.rb +89 -0
  11. data/lib/frappe/frappe_read_stxml.rb +48 -0
  12. data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
  13. data/lib/frappe/interfaces/collins_interface.rb +340 -0
  14. data/lib/frappe/interfaces/counter.rb +19 -0
  15. data/lib/frappe/interfaces/stanford_interface.rb +353 -0
  16. data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
  17. data/lib/frappe/interfaces/treetagger_module.rb +111 -0
  18. data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
  19. data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
  20. data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
  21. data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
  22. data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
  23. data/lib/frappe/interpreters/headz.rb +265 -0
  24. data/lib/frappe/interpreters/headz_helpers.rb +54 -0
  25. data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
  26. data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
  27. data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
  28. data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
  29. data/lib/frappe/one_parsed_file.rb +31 -0
  30. data/lib/frappe/opt_parser.rb +92 -0
  31. data/lib/frappe/path.rb +199 -0
  32. data/lib/frappe/plain_converter.rb +59 -0
  33. data/lib/frappe/salsa_tab_converter.rb +154 -0
  34. data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
  35. data/lib/frappe/stxml_converter.rb +666 -0
  36. data/lib/frappe/syn_interface.rb +76 -0
  37. data/lib/frappe/syn_interface_stxml.rb +173 -0
  38. data/lib/frappe/syn_interface_tab.rb +39 -0
  39. data/lib/frappe/utf_iso.rb +27 -0
  40. data/lib/shalmaneser/frappe.rb +1 -0
  41. metadata +130 -0
@@ -0,0 +1,74 @@
1
+ require_relative 'treetagger_module'
2
+
3
+ require 'frappe/syn_interface_tab'
4
+ require 'frappe/utf_iso'
5
+
6
+ require 'tempfile'
7
+
8
+ module Shalmaneser
9
+ module Frappe
10
+
11
+ class TreetaggerInterface < SynInterfaceTab
12
+ include TreetaggerModule
13
+
14
+ TreetaggerInterface.announce_me
15
+
16
+ ###
17
+ def self.system
18
+ 'treetagger'
19
+ end
20
+
21
+ ###
22
+ def self.service
23
+ 'lemmatizer'
24
+ end
25
+
26
+ ###
27
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
28
+ # @todo AB: Generalize this method to work with different parsers.
29
+ def convert_to_berkeley(line)
30
+ line.chomp.gsub(/\(/, "-LRB-").gsub(/\)/, "-RRB-").gsub(/''/, "\"").gsub(/\`\`/, "\"")
31
+ end
32
+
33
+ ###
34
+ # @param [String] infilename The name of the input file.
35
+ # @param [String] outfilename The name of the output file.
36
+ def process_file(infilename, outfilename)
37
+ ttfilename = really_process_file(infilename, outfilename)
38
+
39
+ # write all output to tempfile2 first, then
40
+ # change ISO to UTF-8 into outputfile
41
+ tempfile2 = Tempfile.new("treetagger")
42
+ tempfile2.close
43
+
44
+ # 2. use cut to get the actual lemmtisation
45
+
46
+ Kernel.system("cat " + ttfilename +
47
+ ' | sed -e\'s/<EOS>//\' | cut -f3 > ' + tempfile2.path)
48
+
49
+ # transform ISO-8859-1 back to UTF-8,
50
+ # write to 'outfilename'
51
+ begin
52
+ outfile = File.new(outfilename, "w")
53
+ rescue
54
+ raise "Could not write to #{outfilename}"
55
+ end
56
+ tempfile2.open
57
+
58
+ # AB: Internally all the flow is an utf-8 encoded stream.
59
+ # TreeTagger consumes one byte encodings (but we should provide a
60
+ # utf-8 model for German). So we convert utf-8 to latin1, then
61
+ # process the text and convert it back to utf-8.
62
+ #
63
+ while (line = tempfile2.gets)
64
+ utf8line = UtfIso.from_iso_8859_1(line)
65
+ outfile.puts convert_to_berkeley(utf8line)
66
+ end
67
+
68
+ # remove second tempfile, finalize output file
69
+ tempfile2.close(true)
70
+ outfile.close
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,111 @@
1
+ ###########
2
+ # KE dec 7, 06
3
+ # common mixin for both Treetagger modules, doing the actual processing
4
+
5
+ require 'tempfile'
6
+ require 'pathname'
7
+ require 'logging'
8
+
9
+ module Shalmaneser
10
+ module Frappe
11
+ module TreetaggerModule
12
+ ###
13
+ # Treetagger does both lemmatization and POS-tagging.
14
+ # However, the way the SynInterface system is set up in Shalmaneser,
15
+ # each SynInterface can offer only _one_ service.
16
+ # This means that we cannot do a SynInterface that writes
17
+ # both a POS file and a lemma file.
18
+ # Instead, both will include this module, which does the
19
+ # actual TreeTagger call and then stores the result in a file
20
+ # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
21
+ # but with a separate extension.
22
+ # really_process_file checks for existence of this file because,
23
+ # if the TreeTagger lemmatization and POS-tagging classes are called separately,
24
+ # one of them will go first, and the 2nd one will not need to do the
25
+ # TreeTagger call anymore
26
+ #
27
+ # really_process_file returns a filename, the name of the file containing
28
+ # the TreeTagger output with both POS tags and lemma information
29
+ #
30
+ # WARNING: this method assumes that outfilename contains a suffix
31
+ # that can be replaced by .TreeTagger
32
+ def really_process_file(infilename, # string: name of input file
33
+ outfilename, # string: name of file that the caller is to produce
34
+ make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
35
+
36
+ # fabricate the filename in which the
37
+ # actual TreeTagger output will be placed:
38
+ # <directory> + <outfilename minus last suffix> + ".TreeTagger"
39
+ current_suffix = outfilename[outfilename.rindex(".")..-1]
40
+ my_outfilename = File.dirname(outfilename) + "/" +
41
+ File.basename(outfilename, current_suffix) +
42
+ ".TreeTagger"
43
+
44
+ ##
45
+ # does it exist? then just return it
46
+ if !make_new_outfile_anyway && File.exist?(my_outfilename)
47
+ my_outfilename
48
+ end
49
+
50
+ ##
51
+ # else construct it, then return it
52
+ tempfile = Tempfile.new("Treetagger")
53
+ TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
54
+ tempfile.close
55
+
56
+ # @todo AB: Remove it by my shame :(
57
+ # AB: A very dirty hack of mine:
58
+ # We need the language attribute, but we don't have the FrappeConfigData,
59
+ # then we'll try to find it in the ObjectSpace since we should have only one.
60
+ lang = ''
61
+ ObjectSpace.each_object(::Shalmaneser::Configuration::FrappeConfigData) do |o|
62
+ lang = o.get('language')
63
+ end
64
+
65
+ case lang
66
+ when 'en'
67
+ tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'english.par')
68
+ tt_filter = ''
69
+ when 'de'
70
+ tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'german.par')
71
+ tt_filter = "#{Pathname.new(@program_path).join('cmd').join('filter-german-tags')}"
72
+ end
73
+
74
+ # call TreeTagger
75
+ tt_binary = Pathname.new(@program_path).join('bin').join(ENV.fetch('SHALM_TREETAGGER_BIN', 'tree-tagger'))
76
+
77
+ invocation_str = "#{tt_binary} -lemma -token -sgml #{tt_model} "\
78
+ "#{tempfile.path} 2>/dev/null | #{tt_filter} > #{my_outfilename}"
79
+
80
+ LOGGER.info "Tagging and lemmatizing #{tempfile.path} with TreeTagger."
81
+ LOGGER.debug invocation_str
82
+
83
+ Kernel.system(invocation_str)
84
+ tempfile.close(true) # delete first tempfile
85
+
86
+ # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
87
+ # resulting on a .tagged file missing the last (blank) line
88
+
89
+ original_length = File.readlines(infilename).size
90
+ lemmatised_length = File.readlines(infilename).size
91
+
92
+ case (original_length - lemmatised_length)
93
+ when 0
94
+ # everything ok, don't do anything
95
+ when 1
96
+ # @todo Add here a Logger Warning.
97
+ # add one more newline to the .tagged file
98
+ `echo "" >> #{my_outfilename}`
99
+ else
100
+ # this is "real" error
101
+ LOGGER.fatal "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}.\n"\
102
+ "Error: lemmatiser/tagger output for for #{File.basename(infilename)} "\
103
+ "has different line number from corpus file!"
104
+ raise
105
+ end
106
+
107
+ my_outfilename
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,80 @@
1
+ # sp 30 11 06
2
+ #
3
+ # using TreeTagger for POS tagging of English text
4
+ #
5
+ # copy-and-paste from lemmatisation
6
+ #
7
+ # differences:
8
+ # 1. use field 2 and not 3 from the output
9
+ # 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
10
+ #
11
+ # KE 7 12 06
12
+ # change interface such that TreeTagger is called only once
13
+ # and both POS tags and lemma are read from the same files,
14
+ # rather than calling the tagger twice
15
+ require_relative 'treetagger_module'
16
+
17
+ require 'frappe/utf_iso'
18
+
19
+ require 'tempfile'
20
+
21
+ module Shalmaneser
22
+ module Frappe
23
+
24
+ class TreetaggerPOSInterface < SynInterfaceTab
25
+ include TreetaggerModule
26
+
27
+ TreetaggerPOSInterface.announce_me
28
+
29
+ ###
30
+ def self.system
31
+ "treetagger"
32
+ end
33
+
34
+ ###
35
+ def self.service
36
+ "pos_tagger"
37
+ end
38
+
39
+ ###
40
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
41
+ def convert_to_collins(line)
42
+ line.chomp.gsub(/^PP/, "PRP").gsub(/^NP/, "NNP").gsub(/^VV/, "VB").gsub(/^VH/, "VB").gsub(/^SENT/, ".")
43
+ end
44
+
45
+ ###
46
+ # @param [String] infilename Name of input file.
47
+ # @param [String] outfilename Name of output file.
48
+ def process_file(infilename, outfilename)
49
+ # KE change here
50
+ tt_filename = really_process_file(infilename, outfilename, true)
51
+
52
+ # write all output to tempfile2 first, then
53
+ # change ISO to UTF-8 into outputfile
54
+ tempfile2 = Tempfile.new("treetagger")
55
+ tempfile2.close
56
+
57
+ # 2. use cut to get the actual lemmtisation
58
+
59
+ Kernel.system("cat " + tt_filename +
60
+ ' | sed -e\'s/<EOS>//\' | cut -f2 > ' + tempfile2.path)
61
+
62
+ # transform ISO-8859-1 back to UTF-8,
63
+ # write to 'outfilename'
64
+ begin
65
+ outfile = File.new(outfilename, "w")
66
+ rescue
67
+ raise "Could not write to #{outfilename}"
68
+ end
69
+ tempfile2.open
70
+ while (line = tempfile2.gets)
71
+ outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
72
+ end
73
+
74
+ # remove second tempfile, finalize output file
75
+ tempfile2.close(true)
76
+ outfile.close
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,27 @@
1
+ # AB: 2013-12-25
2
+ require_relative 'tiger_interpreter'
3
+
4
+ module Shalmaneser
5
+ module Frappe
6
+ class BerkeleyInterpreter < TigerInterpreter
7
+ BerkeleyInterpreter.announce_me
8
+
9
+ ###
10
+ # names of the systems interpreted by this class:
11
+ # returns a hash service(string) -> system name (string),
12
+ # e.g.
13
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
14
+ def self.systems
15
+ {"parser" => "berkeley"}
16
+ end
17
+
18
+ ###
19
+ # names of additional systems that may be interpreted by this class
20
+ # returns a hash service(string) -> system name(string)
21
+ # same as names()
22
+ def self.optional_systems
23
+ {"lemmatizer" => "treetagger", 'pos_tagger' => 'treetagger'}
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,807 @@
1
+ # require 'salsa_tiger_xml/salsa_tiger_sentence'
2
+ # require 'salsa_tiger_xml/syn_node'
3
+
4
+ require_relative 'syn_interpreter'
5
+ ################################################
6
+ # Interpreter class
7
+ module Shalmaneser
8
+ module Frappe
9
+ class CollinsTntInterpreter < SynInterpreter
10
+ CollinsTntInterpreter.announce_me
11
+
12
+ ###
13
+ # names of the systems interpreted by this class:
14
+ # returns a hash service(string) -> system name (string),
15
+ # e.g.
16
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
17
+ def self.systems
18
+ {"pos_tagger" => "treetagger", "parser" => "collins"}
19
+ end
20
+
21
+ ###
22
+ # names of additional systems that may be interpreted by this class
23
+ # returns a hash service(string) -> system name(string)
24
+ # same as names
25
+ def self.optional_systems
26
+ {"lemmatizer" => "treetagger"}
27
+ end
28
+
29
+ ###
30
+ # generalize over POS tags.
31
+ #
32
+ # returns one of:
33
+ #
34
+ # adj: adjective (phrase)
35
+ # adv: adverb (phrase)
36
+ # card: numbers, quantity phrases
37
+ # con: conjunction
38
+ # det: determiner, including possessive/demonstrative pronouns etc.
39
+ # for: foreign material
40
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
41
+ # part: particles, truncated words (German compound parts)
42
+ # prep: preposition (phrase)
43
+ # pun: punctuation, brackets, etc.
44
+ # sent: sentence
45
+ # top: top node of a sentence
46
+ # verb: verb (phrase)
47
+ # nil: something went wrong
48
+ #
49
+ # returns: string, or nil
50
+ def self.category(node) # SynNode
51
+ pt = CollinsTntInterpreter.simplified_pt(node)
52
+ if pt.nil?
53
+ # phrase type could not be determined
54
+ return nil
55
+ end
56
+
57
+ pt.to_s.strip =~ /^([^-]*)/
58
+ case $1
59
+ when /^JJ/, /(WH)?ADJP/, /^PDT/ then return "adj"
60
+ when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
61
+ when /^CD/, /^QP/ then return "card"
62
+ when /^CC/, /^WRB/, /^CONJP/ then return "con"
63
+ when /^DT/, /^POS/ then return "det"
64
+ when /^FW/, /^SYM/ then return "for"
65
+ when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
66
+ when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
67
+ when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
68
+ when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
69
+ when /^TOP/ then return "top"
70
+ when /^TRACE/ then return "trace"
71
+ when /^V/ , /^MD/ then return "verb"
72
+ else
73
+ # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
74
+ return nil
75
+ end
76
+ end
77
+
78
+
79
+ ###
80
+ # is relative pronoun?
81
+ #
82
+ def self.relative_pronoun?(node) # SynNode
83
+ pt = CollinsTntInterpreter.simplified_pt(node)
84
+ if pt.nil?
85
+ # phrase type could not be determined
86
+ return nil
87
+ end
88
+
89
+ pt.to_s.strip =~ /^([^-]*)/
90
+ case $1
91
+ when /^WDT/, /^WHAD/, /^WHNP/, /^WP/
92
+ return true
93
+ else
94
+ return false
95
+ end
96
+ end
97
+
98
+ ###
99
+ # lemma_backoff:
100
+ #
101
+ # if we have lemma information, return that,
102
+ # and failing that, return the word
103
+ #
104
+ # returns: string, or nil
105
+ def CollinsTntInterpreter.lemma_backoff(node)
106
+ lemma = super(node)
107
+ # lemmatizer has returned more than one possible lemma form:
108
+ # just accept the first
109
+ if lemma =~ /^([^|]+)|/
110
+ return $1
111
+ else
112
+ return lemma
113
+ end
114
+ end
115
+
116
+
117
+ ###
118
+ # simplified phrase type:
119
+ # like phrase type, but may simplify
120
+ # the constituent label
121
+ #
122
+ # returns: string
123
+ def self.simplified_pt(node)
124
+ CollinsTntInterpreter.pt(node) =~ /^(\w+)(-\w)*/
125
+ return $1
126
+ end
127
+
128
+ ###
129
+ # verb_with_particle:
130
+ #
131
+ # given a node and a nodelist,
132
+ # if the node represents a verb:
133
+ # see if the verb has a particle among the nodes in nodelist
134
+ # if so, return it
135
+ #
136
+ # returns: SynNode object if successful, else nil
137
+ def self.particle_of_verb(node,
138
+ node_list)
139
+
140
+ # must be verb
141
+ unless CollinsTntInterpreter.category(node) == "verb"
142
+ return nil
143
+ end
144
+
145
+ # must have parent
146
+ unless node.parent
147
+ return nil
148
+ end
149
+
150
+ # look for sisters of the verb node that have the particle category
151
+ particles = node.parent.children.select { |sister|
152
+ CollinsTntInterpreter.category(sister) == "part"
153
+ }.map { |n| n.children}.flatten.select { |niece|
154
+ # now look for children of those nodes that are particles and are in the nodelist
155
+ nodelist.include? niece and
156
+ CollinsTntInterpreter.category(niece) == "part"
157
+ }
158
+
159
+ if particles.length == 0
160
+ return nil
161
+ else
162
+ return particles.first
163
+ end
164
+ end
165
+
166
+ ###
167
+ # auxiliary?
168
+ #
169
+ # returns true if the given node is an auxiliary
170
+ # else false
171
+ def self.auxiliary?(node)
172
+
173
+ # look for
174
+ # ---VP---
175
+ # | |
176
+ # the given node VP-A
177
+ # |
178
+ # verb node
179
+ # verb?
180
+ unless CollinsTntInterpreter.category(node) == "verb"
181
+ return false
182
+ end
183
+
184
+ unless (parent = node.parent) and
185
+ parent.category == "VP"
186
+ return false
187
+ end
188
+ unless (vpa_node = parent.children.detect { |other_child| other_child.category == "VP-A" })
189
+ return false
190
+ end
191
+ unless vpa_node.children.detect { |other_node| CollinsTntInterpreter.category(other_node) == "verb" }
192
+ return false
193
+ end
194
+
195
+ return true
196
+
197
+ end
198
+
199
+ ###
200
+ # modal?
201
+ #
202
+ # returns true if the given node is a modal verb,
203
+ # else false
204
+ def self.modal?(node)
205
+ if node.part_of_speech =~ /^MD/
206
+ return true
207
+ else
208
+ return false
209
+ end
210
+ end
211
+
212
+ ###
213
+ # voice
214
+ #
215
+ # given a constituent, return
216
+ # - "active"/"passive" if it is a verb
217
+ # - nil, else
218
+ def self.voice(node) # SynNode
219
+
220
+ tobe = ["be","am","is","are","was","were"]
221
+
222
+ unless CollinsTntInterpreter.category(node) == "verb"
223
+ return nil
224
+ end
225
+
226
+ # if we have a gerund, a present tense, or an infitive
227
+ # then we are sure that we have an active form
228
+ case CollinsTntInterpreter.pt(node)
229
+ when "VBG", "VBP", "VBZ", "VB"
230
+ "active"
231
+ end
232
+
233
+ # There is an ambiguity for many word forms between VBN (past participle - passive)
234
+ # and VBD (past tense - active)
235
+
236
+ # so for these, we only say something if we can exclude one possibility,
237
+ # this is the case
238
+ # (a) when there is a c-commanding "to be" somewhere. -> passive
239
+ # (b) when there is no "to be", but a "to have" somewhere. -> active
240
+
241
+ # collect lemmas of c-commanding verbs.
242
+
243
+ parent = node.parent
244
+ if parent.nil?
245
+ return nil
246
+ end
247
+ gp = parent.parent
248
+ if gp.nil?
249
+ return nil
250
+ end
251
+
252
+ # other_verbs = []
253
+ #
254
+ # current_node = node
255
+ # while current_node = current_node.parent
256
+ # pt = CollinsTntInterpreter.category(current_node)
257
+ # unless ["verb","sentence"].include? pt
258
+ # break
259
+ # end
260
+ # current_node.children.each {|child|
261
+ # if CollinsTntInterpreter.category(child) == "verb"
262
+ # other_verbs << CollinsTntInterpreter.lemma_backoff(nephew)
263
+ # end
264
+ # }
265
+ # end
266
+ #
267
+ # unless (tobe & other_verbs).empty?
268
+ # puts "passive "+node.id
269
+ # return "passive"
270
+ # end
271
+ # unless (tohave & other_verbs).empty?
272
+ # return "active"
273
+ # end
274
+
275
+ if CollinsTntInterpreter.category(gp) == "verb" or CollinsTntInterpreter.category(gp) == "sent"
276
+
277
+ current_node = node
278
+
279
+ while current_node = current_node.parent
280
+ pt = CollinsTntInterpreter.category(current_node)
281
+ unless ["verb","sent"].include? pt
282
+ break
283
+ end
284
+ if current_node.children.detect {|nephew| tobe.include? CollinsTntInterpreter.lemma_backoff(nephew)}
285
+ return "passive"
286
+ end
287
+ end
288
+ # if no "to be" has been found...
289
+ return "active"
290
+ end
291
+
292
+ # case 2: The grandfather is something else (e.g. a noun phrase)
293
+ # here, simple past forms are often mis-tagged as passives
294
+ #
295
+
296
+ # if we were cautious, we would return "dontknow" here;
297
+ # however, these cases are so rare that it is unlikely that
298
+ # assignments would be more reliable; so we rely on the
299
+ # POS tag anyway.
300
+
301
+
302
+ case CollinsTntInterpreter.pt(node)
303
+ when "VBN","VBD"
304
+ return "passive"
305
+ # this must be some kind of error...
306
+ else
307
+ return nil
308
+ end
309
+ end
310
+
311
+ ###
312
+ # gfs
313
+ #
314
+ # grammatical functions of a constituent:
315
+ #
316
+ # returns: a list of pairs [relation(string), node(SynNode)]
317
+ # where <node> stands in the relation <relation> to the parameter
318
+ # that the method was called with
319
+ def CollinsTntInterpreter.gfs(anchor_node, # SynNode
320
+ sent) # SalsaTigerSentence
321
+
322
+ return sent.syn_nodes.map { |gf_node|
323
+
324
+ case CollinsTntInterpreter.category(anchor_node)
325
+ when "adj"
326
+ rel = CollinsTntInterpreter.gf_adj(anchor_node, gf_node)
327
+ when "verb"
328
+ rel = CollinsTntInterpreter.gf_verb(anchor_node, gf_node)
329
+ when "noun"
330
+ rel = CollinsTntInterpreter.gf_noun(anchor_node, gf_node)
331
+ end
332
+
333
+ if rel
334
+ [rel, gf_node]
335
+ else
336
+ nil
337
+ end
338
+ }.compact
339
+ end
340
+
341
+ ###
342
+ # informative_content_node
343
+ #
344
+ # for most constituents: nil
345
+ # for a PP, the NP
346
+ # for an SBAR, the VP
347
+ # for a VP, the embedded VP
348
+ def CollinsTntInterpreter.informative_content_node(node)
349
+ this_pt = CollinsTntInterpreter.simplified_pt(node)
350
+
351
+ unless ["SBAR", "VP", "PP"].include? this_pt
352
+ return nil
353
+ end
354
+
355
+ nh = CollinsTntInterpreter.head_terminal(node)
356
+ unless nh
357
+ return nil
358
+ end
359
+ headlemma = CollinsTntInterpreter.lemma_backoff(nh)
360
+
361
+ nonhead_children = node.children.reject { |n|
362
+ nnh = CollinsTntInterpreter.head_terminal(n)
363
+ not(nnh) or
364
+ CollinsTntInterpreter.lemma_backoff(nnh) == headlemma
365
+ }
366
+ if nonhead_children.length == 1
367
+ return nonhead_children.first
368
+ end
369
+
370
+ # more than one child:
371
+ # for SBAR and VP take child with head POS starting in VB,
372
+ # for PP child with head POS starting in NN
373
+ case this_pt
374
+ when "SBAR", "VP"
375
+ icont_child = nonhead_children.detect { |n|
376
+ h = CollinsTntInterpreter.head_terminal(n)
377
+ h and h.part_of_speech =~ /^VB/
378
+ }
379
+ when "PP"
380
+ icont_child = nonhead_children.detect { |n|
381
+ h = CollinsTntInterpreter.head_terminal(n)
382
+ h and h.part_of_speech =~ /^NN/
383
+ }
384
+ else
385
+ raise "Shouldn't be here"
386
+ end
387
+
388
+ if icont_child
389
+ return icont_child
390
+ else
391
+ return nonhead_children.first
392
+ end
393
+ end
394
+
395
+
396
+
397
+
398
+ ########
399
+ # prune?
400
+ # given a target node t and another node n of the syntactic structure,
401
+ # decide whether n is likely to instantiate a semantic role
402
+ # of t. If not, recommend n for pruning.
403
+ #
404
+ # This method implements a slight variant of Xue and Palmer (EMNLP 2004).
405
+ # Pruning according to Xue & Palmer, EMNLP 2004:
406
+ # "Step 1: Designate the predicate as the current node and
407
+ # collect its sisters (constituents attached at the same level
408
+ # as the predicate) unless its sisters are coordinated with the
409
+ # predicate. If a sister is a PP, also collect its immediate
410
+ # children.
411
+ # Step 2: Reset the current node to its parent and repeat Step 1
412
+ # till it reaches the top level node.
413
+ #
414
+ # Modifications made here:
415
+ # - paths of length 0 accepted in any case
416
+ #
417
+ # returns: false to recommend n for pruning, else true
418
+ def CollinsTntInterpreter.prune?(node, # SynNode
419
+ paths_to_target, # hash: node ID -> Path object: paths from target to node
420
+ terminal_index) # hash: terminal node -> word index in sentence
421
+
422
+ path_to_target = paths_to_target[node.id]
423
+
424
+ if not path_to_target
425
+ # no path from target to node: suggest for pruning
426
+
427
+ return 0
428
+
429
+ elsif path_to_target.length == 0
430
+ # target may be its own role: definite accept
431
+
432
+ return 1
433
+
434
+ else
435
+ # consider path from target to node.
436
+ # (1) If the path to the current node includes at least one Up
437
+ # and exactly one Down, keep.
438
+ # (2) Else, if the path includes at least one Up and exactly two Down,
439
+ # and the current node's parent is a PP, keep
440
+ # (3) else discard
441
+
442
+ # count number of up and down steps in path to target
443
+ num_up = 0
444
+ num_down = 0
445
+ path_to_target.each_step { |direction, edgelabel, nodelabel, endnode|
446
+ case direction
447
+ when /U/
448
+ num_up += 1
449
+ when /D/
450
+ num_down += 1
451
+ end
452
+ }
453
+
454
+ # coordination sister between node and target?
455
+ conj_sister_between = CollinsTntInterpreter.conj_sister_between?(node, paths_to_target,
456
+ terminal_index)
457
+
458
+
459
+ if conj_sister_between
460
+ # coordination between me and the target -- drop
461
+ return 0
462
+
463
+ elsif num_up >= 1 and num_down == 1
464
+ # case (1)
465
+ return 1
466
+
467
+ elsif num_up >= 1 and num_down == 2 and
468
+ (p = node.parent) and CollinsTntInterpreter.category(p) == "prep"
469
+
470
+ # case (2)
471
+ return 1
472
+
473
+ else
474
+ # case (3)
475
+ return 0
476
+ end
477
+ end
478
+ end
479
+
480
+
481
+ ###
482
+ private
483
+
484
+
485
+ ###
486
+ # given an anchor node and another node that may be some
487
+ # grammatical function of the anchor node:
488
+ # return the grammatical function (string) if found,
489
+ # else nil.
490
+ #
491
+ # here: anchor node is verb.
492
+ def CollinsTntInterpreter.gf_verb(anchor_node, # SynNode
493
+ gf_node) # SynNode
494
+
495
+ # first classification: according to constituent type
496
+ cat = CollinsTntInterpreter.category(gf_node)
497
+ if cat.nil?
498
+ return nil
499
+ end
500
+
501
+ # second classification: according to path
502
+ path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
503
+ if path.nil?
504
+ # no path between anchor node and gf node
505
+ return nil
506
+ end
507
+
508
+ path.set_cutoff_last_pt_on_printing(true)
509
+ path_string = path.print(true,false,true)
510
+
511
+ case path_string
512
+ when "U VP D ", "U SG D "
513
+ categ2 = "inside"
514
+ when /^U (VP U )*S(BAR)? D $/
515
+ categ2 = "external"
516
+ when /^U (VP U )*VP D ADVP D $/
517
+ categ2 = "external"
518
+ else
519
+ categ2 = ""
520
+ end
521
+
522
+ # now evaluate based on both
523
+ case cat+ "+" + categ2
524
+ when "noun+inside"
525
+ # direct object
526
+ return "OA"
527
+
528
+ when "noun+external"
529
+ unless CollinsTntInterpreter.relative_position(gf_node, anchor_node) == "LEFT"
530
+ return nil
531
+ end
532
+
533
+ if CollinsTntInterpreter.voice(anchor_node) == "passive"
534
+ return "OA"
535
+ else
536
+ return "SB"
537
+ end
538
+
539
+ when "prep+inside"
540
+ if CollinsTntInterpreter.voice(anchor_node) == "passive" and
541
+ CollinsTntInterpreter.preposition(gf_node) == "by"
542
+ return "SB"
543
+ else
544
+ return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
545
+ end
546
+
547
+ when "sent+inside"
548
+ return "OC"
549
+
550
+ when "sent+external"
551
+ return "OC"
552
+
553
+ else
554
+ return nil
555
+ end
556
+ end
557
+
558
+ ###
559
+ # given an anchor node and another node that may be some
560
+ # grammatical function of the anchor node:
561
+ # return the grammatical function (string) if found,
562
+ # else nil.
563
+ #
564
+ # here: anchor node is noun.
565
+ def CollinsTntInterpreter.gf_noun(anchor_node, # SynNode
566
+ gf_node) # SynNode
567
+
568
+ # first classification: according to constituent type
569
+ cat = CollinsTntInterpreter.category(gf_node)
570
+ if cat.nil?
571
+ return nil
572
+ end
573
+
574
+ # second classification: according to path
575
+ path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
576
+ if path.nil?
577
+ # no path between anchor node and gf node
578
+ return nil
579
+ end
580
+
581
+ path.set_cutoff_last_pt_on_printing(true)
582
+ path_string = path.print(true,false,true)
583
+
584
+ case path_string
585
+ when "U NPB D "
586
+ categ2 = "np-neighbor"
587
+ when "U NPB U NP D "
588
+ categ2 = "np-parent"
589
+ when "U NP D "
590
+ categ2 = "np-a"
591
+ when /^U NPB (U NP )?(U NP )?U S(BAR)? D( VP D)? $/
592
+ categ2 = "beyond-s"
593
+ when /^U NP(B)? (U NP )?U VP D $/
594
+ categ2 = "beyond-vp"
595
+ when /^U NPB (U NP )?(U NP)?U PP U VP(-A)? D $/
596
+ categ2 = "beyond-pp-vp"
597
+ else
598
+ categ2 = ""
599
+ end
600
+
601
+ # now evaluate based on both
602
+ case cat + "+" + categ2
603
+ when "noun+np-neighbor"
604
+ return "AG"
605
+
606
+ when "sent+np-parent"
607
+ return "OC"
608
+
609
+ when "prep+np-parent", "prep+np-a"
610
+ return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
611
+ # relation of anchor noun to governing verb not covered by "gfs" method
612
+ # when "verb+beyond-s"
613
+ # return "SB-of"
614
+
615
+ # when "verb+beyond-vp"
616
+ # return "OA-of"
617
+
618
+ # when "verb+beyond-pp-vp"
619
+ # return "MO-of"
620
+ else
621
+ return nil
622
+ end
623
+ end
624
+
625
+
626
+ ###
627
+ # given an anchor node and another node that may be some
628
+ # grammatical function of the anchor node:
629
+ # return the grammatical function (string) if found,
630
+ # else nil.
631
+ #
632
+ # here: anchor node is adjective.
633
+ def CollinsTntInterpreter.gf_adj(anchor_node, # SynNode
634
+ gf_node) # SynNode
635
+
636
+ # first classification: according to constituent type
637
+ cat = CollinsTntInterpreter.category(gf_node)
638
+ if cat.nil?
639
+ return nil
640
+ end
641
+
642
+ # second classification: according to path
643
+ path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
644
+ if path.nil?
645
+ # no path between anchor node and gf node
646
+ return nil
647
+ end
648
+
649
+ path.set_cutoff_last_pt_on_printing(true)
650
+ path_string = path.print(true,false,true)
651
+
652
+ case path_string
653
+ when /^(U ADJP )?U NPB D $/
654
+ categ2 = "nnpath"
655
+ when "U ADJP D "
656
+ categ2 = "adjp-neighbor"
657
+ when /^(U ADJP )?U (VP U )?S(BAR)? D $/
658
+ categ2 = "s"
659
+ when /^U (ADJP U )?VP D $/
660
+ categ2 = "vp"
661
+ else
662
+ categ2 = ""
663
+ end
664
+
665
+ # now evaluate based on both
666
+ case cat + "+" + categ2
667
+ when "noun+nnpath"
668
+ return "HD"
669
+ when "verb+adjp-neighbor"
670
+ return "OC"
671
+ when "prep+vp", "prep+adjp-neighbor"
672
+ return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
673
+ else
674
+ return nil
675
+ end
676
+ end
677
+
678
+ ####
679
+ # auxiliary of prune?:
680
+ #
681
+ # given a node and a hash mapping node IDs to paths to target:
682
+ # Does that node have a sister that is a coordination and that
683
+ # is between it and the target?
684
+ #
685
+ def CollinsTntInterpreter.conj_sister_between?(node, # SynNode
686
+ paths_to_target, # Hash: node ID -> Path obj: path from node to target
687
+ ti) # hash: terminal node -> word index in sentence
688
+
689
+ # does node have sisters that represent coordination?
690
+ unless (p = node.parent)
691
+ return false
692
+ end
693
+
694
+ unless (conj_sisters = p.children.select { |sib|
695
+ sib != node and CollinsTntInterpreter.category(sib) == "con"
696
+ } ) and
697
+ not (conj_sisters.empty?)
698
+ return false
699
+ end
700
+
701
+ # represent each coordination sister, and the node itself,
702
+ # as a triple [node, leftmost terminal index(node), rightmost terminal index(node)
703
+ conj_sisters = conj_sisters.map { |n|
704
+ [n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
705
+ }
706
+
707
+ this_triple = [node, CollinsTntInterpreter.lti(node, ti), CollinsTntInterpreter.rti(node, ti)]
708
+
709
+ # sisters closer to the target than node:
710
+ # also map to triples
711
+ sisters_closer_to_target = p.children.select { |sib|
712
+ sib != node and
713
+ not(conj_sisters.include? sib) and
714
+ paths_to_target[sib.id] and
715
+ paths_to_target[sib.id].length < paths_to_target[node.id].length
716
+ }.map { |n|
717
+ [n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
718
+ }
719
+
720
+ if sisters_closer_to_target.empty?
721
+ return false
722
+ end
723
+
724
+ # is there any coordination sister that is inbetween this node
725
+ # and some sister that is closer to the target?
726
+ # if so, return true
727
+ conj_sisters.each { |conj_triple|
728
+ if leftof(conj_triple, this_triple) and
729
+ sisters_closer_to_target.detect { |s| CollinsTntInterpreter.leftof(s, conj_triple) }
730
+
731
+ return true
732
+
733
+ elsif rightof(conj_triple, this_triple) and
734
+ sisters_closer_to_target.detect { |s| CollinsTntInterpreter.rightof(s, conj_triple) }
735
+
736
+ return true
737
+ end
738
+ }
739
+
740
+ # else return false
741
+ return false
742
+ end
743
+
744
+ ###
745
+ # lti, rti: terminal index of the leftmost/rightmost terminal of
746
+ # a given node (SynNode)
747
+ #
748
+ # auxiliary of conj_sister_between?
749
+ def self.lti(node, # SynNode
750
+ terminal_index) # hash: terminal node -> word index in sentence
751
+ lt = CollinsTntInterpreter.leftmost_terminal(node)
752
+ unless lt
753
+ return nil
754
+ end
755
+
756
+ return terminal_index[lt]
757
+ end
758
+
759
+ def self.rti(node, # SynNode
760
+ terminal_index) # hash: terminal node -> word index in sentence
761
+ rt = CollinsTntInterpreter.rightmost_terminal(node)
762
+ unless rt
763
+ return nil
764
+ end
765
+
766
+ return terminal_index[rt]
767
+ end
768
+
769
+ ###
770
+ # leftof, rightof: given 2 triples
771
+ # [node(SynNode), index of leftmost terminal(integer/nil), index of rightmost terminal(integer/nil),
772
+ #
773
+ # auxiliaries of conj_sister_between?
774
+ #
775
+ # return true if both leftmost and rightmost terminal indices of the first triple are
776
+ # smaller than (for leftof) / bigger than (for rightof) the
777
+ # corresponding indices of the second triple
778
+ #
779
+ # return false if some index is nil
780
+ def self.leftof(triple1, triple2)
781
+ _dummy, lm1, rm1 = triple1
782
+ _dummy, lm2, rm2 = triple2
783
+
784
+ if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
785
+ return false
786
+ elsif lm1 < lm2 and rm1 < rm2
787
+ return true
788
+ else
789
+ return false
790
+ end
791
+ end
792
+
793
+ def self.rightof(triple1, triple2)
794
+ _dummy, lm1, rm1 = triple1
795
+ _dummy, lm2, rm2 = triple2
796
+
797
+ if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
798
+ return false
799
+ elsif lm1 > lm2 and rm1 > rm2
800
+ return true
801
+ else
802
+ return false
803
+ end
804
+ end
805
+ end
806
+ end
807
+ end