shalmaneser-frappe 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/frappe/Ampersand.rb +41 -0
- data/lib/frappe/file_parser.rb +126 -0
- data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
- data/lib/frappe/frappe.rb +217 -0
- data/lib/frappe/frappe_flat_syntax.rb +89 -0
- data/lib/frappe/frappe_read_stxml.rb +48 -0
- data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
- data/lib/frappe/interfaces/collins_interface.rb +340 -0
- data/lib/frappe/interfaces/counter.rb +19 -0
- data/lib/frappe/interfaces/stanford_interface.rb +353 -0
- data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
- data/lib/frappe/interfaces/treetagger_module.rb +111 -0
- data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
- data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
- data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
- data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
- data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
- data/lib/frappe/interpreters/headz.rb +265 -0
- data/lib/frappe/interpreters/headz_helpers.rb +54 -0
- data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
- data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
- data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
- data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
- data/lib/frappe/one_parsed_file.rb +31 -0
- data/lib/frappe/opt_parser.rb +92 -0
- data/lib/frappe/path.rb +199 -0
- data/lib/frappe/plain_converter.rb +59 -0
- data/lib/frappe/salsa_tab_converter.rb +154 -0
- data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
- data/lib/frappe/stxml_converter.rb +666 -0
- data/lib/frappe/syn_interface.rb +76 -0
- data/lib/frappe/syn_interface_stxml.rb +173 -0
- data/lib/frappe/syn_interface_tab.rb +39 -0
- data/lib/frappe/utf_iso.rb +27 -0
- data/lib/shalmaneser/frappe.rb +1 -0
- metadata +130 -0
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'logging'
|
2
|
+
require 'external_systems'
|
3
|
+
|
4
|
+
module Shalmaneser
|
5
|
+
module Frappe
|
6
|
+
class SalsaTabConverter
|
7
|
+
def initialize(exp)
|
8
|
+
@exp = exp
|
9
|
+
# suffixes for different types of output files
|
10
|
+
@file_suffixes = {"lemma" => ".lemma", "pos" => ".pos", "tab" => ".tab", "stxml" => ".xml"}
|
11
|
+
end
|
12
|
+
|
13
|
+
###############
|
14
|
+
# transform_pos_and_lemmatize
|
15
|
+
#
|
16
|
+
# transformation for Tab format files:
|
17
|
+
#
|
18
|
+
# - Split into parser-size chunks
|
19
|
+
# - POS-tag, lemmatize
|
20
|
+
# string: input directory
|
21
|
+
# string: output directory
|
22
|
+
def transform_pos_and_lemmatize(input_dir, output_dir)
|
23
|
+
##
|
24
|
+
# split the TabFormatFile into chunks of max_sent_num size
|
25
|
+
split_dir(input_dir, output_dir, @file_suffixes["tab"], @exp.get("parser_max_sent_num"), @exp.get("parser_max_sent_len"))
|
26
|
+
|
27
|
+
##
|
28
|
+
# POS-Tagging
|
29
|
+
if @exp.get("do_postag")
|
30
|
+
LOGGER.info "#{PROGRAM_NAME}: Tagging."
|
31
|
+
|
32
|
+
sys_class = ExternalSystems.get_interface("pos_tagger", @exp.get("pos_tagger"))
|
33
|
+
|
34
|
+
# AB: TODO Remove it.
|
35
|
+
unless sys_class
|
36
|
+
raise "Shouldn't be here"
|
37
|
+
end
|
38
|
+
|
39
|
+
LOGGER.debug "POS Tagger interface: #{sys_class}."
|
40
|
+
sys = sys_class.new(@exp.get("pos_tagger_path"), @file_suffixes["tab"], @file_suffixes["pos"])
|
41
|
+
sys.process_dir(output_dir, output_dir)
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# Lemmatization
|
46
|
+
# AB: We're working on the <split> dir and writing there.
|
47
|
+
if @exp.get("do_lemmatize")
|
48
|
+
LOGGER.info "#{PROGRAM_NAME}: Lemmatizing."
|
49
|
+
|
50
|
+
sys_class = ExternalSystems.get_interface("lemmatizer", @exp.get("lemmatizer"))
|
51
|
+
# AB: TODO make this exception explicit.
|
52
|
+
unless sys_class
|
53
|
+
raise 'I got a empty interface class for the lemmatizer!'
|
54
|
+
end
|
55
|
+
|
56
|
+
sys = sys_class.new(@exp.get("lemmatizer_path"), @file_suffixes["tab"], @file_suffixes["lemma"])
|
57
|
+
sys.process_dir(output_dir, output_dir)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
###########
|
63
|
+
#
|
64
|
+
# class method split_dir:
|
65
|
+
# read all files in one directory and produce chunk files with _suffix_ in outdir
|
66
|
+
# with a certain number of files in them (sent_num).
|
67
|
+
# Optionally, remove all sentences longer than sent_leng
|
68
|
+
#
|
69
|
+
# produces output files 1.<suffix>, 2.<suffix>, etc.
|
70
|
+
#
|
71
|
+
# assumes TabFormat sentences
|
72
|
+
#
|
73
|
+
# example: split_all("/tmp/in","/tmp/out",".tab",2000,80)
|
74
|
+
def split_dir(indir, outdir, suffix, sent_num, sent_leng = nil)
|
75
|
+
unless indir[-1,1] == "/"
|
76
|
+
indir += "/"
|
77
|
+
end
|
78
|
+
unless outdir[-1,1] == "/"
|
79
|
+
outdir += "/"
|
80
|
+
end
|
81
|
+
|
82
|
+
# @note AB: A dummy reimplementation.
|
83
|
+
# Not doing splitting at all.
|
84
|
+
# I want to preserve original file names.
|
85
|
+
Dir["#{indir}*#{suffix}"].each do |file|
|
86
|
+
FileUtils.cp file, outdir
|
87
|
+
end
|
88
|
+
# @note AB: Not doing splitting for now.
|
89
|
+
=begin
|
90
|
+
outfile_counter = 0
|
91
|
+
line_stack = []
|
92
|
+
sent_stack = []
|
93
|
+
|
94
|
+
Dir[indir + "*#{suffix}"].each do |infilename|
|
95
|
+
LOGGER.info "Now splitting #{infilename}."
|
96
|
+
|
97
|
+
infile = File.new(infilename)
|
98
|
+
|
99
|
+
while (line = infile.gets)
|
100
|
+
line.chomp!
|
101
|
+
case line
|
102
|
+
when "" # end of sentence
|
103
|
+
if !(sent_leng.nil? or line_stack.length < sent_leng) # record sentence
|
104
|
+
# suppress multiple empty lines
|
105
|
+
# to avoid problems with lemmatiser
|
106
|
+
# only record sent_stack if it is not empty.
|
107
|
+
|
108
|
+
# change (sp 15 01 07): just cut off sentence at sent_leng.
|
109
|
+
|
110
|
+
STDERR.puts "Cutting off long sentence #{line_stack.last.split("\t").last}"
|
111
|
+
line_stack = line_stack[0...sent_leng]
|
112
|
+
end
|
113
|
+
|
114
|
+
unless line_stack.empty?
|
115
|
+
sent_stack << line_stack
|
116
|
+
# reset line_stack
|
117
|
+
line_stack = []
|
118
|
+
end
|
119
|
+
|
120
|
+
# check if we have to empty the sent stack
|
121
|
+
if sent_stack.length == sent_num # enough sentences for new outfile?
|
122
|
+
outfile = File.new(outdir + outfile_counter.to_s + "#{suffix}", "w")
|
123
|
+
|
124
|
+
sent_stack.each { |l_stack|
|
125
|
+
outfile.puts l_stack.join("\n")
|
126
|
+
outfile.puts
|
127
|
+
}
|
128
|
+
|
129
|
+
outfile.close
|
130
|
+
outfile_counter += 1
|
131
|
+
sent_stack = []
|
132
|
+
end
|
133
|
+
else # for any other line
|
134
|
+
line_stack << line
|
135
|
+
end
|
136
|
+
end
|
137
|
+
infile.close
|
138
|
+
end
|
139
|
+
|
140
|
+
# the last remaining sentences
|
141
|
+
unless sent_stack.empty?
|
142
|
+
File.open(outdir + outfile_counter.to_s + "#{suffix}", "w") do |outfile|
|
143
|
+
sent_stack.each { |l_stack|
|
144
|
+
l_stack << "\n"
|
145
|
+
outfile.puts l_stack.join("\n")
|
146
|
+
}
|
147
|
+
end
|
148
|
+
end
|
149
|
+
=end
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,531 @@
|
|
1
|
+
require 'logging'
|
2
|
+
|
3
|
+
require 'salsa_tiger_xml/salsa_tiger_xml_helper'
|
4
|
+
require 'frappe/file_parser'
|
5
|
+
require 'external_systems'
|
6
|
+
|
7
|
+
module Shalmaneser
|
8
|
+
module Frappe
|
9
|
+
class SalsaTabWithPOSConverter
|
10
|
+
def initialize(exp)
|
11
|
+
@exp = exp
|
12
|
+
# suffixes for different types of output files
|
13
|
+
@file_suffixes = {"lemma" => ".lemma", "pos" => ".pos", "tab" => ".tab", "stxml" => ".xml"}
|
14
|
+
end
|
15
|
+
###############
|
16
|
+
# transform_salsatab
|
17
|
+
#
|
18
|
+
# transformation for Tab format files:
|
19
|
+
#
|
20
|
+
# - parse
|
21
|
+
# - Transform parser output to SalsaTigerXML
|
22
|
+
# If no parsing, make flat syntactic structure.
|
23
|
+
# @param [String] input_dir Input directory.
|
24
|
+
# @param [String] parse_dir Output directory for parses.
|
25
|
+
# @param [String] output_dir Global output directory.
|
26
|
+
def transform_salsatab_dir(input_dir, parse_dir, output_dir)
|
27
|
+
##
|
28
|
+
# (Parse and) transform to SalsaTigerXML
|
29
|
+
# get interpretation class for this
|
30
|
+
# parser/lemmatizer/POS tagger combination
|
31
|
+
interpreter_class = ExternalSystems.get_interpreter_according_to_exp(@exp)
|
32
|
+
|
33
|
+
unless interpreter_class
|
34
|
+
raise "Shouldn't be here"
|
35
|
+
end
|
36
|
+
|
37
|
+
parse_obj = FileParser.new(@exp, @file_suffixes, parse_dir, "tab_dir" => input_dir)
|
38
|
+
|
39
|
+
parse_obj.each_parsed_file do |parsed_file_obj|
|
40
|
+
outfilename = output_dir + parsed_file_obj.filename + ".xml"
|
41
|
+
LOGGER.debug "Writing #{outfilename}."
|
42
|
+
|
43
|
+
begin
|
44
|
+
outfile = File.new(outfilename, "w")
|
45
|
+
rescue
|
46
|
+
raise "Cannot write to SalsaTigerXML output file #{outfilename}"
|
47
|
+
end
|
48
|
+
|
49
|
+
outfile.puts STXML::SalsaTigerXMLHelper.get_header
|
50
|
+
# work with triples
|
51
|
+
# SalsaTigerSentence, FNTabSentence,
|
52
|
+
# hash: tab sentence index(integer) -> array:SynNode
|
53
|
+
parsed_file_obj.each_sentence do |st_sent, tabformat_sent, mapping|
|
54
|
+
# parsed: add headwords using parse tree
|
55
|
+
if @exp.get("do_parse")
|
56
|
+
add_head_attributes(st_sent, interpreter_class)
|
57
|
+
end
|
58
|
+
|
59
|
+
# add lemmas, if they are there. If they are not, don't print out a warning.
|
60
|
+
if @exp.get("do_lemmatize")
|
61
|
+
add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
|
62
|
+
end
|
63
|
+
|
64
|
+
# add semantics
|
65
|
+
# we can use the method in SalsaTigerXMLHelper
|
66
|
+
# that reads semantic information from the tab file
|
67
|
+
# and combines all targets of a sentence into one frame
|
68
|
+
add_semantics_from_tab(st_sent, tabformat_sent, mapping, interpreter_class, @exp)
|
69
|
+
|
70
|
+
# remove pseudo-frames from FrameNet data
|
71
|
+
remove_deprecated_frames(st_sent, @exp)
|
72
|
+
|
73
|
+
# handle multiword targets
|
74
|
+
handle_multiword_targets(st_sent, interpreter_class, @exp.get("language"))
|
75
|
+
|
76
|
+
# handle Unknown frame names
|
77
|
+
handle_unknown_framenames(st_sent, interpreter_class)
|
78
|
+
|
79
|
+
outfile.puts st_sent.get
|
80
|
+
end
|
81
|
+
outfile.puts STXML::SalsaTigerXMLHelper.get_footer
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# add lemma information to each terminal in a given SalsaTigerSentence object
|
86
|
+
# @param [SalsaTigerSentence] st_sent
|
87
|
+
# @param [FNTabFormatSentence] tab_sent
|
88
|
+
# @param [Hash] mapping hash: tab lineno -> array:SynNode
|
89
|
+
def add_lemmas_from_tab(st_sent, tab_sent, mapping)
|
90
|
+
if tab_sent.nil?
|
91
|
+
# tab sentence not found
|
92
|
+
return
|
93
|
+
end
|
94
|
+
|
95
|
+
# produce list with word, lemma pairs
|
96
|
+
lemmat = []
|
97
|
+
tab_sent.each_line_parsed {|line|
|
98
|
+
word = line.get("word")
|
99
|
+
lemma = line.get("lemma")
|
100
|
+
lemmat << [word, lemma]
|
101
|
+
}
|
102
|
+
|
103
|
+
# match with st_sent terminal list and add lemma attributes
|
104
|
+
# KE Jan 07: if word mismatch,
|
105
|
+
# set to Lemmatizer file version,
|
106
|
+
# but count mismatches
|
107
|
+
word_mismatches = []
|
108
|
+
|
109
|
+
st_sent.each_terminal_sorted { |t|
|
110
|
+
matching_lineno = (0...lemmat.length).to_a.detect { |tab_lineno|
|
111
|
+
mapping[tab_lineno].include? t
|
112
|
+
}
|
113
|
+
unless matching_lineno
|
114
|
+
next
|
115
|
+
end
|
116
|
+
word, lemma = lemmat[matching_lineno]
|
117
|
+
|
118
|
+
# transform characters to XML-friendly form
|
119
|
+
# for comparison with st_word, which is also escaped
|
120
|
+
word = STXML::SalsaTigerXMLHelper.escape(word)
|
121
|
+
st_word = t.word
|
122
|
+
if word != st_word && word != STXML::SalsaTigerXMLHelper.escape(st_word)
|
123
|
+
# true mismatch.
|
124
|
+
# use the Lemmatizer version of the word, remember the mismatch
|
125
|
+
word_mismatches << [st_word, word]
|
126
|
+
t.set_attribute("word", word)
|
127
|
+
end
|
128
|
+
|
129
|
+
if lemma
|
130
|
+
# we actually do have lemma information
|
131
|
+
lemmatised_head = STXML::SalsaTigerXMLHelper.escape(lemma)
|
132
|
+
t.set_attribute("lemma",lemmatised_head)
|
133
|
+
end
|
134
|
+
} # each terminal
|
135
|
+
|
136
|
+
# did we have mismatches? then report them
|
137
|
+
unless word_mismatches.empty?
|
138
|
+
$stderr.puts "Warning: Word mismatches found between Lemmatizer file and SalsaTigerXML file generalted from parser output."
|
139
|
+
$stderr.puts "(May be due to failed reencoding of special character in the parser output.)"
|
140
|
+
$stderr.puts "I am using the Lemmatizer version by default."
|
141
|
+
$stderr.puts "Version used:"
|
142
|
+
$stderr.print "\t"
|
143
|
+
st_sent.each_terminal_sorted { |t| $stderr.print ">>#{t}<<" }
|
144
|
+
$stderr.puts
|
145
|
+
$stderr.print "SalsaTigerXML file had: "
|
146
|
+
$stderr.print word_mismatches.map { |st_word, tab_word|
|
147
|
+
"#{st_word} instead of #{tab_word}"
|
148
|
+
}.join(", ")
|
149
|
+
$stderr.puts
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
###
|
155
|
+
# add semantics from tab:
|
156
|
+
#
|
157
|
+
# add information about semantics from a FN tab sentence
|
158
|
+
# to a SalsaTigerSentence object:
|
159
|
+
# - frames (one frame per sentence)
|
160
|
+
# - roles
|
161
|
+
# - FrameNet grammatical functions
|
162
|
+
# - FrameNet POS of target
|
163
|
+
def add_semantics_from_tab(st_sent, # SalsaTigerSentence object
|
164
|
+
tab_sent, # FNTabFormatSentence object
|
165
|
+
mapping, # hash: tab lineno -> array:SynNode
|
166
|
+
interpreter_class, # SynInterpreter class
|
167
|
+
exp) # FrprepConfigData
|
168
|
+
|
169
|
+
if tab_sent.nil?
|
170
|
+
# tab sentence not found
|
171
|
+
return
|
172
|
+
end
|
173
|
+
|
174
|
+
# iterate through frames in the tabsent
|
175
|
+
frame_index = 0
|
176
|
+
tab_sent.each_frame { |tab_frame_obj|
|
177
|
+
frame_name = tab_frame_obj.get_frame # string
|
178
|
+
|
179
|
+
if frame_name.nil? or frame_name =~ /^-*$/
|
180
|
+
# weird: a frame without a frame
|
181
|
+
$stderr.puts "Warning: frame entry without a frame in tab sentence #{st_sent.id}."
|
182
|
+
$stderr.puts "Skipping"
|
183
|
+
next
|
184
|
+
end
|
185
|
+
|
186
|
+
frame_node = st_sent.add_frame(frame_name, tab_sent.get_sent_id + "_f#{frame_index}")
|
187
|
+
frame_index += 1
|
188
|
+
|
189
|
+
# target
|
190
|
+
target_nodes = []
|
191
|
+
tab_frame_obj.get_target_indices.each {|terminal_id|
|
192
|
+
if mapping[terminal_id]
|
193
|
+
target_nodes.concat mapping[terminal_id]
|
194
|
+
end
|
195
|
+
}
|
196
|
+
|
197
|
+
# let the interpreter class decide on how to determine the maximum constituents
|
198
|
+
target_maxnodes = interpreter_class.max_constituents(target_nodes, st_sent)
|
199
|
+
if target_maxnodes.empty?
|
200
|
+
# HIEr
|
201
|
+
STDERR.puts "Warning: no target in frame entry, sentence #{st_sent.id}."
|
202
|
+
$stderr.puts "frame is #{frame_name}, frame no #{frame_index}"
|
203
|
+
$stderr.puts "Skipping."
|
204
|
+
$stderr.puts "target indices: " + tab_frame_obj.get_target_indices.join(", ")
|
205
|
+
#tab_sent.each_line { |line|
|
206
|
+
# $stderr.puts line
|
207
|
+
# $stderr.puts "--"
|
208
|
+
#}
|
209
|
+
next
|
210
|
+
end
|
211
|
+
frame_node.add_fe("target",target_maxnodes)
|
212
|
+
|
213
|
+
# set features on target: target lemma, target POS
|
214
|
+
target_lemma = tab_frame_obj.get_target
|
215
|
+
target_pos = nil
|
216
|
+
if target_lemma
|
217
|
+
if exp.get("origin") == "FrameNet"
|
218
|
+
# FrameNet data: here the lemma in the tab file has the form
|
219
|
+
# <lemma>.<POS>
|
220
|
+
# separate the two
|
221
|
+
if target_lemma =~ /^(.*)\.(.*)$/
|
222
|
+
target_lemma = $1
|
223
|
+
target_pos = $2
|
224
|
+
end
|
225
|
+
end
|
226
|
+
frame_node.target.set_attribute("lemma", target_lemma)
|
227
|
+
if target_pos
|
228
|
+
frame_node.target.set_attribute("pos", target_pos)
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
# roles, GF, PT
|
233
|
+
# synnode_markable_label:
|
234
|
+
# hash "role" | "gf" | "pt" -> SynNode -> array: label(string)
|
235
|
+
layer_synnode_label = {}
|
236
|
+
["gf", "pt", "role"].each {|layer|
|
237
|
+
termids2labels = tab_frame_obj.markables(layer)
|
238
|
+
|
239
|
+
unless layer_synnode_label[layer]
|
240
|
+
layer_synnode_label[layer] = {}
|
241
|
+
end
|
242
|
+
|
243
|
+
termids2labels.each {|terminal_indices, label|
|
244
|
+
terminal_indices.each { |t_i|
|
245
|
+
|
246
|
+
if (nodes = mapping[t_i])
|
247
|
+
|
248
|
+
nodes.each { |node|
|
249
|
+
unless layer_synnode_label[layer][node]
|
250
|
+
layer_synnode_label[layer][node] = []
|
251
|
+
end
|
252
|
+
|
253
|
+
layer_synnode_label[layer][node] << label
|
254
|
+
} # each node that t_i maps to
|
255
|
+
end # if t_i maps to anything
|
256
|
+
|
257
|
+
} # each terminal index
|
258
|
+
} # each mapping terminal indices -> label
|
259
|
+
} # each layer
|
260
|
+
|
261
|
+
# 'stuff' (Support and other things)
|
262
|
+
layer_synnode_label["stuff"] = {}
|
263
|
+
tab_frame_obj.each_line_parsed { |line_obj|
|
264
|
+
if (label = line_obj.get("stuff")) != "-"
|
265
|
+
if (nodes = mapping[line_obj.get("lineno")])
|
266
|
+
nodes.each { |node|
|
267
|
+
unless layer_synnode_label["stuff"][node]
|
268
|
+
layer_synnode_label["stuff"][node] = []
|
269
|
+
end
|
270
|
+
layer_synnode_label["stuff"][node] << label
|
271
|
+
}
|
272
|
+
end
|
273
|
+
end
|
274
|
+
}
|
275
|
+
|
276
|
+
# reencode:
|
277
|
+
# hash role_label(string) -> array of tuples [synnodes, gflabels, ptlabels]
|
278
|
+
# synnodes: array:SynNode. gflabels, ptlabels: array:String
|
279
|
+
#
|
280
|
+
# note that in this step, any gf or pt labels that have been
|
281
|
+
# assigned to a SynNode that has not also been assigned a role
|
282
|
+
# will be lost
|
283
|
+
role2nodes_labels = {}
|
284
|
+
layer_synnode_label["role"].each_pair { |synnode, labels|
|
285
|
+
labels.each { | rolelabel|
|
286
|
+
unless role2nodes_labels[rolelabel]
|
287
|
+
role2nodes_labels[rolelabel] = []
|
288
|
+
end
|
289
|
+
|
290
|
+
role2nodes_labels[rolelabel] << [
|
291
|
+
synnode,
|
292
|
+
layer_synnode_label["gf"][synnode],
|
293
|
+
layer_synnode_label["pt"][synnode]
|
294
|
+
]
|
295
|
+
} # each role label
|
296
|
+
} # each pair SynNode/role labels
|
297
|
+
|
298
|
+
# reencode "stuff", but only the support cases
|
299
|
+
role2nodes_labels["Support"] = []
|
300
|
+
|
301
|
+
layer_synnode_label["stuff"].each_pair { |synnode, labels|
|
302
|
+
labels.each { |stufflabel|
|
303
|
+
if stufflabel =~ /Supp/
|
304
|
+
# some sort of support
|
305
|
+
role2nodes_labels["Support"] << [synnode, nil, nil]
|
306
|
+
end
|
307
|
+
}
|
308
|
+
}
|
309
|
+
|
310
|
+
##
|
311
|
+
# each role label:
|
312
|
+
# make FeNode for the current frame
|
313
|
+
role2nodes_labels.each_pair { |rolelabel, node_gf_pt|
|
314
|
+
|
315
|
+
# get list of syn nodes, GF and PT labels for this role
|
316
|
+
# shortcut for GF and PT labels: take any labels that have
|
317
|
+
# been assigned for _some_ Synnode of this role
|
318
|
+
synnodes = node_gf_pt.map { |ngp| ngp[0] }
|
319
|
+
gflabels = node_gf_pt.map { |ngp| ngp[1] }.compact.flatten.uniq
|
320
|
+
ptlabels = node_gf_pt.map { |ngp| ngp[2] }.compact.flatten.uniq
|
321
|
+
|
322
|
+
|
323
|
+
# let the interpreter class decide on how to
|
324
|
+
# determine the maximum constituents
|
325
|
+
maxnodes = interpreter_class.max_constituents(synnodes, st_sent)
|
326
|
+
|
327
|
+
fe_node = st_sent.add_fe(frame_node, rolelabel, maxnodes)
|
328
|
+
unless gflabels.empty?
|
329
|
+
fe_node.set_attribute("gf", gflabels.join(","))
|
330
|
+
end
|
331
|
+
unless ptlabels.empty?
|
332
|
+
fe_node.set_attribute("pt", ptlabels.join(","))
|
333
|
+
end
|
334
|
+
} # each role label
|
335
|
+
} # each frame
|
336
|
+
end
|
337
|
+
|
338
|
+
|
339
|
+
######
|
340
|
+
# handle multiword targets:
|
341
|
+
# if you find a verb with a separate prefix,
|
342
|
+
# change the verb's lemma information accordingly
|
343
|
+
# and add an attribute "other_words" to the verb node
|
344
|
+
# pointing to the other node
|
345
|
+
#
|
346
|
+
# In general, it will be assumed that "other_words" contains
|
347
|
+
# a list of node IDs for other nodes belonging to the same
|
348
|
+
# group, node IDs separated by spaces, and that
|
349
|
+
# each node of a group has the "other_words" attribute.
|
350
|
+
#
|
351
|
+
def handle_multiword_targets(sent, # SalsaTigerSentence object
|
352
|
+
interpreter, # SynInterpreter object
|
353
|
+
language) # string: en, de
|
354
|
+
##
|
355
|
+
# only retain the interesting words of the sentence:
|
356
|
+
# content words and prepositions
|
357
|
+
if sent.nil?
|
358
|
+
return
|
359
|
+
end
|
360
|
+
|
361
|
+
nodes = sent.terminals.select { |node|
|
362
|
+
[
|
363
|
+
"adj", "adv", "card", "noun", "part", "prep", "verb"
|
364
|
+
].include? interpreter.category(node)
|
365
|
+
}
|
366
|
+
|
367
|
+
##
|
368
|
+
# group:
|
369
|
+
# group verbs with their separate particles
|
370
|
+
# (at a later point, other types of grouping can be inserted here)
|
371
|
+
groups = group_words(nodes, interpreter)
|
372
|
+
|
373
|
+
##
|
374
|
+
# record grouping information as attributes on the terminals.
|
375
|
+
groups.each { |descr, group_of_nodes|
|
376
|
+
case descr
|
377
|
+
when "none"
|
378
|
+
# no grouping
|
379
|
+
when "part"
|
380
|
+
# separate particle belonging to a verb
|
381
|
+
|
382
|
+
# group_of_nodes is a pair [verb, particle]
|
383
|
+
verb, particle = group_of_nodes
|
384
|
+
|
385
|
+
verb.set_attribute("other_words", particle.id)
|
386
|
+
particle.set_attribute("other_words", verb.id)
|
387
|
+
|
388
|
+
if verb.get_attribute("lemma") and particle.get_attribute("lemma")
|
389
|
+
case language
|
390
|
+
when "de"
|
391
|
+
# German: prepend SVP to get the real lemma of the verb
|
392
|
+
verb.set_attribute("lemma",
|
393
|
+
particle.get_attribute("lemma") +
|
394
|
+
verb.get_attribute("lemma"))
|
395
|
+
when "en"
|
396
|
+
# English: append particle as separate word after the lemma of the verb
|
397
|
+
verb.set_attribute("lemma",
|
398
|
+
verb.get_attribute("lemma") + " " +
|
399
|
+
particle.get_attribute("lemma"))
|
400
|
+
else
|
401
|
+
# default
|
402
|
+
verb.set_attribute("lemma",
|
403
|
+
verb.get_attribute("lemma") + " " +
|
404
|
+
particle.get_attribute("lemma"))
|
405
|
+
end
|
406
|
+
end
|
407
|
+
|
408
|
+
else
|
409
|
+
raise "Shouldn't be here: unexpected description #{descr}"
|
410
|
+
end
|
411
|
+
}
|
412
|
+
end
|
413
|
+
|
414
|
+
########################
|
415
|
+
# group_words
|
416
|
+
#
|
417
|
+
# auxiliary of transform_multiword targets
|
418
|
+
#
|
419
|
+
# Group terminals:
|
420
|
+
# At the moment, just find separate prefixes and particles
|
421
|
+
# for verbs
|
422
|
+
#
|
423
|
+
# returns: list of pairs [descr, nodes]
|
424
|
+
# descr: string, "none" (no group), "part" (separate verb particle)
|
425
|
+
# nodes: array:SynNode
|
426
|
+
def group_words(nodes, # array: SynNode
|
427
|
+
interpreter) # SynInterpreter object
|
428
|
+
|
429
|
+
retv = [] # array of groups, array:array:SynNode
|
430
|
+
done = [] # remember nodes already covered
|
431
|
+
|
432
|
+
nodes.each { |terminal_node|
|
433
|
+
if done.include? terminal_node
|
434
|
+
# we have already included this node in one of the groups
|
435
|
+
next
|
436
|
+
end
|
437
|
+
|
438
|
+
if (svp = interpreter.particle_of_verb(terminal_node, nodes))
|
439
|
+
retv << ["part", [terminal_node, svp]]
|
440
|
+
done << terminal_node
|
441
|
+
done << svp
|
442
|
+
else
|
443
|
+
retv << ["none", [terminal_node]]
|
444
|
+
done << terminal_node
|
445
|
+
end
|
446
|
+
|
447
|
+
}
|
448
|
+
|
449
|
+
return retv
|
450
|
+
end
|
451
|
+
|
452
|
+
######
|
453
|
+
# handle unknown framenames
|
454
|
+
#
|
455
|
+
# For all frames with names matching Unknown\d+,
|
456
|
+
# rename them to <lemma>_Unknown\d+
|
457
|
+
def handle_unknown_framenames(sent, # SalsaTigerSentence
|
458
|
+
interpreter) # SynInterpreter class
|
459
|
+
if sent.nil?
|
460
|
+
return
|
461
|
+
end
|
462
|
+
|
463
|
+
sent.each_frame { |frame|
|
464
|
+
if frame.name =~ /^Unknown/
|
465
|
+
if frame.target
|
466
|
+
maintarget = interpreter.main_node_of_expr(frame.target.children, "no_mwe")
|
467
|
+
else
|
468
|
+
maintarget = nil
|
469
|
+
end
|
470
|
+
unless maintarget
|
471
|
+
$stderr.puts "Warning: Unknown frame, and I could not determine the target lemma: Frame #{frame.id}"
|
472
|
+
$stderr.puts "Cannot repair frame name, leaving it as is."
|
473
|
+
return
|
474
|
+
end
|
475
|
+
|
476
|
+
# get lemma, if it exists, otherwise get word
|
477
|
+
# also, if the lemmatizer has returned a disjunction of lemmas,
|
478
|
+
# get the first disjunct
|
479
|
+
lemma = interpreter.lemma_backoff(maintarget)
|
480
|
+
if lemma
|
481
|
+
# we have a lemma
|
482
|
+
frame.set_name(lemma + "_" + frame.name)
|
483
|
+
else
|
484
|
+
# the main target word has no lemma attribute,
|
485
|
+
# and somehow I couldn't even get the target word
|
486
|
+
$stderr.puts "Warning: Salsa 'Unknown' frame."
|
487
|
+
$stderr.puts "Trying to make its lemma-specificity explicit, but"
|
488
|
+
$stderr.puts "I could not determine the target lemma nor the target word: frame #{frame.id}"
|
489
|
+
$stderr.puts "Leaving 'Unknown' as it is."
|
490
|
+
end
|
491
|
+
end
|
492
|
+
}
|
493
|
+
end
|
494
|
+
|
495
|
+
|
496
|
+
####################
|
497
|
+
# add head attributes to each nonterminal in each
|
498
|
+
# SalsaTigerXML file in a directory
|
499
|
+
# @param [SalsaTigerSentence] st_sent
|
500
|
+
# @param [SynInterpreter] interpreter
|
501
|
+
def add_head_attributes(st_sent, interpreter)
|
502
|
+
st_sent.each_nonterminal do |nt_node|
|
503
|
+
head_term = interpreter.head_terminal(nt_node)
|
504
|
+
if head_term && head_term.word
|
505
|
+
nt_node.set_attribute("head", head_term.word)
|
506
|
+
else
|
507
|
+
nt_node.set_attribute("head", "--")
|
508
|
+
end
|
509
|
+
end # each nonterminal
|
510
|
+
end
|
511
|
+
|
512
|
+
###################
|
513
|
+
# given a SalsaTigerSentence,
|
514
|
+
# look for FrameNet frames that are
|
515
|
+
# test frames, and remove them
|
516
|
+
# @param [SalsaTigerSentence] sent
|
517
|
+
# @param [FrprepConfigData] exp
|
518
|
+
def remove_deprecated_frames(sent, exp)
|
519
|
+
unless exp.get("origin") == "FrameNet"
|
520
|
+
return
|
521
|
+
end
|
522
|
+
|
523
|
+
sent.frames.each do |frame_obj|
|
524
|
+
if frame_obj.name == "Boulder" || frame_obj.name =~ /^Test/
|
525
|
+
sent.remove_frame(frame_obj)
|
526
|
+
end
|
527
|
+
end
|
528
|
+
end
|
529
|
+
end
|
530
|
+
end
|
531
|
+
end
|