shalmaneser-frappe 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/frappe/Ampersand.rb +41 -0
- data/lib/frappe/file_parser.rb +126 -0
- data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
- data/lib/frappe/frappe.rb +217 -0
- data/lib/frappe/frappe_flat_syntax.rb +89 -0
- data/lib/frappe/frappe_read_stxml.rb +48 -0
- data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
- data/lib/frappe/interfaces/collins_interface.rb +340 -0
- data/lib/frappe/interfaces/counter.rb +19 -0
- data/lib/frappe/interfaces/stanford_interface.rb +353 -0
- data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
- data/lib/frappe/interfaces/treetagger_module.rb +111 -0
- data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
- data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
- data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
- data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
- data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
- data/lib/frappe/interpreters/headz.rb +265 -0
- data/lib/frappe/interpreters/headz_helpers.rb +54 -0
- data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
- data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
- data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
- data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
- data/lib/frappe/one_parsed_file.rb +31 -0
- data/lib/frappe/opt_parser.rb +92 -0
- data/lib/frappe/path.rb +199 -0
- data/lib/frappe/plain_converter.rb +59 -0
- data/lib/frappe/salsa_tab_converter.rb +154 -0
- data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
- data/lib/frappe/stxml_converter.rb +666 -0
- data/lib/frappe/syn_interface.rb +76 -0
- data/lib/frappe/syn_interface_stxml.rb +173 -0
- data/lib/frappe/syn_interface_tab.rb +39 -0
- data/lib/frappe/utf_iso.rb +27 -0
- data/lib/shalmaneser/frappe.rb +1 -0
- metadata +130 -0
@@ -0,0 +1,666 @@
|
|
1
|
+
require 'logging'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'external_systems'
|
4
|
+
require 'frappe/file_parser'
|
5
|
+
require 'salsa_tiger_xml/file_parts_parser'
|
6
|
+
require 'salsa_tiger_xml/salsa_tiger_sentence'
|
7
|
+
require 'salsa_tiger_xml/salsa_tiger_xml_helper'
|
8
|
+
require 'salsa_tiger_xml/corpus'
|
9
|
+
require 'salsa_tiger_xml/salsa_tiger_sentence'
|
10
|
+
require 'tabular_format/fn_tab_format_file'
|
11
|
+
require 'frappe/fix_syn_sem_mapping'
|
12
|
+
|
13
|
+
module Shalmaneser
|
14
|
+
module Frappe
|
15
|
+
class STXMLConverter
|
16
|
+
def initialize(exp)
|
17
|
+
@exp = exp
|
18
|
+
# @todo Implement the logger as a mixin for all classes.
|
19
|
+
@logger = LOGGER
|
20
|
+
# suffixes for different types of output files
|
21
|
+
@file_suffixes = {"lemma" => ".lemma", "pos" => ".pos", "tab" => ".tab", "stxml" => ".xml"}
|
22
|
+
end
|
23
|
+
#############################################
|
24
|
+
# transform_stxml
|
25
|
+
#
|
26
|
+
# transformation for SalsaTigerXML data
|
27
|
+
#
|
28
|
+
# - If the input format was SalsaTigerXML:
|
29
|
+
# - Tag, lemmatize and parse, if the experiment file tells you so
|
30
|
+
#
|
31
|
+
# - If the origin is the Salsa corpus:
|
32
|
+
# Change frame names from Unknown\d+ to lemma_Unknown\d+
|
33
|
+
#
|
34
|
+
# - fix multiword lemmas, or at least try
|
35
|
+
# - transform to UTF 8
|
36
|
+
# string: name of directory for parse data
|
37
|
+
# string: name of directory for split/tab data
|
38
|
+
# string: name of input directory
|
39
|
+
# string: name of final output directory
|
40
|
+
# FrappeConfigData
|
41
|
+
def transform_stxml_dir(parse_dir, tab_dir, input_dir, output_dir)
|
42
|
+
####
|
43
|
+
# Data preparation
|
44
|
+
|
45
|
+
# Data with Salsa as origin:
|
46
|
+
# remember the target lemma as an attribute on the
|
47
|
+
# <target> elements
|
48
|
+
#
|
49
|
+
# currently deactivated: encoding problems
|
50
|
+
# if @exp.get("origin") == "SalsaTiger"
|
51
|
+
# $stderr.puts "Frprep: noting target lemmas"
|
52
|
+
# changed_input_dir = frprep_dirname("salsalemma", "new")
|
53
|
+
# note_salsa_targetlemmas(input_dir, changed_input_dir)
|
54
|
+
|
55
|
+
# # remember changed input dir as input dir
|
56
|
+
# input_dir = changed_input_dir
|
57
|
+
# end
|
58
|
+
|
59
|
+
# If data is to be parsed, split input files
|
60
|
+
# else copy data to stxml_indir.
|
61
|
+
# stxml_dir: directory where SalsaTiger data is situated
|
62
|
+
if @exp.get("do_parse")
|
63
|
+
# split data
|
64
|
+
stxml_splitdir = frprep_dirname("stxml_split", "new")
|
65
|
+
stxml_dir = stxml_splitdir
|
66
|
+
|
67
|
+
LOGGER.info "#{PROGRAM_NAME}: Splitting the input data into #{stxml_dir}."
|
68
|
+
|
69
|
+
stxml_split_dir(input_dir, stxml_splitdir, @exp.get("parser_max_sent_num"), @exp.get("parser_max_sent_len"))
|
70
|
+
else
|
71
|
+
# no parsing: copy data to split dir
|
72
|
+
stxml_dir = parse_dir
|
73
|
+
|
74
|
+
LOGGER.info "#{PROGRAM_NAME}: Copying data to #{stxml_dir}"
|
75
|
+
|
76
|
+
Dir[input_dir + "*.xml"].each { |f| FileUtils.cp(f, stxml_dir) }
|
77
|
+
end
|
78
|
+
|
79
|
+
# Some syntactic processing will take place:
|
80
|
+
# tabify data
|
81
|
+
if @exp.get("do_parse") || @exp.get("do_lemmatize") || @exp.get("do_postag")
|
82
|
+
LOGGER.info "#{PROGRAM_NAME}: Making input for syn. processing."
|
83
|
+
Dir[stxml_dir + "*" + @file_suffixes["stxml"]].each do |stxmlfilename|
|
84
|
+
tabfilename = tab_dir + File.basename(stxmlfilename, @file_suffixes["stxml"]) + @file_suffixes["tab"]
|
85
|
+
stxml_to_tab_file(stxmlfilename, tabfilename)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
###
|
90
|
+
# POS-tagging
|
91
|
+
if @exp.get("do_postag")
|
92
|
+
LOGGER.info "#{PROGRAM_NAME}: Tagging."
|
93
|
+
sys_class = ExternalSystems.get_interface("pos_tagger", @exp.get("pos_tagger"))
|
94
|
+
sys = sys_class.new(@exp.get("pos_tagger_path"), @file_suffixes["tab"], @file_suffixes["pos"])
|
95
|
+
sys.process_dir(tab_dir, tab_dir)
|
96
|
+
end
|
97
|
+
|
98
|
+
###
|
99
|
+
# Lemmatization
|
100
|
+
if @exp.get("do_lemmatize")
|
101
|
+
LOGGER.info "#{PROGRAM_NAME}: Lemmatizing."
|
102
|
+
sys_class = ExternalSystems.get_interface("lemmatizer", @exp.get("lemmatizer"))
|
103
|
+
sys = sys_class.new(@exp.get("lemmatizer_path"), @file_suffixes["tab"], @file_suffixes["lemma"])
|
104
|
+
sys.process_dir(tab_dir, tab_dir)
|
105
|
+
end
|
106
|
+
|
107
|
+
###
|
108
|
+
# Parsing, production of SalsaTigerXML output
|
109
|
+
|
110
|
+
# get interpretation class for this
|
111
|
+
# parser/lemmatizer/POS tagger combination
|
112
|
+
sys_class_names = {}
|
113
|
+
|
114
|
+
[["do_postag", "pos_tagger"], ["do_lemmatize", "lemmatizer"], ["do_parse", "parser"]].each do |service, system_name|
|
115
|
+
# yes, perform this service
|
116
|
+
if @exp.get(service)
|
117
|
+
sys_class_names[system_name] = @exp.get(system_name)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
interpreter_class = ExternalSystems.get_interpreter(sys_class_names)
|
122
|
+
|
123
|
+
unless interpreter_class
|
124
|
+
raise "Shouldn't be here"
|
125
|
+
end
|
126
|
+
|
127
|
+
parse_obj = FileParser.new(@exp, @file_suffixes, parse_dir, "tab_dir" => tab_dir, "stxml_dir" => stxml_dir)
|
128
|
+
parse_obj.each_parsed_file do |parsed_file_obj|
|
129
|
+
outfilename = output_dir + parsed_file_obj.filename + ".xml"
|
130
|
+
LOGGER.debug "Writing #{outfilename}."
|
131
|
+
begin
|
132
|
+
outfile = File.new(outfilename, "w")
|
133
|
+
rescue
|
134
|
+
raise "Cannot write to SalsaTigerXML output file #{outfilename}"
|
135
|
+
end
|
136
|
+
if @exp.get("do_parse")
|
137
|
+
# read old SalsaTigerXML file
|
138
|
+
# so we can integrate the old file's semantics later
|
139
|
+
# array of sentence strings
|
140
|
+
oldxml = []
|
141
|
+
# we assume that the old and the new file have the same name,
|
142
|
+
# ending in .xml.
|
143
|
+
oldxmlfile = STXML::FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
|
144
|
+
oldxmlfile.scan_s do |sent_string|
|
145
|
+
# remember this sentence by its ID
|
146
|
+
oldxml << sent_string
|
147
|
+
end
|
148
|
+
end
|
149
|
+
outfile.puts STXML::SalsaTigerXMLHelper.get_header
|
150
|
+
index = 0
|
151
|
+
# work with triples
|
152
|
+
# SalsaTigerSentence, FNTabSentence,
|
153
|
+
# hash: tab sentence index(integer) -> array:SynNode
|
154
|
+
parsed_file_obj.each_sentence do |st_sent, tabformat_sent, mapping|
|
155
|
+
# parsed? then integrate semantics and lemmas from old file
|
156
|
+
if @exp.get("do_parse")
|
157
|
+
oldsent_string = oldxml[index]
|
158
|
+
index += 1
|
159
|
+
if oldsent_string
|
160
|
+
oldsent_string = escape_berkeley_chars(oldsent_string)
|
161
|
+
# we have both an old and a new sentence, so integrate semantics
|
162
|
+
oldsent = STXML::SalsaTigerSentence.new(oldsent_string)
|
163
|
+
|
164
|
+
next if st_sent.nil?
|
165
|
+
|
166
|
+
unless integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
|
167
|
+
oldsent_string = oldxml[index]
|
168
|
+
index += 1
|
169
|
+
if oldsent_string
|
170
|
+
oldsent_string = escape_berkeley_chars(oldsent_string)
|
171
|
+
# we have both an old and a new sentence, so integrate semantics
|
172
|
+
oldsent = STXML::SalsaTigerSentence.new(oldsent_string)
|
173
|
+
|
174
|
+
integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
else
|
178
|
+
# no corresponding old sentence for this new sentence
|
179
|
+
@logger.warn "Warning: Transporting semantics - missing source sentence, skipping"
|
180
|
+
end
|
181
|
+
end
|
182
|
+
# remove pseudo-frames from FrameNet data
|
183
|
+
remove_deprecated_frames(st_sent, @exp)
|
184
|
+
# repair syn/sem mapping problems?
|
185
|
+
if @exp.get("fe_syn_repair") || @exp.get("fe_rel_repair")
|
186
|
+
FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
|
187
|
+
end
|
188
|
+
|
189
|
+
outfile.puts st_sent.get
|
190
|
+
end # each ST sentence
|
191
|
+
outfile.puts STXML::SalsaTigerXMLHelper.get_footer
|
192
|
+
end # each file parsed
|
193
|
+
end
|
194
|
+
|
195
|
+
####
|
196
|
+
# transform SalsaTigerXML file to Tab format file
|
197
|
+
# @param [String] input_filename Name of input file.
|
198
|
+
# @param [String] output_filename Name of output file.
|
199
|
+
# @param [FrappeConfigData]
|
200
|
+
def stxml_to_tab_file(input_filename, output_filename)
|
201
|
+
corpus = STXML::Corpus.new(input_filename)
|
202
|
+
|
203
|
+
File.open(output_filename, 'w') do |f|
|
204
|
+
corpus.each_sentence do |sentence|
|
205
|
+
raise 'Interface changed!!!' unless sentence.is_a?(Nokogiri::XML::Element)
|
206
|
+
id = sentence.attributes['id'].value
|
207
|
+
words = sentence.xpath('.//t')
|
208
|
+
# byebug
|
209
|
+
words.each do |word|
|
210
|
+
word = STXML::SalsaTigerXMLHelper.unescape(word.attributes['word'].value)
|
211
|
+
# @todo AB: I don't know why the Berkeley Parser wants this.
|
212
|
+
# Investigate if every Grammar needs this conversion.
|
213
|
+
# Try to move this convertion from FrappeHelper to BerkeleyInterface.
|
214
|
+
if @exp.get("parser") == "berkeley"
|
215
|
+
word.gsub!(/\(/, "*LRB*")
|
216
|
+
word.gsub!(/\)/, "*RRB*")
|
217
|
+
word.gsub!(/``/, '"')
|
218
|
+
word.gsub!(/''/, '"')
|
219
|
+
word.gsub!(%r{\'\'}, '"')
|
220
|
+
end
|
221
|
+
fields = {'word' => word, 'sent_id' => id}
|
222
|
+
f.puts FNTabFormatFile.format_str(fields)
|
223
|
+
end
|
224
|
+
f.puts
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
###############
|
229
|
+
# frprep_dirname:
|
230
|
+
# make directory name for frprep-internal data
|
231
|
+
# of a certain kind described in <subdir>
|
232
|
+
#
|
233
|
+
# frprep_directory has one subdirectory for each experiment ID,
|
234
|
+
# and below that there is one subdir per subtask
|
235
|
+
#
|
236
|
+
# If this is a new directory, it is constructed,
|
237
|
+
# if it should be an existing directory, its existence is checked.
|
238
|
+
# @param subdir [String] designator of a subdirectory
|
239
|
+
# @param neu [Nil] non-nil This may be a new directory
|
240
|
+
def frprep_dirname(subdir, neu = nil)
|
241
|
+
dirname = File.new_dir(@exp.get("frprep_directory"), @exp.get("prep_experiment_ID"), subdir)
|
242
|
+
|
243
|
+
neu ? File.new_dir(dirname) : File.existing_dir(dirname)
|
244
|
+
end
|
245
|
+
|
246
|
+
|
247
|
+
def escape_berkeley_chars(str)
|
248
|
+
# modified by ines, 27/08/08
|
249
|
+
# for Berkeley => substitute ( ) for *LRB* *RRB*
|
250
|
+
# @note AB: Duplicated code!! Move it to the Berkeley Interface.
|
251
|
+
if @exp.get("parser") == "berkeley"
|
252
|
+
str.gsub!(/word='\('/, "word='*LRB*'")
|
253
|
+
str.gsub!(/word='\)'/, "word='*RRB*'")
|
254
|
+
str.gsub!(/word=\"\(\"/, "word='*LRB*'")
|
255
|
+
str.gsub!(/word=\"\)\"/, "word='*RRB*'")
|
256
|
+
end
|
257
|
+
|
258
|
+
str
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
####
|
263
|
+
# stxml_split_dir
|
264
|
+
#
|
265
|
+
# split SalsaTigerXML files into new files of given length,
|
266
|
+
# skipping sentences that are too long
|
267
|
+
#
|
268
|
+
# At the same time, sentences that occur several times (i.e. sentences which are
|
269
|
+
# annotated by SALSA for more than one predicate) are compacted into one occurrence
|
270
|
+
# with combined semantics.
|
271
|
+
#
|
272
|
+
# assumes that all files in input_dir with
|
273
|
+
# extension .xml are SalsaTigerXMl files
|
274
|
+
def stxml_split_dir(input_dir, # string: input directory with STXML files
|
275
|
+
split_dir, # string: output directory
|
276
|
+
max_sentnum, # integer: max num of sentences per file
|
277
|
+
max_sentlen) # integer: max num of terminals per sentence
|
278
|
+
|
279
|
+
|
280
|
+
# @note AB: Effectevely copying all files.
|
281
|
+
Dir["#{input_dir}*.xml"].each do |file|
|
282
|
+
FileUtils.cp file, split_dir
|
283
|
+
end
|
284
|
+
|
285
|
+
# @note AB: Switch off splitting for now.
|
286
|
+
# The algorithms are weird.
|
287
|
+
=begin
|
288
|
+
$stderr.puts "Frprep: splitting data"
|
289
|
+
|
290
|
+
filenames = Dir[input_dir + "*.xml"].to_a
|
291
|
+
|
292
|
+
graph_hash = {} # for each sentence id, keep <s...</graph>
|
293
|
+
frame_hash = {} # for each sentence id , keep the <frame... </frame> string
|
294
|
+
uspfes_hash = {} # for each sentence id, keep the uspfes stuff
|
295
|
+
uspframes_hash = {} # for each sentence id, keep the uspframes stuff
|
296
|
+
|
297
|
+
########################
|
298
|
+
# Traverse of file(s): compute an index of all frames for each sentence, with unique identifiers
|
299
|
+
|
300
|
+
filenames.each { |filename|
|
301
|
+
|
302
|
+
infile = STXML::FilePartsParser.new(filename)
|
303
|
+
infile.scan_s { |sent_str|
|
304
|
+
|
305
|
+
sentlen = 0
|
306
|
+
sent_str.delete("\n").scan(/<t\s/) { |occ| sentlen += 1}
|
307
|
+
if sentlen > max_sentlen
|
308
|
+
sent = STXML::RegXML.new(sent_str)
|
309
|
+
# revisit handling of long sentences
|
310
|
+
# $stderr.puts "I would have skipped overly long sentence " + sent.attributes["id"]+" but Sebastian forbade me.".to_s
|
311
|
+
# next
|
312
|
+
end
|
313
|
+
|
314
|
+
# substitute old frame identifiers with new, unique ones
|
315
|
+
|
316
|
+
# problem: we may have several frames per sentence, and need to keep track of them
|
317
|
+
# if we rename etc sxx_f1 to sxx_f2 and there is already a sxx_f2, then
|
318
|
+
# we cannot distinguish between these frames
|
319
|
+
|
320
|
+
# therefore, we substitute temporary identifiers until we have substituted
|
321
|
+
# all ids with temporary ones, and re-substitute final ones at the end.
|
322
|
+
|
323
|
+
this_frames = []
|
324
|
+
|
325
|
+
temp_subs = []
|
326
|
+
final_subs = []
|
327
|
+
|
328
|
+
sent = STXML::RegXML.new(sent_str)
|
329
|
+
sentid = sent.attributes["id"].to_s
|
330
|
+
if sentid.nil?
|
331
|
+
STDERR.puts "[frprep] Warning: cannot find sentence id, skipping sentence:"
|
332
|
+
STDERR.puts sent_str
|
333
|
+
# strange sentence, no ID? skip
|
334
|
+
next
|
335
|
+
end
|
336
|
+
|
337
|
+
unless frame_hash.key? sentid
|
338
|
+
frame_hash[sentid] = []
|
339
|
+
uspfes_hash[sentid] = []
|
340
|
+
uspframes_hash[sentid] = []
|
341
|
+
end
|
342
|
+
|
343
|
+
# find everything up to and including the graph
|
344
|
+
sent_children = sent.children_and_text
|
345
|
+
graph = sent_children.detect { |child| child.name == "graph" }
|
346
|
+
graph_hash[sentid] = "<s " +
|
347
|
+
sent.attributes.to_a.map { |at, val| "#{at}=\'#{val}\'" }.join(" ") +
|
348
|
+
">" +
|
349
|
+
graph.to_s
|
350
|
+
|
351
|
+
# find the usp block
|
352
|
+
|
353
|
+
sem = sent_children.detect { |child| child.name == "sem"}
|
354
|
+
usp = ""
|
355
|
+
if sem
|
356
|
+
usp = sem.children_and_text.detect { |child| child.name == "usp" }
|
357
|
+
usp = usp.to_s
|
358
|
+
end
|
359
|
+
|
360
|
+
# find all frames
|
361
|
+
if sem
|
362
|
+
frames = sem.children_and_text.detect { |child| child.name == "frames" }
|
363
|
+
if frames
|
364
|
+
frames.children_and_text.each { |frame|
|
365
|
+
unless frame.name == "frame"
|
366
|
+
next
|
367
|
+
end
|
368
|
+
frameid = frame.attributes["id"]
|
369
|
+
|
370
|
+
temp_frameid = "#{sentid}_temp_f#{frame_hash[sentid].length + this_frames.length + 1}"
|
371
|
+
final_frameid = "#{sentid}_f#{frame_hash[sentid].length + this_frames.length + 1}"
|
372
|
+
|
373
|
+
temp_subs << [frameid, temp_frameid]
|
374
|
+
final_subs << [temp_frameid, final_frameid]
|
375
|
+
|
376
|
+
this_frames << frame.to_s
|
377
|
+
}
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
# now first rename all the frames to temporary names
|
382
|
+
|
383
|
+
temp_subs.each {|orig_frameid, temp_frameid|
|
384
|
+
this_frames.map! {|frame_str|
|
385
|
+
#print "orig ", orig_frameid, " temp ", temp_frameid, "\n"
|
386
|
+
frame_str.gsub(orig_frameid,temp_frameid)
|
387
|
+
}
|
388
|
+
|
389
|
+
usp.gsub!(orig_frameid,temp_frameid)
|
390
|
+
}
|
391
|
+
|
392
|
+
# and re-rename the temporary names
|
393
|
+
|
394
|
+
final_subs.each {|temp_frameid, final_frameid|
|
395
|
+
this_frames.map! {|frame_str|
|
396
|
+
frame_str.gsub(temp_frameid,final_frameid)
|
397
|
+
}
|
398
|
+
usp.gsub!(temp_frameid, final_frameid)
|
399
|
+
}
|
400
|
+
|
401
|
+
# store frames in data structure
|
402
|
+
this_frames.each {|frame_str|
|
403
|
+
frame_hash[sentid] << frame_str
|
404
|
+
}
|
405
|
+
|
406
|
+
# store uspfes in data structure
|
407
|
+
unless usp.empty?
|
408
|
+
usp_elt = STXML::RegXML.new(usp)
|
409
|
+
uspfes = usp_elt.children_and_text.detect { |child| child.name == "uspfes" }
|
410
|
+
uspfes.children_and_text.each { |child|
|
411
|
+
unless child.name == "uspblock"
|
412
|
+
next
|
413
|
+
end
|
414
|
+
uspfes_hash[sentid] << child.to_s
|
415
|
+
}
|
416
|
+
|
417
|
+
# store uspframes in data structure
|
418
|
+
uspframes = usp_elt.children_and_text.detect { |child| child.name == "uspframes" }
|
419
|
+
uspframes.children_and_text.each { |child|
|
420
|
+
unless child.name == "uspblock"
|
421
|
+
next
|
422
|
+
end
|
423
|
+
uspframes_hash[sentid] << child.to_s
|
424
|
+
}
|
425
|
+
end
|
426
|
+
}
|
427
|
+
}
|
428
|
+
|
429
|
+
# now write everything in the data structure back to a file
|
430
|
+
|
431
|
+
filecounter = 0
|
432
|
+
sentcounter = 0
|
433
|
+
outfile = nil
|
434
|
+
sent_stack = []
|
435
|
+
|
436
|
+
graph_hash = graph_hash.sort { |a, b| a[0].to_i <=> b[0].to_i }
|
437
|
+
|
438
|
+
graph_hash.each do |sentid, graph_str|
|
439
|
+
unless outfile
|
440
|
+
outfile = File.new(split_dir + filecounter.to_s + ".xml", "w")
|
441
|
+
outfile.puts STXML::SalsaTigerXMLHelper.get_header
|
442
|
+
filecounter += 1
|
443
|
+
sentcounter = 0
|
444
|
+
end
|
445
|
+
|
446
|
+
xml = []
|
447
|
+
xml << graph_str
|
448
|
+
xml << "<sem>"
|
449
|
+
xml << "<globals>"
|
450
|
+
xml << "</globals>"
|
451
|
+
xml << "<frames>"
|
452
|
+
|
453
|
+
frame_hash[sentid].each { |frame_str| xml << frame_str }
|
454
|
+
|
455
|
+
xml << "</frames>"
|
456
|
+
xml << "<usp>"
|
457
|
+
xml << "<uspframes>"
|
458
|
+
|
459
|
+
uspframes_hash[sentid].each { |uspblock_str| xml << uspblock_str }
|
460
|
+
|
461
|
+
xml << "</uspframes>"
|
462
|
+
xml << "<uspfes>"
|
463
|
+
|
464
|
+
uspfes_hash[sentid].each { |uspblock_str| xml << uspblock_str }
|
465
|
+
|
466
|
+
xml << "</uspfes>"
|
467
|
+
xml << "</usp>"
|
468
|
+
xml << "</sem>"
|
469
|
+
xml << "</s>"
|
470
|
+
|
471
|
+
outfile.puts xml.join("\n")
|
472
|
+
sentcounter += 1
|
473
|
+
end
|
474
|
+
|
475
|
+
if outfile
|
476
|
+
outfile.puts STXML::SalsaTigerXMLHelper.get_footer
|
477
|
+
outfile.close
|
478
|
+
outfile = nil
|
479
|
+
end
|
480
|
+
=end
|
481
|
+
end
|
482
|
+
|
483
|
+
|
484
|
+
#####################
|
485
|
+
#
|
486
|
+
# Integrate the semantic annotation of an old sentence
|
487
|
+
# into the corresponding new sentence
|
488
|
+
# At the same time, integrate the lemma information from the
|
489
|
+
# old sentence into the new sentence
|
490
|
+
def integrate_stxml_semantics_and_lemmas(oldsent,
|
491
|
+
newsent,
|
492
|
+
interpreter_class,
|
493
|
+
exp)
|
494
|
+
if oldsent.nil? or newsent.nil?
|
495
|
+
return
|
496
|
+
end
|
497
|
+
##
|
498
|
+
# match old and new sentence via terminals
|
499
|
+
newterminals = newsent.terminals_sorted
|
500
|
+
oldterminals = oldsent.terminals_sorted
|
501
|
+
# sanity check: exact match on terminals?
|
502
|
+
newterminals.interleave(oldterminals).each { |newnode, oldnode|
|
503
|
+
#print "old ", oldnode.word, " ", newnode.word, "\n"
|
504
|
+
# new and old word: use both unescaped and escaped variant
|
505
|
+
if newnode
|
506
|
+
newwords = [ newnode.word, STXML::SalsaTigerXMLHelper.escape(newnode.word) ]
|
507
|
+
else
|
508
|
+
newwords = [nil, nil]
|
509
|
+
end
|
510
|
+
if oldnode
|
511
|
+
oldwords = [ oldnode.word, STXML::SalsaTigerXMLHelper.escape(oldnode.word) ]
|
512
|
+
else
|
513
|
+
oldwords = [ nil, nil]
|
514
|
+
end
|
515
|
+
|
516
|
+
if (newwords & oldwords).empty?
|
517
|
+
# old and new word don't match, either escaped or non-escaped
|
518
|
+
|
519
|
+
$stderr.puts "Warning: could not match terminals of sentence #{newsent.id}"
|
520
|
+
$stderr.puts "This means that I cannot match the semantic annotation"
|
521
|
+
$stderr.puts "to the newly parsed sentence. Skipping."
|
522
|
+
#$stderr.puts "Old sentence: "
|
523
|
+
#$stderr.puts oldterminals.map { |n| n.word }.join("--")
|
524
|
+
#$stderr.puts "New sentence: "
|
525
|
+
#$stderr.puts newterminals.map { |n| n.word }.join("--")
|
526
|
+
return false
|
527
|
+
end
|
528
|
+
}
|
529
|
+
|
530
|
+
##
|
531
|
+
# copy lemma information
|
532
|
+
oldterminals.each_with_index { |oldnode, ix|
|
533
|
+
newnode = newterminals[ix]
|
534
|
+
if oldnode.get_attribute("lemma")
|
535
|
+
newnode.set_attribute("lemma", oldnode.get_attribute("lemma"))
|
536
|
+
end
|
537
|
+
}
|
538
|
+
|
539
|
+
##
|
540
|
+
# copy frames
|
541
|
+
oldsent.each_frame { |oldframe|
|
542
|
+
# make new frame with same ID
|
543
|
+
newframe = newsent.add_frame(oldframe.name, oldframe.id)
|
544
|
+
# copy FEs
|
545
|
+
oldframe.each_child { |oldfe|
|
546
|
+
# new nodes: map old terminals to new terminals,
|
547
|
+
# then find max constituents covering them
|
548
|
+
newnodes = oldfe.descendants.select { |n|
|
549
|
+
n.is_terminal?
|
550
|
+
}.map { |n|
|
551
|
+
oldterminals.index(n)
|
552
|
+
}.map { |ix|
|
553
|
+
newterminals[ix]
|
554
|
+
}
|
555
|
+
|
556
|
+
# let the interpreter class decide on how to determine the maximum constituents
|
557
|
+
newnodes = interpreter_class.max_constituents(newnodes, newsent)
|
558
|
+
|
559
|
+
# make new FE with same ID
|
560
|
+
new_fe = newsent.add_fe(newframe, oldfe.name, newnodes, oldfe.id)
|
561
|
+
# keep all attributes of the FE
|
562
|
+
if oldfe.get_f("attributes")
|
563
|
+
oldfe.get_f("attributes").each_pair { |attr, value|
|
564
|
+
new_fe.set_attribute(attr, value)
|
565
|
+
}
|
566
|
+
end
|
567
|
+
}
|
568
|
+
}
|
569
|
+
|
570
|
+
##
|
571
|
+
### changed by ines => appears twice in stxml file
|
572
|
+
|
573
|
+
# copy underspecification
|
574
|
+
# keep as is, since we've kept all frame and FE IDs
|
575
|
+
oldsent.each_usp_frameblock { |olduspframe|
|
576
|
+
newuspframe = newsent.add_usp("frame")
|
577
|
+
olduspframe.each_child { |oldnode|
|
578
|
+
newnode = newsent.sem_node_with_id(oldnode.id)
|
579
|
+
if newnode
|
580
|
+
newuspframe.add_child(newnode)
|
581
|
+
else
|
582
|
+
$stderr.puts "Error: unknown frame with ID #{oldnode.id}"
|
583
|
+
end
|
584
|
+
}
|
585
|
+
}
|
586
|
+
oldsent.each_usp_feblock { |olduspfe|
|
587
|
+
newuspfe = newsent.add_usp("fe")
|
588
|
+
olduspfe.each_child { |oldnode|
|
589
|
+
newnode = newsent.sem_node_with_id(oldnode.id)
|
590
|
+
if newnode
|
591
|
+
newuspfe.add_child(newnode)
|
592
|
+
else
|
593
|
+
$stderr.puts "Error: unknown FE with ID #{oldnode.id}"
|
594
|
+
end
|
595
|
+
}
|
596
|
+
}
|
597
|
+
|
598
|
+
end
|
599
|
+
####
|
600
|
+
# note salsa targetlemma
|
601
|
+
#
|
602
|
+
# old_dir contains xml files whose name starts with the
|
603
|
+
# target lemma for all frames in the file
|
604
|
+
# record that target lemma in the <target> element of each frame
|
605
|
+
def note_salsa_targetlemma(old_dir, # string ending in /
|
606
|
+
new_dir) # string ending in /
|
607
|
+
|
608
|
+
|
609
|
+
# each input file: extract target lemma from filename,
|
610
|
+
# not this lemma in the <target> element of each frame
|
611
|
+
Dir[old_dir + "*.xml"].each { |filename|
|
612
|
+
changedfilename = new_dir + File.basename(filename)
|
613
|
+
|
614
|
+
if File.basename(filename) =~ /^(.*?)[_\.]/
|
615
|
+
lemma = $1
|
616
|
+
|
617
|
+
infile = STXML::FilePartsParser.new(filename)
|
618
|
+
outfile = File.new(changedfilename, "w")
|
619
|
+
|
620
|
+
# write header
|
621
|
+
outfile.puts infile.head
|
622
|
+
|
623
|
+
# iterate through sentences, yield as SalsaTigerSentence objects
|
624
|
+
infile.scan_s { |sent_string|
|
625
|
+
sent = STXML::SalsaTigerSentence.new(sent_string)
|
626
|
+
sent.each_frame { |frame|
|
627
|
+
frame.target.set_attribute("lemma", lemma)
|
628
|
+
}
|
629
|
+
|
630
|
+
# write changed sentence
|
631
|
+
outfile.puts sent.get
|
632
|
+
} # each sentence
|
633
|
+
|
634
|
+
# write footer
|
635
|
+
outfile.puts infile.tail
|
636
|
+
infile.close
|
637
|
+
outfile.close
|
638
|
+
|
639
|
+
else
|
640
|
+
# couldn't determine lemma
|
641
|
+
# just copy the file
|
642
|
+
`cp #{filename} #{changedfilename}`
|
643
|
+
end
|
644
|
+
}
|
645
|
+
end
|
646
|
+
|
647
|
+
###################3
|
648
|
+
# given a SalsaTigerSentence,
|
649
|
+
# look for FrameNet frames that are
|
650
|
+
# test frames, and remove them
|
651
|
+
# @param [SalsaTigerSentence] sent
|
652
|
+
# @param [FrprepConfigData] exp
|
653
|
+
def remove_deprecated_frames(sent, exp)
|
654
|
+
unless exp.get("origin") == "FrameNet"
|
655
|
+
return
|
656
|
+
end
|
657
|
+
|
658
|
+
sent.frames.each do |frame_obj|
|
659
|
+
if frame_obj.name == "Boulder" || frame_obj.name =~ /^Test/
|
660
|
+
sent.remove_frame(frame_obj)
|
661
|
+
end
|
662
|
+
end
|
663
|
+
end
|
664
|
+
end
|
665
|
+
end
|
666
|
+
end
|