shalmaneser-prep 1.2.0.rc4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +93 -0
- data/lib/frprep/Ampersand.rb +39 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/TreetaggerInterface.rb +327 -0
- data/lib/frprep/do_parses.rb +143 -0
- data/lib/frprep/frprep.rb +693 -0
- data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +58 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +99 -0
- data/test/functional/test_rosy.rb +40 -0
- metadata +85 -0
@@ -0,0 +1,143 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
# AB, 2010-11-25
|
4
|
+
|
5
|
+
##############################
|
6
|
+
# class for managing parses:
|
7
|
+
#
|
8
|
+
# Given either a directory with tab format files or
|
9
|
+
# a directory with SalsaTigerXML files (or both) and
|
10
|
+
# a directory for putting parse files:
|
11
|
+
# - parse, unless no parsing set in the experiment file
|
12
|
+
# - for each parsed file: yield one OneParsedFile object
|
13
|
+
require 'frprep/one_parsed_file'
|
14
|
+
|
15
|
+
class DoParses
|
16
|
+
def initialize(exp, # FrPrepConfigData object
|
17
|
+
file_suffixes, # hash: file type(string) -> suffix(string)
|
18
|
+
parse_dir, # string: name of directory to put parses
|
19
|
+
var_hash = {}) # further directories
|
20
|
+
@exp = exp
|
21
|
+
@file_suffixes = file_suffixes
|
22
|
+
@parse_dir = parse_dir
|
23
|
+
@tab_dir = var_hash["tab_dir"]
|
24
|
+
@stxml_dir = var_hash["stxml_dir"]
|
25
|
+
|
26
|
+
# pre-parsed data available?
|
27
|
+
@parsed_files = @exp.get("directory_parserout")
|
28
|
+
end
|
29
|
+
|
30
|
+
###
|
31
|
+
def each_parsed_file()
|
32
|
+
if @exp.get("do_postag")
|
33
|
+
postag_suffix = @file_suffixes["pos"]
|
34
|
+
else
|
35
|
+
postag_suffix = nil
|
36
|
+
end
|
37
|
+
|
38
|
+
if @exp.get("do_lemmatize")
|
39
|
+
lemma_suffix = @file_suffixes["lemma"]
|
40
|
+
else
|
41
|
+
lemma_suffix = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
if @exp.get("do_parse")
|
45
|
+
|
46
|
+
# get parser interface
|
47
|
+
sys_class = SynInterfaces.get_interface("parser",
|
48
|
+
@exp.get("parser"))
|
49
|
+
unless sys_class
|
50
|
+
raise "Shouldn't be here"
|
51
|
+
end
|
52
|
+
parse_suffix = "." + sys_class.name()
|
53
|
+
sys = sys_class.new(@exp.get("parser_path"),
|
54
|
+
@file_suffixes["tab"],
|
55
|
+
parse_suffix,
|
56
|
+
@file_suffixes["stxml"],
|
57
|
+
"pos_suffix" => postag_suffix,
|
58
|
+
"lemma_suffix" => lemma_suffix,
|
59
|
+
"tab_dir" => @tab_dir)
|
60
|
+
|
61
|
+
if @parsed_files
|
62
|
+
# reuse old parses
|
63
|
+
|
64
|
+
$stderr.puts "Frprep: using pre-computed parses in " + @parsed_files.to_s()
|
65
|
+
$stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
|
66
|
+
|
67
|
+
Dir[@parsed_files + "*"].each { |parsefilename|
|
68
|
+
|
69
|
+
if File.stat(parsefilename).ftype != "file"
|
70
|
+
# something other than a file
|
71
|
+
next
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
# core filename: remove directory and anything after the last "."
|
76
|
+
filename_core = File.basename(parsefilename, ".*")
|
77
|
+
#print "FN ", filename_core, " PN ", parsefilename, " sys ", sys, "\n"
|
78
|
+
# use iterator to read each parsed file
|
79
|
+
yield OneParsedFile.new(filename_core, parsefilename, sys)
|
80
|
+
}
|
81
|
+
|
82
|
+
else
|
83
|
+
# do new parses
|
84
|
+
$stderr.puts "Frprep: Parsing"
|
85
|
+
|
86
|
+
# sanity check
|
87
|
+
unless @exp.get("parser_path")
|
88
|
+
raise "Parsing: I need 'parser_path' in the experiment file"
|
89
|
+
end
|
90
|
+
unless @tab_dir
|
91
|
+
raise "Cannot parse without tab files"
|
92
|
+
end
|
93
|
+
|
94
|
+
# AB: NOTE This is the position where a parser is invoked.
|
95
|
+
# parse
|
96
|
+
sys.process_dir(@tab_dir, @parse_dir)
|
97
|
+
|
98
|
+
$stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
|
99
|
+
|
100
|
+
Dir[@parse_dir + "*" + parse_suffix].each { |parsefilename|
|
101
|
+
filename_core = File.basename(parsefilename, parse_suffix)
|
102
|
+
|
103
|
+
# use iterator to read each parsed file
|
104
|
+
yield OneParsedFile.new(filename_core, parsefilename, sys)
|
105
|
+
}
|
106
|
+
end
|
107
|
+
|
108
|
+
else
|
109
|
+
# no parse:
|
110
|
+
# get pseudo-parse tree
|
111
|
+
|
112
|
+
if @stxml_dir
|
113
|
+
# use existing SalsaTigerXML files
|
114
|
+
Dir[@stxml_dir + "*.xml"].each { |stxmlfilename|
|
115
|
+
|
116
|
+
filename_core = File.basename(stxmlfilename, ".xml")
|
117
|
+
if @tab_dir
|
118
|
+
# we know the tab directory too
|
119
|
+
tabfilename = @tab_dir + filename_core + @file_suffixes["tab"]
|
120
|
+
each_sentence_obj = FrprepReadStxml.new(stxmlfilename, tabfilename,
|
121
|
+
postag_suffix, lemma_suffix)
|
122
|
+
else
|
123
|
+
# we have no tab directory
|
124
|
+
each_sentence_obj = FrprepReadStxml.new(stxmlfilename, nil,
|
125
|
+
postag_suffix, lemma_suffix)
|
126
|
+
end
|
127
|
+
|
128
|
+
yield OneParsedFile.new(filename_core, stxmlfilename, each_sentence_obj)
|
129
|
+
}
|
130
|
+
|
131
|
+
else
|
132
|
+
# construct SalsaTigerXML from tab files
|
133
|
+
Dir[@tab_dir+"*"+@file_suffixes["tab"]].each { |tabfilename|
|
134
|
+
each_sentence_obj = FrprepFlatSyntax.new(tabfilename,
|
135
|
+
postag_suffix,
|
136
|
+
lemma_suffix)
|
137
|
+
filename_core = File.basename(tabfilename, @file_suffixes["tab"])
|
138
|
+
yield OneParsedFile.new(filename_core, tabfilename, each_sentence_obj)
|
139
|
+
}
|
140
|
+
end # source of pseudo-parse
|
141
|
+
end # parse or no parse
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,693 @@
|
|
1
|
+
require 'frprep/do_parses'
|
2
|
+
require 'common/prep_helper'
|
3
|
+
require 'common/FixSynSemMapping'
|
4
|
+
# For FN input.
|
5
|
+
require 'frprep/FNCorpusXML'
|
6
|
+
require 'frprep/FNDatabase'
|
7
|
+
|
8
|
+
##############################
|
9
|
+
# The class that does all the work
|
10
|
+
module FrPrep
|
11
|
+
class FrPrep
|
12
|
+
# @param exp [FrprepConfigData] Configuration object
|
13
|
+
def initialize(exp)
|
14
|
+
@exp = exp
|
15
|
+
|
16
|
+
# AB: move to FRprepOptionParser
|
17
|
+
# remove previous contents of frprep internal data directory
|
18
|
+
unless exp.get("frprep_directory")
|
19
|
+
raise "Please set 'frprep_directory', the frprep internal data directory,\n" +
|
20
|
+
"in the experiment file."
|
21
|
+
end
|
22
|
+
|
23
|
+
# experiment directory:
|
24
|
+
# frprep internal data directory, subdir according to experiment ID
|
25
|
+
exp_dir = File.new_dir(@exp.get("frprep_directory"),
|
26
|
+
@exp.get("prep_experiment_ID"))
|
27
|
+
# %x{rm -rf #{exp_dir}}
|
28
|
+
|
29
|
+
# suffixes for different types of output files
|
30
|
+
@file_suffixes = {"lemma" => ".lemma",
|
31
|
+
"pos" => ".pos",
|
32
|
+
"tab" => ".tab",
|
33
|
+
"stxml" => ".xml"}
|
34
|
+
end
|
35
|
+
|
36
|
+
def transform
|
37
|
+
|
38
|
+
# AB: Debugging.
|
39
|
+
debugger if $DEBUG
|
40
|
+
|
41
|
+
|
42
|
+
# AB: move to FRprepOptionParser
|
43
|
+
unless @exp.get("directory_input")
|
44
|
+
$stderr.puts "Please specify 'directory_input' in the experiment file."
|
45
|
+
exit 1
|
46
|
+
end
|
47
|
+
# AB: move to FRprepOptionParser
|
48
|
+
unless @exp.get("directory_preprocessed")
|
49
|
+
$stderr.puts "Please specify 'directory_preprocessed' in the experiment file."
|
50
|
+
exit 1
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
# input and output directories.
|
55
|
+
#
|
56
|
+
# sanity check: output in tab format will not work
|
57
|
+
# if we also do a parse
|
58
|
+
if @exp.get("tabformat_output") and @exp.get("do_parse")
|
59
|
+
$stderr.puts "Error: Cannot do Tab format output"
|
60
|
+
$stderr.puts "when the input text is being parsed."
|
61
|
+
$stderr.puts "Please set either 'tabformat_output' or 'do_parse' to false."
|
62
|
+
exit 1
|
63
|
+
end
|
64
|
+
input_dir = File.existing_dir(@exp.get("directory_input"))
|
65
|
+
output_dir = File.new_dir(@exp.get("directory_preprocessed"))
|
66
|
+
if @exp.get("tabformat_output")
|
67
|
+
split_dir = output_dir
|
68
|
+
else
|
69
|
+
split_dir = frprep_dirname("split", "new")
|
70
|
+
end
|
71
|
+
|
72
|
+
####
|
73
|
+
# transform data to UTF-8
|
74
|
+
|
75
|
+
if ["iso", "hex"].include? @exp.get("encoding")
|
76
|
+
# transform ISO -> UTF-8 or Hex -> UTF-8
|
77
|
+
# write result to encoding_dir,
|
78
|
+
# then set encoding_dir to be the new input_dir
|
79
|
+
|
80
|
+
encoding_dir = frprep_dirname("encoding", "new")
|
81
|
+
$stderr.puts "Frprep: Transforming to UTF-8."
|
82
|
+
Dir[input_dir + "*"].each { |filename|
|
83
|
+
unless File.file? filename
|
84
|
+
# not a file? then skip
|
85
|
+
next
|
86
|
+
end
|
87
|
+
outfilename = encoding_dir + File.basename(filename)
|
88
|
+
FrprepHelper.to_utf8_file(filename, outfilename, @exp.get("encoding"))
|
89
|
+
}
|
90
|
+
|
91
|
+
input_dir = encoding_dir
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
####
|
96
|
+
# transform data all the way to the output format,
|
97
|
+
# which is SalsaTigerXML by default,
|
98
|
+
# except when tabformat_output has been set, in which case it's
|
99
|
+
# Tab format.
|
100
|
+
current_dir = input_dir
|
101
|
+
|
102
|
+
done_format = @exp.get("tabformat_output") ? 'SalsaTabWithPos' : 'Done'
|
103
|
+
|
104
|
+
current_format = @exp.get("format")
|
105
|
+
|
106
|
+
while current_format != done_format
|
107
|
+
# AB: DEBUG Remove it
|
108
|
+
STDERR.puts "#{current_format} - #{done_format}"
|
109
|
+
# after debugging
|
110
|
+
case current_format
|
111
|
+
|
112
|
+
when "BNC"
|
113
|
+
# basically plain, plus some tags to be removed
|
114
|
+
plain_dir = frprep_dirname("plain", "new")
|
115
|
+
|
116
|
+
$stderr.puts "Frprep: Transforming BNC format text in #{current_dir} to plain format."
|
117
|
+
$stderr.puts "Storing the result in #{plain_dir}."
|
118
|
+
$stderr.puts "Expecting one sentence per line."
|
119
|
+
|
120
|
+
transform_bncformat_dir(current_dir, plain_dir)
|
121
|
+
|
122
|
+
current_dir = plain_dir
|
123
|
+
current_format = "Plain"
|
124
|
+
|
125
|
+
when "Plain"
|
126
|
+
# transform to tab format
|
127
|
+
|
128
|
+
tab_dir = frprep_dirname("tab", "new")
|
129
|
+
|
130
|
+
$stderr.puts "Frprep: Transforming plain text in #{current_dir} to SalsaTab format."
|
131
|
+
$stderr.puts "Storing the result in #{tab_dir}."
|
132
|
+
$stderr.puts "Expecting one sentence per line."
|
133
|
+
|
134
|
+
transform_plain_dir(current_dir, tab_dir)
|
135
|
+
|
136
|
+
current_dir = tab_dir
|
137
|
+
current_format = "SalsaTab"
|
138
|
+
|
139
|
+
when "FNXml"
|
140
|
+
# transform to tab format
|
141
|
+
|
142
|
+
tab_dir = frprep_dirname("tab", "new")
|
143
|
+
|
144
|
+
$stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
|
145
|
+
$stderr.puts "Storing the result in " + tab_dir
|
146
|
+
|
147
|
+
fndata = FNDatabase.new(current_dir)
|
148
|
+
fndata.extract_everything(tab_dir)
|
149
|
+
Kernel.system("chmod -R g+rx #{tab_dir}")
|
150
|
+
|
151
|
+
current_dir = tab_dir
|
152
|
+
current_format = "SalsaTab"
|
153
|
+
|
154
|
+
when "FNCorpusXml"
|
155
|
+
# transform to tab format
|
156
|
+
tab_dir = frprep_dirname("tab", "new")
|
157
|
+
|
158
|
+
$stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
|
159
|
+
$stderr.puts "Storing the result in " + tab_dir
|
160
|
+
# assuming that all XML files in the current directory are FN Corpus XML files
|
161
|
+
Dir[current_dir + "*.xml"].each { |fncorpusfilename|
|
162
|
+
corpus = FNCorpusXMLFile.new(fncorpusfilename)
|
163
|
+
outfile = File.new(tab_dir + File.basename(fncorpusfilename, ".xml") + ".tab",
|
164
|
+
"w")
|
165
|
+
corpus.print_conll_style(outfile)
|
166
|
+
outfile.close()
|
167
|
+
}
|
168
|
+
|
169
|
+
Kernel.system("chmod -R g+rx #{tab_dir}")
|
170
|
+
current_dir = tab_dir
|
171
|
+
current_format = "SalsaTab"
|
172
|
+
|
173
|
+
when "SalsaTab"
|
174
|
+
# lemmatize and POStag
|
175
|
+
|
176
|
+
$stderr.puts "Frprep: Lemmatizing and parsing text in #{current_dir}."
|
177
|
+
$stderr.puts "Storing the result in #{split_dir}."
|
178
|
+
transform_pos_and_lemmatize(current_dir, split_dir)
|
179
|
+
|
180
|
+
current_dir = split_dir
|
181
|
+
current_format = "SalsaTabWithPos"
|
182
|
+
|
183
|
+
when "SalsaTabWithPos"
|
184
|
+
# parse
|
185
|
+
|
186
|
+
parse_dir = frprep_dirname("parse", "new")
|
187
|
+
|
188
|
+
$stderr.puts "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format."
|
189
|
+
$stderr.puts "Storing the result in #{parse_dir}."
|
190
|
+
|
191
|
+
transform_salsatab_dir(current_dir, parse_dir, output_dir)
|
192
|
+
|
193
|
+
current_dir = output_dir
|
194
|
+
current_format = "Done"
|
195
|
+
|
196
|
+
when "SalsaTigerXML"
|
197
|
+
|
198
|
+
parse_dir = frprep_dirname("parse", "new")
|
199
|
+
print "Transform parser output into stxml\n"
|
200
|
+
transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir, @exp)
|
201
|
+
current_dir = output_dir
|
202
|
+
current_format = "Done"
|
203
|
+
|
204
|
+
else
|
205
|
+
STDERR.puts "Done format is: #{done_format}"
|
206
|
+
$stderr.puts "Unknown data format #{current_format}"
|
207
|
+
$stderr.puts "Please check the 'format' entry in your experiment file."
|
208
|
+
raise "Experiment file problem"
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
STDERR.puts "FrPrep: Done preprocessing."
|
213
|
+
end
|
214
|
+
|
215
|
+
############################################################################
|
216
|
+
private
|
217
|
+
|
218
|
+
###############
|
219
|
+
# frprep_dirname:
|
220
|
+
# make directory name for frprep-internal data
|
221
|
+
# of a certain kind described in <subdir>
|
222
|
+
#
|
223
|
+
# frprep_directory has one subdirectory for each experiment ID,
|
224
|
+
# and below that there is one subdir per subtask
|
225
|
+
#
|
226
|
+
# If this is a new directory, it is constructed,
|
227
|
+
# if it should be an existing directory, its existence is checked.
|
228
|
+
# @param subdir [String] designator of a subdirectory
|
229
|
+
# @param neu [Nil] non-nil This may be a new directory
|
230
|
+
def frprep_dirname(subdir, neu = nil)
|
231
|
+
|
232
|
+
dirname = File.new_dir(@exp.get("frprep_directory"),
|
233
|
+
@exp.get("prep_experiment_ID"),
|
234
|
+
subdir)
|
235
|
+
|
236
|
+
neu ? File.new_dir(dirname) : File.existing_dir(dirname)
|
237
|
+
end
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
###############
|
242
|
+
# transform_plain:
|
243
|
+
#
|
244
|
+
# transformation for BNC format:
|
245
|
+
#
|
246
|
+
# transform to plain format, removing <> elements
|
247
|
+
def transform_bncformat_dir(input_dir, # string: input directory
|
248
|
+
output_dir) # string: output directory
|
249
|
+
|
250
|
+
Dir[input_dir + "*"].each { |bncfilename|
|
251
|
+
|
252
|
+
# open input and output file
|
253
|
+
# end output file name in "tab" because that is, at the moment, required
|
254
|
+
outfilename = output_dir + File.basename(bncfilename)
|
255
|
+
FrprepHelper.bnc_to_plain_file(bncfilename, outfilename)
|
256
|
+
}
|
257
|
+
end
|
258
|
+
|
259
|
+
|
260
|
+
###############
|
261
|
+
# transform_plain:
|
262
|
+
#
|
263
|
+
# transformation for plaintext:
|
264
|
+
#
|
265
|
+
# transform to Tab format, separating punctuation from adjacent words
|
266
|
+
# @param input_dir [String] input directory
|
267
|
+
# @param output_dir [String] output directory
|
268
|
+
def transform_plain_dir(input_dir, output_dir)
|
269
|
+
Dir[input_dir + "*"].each do |plainfilename|
|
270
|
+
# open input and output file
|
271
|
+
# end output file name in "tab" because that is, at the moment, required
|
272
|
+
outfilename = output_dir + File.basename(plainfilename) + @file_suffixes["tab"]
|
273
|
+
FrprepHelper.plain_to_tab_file(plainfilename, outfilename)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
###############
|
278
|
+
# transform_pos_and_lemmatize
|
279
|
+
#
|
280
|
+
# transformation for Tab format files:
|
281
|
+
#
|
282
|
+
# - Split into parser-size chunks
|
283
|
+
# - POS-tag, lemmatize
|
284
|
+
def transform_pos_and_lemmatize(input_dir, # string: input directory
|
285
|
+
output_dir) # string: output directory
|
286
|
+
##
|
287
|
+
# split the TabFormatFile into chunks of max_sent_num size
|
288
|
+
FrprepHelper.split_dir(input_dir, output_dir,@file_suffixes["tab"],
|
289
|
+
@exp.get("parser_max_sent_num"),
|
290
|
+
@exp.get("parser_max_sent_len"))
|
291
|
+
|
292
|
+
##
|
293
|
+
# POS-Tagging
|
294
|
+
if @exp.get("do_postag")
|
295
|
+
$stderr.puts "Frprep: Tagging."
|
296
|
+
|
297
|
+
# AB: TODO Move it to OptionParser.
|
298
|
+
unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
|
299
|
+
raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
|
300
|
+
end
|
301
|
+
|
302
|
+
sys_class = SynInterfaces.get_interface("pos_tagger",
|
303
|
+
@exp.get("pos_tagger"))
|
304
|
+
print "pos tagger interface: ", sys_class, "\n"
|
305
|
+
|
306
|
+
# AB: TODO Remove it.
|
307
|
+
unless sys_class
|
308
|
+
raise "Shouldn't be here"
|
309
|
+
end
|
310
|
+
|
311
|
+
sys = sys_class.new(@exp.get("pos_tagger_path"),
|
312
|
+
@file_suffixes["tab"],
|
313
|
+
@file_suffixes["pos"])
|
314
|
+
sys.process_dir(output_dir, output_dir)
|
315
|
+
end
|
316
|
+
|
317
|
+
|
318
|
+
##
|
319
|
+
# Lemmatization
|
320
|
+
# AB: We're working on the <split> dir and writing there.
|
321
|
+
if @exp.get("do_lemmatize")
|
322
|
+
STDERR.puts 'Frprep: Lemmatizing.'
|
323
|
+
|
324
|
+
# AB: TODO Move it to OptionParser.
|
325
|
+
unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
|
326
|
+
raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
|
327
|
+
end
|
328
|
+
|
329
|
+
sys_class = SynInterfaces.get_interface("lemmatizer",
|
330
|
+
@exp.get("lemmatizer"))
|
331
|
+
# AB: TODO make this exception explicit.
|
332
|
+
unless sys_class
|
333
|
+
raise 'I got a empty interface class for the lemmatizer!'
|
334
|
+
end
|
335
|
+
|
336
|
+
sys = sys_class.new(@exp.get("lemmatizer_path"),
|
337
|
+
@file_suffixes["tab"],
|
338
|
+
@file_suffixes["lemma"])
|
339
|
+
sys.process_dir(output_dir, output_dir)
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
343
|
+
###############
|
344
|
+
# transform_salsatab
|
345
|
+
#
|
346
|
+
# transformation for Tab format files:
|
347
|
+
#
|
348
|
+
# - parse
|
349
|
+
# - Transform parser output to SalsaTigerXML
|
350
|
+
# If no parsing, make flat syntactic structure.
|
351
|
+
def transform_salsatab_dir(input_dir, # string: input directory
|
352
|
+
parse_dir, # string: output directory for parses
|
353
|
+
output_dir) # string: global output directory
|
354
|
+
|
355
|
+
##
|
356
|
+
# (Parse and) transform to SalsaTigerXML
|
357
|
+
|
358
|
+
# get interpretation class for this
|
359
|
+
# parser/lemmatizer/POS tagger combination
|
360
|
+
interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
|
361
|
+
unless interpreter_class
|
362
|
+
raise "Shouldn't be here"
|
363
|
+
end
|
364
|
+
|
365
|
+
parse_obj = DoParses.new(@exp, @file_suffixes,
|
366
|
+
parse_dir,
|
367
|
+
"tab_dir" => input_dir)
|
368
|
+
parse_obj.each_parsed_file { |parsed_file_obj|
|
369
|
+
|
370
|
+
outfilename = output_dir + parsed_file_obj.filename + ".xml"
|
371
|
+
$stderr.puts "Writing #{outfilename}"
|
372
|
+
begin
|
373
|
+
outfile = File.new(outfilename, "w")
|
374
|
+
rescue
|
375
|
+
raise "Cannot write to SalsaTigerXML output file #{outfilename}"
|
376
|
+
end
|
377
|
+
|
378
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
379
|
+
# work with triples
|
380
|
+
# SalsaTigerSentence, FNTabSentence,
|
381
|
+
# hash: tab sentence index(integer) -> array:SynNode
|
382
|
+
parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
|
383
|
+
|
384
|
+
# parsed: add headwords using parse tree
|
385
|
+
if @exp.get("do_parse")
|
386
|
+
FrprepHelper.add_head_attributes(st_sent, interpreter_class)
|
387
|
+
end
|
388
|
+
|
389
|
+
# add lemmas, if they are there. If they are not, don't print out a warning.
|
390
|
+
if @exp.get("do_lemmatize")
|
391
|
+
FrprepHelper.add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
|
392
|
+
end
|
393
|
+
|
394
|
+
# add semantics
|
395
|
+
# we can use the method in SalsaTigerXMLHelper
|
396
|
+
# that reads semantic information from the tab file
|
397
|
+
# and combines all targets of a sentence into one frame
|
398
|
+
FrprepHelper.add_semantics_from_tab(st_sent, tabformat_sent, mapping,
|
399
|
+
interpreter_class, @exp)
|
400
|
+
|
401
|
+
# remove pseudo-frames from FrameNet data
|
402
|
+
FrprepHelper.remove_deprecated_frames(st_sent, @exp)
|
403
|
+
|
404
|
+
# handle multiword targets
|
405
|
+
FrprepHelper.handle_multiword_targets(st_sent,
|
406
|
+
interpreter_class, @exp.get("language"))
|
407
|
+
|
408
|
+
# handle Unknown frame names
|
409
|
+
FrprepHelper.handle_unknown_framenames(st_sent, interpreter_class)
|
410
|
+
|
411
|
+
outfile.puts st_sent.get()
|
412
|
+
}
|
413
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
414
|
+
}
|
415
|
+
end
|
416
|
+
|
417
|
+
#############################################
|
418
|
+
# transform_stxml
|
419
|
+
#
|
420
|
+
# transformation for SalsaTigerXML data
|
421
|
+
#
|
422
|
+
# - If the input format was SalsaTigerXML:
|
423
|
+
# - Tag, lemmatize and parse, if the experiment file tells you so
|
424
|
+
#
|
425
|
+
# - If the origin is the Salsa corpus:
|
426
|
+
# Change frame names from Unknown\d+ to lemma_Unknown\d+
|
427
|
+
#
|
428
|
+
# - fix multiword lemmas, or at least try
|
429
|
+
# - transform to UTF 8
|
430
|
+
def transform_stxml_dir(parse_dir, # string: name of directory for parse data
|
431
|
+
tab_dir, # string: name of directory for split/tab data
|
432
|
+
input_dir, # string: name of input directory
|
433
|
+
output_dir, # string: name of final output directory
|
434
|
+
exp) # FrprepConfigData
|
435
|
+
|
436
|
+
####
|
437
|
+
# Data preparation
|
438
|
+
|
439
|
+
# Data with Salsa as origin:
|
440
|
+
# remember the target lemma as an attribute on the
|
441
|
+
# <target> elements
|
442
|
+
#
|
443
|
+
# currently deactivated: encoding problems
|
444
|
+
# if @exp.get("origin") == "SalsaTiger"
|
445
|
+
# $stderr.puts "Frprep: noting target lemmas"
|
446
|
+
# changed_input_dir = frprep_dirname("salsalemma", "new")
|
447
|
+
# FrprepHelper.note_salsa_targetlemmas(input_dir, changed_input_dir)
|
448
|
+
|
449
|
+
# # remember changed input dir as input dir
|
450
|
+
# input_dir = changed_input_dir
|
451
|
+
# end
|
452
|
+
|
453
|
+
# If data is to be parsed, split and tabify input files
|
454
|
+
# else copy data to stxml_indir.
|
455
|
+
|
456
|
+
# stxml_dir: directory where SalsaTiger data is situated
|
457
|
+
if @exp.get("do_parse")
|
458
|
+
# split data
|
459
|
+
stxml_splitdir = frprep_dirname("stxml_split", "new")
|
460
|
+
stxml_dir = stxml_splitdir
|
461
|
+
|
462
|
+
$stderr.puts "Frprep: splitting data"
|
463
|
+
FrprepHelper.stxml_split_dir(input_dir, stxml_splitdir,
|
464
|
+
@exp.get("parser_max_sent_num"),
|
465
|
+
@exp.get("parser_max_sent_len"))
|
466
|
+
else
|
467
|
+
# no parsing: copy data to split dir
|
468
|
+
stxml_dir = parse_dir
|
469
|
+
$stderr.puts "Frprep: Copying data to #{stxml_dir}"
|
470
|
+
Dir[input_dir + "*.xml"].each { |filename|
|
471
|
+
`cp #{filename} #{stxml_dir}#{File.basename(filename)}`
|
472
|
+
}
|
473
|
+
end
|
474
|
+
|
475
|
+
# Some syntactic processing will take place:
|
476
|
+
# tabify data
|
477
|
+
if @exp.get("do_parse") or @exp.get("do_lemmatize") or @exp.get("do_postag")
|
478
|
+
$stderr.puts "Frprep: making input for syn. processing"
|
479
|
+
|
480
|
+
Dir[stxml_dir+"*"+@file_suffixes["stxml"]].each { |stxmlfilename|
|
481
|
+
|
482
|
+
tabfilename = tab_dir + File.basename(stxmlfilename,@file_suffixes["stxml"]) + @file_suffixes["tab"]
|
483
|
+
FrprepHelper.stxml_to_tab_file(stxmlfilename, tabfilename, exp)
|
484
|
+
}
|
485
|
+
end
|
486
|
+
|
487
|
+
###
|
488
|
+
# POS-tagging
|
489
|
+
if @exp.get("do_postag")
|
490
|
+
$stderr.puts "Frprep: Tagging."
|
491
|
+
unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
|
492
|
+
raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
|
493
|
+
end
|
494
|
+
|
495
|
+
sys_class = SynInterfaces.get_interface("pos_tagger",
|
496
|
+
@exp.get("pos_tagger"))
|
497
|
+
unless sys_class
|
498
|
+
raise "Shouldn't be here"
|
499
|
+
end
|
500
|
+
sys = sys_class.new(@exp.get("pos_tagger_path"),
|
501
|
+
@file_suffixes["tab"],
|
502
|
+
@file_suffixes["pos"])
|
503
|
+
sys.process_dir(tab_dir, tab_dir)
|
504
|
+
end
|
505
|
+
|
506
|
+
###
|
507
|
+
# Lemmatization
|
508
|
+
if @exp.get("do_lemmatize")
|
509
|
+
$stderr.puts "Frprep: Lemmatizing."
|
510
|
+
unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
|
511
|
+
raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
|
512
|
+
end
|
513
|
+
|
514
|
+
sys_class = SynInterfaces.get_interface("lemmatizer",
|
515
|
+
@exp.get("lemmatizer"))
|
516
|
+
unless sys_class
|
517
|
+
raise "Shouldn't be here"
|
518
|
+
end
|
519
|
+
sys = sys_class.new(@exp.get("lemmatizer_path"),
|
520
|
+
@file_suffixes["tab"],
|
521
|
+
@file_suffixes["lemma"])
|
522
|
+
sys.process_dir(tab_dir, tab_dir)
|
523
|
+
end
|
524
|
+
|
525
|
+
###
|
526
|
+
# Parsing, production of SalsaTigerXML output
|
527
|
+
|
528
|
+
# get interpretation class for this
|
529
|
+
# parser/lemmatizer/POS tagger combination
|
530
|
+
sys_class_names = Hash.new
|
531
|
+
[["do_postag", "pos_tagger"],
|
532
|
+
["do_lemmatize", "lemmatizer"],
|
533
|
+
["do_parse", "parser"]].each { |service, system_name|
|
534
|
+
if @exp.get(service) # yes, perform this service
|
535
|
+
sys_class_names[system_name] = @exp.get(system_name)
|
536
|
+
end
|
537
|
+
}
|
538
|
+
interpreter_class = SynInterfaces.get_interpreter(sys_class_names)
|
539
|
+
unless interpreter_class
|
540
|
+
raise "Shouldn't be here"
|
541
|
+
end
|
542
|
+
|
543
|
+
parse_obj = DoParses.new(@exp, @file_suffixes,
|
544
|
+
parse_dir,
|
545
|
+
"tab_dir" => tab_dir,
|
546
|
+
"stxml_dir" => stxml_dir)
|
547
|
+
parse_obj.each_parsed_file { |parsed_file_obj|
|
548
|
+
outfilename = output_dir + parsed_file_obj.filename + ".xml"
|
549
|
+
$stderr.puts "Writing #{outfilename}"
|
550
|
+
begin
|
551
|
+
outfile = File.new(outfilename, "w")
|
552
|
+
rescue
|
553
|
+
raise "Cannot write to SalsaTigerXML output file #{outfilename}"
|
554
|
+
end
|
555
|
+
|
556
|
+
|
557
|
+
if @exp.get("do_parse")
|
558
|
+
# read old SalsaTigerXML file
|
559
|
+
# so we can integrate the old file's semantics later
|
560
|
+
oldxml = Array.new # array of sentence strings
|
561
|
+
# we assume that the old and the new file have the same name,
|
562
|
+
# ending in .xml.
|
563
|
+
oldxmlfile = FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
|
564
|
+
oldxmlfile.scan_s { |sent_string|
|
565
|
+
# remember this sentence by its ID
|
566
|
+
oldxml << sent_string
|
567
|
+
}
|
568
|
+
end
|
569
|
+
|
570
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
571
|
+
index = 0
|
572
|
+
# work with triples
|
573
|
+
# SalsaTigerSentence, FNTabSentence,
|
574
|
+
# hash: tab sentence index(integer) -> array:SynNode
|
575
|
+
parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
|
576
|
+
|
577
|
+
# parsed? then integrate semantics and lemmas from old file
|
578
|
+
if @exp.get("do_parse")
|
579
|
+
oldsent_string = oldxml[index]
|
580
|
+
index += 1
|
581
|
+
if oldsent_string
|
582
|
+
|
583
|
+
# modified by ines, 27/08/08
|
584
|
+
# for Berkeley => substitute ( ) for *LRB* *RRB*
|
585
|
+
if exp.get("parser") == "berkeley"
|
586
|
+
oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
|
587
|
+
oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
|
588
|
+
oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
|
589
|
+
oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
|
590
|
+
end
|
591
|
+
|
592
|
+
# we have both an old and a new sentence, so integrate semantics
|
593
|
+
oldsent = SalsaTigerSentence.new(oldsent_string)
|
594
|
+
if st_sent.nil?
|
595
|
+
next
|
596
|
+
end
|
597
|
+
if ( FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp) == false)
|
598
|
+
#print "FALSE \n";
|
599
|
+
#print oldsent, "\n", st_sent, "\n\n";
|
600
|
+
|
601
|
+
oldsent_string = oldxml[index]
|
602
|
+
index += 1
|
603
|
+
if oldsent_string
|
604
|
+
|
605
|
+
# modified by ines, 27/08/08
|
606
|
+
# for Berkeley => substitute ( ) for *LRB* *RRB*
|
607
|
+
if exp.get("parser") == "berkeley"
|
608
|
+
oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
|
609
|
+
oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
|
610
|
+
oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
|
611
|
+
oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
|
612
|
+
end
|
613
|
+
|
614
|
+
# we have both an old and a new sentence, so integrate semantics
|
615
|
+
oldsent = SalsaTigerSentence.new(oldsent_string)
|
616
|
+
#print oldsent, "\n", st_sent, "\n\n";
|
617
|
+
FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
|
618
|
+
|
619
|
+
end
|
620
|
+
#else
|
621
|
+
#print "TRUE\n";
|
622
|
+
#print oldsent, "\n", st_sent, "\n\n";
|
623
|
+
end
|
624
|
+
else
|
625
|
+
# no corresponding old sentence for this new sentence
|
626
|
+
$stderr.puts "Warning: transporting semantics -- missing source sentence, skipping"
|
627
|
+
end
|
628
|
+
end
|
629
|
+
|
630
|
+
# remove pseudo-frames from FrameNet data
|
631
|
+
FrprepHelper.remove_deprecated_frames(st_sent, @exp)
|
632
|
+
|
633
|
+
# repair syn/sem mapping problems?
|
634
|
+
if @exp.get("fe_syn_repair") or @exp.get("fe_rel_repair")
|
635
|
+
FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
|
636
|
+
end
|
637
|
+
|
638
|
+
outfile.puts st_sent.get()
|
639
|
+
} # each ST sentence
|
640
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
641
|
+
} # each file parsed
|
642
|
+
end
|
643
|
+
|
644
|
+
|
645
|
+
###################################
|
646
|
+
# general file iterators
|
647
|
+
|
648
|
+
# yields pairs of [infile name, outfile stream]
|
649
|
+
def change_each_file_in_dir(dir, # string: directory name
|
650
|
+
suffix) # string: filename pattern, e.g. "*.xml"
|
651
|
+
Dir[dir + "*#{suffix}"].each { |filename|
|
652
|
+
tempfile = Tempfile.new("FrprepHelper")
|
653
|
+
yield [filename, tempfile]
|
654
|
+
|
655
|
+
# move temp file to original file location
|
656
|
+
tempfile.close()
|
657
|
+
`cp #{filename} #{filename}.bak`
|
658
|
+
`mv #{tempfile.path()} #{filename}`
|
659
|
+
tempfile.close(true)
|
660
|
+
} # each file
|
661
|
+
end
|
662
|
+
|
663
|
+
#######
|
664
|
+
# change_each_stxml_file_in_dir
|
665
|
+
#
|
666
|
+
# use change_each_file_in_dir, but assume that the files
|
667
|
+
# are SalsaTigerXML files: Keep file headers and footers,
|
668
|
+
# and just offer individual sentences for changing
|
669
|
+
#
|
670
|
+
# Yields SalsaTigerSentence objects, each sentence to be changed
|
671
|
+
def change_each_stxml_file_in_dir(dir) # string: directory name
|
672
|
+
|
673
|
+
change_each_file_in_dir(dir, "*.xml") { |stfilename, tf|
|
674
|
+
infile = FilePartsParser.new(stfilename)
|
675
|
+
|
676
|
+
# write header
|
677
|
+
tf.puts infile.head()
|
678
|
+
|
679
|
+
# iterate through sentences, yield as SalsaTigerSentence objects
|
680
|
+
infile.scan_s() { |sent_string|
|
681
|
+
sent = SalsaTigerSentence.new(sent_string)
|
682
|
+
yield sent
|
683
|
+
# write changed sentence
|
684
|
+
tf.puts sent.get()
|
685
|
+
} # each sentence
|
686
|
+
|
687
|
+
# write footer
|
688
|
+
tf.puts infile.tail()
|
689
|
+
infile.close()
|
690
|
+
}
|
691
|
+
end
|
692
|
+
end
|
693
|
+
end
|