shalmaneser-prep 1.2.0.rc4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +93 -0
- data/lib/frprep/Ampersand.rb +39 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/TreetaggerInterface.rb +327 -0
- data/lib/frprep/do_parses.rb +143 -0
- data/lib/frprep/frprep.rb +693 -0
- data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +58 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +99 -0
- data/test/functional/test_rosy.rb +40 -0
- metadata +85 -0
@@ -0,0 +1,143 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
# AB, 2010-11-25
|
4
|
+
|
5
|
+
##############################
|
6
|
+
# class for managing parses:
|
7
|
+
#
|
8
|
+
# Given either a directory with tab format files or
|
9
|
+
# a directory with SalsaTigerXML files (or both) and
|
10
|
+
# a directory for putting parse files:
|
11
|
+
# - parse, unless no parsing set in the experiment file
|
12
|
+
# - for each parsed file: yield one OneParsedFile object
|
13
|
+
require 'frprep/one_parsed_file'
|
14
|
+
|
15
|
+
class DoParses
|
16
|
+
def initialize(exp, # FrPrepConfigData object
|
17
|
+
file_suffixes, # hash: file type(string) -> suffix(string)
|
18
|
+
parse_dir, # string: name of directory to put parses
|
19
|
+
var_hash = {}) # further directories
|
20
|
+
@exp = exp
|
21
|
+
@file_suffixes = file_suffixes
|
22
|
+
@parse_dir = parse_dir
|
23
|
+
@tab_dir = var_hash["tab_dir"]
|
24
|
+
@stxml_dir = var_hash["stxml_dir"]
|
25
|
+
|
26
|
+
# pre-parsed data available?
|
27
|
+
@parsed_files = @exp.get("directory_parserout")
|
28
|
+
end
|
29
|
+
|
30
|
+
###
|
31
|
+
def each_parsed_file()
|
32
|
+
if @exp.get("do_postag")
|
33
|
+
postag_suffix = @file_suffixes["pos"]
|
34
|
+
else
|
35
|
+
postag_suffix = nil
|
36
|
+
end
|
37
|
+
|
38
|
+
if @exp.get("do_lemmatize")
|
39
|
+
lemma_suffix = @file_suffixes["lemma"]
|
40
|
+
else
|
41
|
+
lemma_suffix = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
if @exp.get("do_parse")
|
45
|
+
|
46
|
+
# get parser interface
|
47
|
+
sys_class = SynInterfaces.get_interface("parser",
|
48
|
+
@exp.get("parser"))
|
49
|
+
unless sys_class
|
50
|
+
raise "Shouldn't be here"
|
51
|
+
end
|
52
|
+
parse_suffix = "." + sys_class.name()
|
53
|
+
sys = sys_class.new(@exp.get("parser_path"),
|
54
|
+
@file_suffixes["tab"],
|
55
|
+
parse_suffix,
|
56
|
+
@file_suffixes["stxml"],
|
57
|
+
"pos_suffix" => postag_suffix,
|
58
|
+
"lemma_suffix" => lemma_suffix,
|
59
|
+
"tab_dir" => @tab_dir)
|
60
|
+
|
61
|
+
if @parsed_files
|
62
|
+
# reuse old parses
|
63
|
+
|
64
|
+
$stderr.puts "Frprep: using pre-computed parses in " + @parsed_files.to_s()
|
65
|
+
$stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
|
66
|
+
|
67
|
+
Dir[@parsed_files + "*"].each { |parsefilename|
|
68
|
+
|
69
|
+
if File.stat(parsefilename).ftype != "file"
|
70
|
+
# something other than a file
|
71
|
+
next
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
# core filename: remove directory and anything after the last "."
|
76
|
+
filename_core = File.basename(parsefilename, ".*")
|
77
|
+
#print "FN ", filename_core, " PN ", parsefilename, " sys ", sys, "\n"
|
78
|
+
# use iterator to read each parsed file
|
79
|
+
yield OneParsedFile.new(filename_core, parsefilename, sys)
|
80
|
+
}
|
81
|
+
|
82
|
+
else
|
83
|
+
# do new parses
|
84
|
+
$stderr.puts "Frprep: Parsing"
|
85
|
+
|
86
|
+
# sanity check
|
87
|
+
unless @exp.get("parser_path")
|
88
|
+
raise "Parsing: I need 'parser_path' in the experiment file"
|
89
|
+
end
|
90
|
+
unless @tab_dir
|
91
|
+
raise "Cannot parse without tab files"
|
92
|
+
end
|
93
|
+
|
94
|
+
# AB: NOTE This is the position where a parser is invoked.
|
95
|
+
# parse
|
96
|
+
sys.process_dir(@tab_dir, @parse_dir)
|
97
|
+
|
98
|
+
$stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
|
99
|
+
|
100
|
+
Dir[@parse_dir + "*" + parse_suffix].each { |parsefilename|
|
101
|
+
filename_core = File.basename(parsefilename, parse_suffix)
|
102
|
+
|
103
|
+
# use iterator to read each parsed file
|
104
|
+
yield OneParsedFile.new(filename_core, parsefilename, sys)
|
105
|
+
}
|
106
|
+
end
|
107
|
+
|
108
|
+
else
|
109
|
+
# no parse:
|
110
|
+
# get pseudo-parse tree
|
111
|
+
|
112
|
+
if @stxml_dir
|
113
|
+
# use existing SalsaTigerXML files
|
114
|
+
Dir[@stxml_dir + "*.xml"].each { |stxmlfilename|
|
115
|
+
|
116
|
+
filename_core = File.basename(stxmlfilename, ".xml")
|
117
|
+
if @tab_dir
|
118
|
+
# we know the tab directory too
|
119
|
+
tabfilename = @tab_dir + filename_core + @file_suffixes["tab"]
|
120
|
+
each_sentence_obj = FrprepReadStxml.new(stxmlfilename, tabfilename,
|
121
|
+
postag_suffix, lemma_suffix)
|
122
|
+
else
|
123
|
+
# we have no tab directory
|
124
|
+
each_sentence_obj = FrprepReadStxml.new(stxmlfilename, nil,
|
125
|
+
postag_suffix, lemma_suffix)
|
126
|
+
end
|
127
|
+
|
128
|
+
yield OneParsedFile.new(filename_core, stxmlfilename, each_sentence_obj)
|
129
|
+
}
|
130
|
+
|
131
|
+
else
|
132
|
+
# construct SalsaTigerXML from tab files
|
133
|
+
Dir[@tab_dir+"*"+@file_suffixes["tab"]].each { |tabfilename|
|
134
|
+
each_sentence_obj = FrprepFlatSyntax.new(tabfilename,
|
135
|
+
postag_suffix,
|
136
|
+
lemma_suffix)
|
137
|
+
filename_core = File.basename(tabfilename, @file_suffixes["tab"])
|
138
|
+
yield OneParsedFile.new(filename_core, tabfilename, each_sentence_obj)
|
139
|
+
}
|
140
|
+
end # source of pseudo-parse
|
141
|
+
end # parse or no parse
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,693 @@
|
|
1
|
+
require 'frprep/do_parses'
|
2
|
+
require 'common/prep_helper'
|
3
|
+
require 'common/FixSynSemMapping'
|
4
|
+
# For FN input.
|
5
|
+
require 'frprep/FNCorpusXML'
|
6
|
+
require 'frprep/FNDatabase'
|
7
|
+
|
8
|
+
##############################
|
9
|
+
# The class that does all the work
|
10
|
+
module FrPrep
|
11
|
+
class FrPrep
|
12
|
+
# @param exp [FrprepConfigData] Configuration object
|
13
|
+
def initialize(exp)
|
14
|
+
@exp = exp
|
15
|
+
|
16
|
+
# AB: move to FRprepOptionParser
|
17
|
+
# remove previous contents of frprep internal data directory
|
18
|
+
unless exp.get("frprep_directory")
|
19
|
+
raise "Please set 'frprep_directory', the frprep internal data directory,\n" +
|
20
|
+
"in the experiment file."
|
21
|
+
end
|
22
|
+
|
23
|
+
# experiment directory:
|
24
|
+
# frprep internal data directory, subdir according to experiment ID
|
25
|
+
exp_dir = File.new_dir(@exp.get("frprep_directory"),
|
26
|
+
@exp.get("prep_experiment_ID"))
|
27
|
+
# %x{rm -rf #{exp_dir}}
|
28
|
+
|
29
|
+
# suffixes for different types of output files
|
30
|
+
@file_suffixes = {"lemma" => ".lemma",
|
31
|
+
"pos" => ".pos",
|
32
|
+
"tab" => ".tab",
|
33
|
+
"stxml" => ".xml"}
|
34
|
+
end
|
35
|
+
|
36
|
+
def transform
|
37
|
+
|
38
|
+
# AB: Debugging.
|
39
|
+
debugger if $DEBUG
|
40
|
+
|
41
|
+
|
42
|
+
# AB: move to FRprepOptionParser
|
43
|
+
unless @exp.get("directory_input")
|
44
|
+
$stderr.puts "Please specify 'directory_input' in the experiment file."
|
45
|
+
exit 1
|
46
|
+
end
|
47
|
+
# AB: move to FRprepOptionParser
|
48
|
+
unless @exp.get("directory_preprocessed")
|
49
|
+
$stderr.puts "Please specify 'directory_preprocessed' in the experiment file."
|
50
|
+
exit 1
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
# input and output directories.
|
55
|
+
#
|
56
|
+
# sanity check: output in tab format will not work
|
57
|
+
# if we also do a parse
|
58
|
+
if @exp.get("tabformat_output") and @exp.get("do_parse")
|
59
|
+
$stderr.puts "Error: Cannot do Tab format output"
|
60
|
+
$stderr.puts "when the input text is being parsed."
|
61
|
+
$stderr.puts "Please set either 'tabformat_output' or 'do_parse' to false."
|
62
|
+
exit 1
|
63
|
+
end
|
64
|
+
input_dir = File.existing_dir(@exp.get("directory_input"))
|
65
|
+
output_dir = File.new_dir(@exp.get("directory_preprocessed"))
|
66
|
+
if @exp.get("tabformat_output")
|
67
|
+
split_dir = output_dir
|
68
|
+
else
|
69
|
+
split_dir = frprep_dirname("split", "new")
|
70
|
+
end
|
71
|
+
|
72
|
+
####
|
73
|
+
# transform data to UTF-8
|
74
|
+
|
75
|
+
if ["iso", "hex"].include? @exp.get("encoding")
|
76
|
+
# transform ISO -> UTF-8 or Hex -> UTF-8
|
77
|
+
# write result to encoding_dir,
|
78
|
+
# then set encoding_dir to be the new input_dir
|
79
|
+
|
80
|
+
encoding_dir = frprep_dirname("encoding", "new")
|
81
|
+
$stderr.puts "Frprep: Transforming to UTF-8."
|
82
|
+
Dir[input_dir + "*"].each { |filename|
|
83
|
+
unless File.file? filename
|
84
|
+
# not a file? then skip
|
85
|
+
next
|
86
|
+
end
|
87
|
+
outfilename = encoding_dir + File.basename(filename)
|
88
|
+
FrprepHelper.to_utf8_file(filename, outfilename, @exp.get("encoding"))
|
89
|
+
}
|
90
|
+
|
91
|
+
input_dir = encoding_dir
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
####
|
96
|
+
# transform data all the way to the output format,
|
97
|
+
# which is SalsaTigerXML by default,
|
98
|
+
# except when tabformat_output has been set, in which case it's
|
99
|
+
# Tab format.
|
100
|
+
current_dir = input_dir
|
101
|
+
|
102
|
+
done_format = @exp.get("tabformat_output") ? 'SalsaTabWithPos' : 'Done'
|
103
|
+
|
104
|
+
current_format = @exp.get("format")
|
105
|
+
|
106
|
+
while current_format != done_format
|
107
|
+
# AB: DEBUG Remove it
|
108
|
+
STDERR.puts "#{current_format} - #{done_format}"
|
109
|
+
# after debugging
|
110
|
+
case current_format
|
111
|
+
|
112
|
+
when "BNC"
|
113
|
+
# basically plain, plus some tags to be removed
|
114
|
+
plain_dir = frprep_dirname("plain", "new")
|
115
|
+
|
116
|
+
$stderr.puts "Frprep: Transforming BNC format text in #{current_dir} to plain format."
|
117
|
+
$stderr.puts "Storing the result in #{plain_dir}."
|
118
|
+
$stderr.puts "Expecting one sentence per line."
|
119
|
+
|
120
|
+
transform_bncformat_dir(current_dir, plain_dir)
|
121
|
+
|
122
|
+
current_dir = plain_dir
|
123
|
+
current_format = "Plain"
|
124
|
+
|
125
|
+
when "Plain"
|
126
|
+
# transform to tab format
|
127
|
+
|
128
|
+
tab_dir = frprep_dirname("tab", "new")
|
129
|
+
|
130
|
+
$stderr.puts "Frprep: Transforming plain text in #{current_dir} to SalsaTab format."
|
131
|
+
$stderr.puts "Storing the result in #{tab_dir}."
|
132
|
+
$stderr.puts "Expecting one sentence per line."
|
133
|
+
|
134
|
+
transform_plain_dir(current_dir, tab_dir)
|
135
|
+
|
136
|
+
current_dir = tab_dir
|
137
|
+
current_format = "SalsaTab"
|
138
|
+
|
139
|
+
when "FNXml"
|
140
|
+
# transform to tab format
|
141
|
+
|
142
|
+
tab_dir = frprep_dirname("tab", "new")
|
143
|
+
|
144
|
+
$stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
|
145
|
+
$stderr.puts "Storing the result in " + tab_dir
|
146
|
+
|
147
|
+
fndata = FNDatabase.new(current_dir)
|
148
|
+
fndata.extract_everything(tab_dir)
|
149
|
+
Kernel.system("chmod -R g+rx #{tab_dir}")
|
150
|
+
|
151
|
+
current_dir = tab_dir
|
152
|
+
current_format = "SalsaTab"
|
153
|
+
|
154
|
+
when "FNCorpusXml"
|
155
|
+
# transform to tab format
|
156
|
+
tab_dir = frprep_dirname("tab", "new")
|
157
|
+
|
158
|
+
$stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
|
159
|
+
$stderr.puts "Storing the result in " + tab_dir
|
160
|
+
# assuming that all XML files in the current directory are FN Corpus XML files
|
161
|
+
Dir[current_dir + "*.xml"].each { |fncorpusfilename|
|
162
|
+
corpus = FNCorpusXMLFile.new(fncorpusfilename)
|
163
|
+
outfile = File.new(tab_dir + File.basename(fncorpusfilename, ".xml") + ".tab",
|
164
|
+
"w")
|
165
|
+
corpus.print_conll_style(outfile)
|
166
|
+
outfile.close()
|
167
|
+
}
|
168
|
+
|
169
|
+
Kernel.system("chmod -R g+rx #{tab_dir}")
|
170
|
+
current_dir = tab_dir
|
171
|
+
current_format = "SalsaTab"
|
172
|
+
|
173
|
+
when "SalsaTab"
|
174
|
+
# lemmatize and POStag
|
175
|
+
|
176
|
+
$stderr.puts "Frprep: Lemmatizing and parsing text in #{current_dir}."
|
177
|
+
$stderr.puts "Storing the result in #{split_dir}."
|
178
|
+
transform_pos_and_lemmatize(current_dir, split_dir)
|
179
|
+
|
180
|
+
current_dir = split_dir
|
181
|
+
current_format = "SalsaTabWithPos"
|
182
|
+
|
183
|
+
when "SalsaTabWithPos"
|
184
|
+
# parse
|
185
|
+
|
186
|
+
parse_dir = frprep_dirname("parse", "new")
|
187
|
+
|
188
|
+
$stderr.puts "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format."
|
189
|
+
$stderr.puts "Storing the result in #{parse_dir}."
|
190
|
+
|
191
|
+
transform_salsatab_dir(current_dir, parse_dir, output_dir)
|
192
|
+
|
193
|
+
current_dir = output_dir
|
194
|
+
current_format = "Done"
|
195
|
+
|
196
|
+
when "SalsaTigerXML"
|
197
|
+
|
198
|
+
parse_dir = frprep_dirname("parse", "new")
|
199
|
+
print "Transform parser output into stxml\n"
|
200
|
+
transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir, @exp)
|
201
|
+
current_dir = output_dir
|
202
|
+
current_format = "Done"
|
203
|
+
|
204
|
+
else
|
205
|
+
STDERR.puts "Done format is: #{done_format}"
|
206
|
+
$stderr.puts "Unknown data format #{current_format}"
|
207
|
+
$stderr.puts "Please check the 'format' entry in your experiment file."
|
208
|
+
raise "Experiment file problem"
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
STDERR.puts "FrPrep: Done preprocessing."
|
213
|
+
end
|
214
|
+
|
215
|
+
############################################################################
|
216
|
+
private
|
217
|
+
|
218
|
+
###############
|
219
|
+
# frprep_dirname:
|
220
|
+
# make directory name for frprep-internal data
|
221
|
+
# of a certain kind described in <subdir>
|
222
|
+
#
|
223
|
+
# frprep_directory has one subdirectory for each experiment ID,
|
224
|
+
# and below that there is one subdir per subtask
|
225
|
+
#
|
226
|
+
# If this is a new directory, it is constructed,
|
227
|
+
# if it should be an existing directory, its existence is checked.
|
228
|
+
# @param subdir [String] designator of a subdirectory
|
229
|
+
# @param neu [Nil] non-nil This may be a new directory
|
230
|
+
def frprep_dirname(subdir, neu = nil)
|
231
|
+
|
232
|
+
dirname = File.new_dir(@exp.get("frprep_directory"),
|
233
|
+
@exp.get("prep_experiment_ID"),
|
234
|
+
subdir)
|
235
|
+
|
236
|
+
neu ? File.new_dir(dirname) : File.existing_dir(dirname)
|
237
|
+
end
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
###############
|
242
|
+
# transform_plain:
|
243
|
+
#
|
244
|
+
# transformation for BNC format:
|
245
|
+
#
|
246
|
+
# transform to plain format, removing <> elements
|
247
|
+
def transform_bncformat_dir(input_dir, # string: input directory
|
248
|
+
output_dir) # string: output directory
|
249
|
+
|
250
|
+
Dir[input_dir + "*"].each { |bncfilename|
|
251
|
+
|
252
|
+
# open input and output file
|
253
|
+
# end output file name in "tab" because that is, at the moment, required
|
254
|
+
outfilename = output_dir + File.basename(bncfilename)
|
255
|
+
FrprepHelper.bnc_to_plain_file(bncfilename, outfilename)
|
256
|
+
}
|
257
|
+
end
|
258
|
+
|
259
|
+
|
260
|
+
###############
|
261
|
+
# transform_plain:
|
262
|
+
#
|
263
|
+
# transformation for plaintext:
|
264
|
+
#
|
265
|
+
# transform to Tab format, separating punctuation from adjacent words
|
266
|
+
# @param input_dir [String] input directory
|
267
|
+
# @param output_dir [String] output directory
|
268
|
+
def transform_plain_dir(input_dir, output_dir)
|
269
|
+
Dir[input_dir + "*"].each do |plainfilename|
|
270
|
+
# open input and output file
|
271
|
+
# end output file name in "tab" because that is, at the moment, required
|
272
|
+
outfilename = output_dir + File.basename(plainfilename) + @file_suffixes["tab"]
|
273
|
+
FrprepHelper.plain_to_tab_file(plainfilename, outfilename)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
###############
|
278
|
+
# transform_pos_and_lemmatize
|
279
|
+
#
|
280
|
+
# transformation for Tab format files:
|
281
|
+
#
|
282
|
+
# - Split into parser-size chunks
|
283
|
+
# - POS-tag, lemmatize
|
284
|
+
def transform_pos_and_lemmatize(input_dir, # string: input directory
|
285
|
+
output_dir) # string: output directory
|
286
|
+
##
|
287
|
+
# split the TabFormatFile into chunks of max_sent_num size
|
288
|
+
FrprepHelper.split_dir(input_dir, output_dir,@file_suffixes["tab"],
|
289
|
+
@exp.get("parser_max_sent_num"),
|
290
|
+
@exp.get("parser_max_sent_len"))
|
291
|
+
|
292
|
+
##
|
293
|
+
# POS-Tagging
|
294
|
+
if @exp.get("do_postag")
|
295
|
+
$stderr.puts "Frprep: Tagging."
|
296
|
+
|
297
|
+
# AB: TODO Move it to OptionParser.
|
298
|
+
unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
|
299
|
+
raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
|
300
|
+
end
|
301
|
+
|
302
|
+
sys_class = SynInterfaces.get_interface("pos_tagger",
|
303
|
+
@exp.get("pos_tagger"))
|
304
|
+
print "pos tagger interface: ", sys_class, "\n"
|
305
|
+
|
306
|
+
# AB: TODO Remove it.
|
307
|
+
unless sys_class
|
308
|
+
raise "Shouldn't be here"
|
309
|
+
end
|
310
|
+
|
311
|
+
sys = sys_class.new(@exp.get("pos_tagger_path"),
|
312
|
+
@file_suffixes["tab"],
|
313
|
+
@file_suffixes["pos"])
|
314
|
+
sys.process_dir(output_dir, output_dir)
|
315
|
+
end
|
316
|
+
|
317
|
+
|
318
|
+
##
|
319
|
+
# Lemmatization
|
320
|
+
# AB: We're working on the <split> dir and writing there.
|
321
|
+
if @exp.get("do_lemmatize")
|
322
|
+
STDERR.puts 'Frprep: Lemmatizing.'
|
323
|
+
|
324
|
+
# AB: TODO Move it to OptionParser.
|
325
|
+
unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
|
326
|
+
raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
|
327
|
+
end
|
328
|
+
|
329
|
+
sys_class = SynInterfaces.get_interface("lemmatizer",
|
330
|
+
@exp.get("lemmatizer"))
|
331
|
+
# AB: TODO make this exception explicit.
|
332
|
+
unless sys_class
|
333
|
+
raise 'I got a empty interface class for the lemmatizer!'
|
334
|
+
end
|
335
|
+
|
336
|
+
sys = sys_class.new(@exp.get("lemmatizer_path"),
|
337
|
+
@file_suffixes["tab"],
|
338
|
+
@file_suffixes["lemma"])
|
339
|
+
sys.process_dir(output_dir, output_dir)
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
343
|
+
###############
|
344
|
+
# transform_salsatab
|
345
|
+
#
|
346
|
+
# transformation for Tab format files:
|
347
|
+
#
|
348
|
+
# - parse
|
349
|
+
# - Transform parser output to SalsaTigerXML
|
350
|
+
# If no parsing, make flat syntactic structure.
|
351
|
+
def transform_salsatab_dir(input_dir, # string: input directory
|
352
|
+
parse_dir, # string: output directory for parses
|
353
|
+
output_dir) # string: global output directory
|
354
|
+
|
355
|
+
##
|
356
|
+
# (Parse and) transform to SalsaTigerXML
|
357
|
+
|
358
|
+
# get interpretation class for this
|
359
|
+
# parser/lemmatizer/POS tagger combination
|
360
|
+
interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
|
361
|
+
unless interpreter_class
|
362
|
+
raise "Shouldn't be here"
|
363
|
+
end
|
364
|
+
|
365
|
+
parse_obj = DoParses.new(@exp, @file_suffixes,
|
366
|
+
parse_dir,
|
367
|
+
"tab_dir" => input_dir)
|
368
|
+
parse_obj.each_parsed_file { |parsed_file_obj|
|
369
|
+
|
370
|
+
outfilename = output_dir + parsed_file_obj.filename + ".xml"
|
371
|
+
$stderr.puts "Writing #{outfilename}"
|
372
|
+
begin
|
373
|
+
outfile = File.new(outfilename, "w")
|
374
|
+
rescue
|
375
|
+
raise "Cannot write to SalsaTigerXML output file #{outfilename}"
|
376
|
+
end
|
377
|
+
|
378
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
379
|
+
# work with triples
|
380
|
+
# SalsaTigerSentence, FNTabSentence,
|
381
|
+
# hash: tab sentence index(integer) -> array:SynNode
|
382
|
+
parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
|
383
|
+
|
384
|
+
# parsed: add headwords using parse tree
|
385
|
+
if @exp.get("do_parse")
|
386
|
+
FrprepHelper.add_head_attributes(st_sent, interpreter_class)
|
387
|
+
end
|
388
|
+
|
389
|
+
# add lemmas, if they are there. If they are not, don't print out a warning.
|
390
|
+
if @exp.get("do_lemmatize")
|
391
|
+
FrprepHelper.add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
|
392
|
+
end
|
393
|
+
|
394
|
+
# add semantics
|
395
|
+
# we can use the method in SalsaTigerXMLHelper
|
396
|
+
# that reads semantic information from the tab file
|
397
|
+
# and combines all targets of a sentence into one frame
|
398
|
+
FrprepHelper.add_semantics_from_tab(st_sent, tabformat_sent, mapping,
|
399
|
+
interpreter_class, @exp)
|
400
|
+
|
401
|
+
# remove pseudo-frames from FrameNet data
|
402
|
+
FrprepHelper.remove_deprecated_frames(st_sent, @exp)
|
403
|
+
|
404
|
+
# handle multiword targets
|
405
|
+
FrprepHelper.handle_multiword_targets(st_sent,
|
406
|
+
interpreter_class, @exp.get("language"))
|
407
|
+
|
408
|
+
# handle Unknown frame names
|
409
|
+
FrprepHelper.handle_unknown_framenames(st_sent, interpreter_class)
|
410
|
+
|
411
|
+
outfile.puts st_sent.get()
|
412
|
+
}
|
413
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
414
|
+
}
|
415
|
+
end
|
416
|
+
|
417
|
+
#############################################
|
418
|
+
# transform_stxml
|
419
|
+
#
|
420
|
+
# transformation for SalsaTigerXML data
|
421
|
+
#
|
422
|
+
# - If the input format was SalsaTigerXML:
|
423
|
+
# - Tag, lemmatize and parse, if the experiment file tells you so
|
424
|
+
#
|
425
|
+
# - If the origin is the Salsa corpus:
|
426
|
+
# Change frame names from Unknown\d+ to lemma_Unknown\d+
|
427
|
+
#
|
428
|
+
# - fix multiword lemmas, or at least try
|
429
|
+
# - transform to UTF 8
|
430
|
+
def transform_stxml_dir(parse_dir, # string: name of directory for parse data
|
431
|
+
tab_dir, # string: name of directory for split/tab data
|
432
|
+
input_dir, # string: name of input directory
|
433
|
+
output_dir, # string: name of final output directory
|
434
|
+
exp) # FrprepConfigData
|
435
|
+
|
436
|
+
####
|
437
|
+
# Data preparation
|
438
|
+
|
439
|
+
# Data with Salsa as origin:
|
440
|
+
# remember the target lemma as an attribute on the
|
441
|
+
# <target> elements
|
442
|
+
#
|
443
|
+
# currently deactivated: encoding problems
|
444
|
+
# if @exp.get("origin") == "SalsaTiger"
|
445
|
+
# $stderr.puts "Frprep: noting target lemmas"
|
446
|
+
# changed_input_dir = frprep_dirname("salsalemma", "new")
|
447
|
+
# FrprepHelper.note_salsa_targetlemmas(input_dir, changed_input_dir)
|
448
|
+
|
449
|
+
# # remember changed input dir as input dir
|
450
|
+
# input_dir = changed_input_dir
|
451
|
+
# end
|
452
|
+
|
453
|
+
# If data is to be parsed, split and tabify input files
|
454
|
+
# else copy data to stxml_indir.
|
455
|
+
|
456
|
+
# stxml_dir: directory where SalsaTiger data is situated
|
457
|
+
if @exp.get("do_parse")
|
458
|
+
# split data
|
459
|
+
stxml_splitdir = frprep_dirname("stxml_split", "new")
|
460
|
+
stxml_dir = stxml_splitdir
|
461
|
+
|
462
|
+
$stderr.puts "Frprep: splitting data"
|
463
|
+
FrprepHelper.stxml_split_dir(input_dir, stxml_splitdir,
|
464
|
+
@exp.get("parser_max_sent_num"),
|
465
|
+
@exp.get("parser_max_sent_len"))
|
466
|
+
else
|
467
|
+
# no parsing: copy data to split dir
|
468
|
+
stxml_dir = parse_dir
|
469
|
+
$stderr.puts "Frprep: Copying data to #{stxml_dir}"
|
470
|
+
Dir[input_dir + "*.xml"].each { |filename|
|
471
|
+
`cp #{filename} #{stxml_dir}#{File.basename(filename)}`
|
472
|
+
}
|
473
|
+
end
|
474
|
+
|
475
|
+
# Some syntactic processing will take place:
|
476
|
+
# tabify data
|
477
|
+
if @exp.get("do_parse") or @exp.get("do_lemmatize") or @exp.get("do_postag")
|
478
|
+
$stderr.puts "Frprep: making input for syn. processing"
|
479
|
+
|
480
|
+
Dir[stxml_dir+"*"+@file_suffixes["stxml"]].each { |stxmlfilename|
|
481
|
+
|
482
|
+
tabfilename = tab_dir + File.basename(stxmlfilename,@file_suffixes["stxml"]) + @file_suffixes["tab"]
|
483
|
+
FrprepHelper.stxml_to_tab_file(stxmlfilename, tabfilename, exp)
|
484
|
+
}
|
485
|
+
end
|
486
|
+
|
487
|
+
###
|
488
|
+
# POS-tagging
|
489
|
+
if @exp.get("do_postag")
|
490
|
+
$stderr.puts "Frprep: Tagging."
|
491
|
+
unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
|
492
|
+
raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
|
493
|
+
end
|
494
|
+
|
495
|
+
sys_class = SynInterfaces.get_interface("pos_tagger",
|
496
|
+
@exp.get("pos_tagger"))
|
497
|
+
unless sys_class
|
498
|
+
raise "Shouldn't be here"
|
499
|
+
end
|
500
|
+
sys = sys_class.new(@exp.get("pos_tagger_path"),
|
501
|
+
@file_suffixes["tab"],
|
502
|
+
@file_suffixes["pos"])
|
503
|
+
sys.process_dir(tab_dir, tab_dir)
|
504
|
+
end
|
505
|
+
|
506
|
+
###
|
507
|
+
# Lemmatization
|
508
|
+
if @exp.get("do_lemmatize")
|
509
|
+
$stderr.puts "Frprep: Lemmatizing."
|
510
|
+
unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
|
511
|
+
raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
|
512
|
+
end
|
513
|
+
|
514
|
+
sys_class = SynInterfaces.get_interface("lemmatizer",
|
515
|
+
@exp.get("lemmatizer"))
|
516
|
+
unless sys_class
|
517
|
+
raise "Shouldn't be here"
|
518
|
+
end
|
519
|
+
sys = sys_class.new(@exp.get("lemmatizer_path"),
|
520
|
+
@file_suffixes["tab"],
|
521
|
+
@file_suffixes["lemma"])
|
522
|
+
sys.process_dir(tab_dir, tab_dir)
|
523
|
+
end
|
524
|
+
|
525
|
+
###
|
526
|
+
# Parsing, production of SalsaTigerXML output
|
527
|
+
|
528
|
+
# get interpretation class for this
|
529
|
+
# parser/lemmatizer/POS tagger combination
|
530
|
+
sys_class_names = Hash.new
|
531
|
+
[["do_postag", "pos_tagger"],
|
532
|
+
["do_lemmatize", "lemmatizer"],
|
533
|
+
["do_parse", "parser"]].each { |service, system_name|
|
534
|
+
if @exp.get(service) # yes, perform this service
|
535
|
+
sys_class_names[system_name] = @exp.get(system_name)
|
536
|
+
end
|
537
|
+
}
|
538
|
+
interpreter_class = SynInterfaces.get_interpreter(sys_class_names)
|
539
|
+
unless interpreter_class
|
540
|
+
raise "Shouldn't be here"
|
541
|
+
end
|
542
|
+
|
543
|
+
parse_obj = DoParses.new(@exp, @file_suffixes,
|
544
|
+
parse_dir,
|
545
|
+
"tab_dir" => tab_dir,
|
546
|
+
"stxml_dir" => stxml_dir)
|
547
|
+
parse_obj.each_parsed_file { |parsed_file_obj|
|
548
|
+
outfilename = output_dir + parsed_file_obj.filename + ".xml"
|
549
|
+
$stderr.puts "Writing #{outfilename}"
|
550
|
+
begin
|
551
|
+
outfile = File.new(outfilename, "w")
|
552
|
+
rescue
|
553
|
+
raise "Cannot write to SalsaTigerXML output file #{outfilename}"
|
554
|
+
end
|
555
|
+
|
556
|
+
|
557
|
+
if @exp.get("do_parse")
|
558
|
+
# read old SalsaTigerXML file
|
559
|
+
# so we can integrate the old file's semantics later
|
560
|
+
oldxml = Array.new # array of sentence strings
|
561
|
+
# we assume that the old and the new file have the same name,
|
562
|
+
# ending in .xml.
|
563
|
+
oldxmlfile = FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
|
564
|
+
oldxmlfile.scan_s { |sent_string|
|
565
|
+
# remember this sentence by its ID
|
566
|
+
oldxml << sent_string
|
567
|
+
}
|
568
|
+
end
|
569
|
+
|
570
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
571
|
+
index = 0
|
572
|
+
# work with triples
|
573
|
+
# SalsaTigerSentence, FNTabSentence,
|
574
|
+
# hash: tab sentence index(integer) -> array:SynNode
|
575
|
+
parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
|
576
|
+
|
577
|
+
# parsed? then integrate semantics and lemmas from old file
|
578
|
+
if @exp.get("do_parse")
|
579
|
+
oldsent_string = oldxml[index]
|
580
|
+
index += 1
|
581
|
+
if oldsent_string
|
582
|
+
|
583
|
+
# modified by ines, 27/08/08
|
584
|
+
# for Berkeley => substitute ( ) for *LRB* *RRB*
|
585
|
+
if exp.get("parser") == "berkeley"
|
586
|
+
oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
|
587
|
+
oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
|
588
|
+
oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
|
589
|
+
oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
|
590
|
+
end
|
591
|
+
|
592
|
+
# we have both an old and a new sentence, so integrate semantics
|
593
|
+
oldsent = SalsaTigerSentence.new(oldsent_string)
|
594
|
+
if st_sent.nil?
|
595
|
+
next
|
596
|
+
end
|
597
|
+
if ( FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp) == false)
|
598
|
+
#print "FALSE \n";
|
599
|
+
#print oldsent, "\n", st_sent, "\n\n";
|
600
|
+
|
601
|
+
oldsent_string = oldxml[index]
|
602
|
+
index += 1
|
603
|
+
if oldsent_string
|
604
|
+
|
605
|
+
# modified by ines, 27/08/08
|
606
|
+
# for Berkeley => substitute ( ) for *LRB* *RRB*
|
607
|
+
if exp.get("parser") == "berkeley"
|
608
|
+
oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
|
609
|
+
oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
|
610
|
+
oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
|
611
|
+
oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
|
612
|
+
end
|
613
|
+
|
614
|
+
# we have both an old and a new sentence, so integrate semantics
|
615
|
+
oldsent = SalsaTigerSentence.new(oldsent_string)
|
616
|
+
#print oldsent, "\n", st_sent, "\n\n";
|
617
|
+
FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
|
618
|
+
|
619
|
+
end
|
620
|
+
#else
|
621
|
+
#print "TRUE\n";
|
622
|
+
#print oldsent, "\n", st_sent, "\n\n";
|
623
|
+
end
|
624
|
+
else
|
625
|
+
# no corresponding old sentence for this new sentence
|
626
|
+
$stderr.puts "Warning: transporting semantics -- missing source sentence, skipping"
|
627
|
+
end
|
628
|
+
end
|
629
|
+
|
630
|
+
# remove pseudo-frames from FrameNet data
|
631
|
+
FrprepHelper.remove_deprecated_frames(st_sent, @exp)
|
632
|
+
|
633
|
+
# repair syn/sem mapping problems?
|
634
|
+
if @exp.get("fe_syn_repair") or @exp.get("fe_rel_repair")
|
635
|
+
FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
|
636
|
+
end
|
637
|
+
|
638
|
+
outfile.puts st_sent.get()
|
639
|
+
} # each ST sentence
|
640
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
641
|
+
} # each file parsed
|
642
|
+
end
|
643
|
+
|
644
|
+
|
645
|
+
###################################
|
646
|
+
# general file iterators
|
647
|
+
|
648
|
+
# yields pairs of [infile name, outfile stream]
|
649
|
+
def change_each_file_in_dir(dir, # string: directory name
|
650
|
+
suffix) # string: filename pattern, e.g. "*.xml"
|
651
|
+
Dir[dir + "*#{suffix}"].each { |filename|
|
652
|
+
tempfile = Tempfile.new("FrprepHelper")
|
653
|
+
yield [filename, tempfile]
|
654
|
+
|
655
|
+
# move temp file to original file location
|
656
|
+
tempfile.close()
|
657
|
+
`cp #{filename} #{filename}.bak`
|
658
|
+
`mv #{tempfile.path()} #{filename}`
|
659
|
+
tempfile.close(true)
|
660
|
+
} # each file
|
661
|
+
end
|
662
|
+
|
663
|
+
#######
|
664
|
+
# change_each_stxml_file_in_dir
|
665
|
+
#
|
666
|
+
# use change_each_file_in_dir, but assume that the files
|
667
|
+
# are SalsaTigerXML files: Keep file headers and footers,
|
668
|
+
# and just offer individual sentences for changing
|
669
|
+
#
|
670
|
+
# Yields SalsaTigerSentence objects, each sentence to be changed
|
671
|
+
def change_each_stxml_file_in_dir(dir) # string: directory name
|
672
|
+
|
673
|
+
change_each_file_in_dir(dir, "*.xml") { |stfilename, tf|
|
674
|
+
infile = FilePartsParser.new(stfilename)
|
675
|
+
|
676
|
+
# write header
|
677
|
+
tf.puts infile.head()
|
678
|
+
|
679
|
+
# iterate through sentences, yield as SalsaTigerSentence objects
|
680
|
+
infile.scan_s() { |sent_string|
|
681
|
+
sent = SalsaTigerSentence.new(sent_string)
|
682
|
+
yield sent
|
683
|
+
# write changed sentence
|
684
|
+
tf.puts sent.get()
|
685
|
+
} # each sentence
|
686
|
+
|
687
|
+
# write footer
|
688
|
+
tf.puts infile.tail()
|
689
|
+
infile.close()
|
690
|
+
}
|
691
|
+
end
|
692
|
+
end
|
693
|
+
end
|