shalmaneser 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/shalmaneser +8 -2
- data/doc/index.md +1 -0
- data/lib/shalmaneser/opt_parser.rb +68 -67
- metadata +49 -119
- data/bin/fred +0 -16
- data/bin/frprep +0 -34
- data/bin/rosy +0 -17
- data/lib/common/AbstractSynInterface.rb +0 -1229
- data/lib/common/Counter.rb +0 -18
- data/lib/common/EnduserMode.rb +0 -27
- data/lib/common/Eval.rb +0 -480
- data/lib/common/FixSynSemMapping.rb +0 -196
- data/lib/common/Graph.rb +0 -345
- data/lib/common/ISO-8859-1.rb +0 -24
- data/lib/common/ML.rb +0 -186
- data/lib/common/Mallet.rb +0 -236
- data/lib/common/Maxent.rb +0 -229
- data/lib/common/Optimise.rb +0 -195
- data/lib/common/Parser.rb +0 -213
- data/lib/common/RegXML.rb +0 -269
- data/lib/common/RosyConventions.rb +0 -171
- data/lib/common/STXmlTerminalOrder.rb +0 -194
- data/lib/common/SalsaTigerRegXML.rb +0 -2347
- data/lib/common/SalsaTigerXMLHelper.rb +0 -99
- data/lib/common/SynInterfaces.rb +0 -282
- data/lib/common/TabFormat.rb +0 -721
- data/lib/common/Tiger.rb +0 -1448
- data/lib/common/Timbl.rb +0 -144
- data/lib/common/Tree.rb +0 -61
- data/lib/common/config_data.rb +0 -470
- data/lib/common/config_format_element.rb +0 -220
- data/lib/common/headz.rb +0 -338
- data/lib/common/option_parser.rb +0 -13
- data/lib/common/prep_config_data.rb +0 -62
- data/lib/common/prep_helper.rb +0 -1330
- data/lib/common/ruby_class_extensions.rb +0 -310
- data/lib/db/db_interface.rb +0 -48
- data/lib/db/db_mysql.rb +0 -145
- data/lib/db/db_sqlite.rb +0 -280
- data/lib/db/db_table.rb +0 -239
- data/lib/db/db_wrapper.rb +0 -176
- data/lib/db/sql_query.rb +0 -243
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredConventions.rb +0 -232
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred.rb +0 -47
- data/lib/fred/fred_config_data.rb +0 -185
- data/lib/fred/md5.rb +0 -23
- data/lib/fred/opt_parser.rb +0 -250
- data/lib/frprep/Ampersand.rb +0 -39
- data/lib/frprep/CollinsInterface.rb +0 -1165
- data/lib/frprep/Counter.rb +0 -18
- data/lib/frprep/FNCorpusXML.rb +0 -643
- data/lib/frprep/FNDatabase.rb +0 -144
- data/lib/frprep/FrameXML.rb +0 -513
- data/lib/frprep/Graph.rb +0 -345
- data/lib/frprep/MiniparInterface.rb +0 -1388
- data/lib/frprep/RegXML.rb +0 -269
- data/lib/frprep/STXmlTerminalOrder.rb +0 -194
- data/lib/frprep/SleepyInterface.rb +0 -384
- data/lib/frprep/TntInterface.rb +0 -44
- data/lib/frprep/TreetaggerInterface.rb +0 -327
- data/lib/frprep/do_parses.rb +0 -143
- data/lib/frprep/frprep.rb +0 -693
- data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
- data/lib/frprep/interfaces/stanford_interface.rb +0 -353
- data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
- data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
- data/lib/frprep/one_parsed_file.rb +0 -28
- data/lib/frprep/opt_parser.rb +0 -94
- data/lib/frprep/ruby_class_extensions.rb +0 -310
- data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/FailedParses.rb +0 -130
- data/lib/rosy/FeatureInfo.rb +0 -242
- data/lib/rosy/GfInduce.rb +0 -1115
- data/lib/rosy/GfInduceFeature.rb +0 -148
- data/lib/rosy/InputData.rb +0 -294
- data/lib/rosy/RosyConfusability.rb +0 -338
- data/lib/rosy/RosyEval.rb +0 -465
- data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
- data/lib/rosy/RosyFeaturize.rb +0 -281
- data/lib/rosy/RosyInspect.rb +0 -336
- data/lib/rosy/RosyIterator.rb +0 -478
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
- data/lib/rosy/RosyPruning.rb +0 -165
- data/lib/rosy/RosyServices.rb +0 -744
- data/lib/rosy/RosySplit.rb +0 -232
- data/lib/rosy/RosyTask.rb +0 -19
- data/lib/rosy/RosyTest.rb +0 -829
- data/lib/rosy/RosyTrain.rb +0 -234
- data/lib/rosy/RosyTrainingTestTable.rb +0 -787
- data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/opt_parser.rb +0 -379
- data/lib/rosy/rosy.rb +0 -78
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/lib/shalmaneser/version.rb +0 -3
data/lib/common/option_parser.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
# -*- encoding: us-ascii -*-
|
2
|
-
|
3
|
-
# AB, 2010-11-25
|
4
|
-
|
5
|
-
|
6
|
-
# It is a general class for parsing options.
|
7
|
-
# It is now emtpy, we are implementing three different classes:
|
8
|
-
# FRPrepOptionParser, RosyOptionParser and FredOptionParser.
|
9
|
-
# All classes above inherit from OptionParser.
|
10
|
-
#--
|
11
|
-
# TODO: move the functionality to the parent class.
|
12
|
-
class OptionParser
|
13
|
-
end
|
@@ -1,62 +0,0 @@
|
|
1
|
-
# FPrepConfigData
|
2
|
-
# Katrin Erk July 05
|
3
|
-
#
|
4
|
-
# Preprocessing for Fred and Rosy:
|
5
|
-
# access to a configuration and experiment description file
|
6
|
-
|
7
|
-
require "common/config_data"
|
8
|
-
|
9
|
-
##############################
|
10
|
-
# Class FrPrepConfigData
|
11
|
-
#
|
12
|
-
# inherits from ConfigData,
|
13
|
-
# sets variable names appropriate to preprocessing task
|
14
|
-
|
15
|
-
class FrPrepConfigData < ConfigData
|
16
|
-
|
17
|
-
CONFIG_DEFS = {"prep_experiment_ID" => "string", # experiment identifier
|
18
|
-
"frprep_directory" => "string", # dir for frprep internal data
|
19
|
-
# information about the dataset
|
20
|
-
"language" => "string", # en, de
|
21
|
-
"origin"=> "string", # FrameNet, Salsa, or nothing
|
22
|
-
"format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
|
23
|
-
"encoding" => "string", # utf8, iso, hex, or nothing
|
24
|
-
|
25
|
-
# directories
|
26
|
-
"directory_input" => "string", # dir with input data
|
27
|
-
"directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
|
28
|
-
"directory_parserout" => "string", # dir with parser output for the parser named below
|
29
|
-
|
30
|
-
# syntactic processing
|
31
|
-
"pos_tagger" => "string", # name of POS tagger
|
32
|
-
"lemmatizer" => "string", # name of lemmatizer
|
33
|
-
"parser" => "string", # name of parser
|
34
|
-
"pos_tagger_path" => "string", # path to POS tagger
|
35
|
-
"lemmatizer_path" => "string", # path to lemmatizer
|
36
|
-
"parser_path" => "string", # path to parser
|
37
|
-
"parser_max_sent_num" => "integer", # max number of sentences per parser input file
|
38
|
-
"parser_max_sent_len" => "integer", # max sentence length the parser handles
|
39
|
-
|
40
|
-
"do_parse" => "bool", # use parser?
|
41
|
-
"do_lemmatize" => "bool",# use lemmatizer?
|
42
|
-
"do_postag" => "bool", # use POS tagger?
|
43
|
-
|
44
|
-
# output format: if tabformat_output == true,
|
45
|
-
# output in Tab format rather than Salsa/Tiger XML
|
46
|
-
# (this will not work if do_parse == true)
|
47
|
-
"tabformat_output" => "bool",
|
48
|
-
|
49
|
-
# syntactic repairs, dependent on existing semantic role annotation
|
50
|
-
"fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
|
51
|
-
"fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
|
52
|
-
}
|
53
|
-
|
54
|
-
def initialize(filename)
|
55
|
-
# @param filename [String] path to a config file
|
56
|
-
# @param CONFIG_DEFS [Hash] a list of configuration definitions
|
57
|
-
super(filename, CONFIG_DEFS, [])
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
|
62
|
-
|
data/lib/common/prep_helper.rb
DELETED
@@ -1,1330 +0,0 @@
|
|
1
|
-
# Salsa packages
|
2
|
-
require "common/ISO-8859-1"
|
3
|
-
require "common/Parser"
|
4
|
-
require "common/RegXML"
|
5
|
-
require "common/SalsaTigerRegXML"
|
6
|
-
require "common/SalsaTigerXMLHelper"
|
7
|
-
require "common/TabFormat"
|
8
|
-
require "common/ruby_class_extensions"
|
9
|
-
require "common/AbstractSynInterface"
|
10
|
-
|
11
|
-
############################################3
|
12
|
-
# Module FrprepHelper:
|
13
|
-
# diverse transformation methods for frprep.rb
|
14
|
-
# moved over here to make the main file less crowded
|
15
|
-
module FrprepHelper
|
16
|
-
|
17
|
-
####
|
18
|
-
# transform a file to UTF-8 from a given encoding
|
19
|
-
def FrprepHelper.to_utf8_file(input_filename, # string: name of input file
|
20
|
-
output_filename, # string: name of output file
|
21
|
-
encoding) # string: "iso", "hex"
|
22
|
-
begin
|
23
|
-
infile = File.new(input_filename)
|
24
|
-
outfile = File.new(output_filename, "w")
|
25
|
-
rescue
|
26
|
-
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
27
|
-
end
|
28
|
-
|
29
|
-
while (line = infile.gets())
|
30
|
-
case encoding
|
31
|
-
when "iso"
|
32
|
-
outfile.puts UtfIso.from_iso_8859_1(line)
|
33
|
-
when "hex"
|
34
|
-
outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
|
35
|
-
else
|
36
|
-
raise "Shouldn't be here."
|
37
|
-
end
|
38
|
-
end
|
39
|
-
infile.close()
|
40
|
-
outfile.close()
|
41
|
-
end
|
42
|
-
|
43
|
-
####
|
44
|
-
# transform BNC format file to plaintext file
|
45
|
-
def FrprepHelper.bnc_to_plain_file(input_filename, # string: name of input file
|
46
|
-
output_filename) # string: name of output file
|
47
|
-
begin
|
48
|
-
infile = File.new(input_filename)
|
49
|
-
outfile = File.new(output_filename, "w")
|
50
|
-
rescue
|
51
|
-
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
52
|
-
end
|
53
|
-
|
54
|
-
infile.each { |line|
|
55
|
-
# does this line contain a sentence?
|
56
|
-
if line =~ /^\s*<s\s+n=/
|
57
|
-
# remove all tags, replace by spaces,
|
58
|
-
# then remove superfluous spaces
|
59
|
-
textline = line.gsub(/<.+?>/, " ").strip().squeeze(" ")
|
60
|
-
|
61
|
-
|
62
|
-
textline.gsub!(/&bquo;/, '"')
|
63
|
-
textline.gsub!(/&equo;/, '"')
|
64
|
-
textline.gsub!(/—/, "-")
|
65
|
-
textline.gsub!(/–/, "-")
|
66
|
-
textline.gsub!(/%/, "%")
|
67
|
-
textline.gsub!(/£/, " pounds ")
|
68
|
-
textline.gsub!(/&/, " and ")
|
69
|
-
textline.gsub!(/…/, "...")
|
70
|
-
textline.gsub!(/©/, "(copyright)")
|
71
|
-
textline.gsub!(/é/, "e")
|
72
|
-
textline.gsub!(/•/, "*")
|
73
|
-
textline.gsub!(/$/, "$")
|
74
|
-
textline.gsub!(/°/, " degree ")
|
75
|
-
|
76
|
-
textline.gsub!(/½/, "1/2")
|
77
|
-
textline.gsub!(/¾/, "3/4")
|
78
|
-
|
79
|
-
textline.gsub!(/[/, "[")
|
80
|
-
textline.gsub!(/]/, "]")
|
81
|
-
|
82
|
-
textline.gsub!(/&ins;/, "i")
|
83
|
-
textline.gsub!(/&ft;/, "ft")
|
84
|
-
|
85
|
-
textline.gsub!(/→/, ">")
|
86
|
-
textline.gsub!(/←/, "<")
|
87
|
-
|
88
|
-
|
89
|
-
textline.gsub!(/á/, "a")
|
90
|
-
textline.gsub!(/ä/, "a")
|
91
|
-
textline.gsub!(/à/, "a")
|
92
|
-
textline.gsub!(/ã/, "a")
|
93
|
-
textline.gsub!(/â/, "a")
|
94
|
-
textline.gsub!(/Á/, "A")
|
95
|
-
textline.gsub!(/Ä/, "A")
|
96
|
-
textline.gsub!(/À/, "A")
|
97
|
-
textline.gsub!(/Ã/, "A")
|
98
|
-
textline.gsub!(/Â/, "A")
|
99
|
-
|
100
|
-
textline.gsub!(/é/, "e")
|
101
|
-
textline.gsub!(/è/, "e")
|
102
|
-
textline.gsub!(/ê/, "e")
|
103
|
-
textline.gsub!(/ë/, "e")
|
104
|
-
textline.gsub!(/É/, "E")
|
105
|
-
textline.gsub!(/È/, "E")
|
106
|
-
textline.gsub!(/Ê/, "E")
|
107
|
-
textline.gsub!(/Ë/, "E")
|
108
|
-
|
109
|
-
textline.gsub!(/í/, "i")
|
110
|
-
textline.gsub!(/ì/, "i")
|
111
|
-
textline.gsub!(/î/, "i")
|
112
|
-
textline.gsub!(/ï/, "i")
|
113
|
-
textline.gsub!(/Í/, "I")
|
114
|
-
textline.gsub!(/Ì/, "I")
|
115
|
-
textline.gsub!(/Î/, "I")
|
116
|
-
|
117
|
-
textline.gsub!(/ó/, "o")
|
118
|
-
textline.gsub!(/ò/, "o")
|
119
|
-
textline.gsub!(/ô/, "o")
|
120
|
-
textline.gsub!(/ö/, "o")
|
121
|
-
textline.gsub!(/Ó/, "O")
|
122
|
-
textline.gsub!(/Ò/, "O")
|
123
|
-
textline.gsub!(/Ô/, "O")
|
124
|
-
textline.gsub!(/Ö/, "O")
|
125
|
-
|
126
|
-
textline.gsub!(/ú/, "u")
|
127
|
-
textline.gsub!(/ù/, "u")
|
128
|
-
textline.gsub!(/û/, "u")
|
129
|
-
textline.gsub!(/ü/, "u")
|
130
|
-
textline.gsub!(/Ú/, "U")
|
131
|
-
textline.gsub!(/Ù/, "U")
|
132
|
-
textline.gsub!(/Û/, "U")
|
133
|
-
textline.gsub!(/Ü/, "U")
|
134
|
-
|
135
|
-
textline.gsub!(/ÿ/, "y")
|
136
|
-
textline.gsub!(/Ÿ/, "Y")
|
137
|
-
|
138
|
-
textline.gsub!(/ñ/, "n")
|
139
|
-
textline.gsub!(/Ñ/, "N")
|
140
|
-
|
141
|
-
textline.gsub!(/ç/, "c")
|
142
|
-
textline.gsub!(/Ç/, "C")
|
143
|
-
|
144
|
-
|
145
|
-
outfile.puts textline
|
146
|
-
end
|
147
|
-
}
|
148
|
-
infile.close()
|
149
|
-
outfile.close()
|
150
|
-
end
|
151
|
-
|
152
|
-
|
153
|
-
####
|
154
|
-
# transform plaintext file to Tab format file
|
155
|
-
def FrprepHelper.plain_to_tab_file(input_filename,# string: name of input file
|
156
|
-
output_filename) # string: name of output file
|
157
|
-
begin
|
158
|
-
infile = File.new(input_filename)
|
159
|
-
outfile = File.new(output_filename, "w")
|
160
|
-
rescue
|
161
|
-
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
162
|
-
end
|
163
|
-
|
164
|
-
# AB: TODO This assumes all input files have the extension <txt>.
|
165
|
-
# Is it good?
|
166
|
-
filename_core = File.basename(input_filename, 'txt')
|
167
|
-
|
168
|
-
# array(string): keep the words of each sentence
|
169
|
-
sentence = []
|
170
|
-
# sentence number for making the sentence ID:
|
171
|
-
# global count, over all input files
|
172
|
-
sentno = 0
|
173
|
-
|
174
|
-
while line = infile.gets
|
175
|
-
|
176
|
-
# make a sentence ID for the next sentence: running number
|
177
|
-
sentid = "#{filename_core}_#{sentno}"
|
178
|
-
sentno += 1
|
179
|
-
|
180
|
-
# read words into the sentence array,
|
181
|
-
# separating out punctuation attached to the beginning or end of words
|
182
|
-
sentence.clear
|
183
|
-
|
184
|
-
# AB: TODO Remove this naive tokenizer, better to have a fully
|
185
|
-
# tokenized input using an external tokenizer than that.
|
186
|
-
line.split.each { |word|
|
187
|
-
# punctuation at the beginning of the word
|
188
|
-
#if word =~ /^([\(\[`'\"-]+)(.*)$/
|
189
|
-
if word =~ /^([\(\[`\"-]+)(.*)$/
|
190
|
-
punct = $1
|
191
|
-
word = $2
|
192
|
-
punct.scan(/./) { |single_punct|
|
193
|
-
sentence << single_punct
|
194
|
-
}
|
195
|
-
|
196
|
-
end
|
197
|
-
# punctuation at the end of the word
|
198
|
-
#if word =~ /[,:;-\`?!'\"\.\)\]]+$/
|
199
|
-
if word =~ /[,:;-\`?!\"\.\)\]]+$/
|
200
|
-
sentence << $` # part before the match: the word
|
201
|
-
punct = $&
|
202
|
-
punct.scan(/./) { |single_punct|
|
203
|
-
sentence << single_punct
|
204
|
-
}
|
205
|
-
|
206
|
-
else
|
207
|
-
# no punctuation recognized
|
208
|
-
sentence << word
|
209
|
-
end
|
210
|
-
}
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
# remove empty words
|
215
|
-
# AB: TODO Is it possible? Remove this.
|
216
|
-
sentence.reject! { |word| word.nil? or word.strip.empty? }
|
217
|
-
|
218
|
-
# write words to tab file
|
219
|
-
# KE Dec 06: TabFormat changed
|
220
|
-
sentence.each { |word|
|
221
|
-
# for each word, one line, entries in the line tab-separated
|
222
|
-
# the 'word' entry is the word, the 'lu_sent_ids' entry is the sentence ID sentid,
|
223
|
-
# all other entries (gf, pt, frame etc.) are not set
|
224
|
-
outfile.puts FNTabFormatFile.format_str({
|
225
|
-
"word" => word,
|
226
|
-
"sent_id" => sentid
|
227
|
-
})
|
228
|
-
}
|
229
|
-
outfile.puts
|
230
|
-
end
|
231
|
-
outfile.close
|
232
|
-
end
|
233
|
-
|
234
|
-
###########
|
235
|
-
#
|
236
|
-
# class method split_dir:
|
237
|
-
# read all files in one directory and produce chunk files with _suffix_ in outdir
|
238
|
-
# with a certain number of files in them (sent_num).
|
239
|
-
# Optionally, remove all sentences longer than sent_leng
|
240
|
-
#
|
241
|
-
# produces output files 1.<suffix>, 2.<suffix>, etc.
|
242
|
-
#
|
243
|
-
# assumes TabFormat sentences
|
244
|
-
#
|
245
|
-
# example: split_all("/tmp/in","/tmp/out",".tab",2000,80)
|
246
|
-
|
247
|
-
def FrprepHelper.split_dir(indir,
|
248
|
-
outdir,
|
249
|
-
suffix,
|
250
|
-
sent_num,
|
251
|
-
sent_leng=nil)
|
252
|
-
|
253
|
-
unless indir[-1,1] == "/"
|
254
|
-
indir += "/"
|
255
|
-
end
|
256
|
-
unless outdir[-1,1] == "/"
|
257
|
-
outdir += "/"
|
258
|
-
end
|
259
|
-
|
260
|
-
outfile_counter = 0
|
261
|
-
line_stack = Array.new
|
262
|
-
sent_stack = Array.new
|
263
|
-
|
264
|
-
Dir[indir+"*#{suffix}"].each {|infilename|
|
265
|
-
STDERR.puts "Now splitting #{infilename}"
|
266
|
-
infile = File.new(infilename)
|
267
|
-
|
268
|
-
while line = infile.gets
|
269
|
-
line.chomp!
|
270
|
-
case line
|
271
|
-
when "" # end of sentence
|
272
|
-
if !(sent_leng.nil? or line_stack.length < sent_leng) # record sentence
|
273
|
-
# suppress multiple empty lines
|
274
|
-
# to avoid problems with lemmatiser
|
275
|
-
# only record sent_stack if it is not empty.
|
276
|
-
|
277
|
-
# change (sp 15 01 07): just cut off sentence at sent_leng.
|
278
|
-
|
279
|
-
STDERR.puts "Cutting off long sentence #{line_stack.last.split("\t").last}"
|
280
|
-
line_stack = line_stack[0..sent_leng-1]
|
281
|
-
end
|
282
|
-
unless line_stack.empty?
|
283
|
-
sent_stack << line_stack
|
284
|
-
# reset line_stack
|
285
|
-
line_stack = Array.new
|
286
|
-
end
|
287
|
-
|
288
|
-
|
289
|
-
# check if we have to empty the sent stack
|
290
|
-
if sent_stack.length == sent_num # enough sentences for new outfile?
|
291
|
-
outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
|
292
|
-
sent_stack.each {|l_stack|
|
293
|
-
outfile.puts l_stack.join("\n")
|
294
|
-
outfile.puts
|
295
|
-
}
|
296
|
-
outfile.close
|
297
|
-
outfile_counter += 1
|
298
|
-
sent_stack = Array.new
|
299
|
-
end
|
300
|
-
|
301
|
-
else # for any other line
|
302
|
-
line_stack << line
|
303
|
-
end
|
304
|
-
end
|
305
|
-
infile.close
|
306
|
-
}
|
307
|
-
# the last remaining sentences
|
308
|
-
unless sent_stack.empty?
|
309
|
-
outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
|
310
|
-
sent_stack.each {|l_stack|
|
311
|
-
l_stack << "\n"
|
312
|
-
outfile.puts l_stack.join("\n")
|
313
|
-
}
|
314
|
-
outfile.close
|
315
|
-
end
|
316
|
-
end
|
317
|
-
|
318
|
-
####
|
319
|
-
# note salsa targetlemma
|
320
|
-
#
|
321
|
-
# old_dir contains xml files whose name starts with the
|
322
|
-
# target lemma for all frames in the file
|
323
|
-
# record that target lemma in the <target> element of each frame
|
324
|
-
def FrprepHelper.note_salsa_targetlemma(old_dir, # string ending in /
|
325
|
-
new_dir) # string ending in /
|
326
|
-
|
327
|
-
|
328
|
-
# each input file: extract target lemma from filename,
|
329
|
-
# not this lemma in the <target> element of each frame
|
330
|
-
Dir[old_dir + "*.xml"].each { |filename|
|
331
|
-
changedfilename = new_dir + File.basename(filename)
|
332
|
-
|
333
|
-
if File.basename(filename) =~ /^(.*?)[_\.]/
|
334
|
-
lemma = $1
|
335
|
-
|
336
|
-
infile = FilePartsParser.new(filename)
|
337
|
-
outfile = File.new(changedfilename, "w")
|
338
|
-
|
339
|
-
# write header
|
340
|
-
outfile.puts infile.head()
|
341
|
-
|
342
|
-
# iterate through sentences, yield as SalsaTigerSentence objects
|
343
|
-
infile.scan_s() { |sent_string|
|
344
|
-
sent = SalsaTigerSentence.new(sent_string)
|
345
|
-
sent.each_frame { |frame|
|
346
|
-
frame.target.set_attribute("lemma", lemma)
|
347
|
-
}
|
348
|
-
|
349
|
-
# write changed sentence
|
350
|
-
outfile.puts sent.get()
|
351
|
-
} # each sentence
|
352
|
-
|
353
|
-
# write footer
|
354
|
-
outfile.puts infile.tail()
|
355
|
-
infile.close()
|
356
|
-
outfile.close()
|
357
|
-
|
358
|
-
else
|
359
|
-
# couldn't determine lemma
|
360
|
-
# just copy the file
|
361
|
-
`cp #{filename} #{changedfilename}`
|
362
|
-
end
|
363
|
-
}
|
364
|
-
end
|
365
|
-
|
366
|
-
####
|
367
|
-
# stxml_split_dir
|
368
|
-
#
|
369
|
-
# split SalsaTigerXML files into new files of given length,
|
370
|
-
# skipping sentences that are too long
|
371
|
-
#
|
372
|
-
# At the same time, sentences that occur several times (i.e. sentences which are
|
373
|
-
# annotated by SALSA for more than one predicate) are compacted into one occurrence
|
374
|
-
# with combined semantics.
|
375
|
-
#
|
376
|
-
# assumes that all files in input_dir with
|
377
|
-
# extension .xml are SalsaTigerXMl files
|
378
|
-
def FrprepHelper.stxml_split_dir(input_dir, # string: input directory with STXML files
|
379
|
-
split_dir, # string: output directory
|
380
|
-
max_sentnum, # integer: max num of sentences per file
|
381
|
-
max_sentlen) # integer: max num of terminals per sentence
|
382
|
-
|
383
|
-
filenames = Dir[input_dir+"*.xml"].to_a
|
384
|
-
|
385
|
-
graph_hash = Hash.new # for each sentence id, keep <s...</graph>
|
386
|
-
frame_hash = Hash.new # for each sentence id , keep the <frame... </frame> string
|
387
|
-
uspfes_hash = Hash.new # for each sentence id, keep the uspfes stuff
|
388
|
-
uspframes_hash = Hash.new # for each sentence id, keep the uspframes stuff
|
389
|
-
|
390
|
-
########################
|
391
|
-
# Traverse of file(s): compute an index of all frames for each sentence, with unique identifiers
|
392
|
-
|
393
|
-
filenames.each {|filename|
|
394
|
-
|
395
|
-
infile = FilePartsParser.new(filename)
|
396
|
-
infile.scan_s {|sent_str|
|
397
|
-
|
398
|
-
sentlen = 0
|
399
|
-
sent_str.delete("\n").scan(/<t\s/) { |occ| sentlen += 1}
|
400
|
-
if sentlen > max_sentlen
|
401
|
-
sent = RegXML.new(sent_str)
|
402
|
-
# revisit handling of long sentences
|
403
|
-
# $stderr.puts "I would have skipped overly long sentence " + sent.attributes["id"]+" but Sebastian forbade me.".to_s
|
404
|
-
# next
|
405
|
-
end
|
406
|
-
|
407
|
-
# substitute old frame identifiers with new, unique ones
|
408
|
-
|
409
|
-
# problem: we may have several frames per sentence, and need to keep track of them
|
410
|
-
# if we rename etc sxx_f1 to sxx_f2 and there is already a sxx_f2, then
|
411
|
-
# we cannot distinguish between these frames
|
412
|
-
|
413
|
-
# therefore, we substitute temporary identifiers until we have substituted
|
414
|
-
# all ids with temporary ones, and re-substitute final ones at the end.
|
415
|
-
|
416
|
-
this_frames = Array.new
|
417
|
-
|
418
|
-
temp_subs = Array.new
|
419
|
-
final_subs = Array.new
|
420
|
-
|
421
|
-
sent = RegXML.new(sent_str)
|
422
|
-
sentid = sent.attributes["id"].to_s
|
423
|
-
if sentid.nil?
|
424
|
-
STDERR.puts "[frprep] Warning: cannot find sentence id, skipping sentence:"
|
425
|
-
STDERR.puts sent_str
|
426
|
-
# strange sentence, no ID? skip
|
427
|
-
next
|
428
|
-
end
|
429
|
-
|
430
|
-
unless frame_hash.key? sentid
|
431
|
-
frame_hash[sentid] = Array.new
|
432
|
-
uspfes_hash[sentid] = Array.new
|
433
|
-
uspframes_hash[sentid] = Array.new
|
434
|
-
end
|
435
|
-
|
436
|
-
# find everything up to and including the graph
|
437
|
-
sent_children = sent.children_and_text()
|
438
|
-
graph = sent_children.detect { |child| child.name == "graph" }
|
439
|
-
graph_hash[sentid] = "<s " +
|
440
|
-
sent.attributes.to_a.map { |at, val| "#{at}=\'#{val}\'" }.join(" ") +
|
441
|
-
">" +
|
442
|
-
graph.to_s
|
443
|
-
|
444
|
-
# find the usp block
|
445
|
-
|
446
|
-
sem = sent_children.detect { |child| child.name == "sem"}
|
447
|
-
usp = ""
|
448
|
-
if sem
|
449
|
-
usp = sem.children_and_text.detect { |child| child.name == "usp" }
|
450
|
-
usp = usp.to_s
|
451
|
-
end
|
452
|
-
|
453
|
-
# find all frames
|
454
|
-
if sem
|
455
|
-
frames = sem.children_and_text.detect { |child| child.name == "frames" }
|
456
|
-
if frames
|
457
|
-
frames.children_and_text.each { |frame|
|
458
|
-
unless frame.name == "frame"
|
459
|
-
next
|
460
|
-
end
|
461
|
-
frameid = frame.attributes["id"]
|
462
|
-
|
463
|
-
temp_frameid = "#{sentid}_temp_f#{frame_hash[sentid].length+this_frames.length+1}"
|
464
|
-
final_frameid = "#{sentid}_f#{frame_hash[sentid].length+this_frames.length+1}"
|
465
|
-
|
466
|
-
temp_subs << [frameid,temp_frameid]
|
467
|
-
final_subs << [temp_frameid,final_frameid]
|
468
|
-
|
469
|
-
this_frames << frame.to_s
|
470
|
-
}
|
471
|
-
end
|
472
|
-
end
|
473
|
-
|
474
|
-
# now first rename all the frames to temporary names
|
475
|
-
|
476
|
-
temp_subs.each {|orig_frameid, temp_frameid|
|
477
|
-
this_frames.map! {|frame_str|
|
478
|
-
#print "orig ", orig_frameid, " temp ", temp_frameid, "\n"
|
479
|
-
frame_str.gsub(orig_frameid,temp_frameid)
|
480
|
-
}
|
481
|
-
|
482
|
-
usp.gsub!(orig_frameid,temp_frameid)
|
483
|
-
}
|
484
|
-
|
485
|
-
# and re-rename the temporary names
|
486
|
-
|
487
|
-
final_subs.each {|temp_frameid, final_frameid|
|
488
|
-
this_frames.map! {|frame_str|
|
489
|
-
frame_str.gsub(temp_frameid,final_frameid)
|
490
|
-
}
|
491
|
-
usp.gsub!(temp_frameid, final_frameid)
|
492
|
-
}
|
493
|
-
|
494
|
-
# store frames in data structure
|
495
|
-
this_frames.each {|frame_str|
|
496
|
-
frame_hash[sentid] << frame_str
|
497
|
-
}
|
498
|
-
|
499
|
-
# store uspfes in data structure
|
500
|
-
unless usp.empty?
|
501
|
-
usp_elt = RegXML.new(usp)
|
502
|
-
uspfes = usp_elt.children_and_text.detect { |child| child.name == "uspfes" }
|
503
|
-
uspfes.children_and_text.each { |child|
|
504
|
-
unless child.name == "uspblock"
|
505
|
-
next
|
506
|
-
end
|
507
|
-
uspfes_hash[sentid] << child.to_s
|
508
|
-
}
|
509
|
-
|
510
|
-
# store uspframes in data structure
|
511
|
-
uspframes = usp_elt.children_and_text.detect { |child| child.name == "uspframes" }
|
512
|
-
uspframes.children_and_text.each { |child|
|
513
|
-
unless child.name == "uspblock"
|
514
|
-
next
|
515
|
-
end
|
516
|
-
uspframes_hash[sentid] << child.to_s
|
517
|
-
}
|
518
|
-
end
|
519
|
-
}
|
520
|
-
}
|
521
|
-
|
522
|
-
# now write everything in the data structure back to a file
|
523
|
-
|
524
|
-
filecounter = 0
|
525
|
-
sentcounter = 0
|
526
|
-
outfile = nil
|
527
|
-
sent_stack = Array.new
|
528
|
-
|
529
|
-
graph_hash.sort {|a,b| a[0].to_i <=> b[0].to_i}.each {|sentid,graph_str|
|
530
|
-
|
531
|
-
if sentcounter == max_sentnum
|
532
|
-
outfile.puts SalsaTigerXMLHelper.get_footer
|
533
|
-
outfile.close
|
534
|
-
outfile = nil
|
535
|
-
end
|
536
|
-
|
537
|
-
unless outfile
|
538
|
-
outfile = File.new(split_dir+filecounter.to_s+".xml","w")
|
539
|
-
outfile.puts SalsaTigerXMLHelper.get_header
|
540
|
-
filecounter +=1
|
541
|
-
sentcounter = 0
|
542
|
-
end
|
543
|
-
|
544
|
-
xml = Array.new
|
545
|
-
xml << graph_str
|
546
|
-
xml << "<sem>"
|
547
|
-
xml << "<globals>"
|
548
|
-
xml << "</globals>"
|
549
|
-
xml << "<frames>"
|
550
|
-
frame_hash[sentid].each {|frame_str|
|
551
|
-
xml << frame_str
|
552
|
-
}
|
553
|
-
xml << "</frames>"
|
554
|
-
xml << "<usp>"
|
555
|
-
xml << "<uspframes>"
|
556
|
-
uspframes_hash[sentid].each {|uspblock_str|
|
557
|
-
xml << uspblock_str
|
558
|
-
}
|
559
|
-
xml << "</uspframes>"
|
560
|
-
xml << "<uspfes>"
|
561
|
-
uspfes_hash[sentid].each {|uspblock_str|
|
562
|
-
xml << uspblock_str
|
563
|
-
}
|
564
|
-
xml << "</uspfes>"
|
565
|
-
xml << "</usp>"
|
566
|
-
xml << "</sem>"
|
567
|
-
xml << "</s>"
|
568
|
-
|
569
|
-
outfile.puts xml.join("\n")
|
570
|
-
sentcounter += 1
|
571
|
-
}
|
572
|
-
|
573
|
-
if outfile
|
574
|
-
outfile.puts SalsaTigerXMLHelper.get_footer
|
575
|
-
outfile.close
|
576
|
-
outfile = nil
|
577
|
-
end
|
578
|
-
|
579
|
-
end
|
580
|
-
|
581
|
-
|
582
|
-
####
|
583
|
-
# transform SalsaTigerXML file to Tab format file
|
584
|
-
def FrprepHelper.stxml_to_tab_file(input_filename, # string: name of input file
|
585
|
-
output_filename, # string: name of output file
|
586
|
-
exp) # FrprepConfigData
|
587
|
-
infile = FilePartsParser.new(input_filename)
|
588
|
-
begin
|
589
|
-
outfile = File.new(output_filename,"w")
|
590
|
-
rescue
|
591
|
-
raise "Stxml to tab: could not write to tab file #{output_filename}"
|
592
|
-
end
|
593
|
-
|
594
|
-
infile.scan_s {|sent_string|
|
595
|
-
|
596
|
-
# determine sentence ID
|
597
|
-
sentid = RegXML.new(sent_string).attributes["id"]
|
598
|
-
unless sentid
|
599
|
-
$stderr.puts "No sentence ID in sentence:\n "+ sent_string
|
600
|
-
$stderr.puts "Making a new one up."
|
601
|
-
sentid = Time.new().to_f.to_s
|
602
|
-
end
|
603
|
-
|
604
|
-
# find terminals and process them
|
605
|
-
unless sent_string.delete("\n") =~ /<terminals[ >].+<\/terminals>/
|
606
|
-
$stderr.puts "Warning: could not find terminals in sentence:"
|
607
|
-
$stderr.puts sent_string
|
608
|
-
$stderr.puts "Skipping"
|
609
|
-
next
|
610
|
-
end
|
611
|
-
|
612
|
-
# modified by ines, 27/08/08
|
613
|
-
# for Berkeley => convert ( ) to -LRB- -RRB-
|
614
|
-
|
615
|
-
text = $&
|
616
|
-
if exp.get("parser") == "berkeley"
|
617
|
-
text.gsub!(/word='\('/, "word='*LRB*'")
|
618
|
-
text.gsub!(/word='\)'/, "word='*RRB*'")
|
619
|
-
text.gsub!(/word=['"]``['"]/, "word='\"'")
|
620
|
-
text.gsub!(/word=['"]''['"]/, "word='\"'")
|
621
|
-
text.gsub!(/word=['"]\'\'['"]/, "word='\"'")
|
622
|
-
#text.gsub!(/word=['"]\(['"]/, "word='-LRB-'")
|
623
|
-
#text.gsub!(/word=['"]\)['"]/, "word='-RRB-'")
|
624
|
-
|
625
|
-
end
|
626
|
-
terminals = text
|
627
|
-
#terminals = sent_string
|
628
|
-
terminals = RegXML.new(terminals)
|
629
|
-
terminals.children_and_text.each { |terminal|
|
630
|
-
|
631
|
-
unless terminal.name == "t"
|
632
|
-
# not a terminal after all
|
633
|
-
next
|
634
|
-
end
|
635
|
-
|
636
|
-
|
637
|
-
outfile.puts FNTabFormatFile.format_str({
|
638
|
-
"word" => SalsaTigerXMLHelper.unescape(terminal.attributes["word"]),
|
639
|
-
"sent_id" => sentid
|
640
|
-
})
|
641
|
-
} # each terminal
|
642
|
-
outfile.puts
|
643
|
-
} # each sentence
|
644
|
-
outfile.close
|
645
|
-
end
|
646
|
-
|
647
|
-
###
|
648
|
-
# add semantics from tab:
|
649
|
-
#
|
650
|
-
# add information about semantics from a FN tab sentence
|
651
|
-
# to a SalsaTigerSentence object:
|
652
|
-
# - frames (one frame per sentence)
|
653
|
-
# - roles
|
654
|
-
# - FrameNet grammatical functions
|
655
|
-
# - FrameNet POS of target
|
656
|
-
def FrprepHelper.add_semantics_from_tab(st_sent, # SalsaTigerSentence object
|
657
|
-
tab_sent, # FNTabFormatSentence object
|
658
|
-
mapping, # hash: tab lineno -> array:SynNode
|
659
|
-
interpreter_class, # SynInterpreter class
|
660
|
-
exp) # FrprepConfigData
|
661
|
-
|
662
|
-
if tab_sent.nil?
|
663
|
-
# tab sentence not found
|
664
|
-
return
|
665
|
-
end
|
666
|
-
|
667
|
-
# iterate through frames in the tabsent
|
668
|
-
frame_index = 0
|
669
|
-
tab_sent.each_frame { |tab_frame_obj|
|
670
|
-
frame_name = tab_frame_obj.get_frame() # string
|
671
|
-
|
672
|
-
if frame_name.nil? or frame_name =~ /^-*$/
|
673
|
-
# weird: a frame without a frame
|
674
|
-
$stderr.puts "Warning: frame entry without a frame in tab sentence #{st_sent.id}."
|
675
|
-
$stderr.puts "Skipping"
|
676
|
-
next
|
677
|
-
end
|
678
|
-
|
679
|
-
frame_node = st_sent.add_frame(frame_name, tab_sent.get_sent_id() + "_f#{frame_index}")
|
680
|
-
frame_index += 1
|
681
|
-
|
682
|
-
# target
|
683
|
-
target_nodes = Array.new
|
684
|
-
tab_frame_obj.get_target_indices.each {|terminal_id|
|
685
|
-
if mapping[terminal_id]
|
686
|
-
target_nodes.concat mapping[terminal_id]
|
687
|
-
end
|
688
|
-
}
|
689
|
-
|
690
|
-
# let the interpreter class decide on how to determine the maximum constituents
|
691
|
-
target_maxnodes = interpreter_class.max_constituents(target_nodes, st_sent)
|
692
|
-
if target_maxnodes.empty?
|
693
|
-
# HIEr
|
694
|
-
STDERR.puts "Warning: no target in frame entry, sentence #{st_sent.id}."
|
695
|
-
$stderr.puts "frame is #{frame_name}, frame no #{frame_index}"
|
696
|
-
$stderr.puts "Skipping."
|
697
|
-
$stderr.puts "target indices: " + tab_frame_obj.get_target_indices.join(", ")
|
698
|
-
#tab_sent.each_line { |line|
|
699
|
-
# $stderr.puts line
|
700
|
-
# $stderr.puts "--"
|
701
|
-
#}
|
702
|
-
next
|
703
|
-
end
|
704
|
-
frame_node.add_fe("target",target_maxnodes)
|
705
|
-
|
706
|
-
# set features on target: target lemma, target POS
|
707
|
-
target_lemma = tab_frame_obj.get_target()
|
708
|
-
target_pos = nil
|
709
|
-
if target_lemma
|
710
|
-
if exp.get("origin") == "FrameNet"
|
711
|
-
# FrameNet data: here the lemma in the tab file has the form
|
712
|
-
# <lemma>.<POS>
|
713
|
-
# separate the two
|
714
|
-
if target_lemma =~ /^(.*)\.(.*)$/
|
715
|
-
target_lemma = $1
|
716
|
-
target_pos = $2
|
717
|
-
end
|
718
|
-
end
|
719
|
-
frame_node.target.set_attribute("lemma", target_lemma)
|
720
|
-
if target_pos
|
721
|
-
frame_node.target.set_attribute("pos", target_pos)
|
722
|
-
end
|
723
|
-
end
|
724
|
-
|
725
|
-
# roles, GF, PT
|
726
|
-
# synnode_markable_label:
|
727
|
-
# hash "role" | "gf" | "pt" -> SynNode -> array: label(string)
|
728
|
-
layer_synnode_label = Hash.new
|
729
|
-
["gf", "pt", "role"].each {|layer|
|
730
|
-
termids2labels = tab_frame_obj.markables(layer)
|
731
|
-
|
732
|
-
unless layer_synnode_label[layer]
|
733
|
-
layer_synnode_label[layer] = Hash.new
|
734
|
-
end
|
735
|
-
|
736
|
-
termids2labels.each {|terminal_indices, label|
|
737
|
-
terminal_indices.each { |t_i|
|
738
|
-
|
739
|
-
if (nodes = mapping[t_i])
|
740
|
-
|
741
|
-
nodes.each { |node|
|
742
|
-
unless layer_synnode_label[layer][node]
|
743
|
-
layer_synnode_label[layer][node] = Array.new
|
744
|
-
end
|
745
|
-
|
746
|
-
layer_synnode_label[layer][node] << label
|
747
|
-
} # each node that t_i maps to
|
748
|
-
end # if t_i maps to anything
|
749
|
-
|
750
|
-
} # each terminal index
|
751
|
-
} # each mapping terminal indices -> label
|
752
|
-
} # each layer
|
753
|
-
|
754
|
-
# 'stuff' (Support and other things)
|
755
|
-
layer_synnode_label["stuff"] = Hash.new
|
756
|
-
tab_frame_obj.each_line_parsed { |line_obj|
|
757
|
-
if (label = line_obj.get("stuff")) != "-"
|
758
|
-
if (nodes = mapping[line_obj.get("lineno")])
|
759
|
-
nodes.each { |node|
|
760
|
-
unless layer_synnode_label["stuff"][node]
|
761
|
-
layer_synnode_label["stuff"][node] = Array.new
|
762
|
-
end
|
763
|
-
layer_synnode_label["stuff"][node] << label
|
764
|
-
}
|
765
|
-
end
|
766
|
-
end
|
767
|
-
}
|
768
|
-
|
769
|
-
# reencode:
|
770
|
-
# hash role_label(string) -> array of tuples [synnodes, gflabels, ptlabels]
|
771
|
-
# synnodes: array:SynNode. gflabels, ptlabels: array:String
|
772
|
-
#
|
773
|
-
# note that in this step, any gf or pt labels that have been
|
774
|
-
# assigned to a SynNode that has not also been assigned a role
|
775
|
-
# will be lost
|
776
|
-
role2nodes_labels = Hash.new
|
777
|
-
layer_synnode_label["role"].each_pair { |synnode, labels|
|
778
|
-
labels.each { | rolelabel|
|
779
|
-
unless role2nodes_labels[rolelabel]
|
780
|
-
role2nodes_labels[rolelabel] = Array.new
|
781
|
-
end
|
782
|
-
|
783
|
-
role2nodes_labels[rolelabel] << [
|
784
|
-
synnode,
|
785
|
-
layer_synnode_label["gf"][synnode],
|
786
|
-
layer_synnode_label["pt"][synnode]
|
787
|
-
]
|
788
|
-
} # each role label
|
789
|
-
} # each pair SynNode/role labels
|
790
|
-
|
791
|
-
# reencode "stuff", but only the support cases
|
792
|
-
role2nodes_labels["Support"] = Array.new()
|
793
|
-
|
794
|
-
layer_synnode_label["stuff"].each_pair { |synnode, labels|
|
795
|
-
labels.each { |stufflabel|
|
796
|
-
if stufflabel =~ /Supp/
|
797
|
-
# some sort of support
|
798
|
-
role2nodes_labels["Support"] << [synnode, nil, nil]
|
799
|
-
end
|
800
|
-
}
|
801
|
-
}
|
802
|
-
|
803
|
-
##
|
804
|
-
# each role label:
|
805
|
-
# make FeNode for the current frame
|
806
|
-
role2nodes_labels.each_pair { |rolelabel, node_gf_pt|
|
807
|
-
|
808
|
-
# get list of syn nodes, GF and PT labels for this role
|
809
|
-
# shortcut for GF and PT labels: take any labels that have
|
810
|
-
# been assigned for _some_ Synnode of this role
|
811
|
-
synnodes = node_gf_pt.map { |ngp| ngp[0] }
|
812
|
-
gflabels = node_gf_pt.map { |ngp| ngp[1] }.compact.flatten.uniq
|
813
|
-
ptlabels = node_gf_pt.map { |ngp| ngp[2] }.compact.flatten.uniq
|
814
|
-
|
815
|
-
|
816
|
-
# let the interpreter class decide on how to
|
817
|
-
# determine the maximum constituents
|
818
|
-
maxnodes = interpreter_class.max_constituents(synnodes, st_sent)
|
819
|
-
|
820
|
-
fe_node = st_sent.add_fe(frame_node, rolelabel, maxnodes)
|
821
|
-
unless gflabels.empty?
|
822
|
-
fe_node.set_attribute("gf", gflabels.join(","))
|
823
|
-
end
|
824
|
-
unless ptlabels.empty?
|
825
|
-
fe_node.set_attribute("pt", ptlabels.join(","))
|
826
|
-
end
|
827
|
-
} # each role label
|
828
|
-
} # each frame
|
829
|
-
end
|
830
|
-
|
831
|
-
|
832
|
-
######
|
833
|
-
# handle multiword targets:
|
834
|
-
# if you find a verb with a separate prefix,
|
835
|
-
# change the verb's lemma information accordingly
|
836
|
-
# and add an attribute "other_words" to the verb node
|
837
|
-
# pointing to the other node
|
838
|
-
#
|
839
|
-
# In general, it will be assumed that "other_words" contains
|
840
|
-
# a list of node IDs for other nodes belonging to the same
|
841
|
-
# group, node IDs separated by spaces, and that
|
842
|
-
# each node of a group has the "other_words" attribute.
|
843
|
-
#
|
844
|
-
def FrprepHelper.handle_multiword_targets(sent, # SalsaTigerSentence object
|
845
|
-
interpreter, # SynInterpreter object
|
846
|
-
language) # string: en, de
|
847
|
-
##
|
848
|
-
# only retain the interesting words of the sentence:
|
849
|
-
# content words and prepositions
|
850
|
-
if sent.nil?
|
851
|
-
return
|
852
|
-
end
|
853
|
-
|
854
|
-
nodes = sent.terminals.select { |node|
|
855
|
-
[
|
856
|
-
"adj", "adv", "card", "noun", "part", "prep", "verb"
|
857
|
-
].include? interpreter.category(node)
|
858
|
-
}
|
859
|
-
|
860
|
-
##
|
861
|
-
# group:
|
862
|
-
# group verbs with their separate particles
|
863
|
-
# (at a later point, other types of grouping can be inserted here)
|
864
|
-
groups = FrprepHelper.group_words(nodes, interpreter)
|
865
|
-
|
866
|
-
##
|
867
|
-
# record grouping information as attributes on the terminals.
|
868
|
-
groups.each { |descr, group_of_nodes|
|
869
|
-
case descr
|
870
|
-
when "none"
|
871
|
-
# no grouping
|
872
|
-
when "part"
|
873
|
-
# separate particle belonging to a verb
|
874
|
-
|
875
|
-
# group_of_nodes is a pair [verb, particle]
|
876
|
-
verb, particle = group_of_nodes
|
877
|
-
|
878
|
-
verb.set_attribute("other_words", particle.id())
|
879
|
-
particle.set_attribute("other_words", verb.id())
|
880
|
-
|
881
|
-
if verb.get_attribute("lemma") and particle.get_attribute("lemma")
|
882
|
-
case language
|
883
|
-
when "de"
|
884
|
-
# German: prepend SVP to get the real lemma of the verb
|
885
|
-
verb.set_attribute("lemma",
|
886
|
-
particle.get_attribute("lemma") +
|
887
|
-
verb.get_attribute("lemma"))
|
888
|
-
when "en"
|
889
|
-
# English: append particle as separate word after the lemma of the verb
|
890
|
-
verb.set_attribute("lemma",
|
891
|
-
verb.get_attribute("lemma") + " " +
|
892
|
-
particle.get_attribute("lemma"))
|
893
|
-
else
|
894
|
-
# default
|
895
|
-
verb.set_attribute("lemma",
|
896
|
-
verb.get_attribute("lemma") + " " +
|
897
|
-
particle.get_attribute("lemma"))
|
898
|
-
end
|
899
|
-
end
|
900
|
-
|
901
|
-
else
|
902
|
-
raise "Shouldn't be here: unexpected description #{descr}"
|
903
|
-
end
|
904
|
-
}
|
905
|
-
end
|
906
|
-
|
907
|
-
########################
|
908
|
-
# group_words
|
909
|
-
#
|
910
|
-
# auxiliary of transform_multiword targets
|
911
|
-
#
|
912
|
-
# Group terminals:
|
913
|
-
# At the moment, just find separate prefixes and particles
|
914
|
-
# for verbs
|
915
|
-
#
|
916
|
-
# returns: list of pairs [descr, nodes]
|
917
|
-
# descr: string, "none" (no group), "part" (separate verb particle)
|
918
|
-
# nodes: array:SynNode
|
919
|
-
def FrprepHelper.group_words(nodes, # array: SynNode
|
920
|
-
interpreter) # SynInterpreter object
|
921
|
-
|
922
|
-
retv = Array.new # array of groups, array:array:SynNode
|
923
|
-
done = Array.new # remember nodes already covered
|
924
|
-
|
925
|
-
nodes.each { |terminal_node|
|
926
|
-
if done.include? terminal_node
|
927
|
-
# we have already included this node in one of the groups
|
928
|
-
next
|
929
|
-
end
|
930
|
-
|
931
|
-
if (svp = interpreter.particle_of_verb(terminal_node, nodes))
|
932
|
-
retv << ["part", [terminal_node, svp]]
|
933
|
-
done << terminal_node
|
934
|
-
done << svp
|
935
|
-
else
|
936
|
-
retv << ["none", [terminal_node]]
|
937
|
-
done << terminal_node
|
938
|
-
end
|
939
|
-
|
940
|
-
}
|
941
|
-
|
942
|
-
return retv
|
943
|
-
end
|
944
|
-
|
945
|
-
|
946
|
-
######
|
947
|
-
# handle unknown framenames
|
948
|
-
#
|
949
|
-
# For all frames with names matching Unknown\d+,
|
950
|
-
# rename them to <lemma>_Unknown\d+
|
951
|
-
def FrprepHelper.handle_unknown_framenames(sent, # SalsaTigerSentence
|
952
|
-
interpreter) # SynInterpreter class
|
953
|
-
if sent.nil?
|
954
|
-
return
|
955
|
-
end
|
956
|
-
|
957
|
-
sent.each_frame { |frame|
|
958
|
-
if frame.name() =~ /^Unknown/
|
959
|
-
if frame.target
|
960
|
-
maintarget = interpreter.main_node_of_expr(frame.target.children(), "no_mwe")
|
961
|
-
else
|
962
|
-
maintarget = nil
|
963
|
-
end
|
964
|
-
unless maintarget
|
965
|
-
$stderr.puts "Warning: Unknown frame, and I could not determine the target lemma: Frame #{frame.id()}"
|
966
|
-
$stderr.puts "Cannot repair frame name, leaving it as is."
|
967
|
-
return
|
968
|
-
end
|
969
|
-
|
970
|
-
# get lemma, if it exists, otherwise get word
|
971
|
-
# also, if the lemmatizer has returned a disjunction of lemmas,
|
972
|
-
# get the first disjunct
|
973
|
-
lemma = interpreter.lemma_backoff(maintarget)
|
974
|
-
if lemma
|
975
|
-
# we have a lemma
|
976
|
-
frame.set_name(lemma + "_" + frame.name())
|
977
|
-
else
|
978
|
-
# the main target word has no lemma attribute,
|
979
|
-
# and somehow I couldn't even get the target word
|
980
|
-
$stderr.puts "Warning: Salsa 'Unknown' frame."
|
981
|
-
$stderr.puts "Trying to make its lemma-specificity explicit, but"
|
982
|
-
$stderr.puts "I could not determine the target lemma nor the target word: frame #{frame.id()}"
|
983
|
-
$stderr.puts "Leaving 'Unknown' as it is."
|
984
|
-
end
|
985
|
-
end
|
986
|
-
}
|
987
|
-
end
|
988
|
-
|
989
|
-
|
990
|
-
#####################
|
991
|
-
#
|
992
|
-
# Integrate the semantic annotation of an old sentence
|
993
|
-
# into the corresponding new sentence
|
994
|
-
# At the same time, integrate the lemma information from the
|
995
|
-
# old sentence into the new sentence
|
996
|
-
def FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent,
|
997
|
-
newsent,
|
998
|
-
interpreter_class,
|
999
|
-
exp)
|
1000
|
-
if oldsent.nil? or newsent.nil?
|
1001
|
-
return
|
1002
|
-
end
|
1003
|
-
##
|
1004
|
-
# match old and new sentence via terminals
|
1005
|
-
newterminals = newsent.terminals_sorted()
|
1006
|
-
oldterminals = oldsent.terminals_sorted()
|
1007
|
-
# sanity check: exact match on terminals?
|
1008
|
-
newterminals.interleave(oldterminals).each { |newnode, oldnode|
|
1009
|
-
#print "old ", oldnode.word, " ", newnode.word, "\n"
|
1010
|
-
# new and old word: use both unescaped and escaped variant
|
1011
|
-
if newnode
|
1012
|
-
newwords = [ newnode.word, SalsaTigerXMLHelper.escape(newnode.word) ]
|
1013
|
-
else
|
1014
|
-
newwords = [nil, nil]
|
1015
|
-
end
|
1016
|
-
if oldnode
|
1017
|
-
oldwords = [ oldnode.word, SalsaTigerXMLHelper.escape(oldnode.word) ]
|
1018
|
-
else
|
1019
|
-
oldwords = [ nil, nil]
|
1020
|
-
end
|
1021
|
-
|
1022
|
-
if (newwords & oldwords).empty?
|
1023
|
-
# old and new word don't match, either escaped or non-escaped
|
1024
|
-
|
1025
|
-
$stderr.puts "Warning: could not match terminals of sentence #{newsent.id()}"
|
1026
|
-
$stderr.puts "This means that I cannot match the semantic annotation"
|
1027
|
-
$stderr.puts "to the newly parsed sentence. Skipping."
|
1028
|
-
#$stderr.puts "Old sentence: "
|
1029
|
-
#$stderr.puts oldterminals.map { |n| n.word }.join("--")
|
1030
|
-
#$stderr.puts "New sentence: "
|
1031
|
-
#$stderr.puts newterminals.map { |n| n.word }.join("--")
|
1032
|
-
return false
|
1033
|
-
end
|
1034
|
-
}
|
1035
|
-
|
1036
|
-
##
|
1037
|
-
# copy lemma information
|
1038
|
-
oldterminals.each_with_index { |oldnode, ix|
|
1039
|
-
newnode = newterminals[ix]
|
1040
|
-
if oldnode.get_attribute("lemma")
|
1041
|
-
newnode.set_attribute("lemma", oldnode.get_attribute("lemma"))
|
1042
|
-
end
|
1043
|
-
}
|
1044
|
-
|
1045
|
-
##
|
1046
|
-
# copy frames
|
1047
|
-
oldsent.each_frame { |oldframe|
|
1048
|
-
# make new frame with same ID
|
1049
|
-
newframe = newsent.add_frame(oldframe.name, oldframe.id())
|
1050
|
-
# copy FEs
|
1051
|
-
oldframe.each_child { |oldfe|
|
1052
|
-
# new nodes: map old terminals to new terminals,
|
1053
|
-
# then find max constituents covering them
|
1054
|
-
newnodes = oldfe.descendants.select { |n|
|
1055
|
-
n.is_terminal?
|
1056
|
-
}.map { |n|
|
1057
|
-
oldterminals.index(n)
|
1058
|
-
}.map { |ix|
|
1059
|
-
newterminals[ix]
|
1060
|
-
}
|
1061
|
-
|
1062
|
-
# let the interpreter class decide on how to determine the maximum constituents
|
1063
|
-
newnodes = interpreter_class.max_constituents(newnodes, newsent)
|
1064
|
-
|
1065
|
-
# make new FE with same ID
|
1066
|
-
new_fe = newsent.add_fe(newframe, oldfe.name(), newnodes, oldfe.id())
|
1067
|
-
# keep all attributes of the FE
|
1068
|
-
if oldfe.get_f("attributes")
|
1069
|
-
oldfe.get_f("attributes").each_pair { |attr, value|
|
1070
|
-
new_fe.set_attribute(attr, value)
|
1071
|
-
}
|
1072
|
-
end
|
1073
|
-
}
|
1074
|
-
}
|
1075
|
-
|
1076
|
-
##
|
1077
|
-
### changed by ines => appears twice in stxml file
|
1078
|
-
|
1079
|
-
# copy underspecification
|
1080
|
-
# keep as is, since we've kept all frame and FE IDs
|
1081
|
-
oldsent.each_usp_frameblock { |olduspframe|
|
1082
|
-
newuspframe = newsent.add_usp("frame")
|
1083
|
-
olduspframe.each_child { |oldnode|
|
1084
|
-
newnode = newsent.sem_node_with_id(oldnode.id())
|
1085
|
-
if newnode
|
1086
|
-
newuspframe.add_child(newnode)
|
1087
|
-
else
|
1088
|
-
$stderr.puts "Error: unknown frame with ID #{oldnode.id()}"
|
1089
|
-
end
|
1090
|
-
}
|
1091
|
-
}
|
1092
|
-
oldsent.each_usp_feblock { |olduspfe|
|
1093
|
-
newuspfe = newsent.add_usp("fe")
|
1094
|
-
olduspfe.each_child { |oldnode|
|
1095
|
-
newnode = newsent.sem_node_with_id(oldnode.id())
|
1096
|
-
if newnode
|
1097
|
-
newuspfe.add_child(newnode)
|
1098
|
-
else
|
1099
|
-
$stderr.puts "Error: unknown FE with ID #{oldnode.id()}"
|
1100
|
-
end
|
1101
|
-
}
|
1102
|
-
}
|
1103
|
-
|
1104
|
-
end
|
1105
|
-
|
1106
|
-
####################
|
1107
|
-
# add head attributes to each nonterminal in each
|
1108
|
-
# SalsaTigerXML file in a directory
|
1109
|
-
|
1110
|
-
def FrprepHelper.add_head_attributes(st_sent, # SalsaTigerSentence object
|
1111
|
-
interpreter) # SynInterpreter class
|
1112
|
-
st_sent.each_nonterminal {|nt_node|
|
1113
|
-
head_term = interpreter.head_terminal(nt_node)
|
1114
|
-
if head_term and head_term.word()
|
1115
|
-
nt_node.set_attribute("head", head_term.word())
|
1116
|
-
else
|
1117
|
-
nt_node.set_attribute("head", "--")
|
1118
|
-
end
|
1119
|
-
} # each nonterminal
|
1120
|
-
end
|
1121
|
-
|
1122
|
-
# add lemma information to each terminal in a given SalsaTigerSentence object
|
1123
|
-
def FrprepHelper.add_lemmas_from_tab(st_sent, # SalsaTigerSentence object
|
1124
|
-
tab_sent,# FNTabFormatSentence object
|
1125
|
-
mapping) # hash: tab lineno -> array:SynNode
|
1126
|
-
if tab_sent.nil?
|
1127
|
-
# tab sentence not found
|
1128
|
-
return
|
1129
|
-
end
|
1130
|
-
|
1131
|
-
# produce list with word, lemma pairs
|
1132
|
-
lemmat = Array.new
|
1133
|
-
tab_sent.each_line_parsed {|line|
|
1134
|
-
word = line.get("word")
|
1135
|
-
lemma = line.get("lemma")
|
1136
|
-
lemmat << [word,lemma]
|
1137
|
-
}
|
1138
|
-
|
1139
|
-
# match with st_sent terminal list and add lemma attributes
|
1140
|
-
# KE Jan 07: if word mismatch,
|
1141
|
-
# set to Lemmatizer file version,
|
1142
|
-
# but count mismatches
|
1143
|
-
word_mismatches = Array.new()
|
1144
|
-
|
1145
|
-
st_sent.each_terminal_sorted {|t|
|
1146
|
-
matching_lineno = (0..lemmat.length()-1).to_a.detect { |tab_lineno|
|
1147
|
-
mapping[tab_lineno].include? t
|
1148
|
-
}
|
1149
|
-
unless matching_lineno
|
1150
|
-
next
|
1151
|
-
end
|
1152
|
-
word, lemma = lemmat[matching_lineno]
|
1153
|
-
|
1154
|
-
# transform characters to XML-friendly form
|
1155
|
-
# for comparison with st_word, which is also escaped
|
1156
|
-
word = SalsaTigerXMLHelper.escape(word)
|
1157
|
-
st_word = t.word()
|
1158
|
-
if word != st_word and
|
1159
|
-
word != SalsaTigerXMLHelper.escape(st_word)
|
1160
|
-
# true mismatch.
|
1161
|
-
# use the Lemmatizer version of the word, remember the mismatch
|
1162
|
-
word_mismatches << [st_word, word]
|
1163
|
-
t.set_attribute("word", word)
|
1164
|
-
end
|
1165
|
-
|
1166
|
-
if lemma
|
1167
|
-
# we actually do have lemma information
|
1168
|
-
lemmatised_head = SalsaTigerXMLHelper.escape(lemma)
|
1169
|
-
t.set_attribute("lemma",lemmatised_head)
|
1170
|
-
end
|
1171
|
-
} # each terminal
|
1172
|
-
|
1173
|
-
# did we have mismatches? then report them
|
1174
|
-
unless word_mismatches.empty?
|
1175
|
-
$stderr.puts "Warning: Word mismatches found between Lemmatizer file and SalsaTigerXML file generalted from parser output."
|
1176
|
-
$stderr.puts "(May be due to failed reencoding of special character in the parser output.)"
|
1177
|
-
$stderr.puts "I am using the Lemmatizer version by default."
|
1178
|
-
$stderr.puts "Version used:"
|
1179
|
-
$stderr.print "\t"
|
1180
|
-
st_sent.each_terminal_sorted { |t| $stderr.print ">>#{t}<<" }
|
1181
|
-
$stderr.puts
|
1182
|
-
$stderr.print "SalsaTigerXML file had: "
|
1183
|
-
$stderr.print word_mismatches.map { |st_word, tab_word|
|
1184
|
-
"#{st_word} instead of #{tab_word}"
|
1185
|
-
}.join(", ")
|
1186
|
-
$stderr.puts
|
1187
|
-
end
|
1188
|
-
end
|
1189
|
-
|
1190
|
-
###################3
|
1191
|
-
# given a SalsaTigerSentence,
|
1192
|
-
# look for FrameNet frames that are
|
1193
|
-
# test frames, and remove them
|
1194
|
-
def FrprepHelper.remove_deprecated_frames(sent, # SalsaTigerSentence
|
1195
|
-
exp) # FrprepConfigData
|
1196
|
-
|
1197
|
-
unless exp.get("origin") == "FrameNet"
|
1198
|
-
return
|
1199
|
-
end
|
1200
|
-
|
1201
|
-
sent.frames.each { |frame_obj|
|
1202
|
-
if frame_obj.name() == "Boulder" or
|
1203
|
-
frame_obj.name() =~ /^Test/
|
1204
|
-
sent.remove_frame(frame_obj)
|
1205
|
-
end
|
1206
|
-
}
|
1207
|
-
end
|
1208
|
-
|
1209
|
-
end
|
1210
|
-
|
1211
|
-
############################################3
|
1212
|
-
# Class FrprepFlatSyntax:
|
1213
|
-
#
|
1214
|
-
# given a FNTabFormat file,
|
1215
|
-
# yield each of its sentences in SalsaTigerXML,
|
1216
|
-
# constructing a flat syntax
|
1217
|
-
class FrprepFlatSyntax
|
1218
|
-
def initialize(tabfilename, # string: name of tab file
|
1219
|
-
postag_suffix, # postag file suffix (or nil)
|
1220
|
-
lemma_suffix) # lemmatisation file suffix (or nil)
|
1221
|
-
|
1222
|
-
@tabfilename = tabfilename
|
1223
|
-
@pos_suffix = postag_suffix
|
1224
|
-
@lemma_suffix = lemma_suffix
|
1225
|
-
end
|
1226
|
-
|
1227
|
-
# yield each non-parse sentence as a tuple
|
1228
|
-
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
1229
|
-
# of a SalsaTigerSentence object, a FNTabSentence object,
|
1230
|
-
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
1231
|
-
# pointing each tab word to one or more SalsaTigerSentence terminals
|
1232
|
-
def each_sentence(dummy)
|
1233
|
-
|
1234
|
-
# read tab file with lemma and POS info
|
1235
|
-
tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
|
1236
|
-
|
1237
|
-
tabfile.each_sentence() { |tabsent|
|
1238
|
-
# start new, empty sentence with "failed" attribute (i.e. no parse)
|
1239
|
-
# and with the ID of the corresponding TabFormat sentence
|
1240
|
-
sentid = tabsent.get_sent_id()
|
1241
|
-
if sentid.nil? or sentid =~ /^-*$/
|
1242
|
-
$stderr.puts "No sentence ID for sentence:"
|
1243
|
-
tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
|
1244
|
-
$stderr.puts
|
1245
|
-
sentid = Time.new().to_f.to_s
|
1246
|
-
end
|
1247
|
-
sent = SalsaTigerSentence.new("<s id=\"#{SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
|
1248
|
-
|
1249
|
-
# add single nonterminal node, category "S"
|
1250
|
-
single_nonterminal_id = SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
|
1251
|
-
vroot = sent.add_syn("nt", "S", # category
|
1252
|
-
nil, # word
|
1253
|
-
nil, # pos
|
1254
|
-
single_nonterminal_id)
|
1255
|
-
|
1256
|
-
# add terminals
|
1257
|
-
tabsent.each_line_parsed() { |line_obj|
|
1258
|
-
# make terminal node with tab sent info
|
1259
|
-
node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
|
1260
|
-
word = line_obj.get("word")
|
1261
|
-
unless word
|
1262
|
-
word = ""
|
1263
|
-
end
|
1264
|
-
word = SalsaTigerXMLHelper.escape(word)
|
1265
|
-
pos = line_obj.get("pos")
|
1266
|
-
unless pos
|
1267
|
-
pos = ""
|
1268
|
-
end
|
1269
|
-
pos = SalsaTigerXMLHelper.escape(pos)
|
1270
|
-
terminal = sent.add_syn("t", nil, # category
|
1271
|
-
word, pos,
|
1272
|
-
node_id)
|
1273
|
-
|
1274
|
-
if line_obj.get("lemma")
|
1275
|
-
# lemma
|
1276
|
-
terminal.set_attribute("lemma", SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
|
1277
|
-
end
|
1278
|
-
|
1279
|
-
# add new terminal as child of vroot
|
1280
|
-
vroot.add_child(terminal, nil)
|
1281
|
-
terminal.add_parent(vroot, nil)
|
1282
|
-
} # each line of tab file
|
1283
|
-
|
1284
|
-
# yield newly constructed SalsaTigerXMl sentence plus tab sentence
|
1285
|
-
yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
|
1286
|
-
}
|
1287
|
-
end
|
1288
|
-
end
|
1289
|
-
|
1290
|
-
############################################3
|
1291
|
-
# Class FrprepReadStxml
|
1292
|
-
#
|
1293
|
-
# given a STXML file,
|
1294
|
-
# yield each of its sentences
|
1295
|
-
class FrprepReadStxml
|
1296
|
-
def initialize(stxmlfilename, # string: name of SalsaTigerXML file
|
1297
|
-
tabfilename, # string: name of corresponding tab file (or nil)
|
1298
|
-
postag_suffix, # POS tag file suffix (or nil)
|
1299
|
-
lemma_suffix) # lemmatization file suffix (or nil)
|
1300
|
-
|
1301
|
-
@stxmlfilename = stxmlfilename
|
1302
|
-
@tabfilename = tabfilename
|
1303
|
-
@pos_suffix = postag_suffix
|
1304
|
-
@lemma_suffix = lemma_suffix
|
1305
|
-
end
|
1306
|
-
# yield each non-parse sentence as a tuple
|
1307
|
-
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
1308
|
-
# of a SalsaTigerSentence object, a FNTabSentence object,
|
1309
|
-
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
1310
|
-
# pointing each tab word to one or more SalsaTigerSentence terminals
|
1311
|
-
def each_sentence(dummy)
|
1312
|
-
# read corresponding tab file?
|
1313
|
-
tab_sents = Array.new()
|
1314
|
-
if File.exists? @tabfilename
|
1315
|
-
tabfile = FNTabFormatFile.new(@tabfilename,@pos_suffix,@lemma_suffix)
|
1316
|
-
tabfile.each_sentence { |tabsent|
|
1317
|
-
tab_sents << tabsent
|
1318
|
-
}
|
1319
|
-
end
|
1320
|
-
|
1321
|
-
# read STXML file
|
1322
|
-
infile = FilePartsParser.new(@stxmlfilename)
|
1323
|
-
index = 0
|
1324
|
-
infile.scan_s { |sent_string|
|
1325
|
-
sent = SalsaTigerSentence.new(sent_string)
|
1326
|
-
yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
|
1327
|
-
index += 1
|
1328
|
-
}
|
1329
|
-
end
|
1330
|
-
end
|