shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,13 +0,0 @@
1
- # -*- encoding: us-ascii -*-
2
-
3
- # AB, 2010-11-25
4
-
5
-
6
- # It is a general class for parsing options.
7
- # It is now emtpy, we are implementing three different classes:
8
- # FRPrepOptionParser, RosyOptionParser and FredOptionParser.
9
- # All classes above inherit from OptionParser.
10
- #--
11
- # TODO: move the functionality to the parent class.
12
- class OptionParser
13
- end
@@ -1,62 +0,0 @@
1
- # FPrepConfigData
2
- # Katrin Erk July 05
3
- #
4
- # Preprocessing for Fred and Rosy:
5
- # access to a configuration and experiment description file
6
-
7
- require "common/config_data"
8
-
9
- ##############################
10
- # Class FrPrepConfigData
11
- #
12
- # inherits from ConfigData,
13
- # sets variable names appropriate to preprocessing task
14
-
15
- class FrPrepConfigData < ConfigData
16
-
17
- CONFIG_DEFS = {"prep_experiment_ID" => "string", # experiment identifier
18
- "frprep_directory" => "string", # dir for frprep internal data
19
- # information about the dataset
20
- "language" => "string", # en, de
21
- "origin"=> "string", # FrameNet, Salsa, or nothing
22
- "format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
23
- "encoding" => "string", # utf8, iso, hex, or nothing
24
-
25
- # directories
26
- "directory_input" => "string", # dir with input data
27
- "directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
28
- "directory_parserout" => "string", # dir with parser output for the parser named below
29
-
30
- # syntactic processing
31
- "pos_tagger" => "string", # name of POS tagger
32
- "lemmatizer" => "string", # name of lemmatizer
33
- "parser" => "string", # name of parser
34
- "pos_tagger_path" => "string", # path to POS tagger
35
- "lemmatizer_path" => "string", # path to lemmatizer
36
- "parser_path" => "string", # path to parser
37
- "parser_max_sent_num" => "integer", # max number of sentences per parser input file
38
- "parser_max_sent_len" => "integer", # max sentence length the parser handles
39
-
40
- "do_parse" => "bool", # use parser?
41
- "do_lemmatize" => "bool",# use lemmatizer?
42
- "do_postag" => "bool", # use POS tagger?
43
-
44
- # output format: if tabformat_output == true,
45
- # output in Tab format rather than Salsa/Tiger XML
46
- # (this will not work if do_parse == true)
47
- "tabformat_output" => "bool",
48
-
49
- # syntactic repairs, dependent on existing semantic role annotation
50
- "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
51
- "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
52
- }
53
-
54
- def initialize(filename)
55
- # @param filename [String] path to a config file
56
- # @param CONFIG_DEFS [Hash] a list of configuration definitions
57
- super(filename, CONFIG_DEFS, [])
58
- end
59
- end
60
-
61
-
62
-
@@ -1,1330 +0,0 @@
1
- # Salsa packages
2
- require "common/ISO-8859-1"
3
- require "common/Parser"
4
- require "common/RegXML"
5
- require "common/SalsaTigerRegXML"
6
- require "common/SalsaTigerXMLHelper"
7
- require "common/TabFormat"
8
- require "common/ruby_class_extensions"
9
- require "common/AbstractSynInterface"
10
-
11
- ############################################3
12
- # Module FrprepHelper:
13
- # diverse transformation methods for frprep.rb
14
- # moved over here to make the main file less crowded
15
- module FrprepHelper
16
-
17
- ####
18
- # transform a file to UTF-8 from a given encoding
19
- def FrprepHelper.to_utf8_file(input_filename, # string: name of input file
20
- output_filename, # string: name of output file
21
- encoding) # string: "iso", "hex"
22
- begin
23
- infile = File.new(input_filename)
24
- outfile = File.new(output_filename, "w")
25
- rescue
26
- raise "Could not read #{input_filename}, or could not write to #{output_filename}."
27
- end
28
-
29
- while (line = infile.gets())
30
- case encoding
31
- when "iso"
32
- outfile.puts UtfIso.from_iso_8859_1(line)
33
- when "hex"
34
- outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
35
- else
36
- raise "Shouldn't be here."
37
- end
38
- end
39
- infile.close()
40
- outfile.close()
41
- end
42
-
43
- ####
44
- # transform BNC format file to plaintext file
45
- def FrprepHelper.bnc_to_plain_file(input_filename, # string: name of input file
46
- output_filename) # string: name of output file
47
- begin
48
- infile = File.new(input_filename)
49
- outfile = File.new(output_filename, "w")
50
- rescue
51
- raise "Could not read #{input_filename}, or could not write to #{output_filename}."
52
- end
53
-
54
- infile.each { |line|
55
- # does this line contain a sentence?
56
- if line =~ /^\s*<s\s+n=/
57
- # remove all tags, replace by spaces,
58
- # then remove superfluous spaces
59
- textline = line.gsub(/<.+?>/, " ").strip().squeeze(" ")
60
-
61
-
62
- textline.gsub!(/&bquo;/, '"')
63
- textline.gsub!(/&equo;/, '"')
64
- textline.gsub!(/&mdash;/, "-")
65
- textline.gsub!(/&ndash;/, "-")
66
- textline.gsub!(/&percnt;/, "%")
67
- textline.gsub!(/&pound;/, " pounds ")
68
- textline.gsub!(/&amp;/, " and ")
69
- textline.gsub!(/&hellip;/, "...")
70
- textline.gsub!(/&copy;/, "(copyright)")
71
- textline.gsub!(/&eacute;/, "e")
72
- textline.gsub!(/&bull;/, "*")
73
- textline.gsub!(/&dollar;/, "$")
74
- textline.gsub!(/&deg;/, " degree ")
75
-
76
- textline.gsub!(/&frac12;/, "1/2")
77
- textline.gsub!(/&frac34;/, "3/4")
78
-
79
- textline.gsub!(/&lsqb;/, "[")
80
- textline.gsub!(/&rsqb;/, "]")
81
-
82
- textline.gsub!(/&ins;/, "i")
83
- textline.gsub!(/&ft;/, "ft")
84
-
85
- textline.gsub!(/&rarr;/, ">")
86
- textline.gsub!(/&larr;/, "<")
87
-
88
-
89
- textline.gsub!(/&aacute;/, "a")
90
- textline.gsub!(/&auml;/, "a")
91
- textline.gsub!(/&agrave;/, "a")
92
- textline.gsub!(/&atilde;/, "a")
93
- textline.gsub!(/&acirc;/, "a")
94
- textline.gsub!(/&Aacute;/, "A")
95
- textline.gsub!(/&Auml;/, "A")
96
- textline.gsub!(/&Agrave;/, "A")
97
- textline.gsub!(/&Atilde;/, "A")
98
- textline.gsub!(/&Acirc;/, "A")
99
-
100
- textline.gsub!(/&eacute;/, "e")
101
- textline.gsub!(/&egrave;/, "e")
102
- textline.gsub!(/&ecirc;/, "e")
103
- textline.gsub!(/&euml;/, "e")
104
- textline.gsub!(/&Eacute;/, "E")
105
- textline.gsub!(/&Egrave;/, "E")
106
- textline.gsub!(/&Ecirc;/, "E")
107
- textline.gsub!(/&Euml;/, "E")
108
-
109
- textline.gsub!(/&iacute;/, "i")
110
- textline.gsub!(/&igrave;/, "i")
111
- textline.gsub!(/&icirc;/, "i")
112
- textline.gsub!(/&iuml;/, "i")
113
- textline.gsub!(/&Iacute;/, "I")
114
- textline.gsub!(/&Igrave;/, "I")
115
- textline.gsub!(/&Icirc;/, "I")
116
-
117
- textline.gsub!(/&oacute;/, "o")
118
- textline.gsub!(/&ograve;/, "o")
119
- textline.gsub!(/&ocirc;/, "o")
120
- textline.gsub!(/&ouml;/, "o")
121
- textline.gsub!(/&Oacute;/, "O")
122
- textline.gsub!(/&Ograve;/, "O")
123
- textline.gsub!(/&Ocirc;/, "O")
124
- textline.gsub!(/&Ouml;/, "O")
125
-
126
- textline.gsub!(/&uacute;/, "u")
127
- textline.gsub!(/&ugrave;/, "u")
128
- textline.gsub!(/&ucirc;/, "u")
129
- textline.gsub!(/&uuml;/, "u")
130
- textline.gsub!(/&Uacute;/, "U")
131
- textline.gsub!(/&Ugrave;/, "U")
132
- textline.gsub!(/&Ucirc;/, "U")
133
- textline.gsub!(/&Uuml;/, "U")
134
-
135
- textline.gsub!(/&yuml;/, "y")
136
- textline.gsub!(/&Yuml;/, "Y")
137
-
138
- textline.gsub!(/&ntilde;/, "n")
139
- textline.gsub!(/&Ntilde;/, "N")
140
-
141
- textline.gsub!(/&ccedil;/, "c")
142
- textline.gsub!(/&Ccedil;/, "C")
143
-
144
-
145
- outfile.puts textline
146
- end
147
- }
148
- infile.close()
149
- outfile.close()
150
- end
151
-
152
-
153
- ####
154
- # transform plaintext file to Tab format file
155
- def FrprepHelper.plain_to_tab_file(input_filename,# string: name of input file
156
- output_filename) # string: name of output file
157
- begin
158
- infile = File.new(input_filename)
159
- outfile = File.new(output_filename, "w")
160
- rescue
161
- raise "Could not read #{input_filename}, or could not write to #{output_filename}."
162
- end
163
-
164
- # AB: TODO This assumes all input files have the extension <txt>.
165
- # Is it good?
166
- filename_core = File.basename(input_filename, 'txt')
167
-
168
- # array(string): keep the words of each sentence
169
- sentence = []
170
- # sentence number for making the sentence ID:
171
- # global count, over all input files
172
- sentno = 0
173
-
174
- while line = infile.gets
175
-
176
- # make a sentence ID for the next sentence: running number
177
- sentid = "#{filename_core}_#{sentno}"
178
- sentno += 1
179
-
180
- # read words into the sentence array,
181
- # separating out punctuation attached to the beginning or end of words
182
- sentence.clear
183
-
184
- # AB: TODO Remove this naive tokenizer, better to have a fully
185
- # tokenized input using an external tokenizer than that.
186
- line.split.each { |word|
187
- # punctuation at the beginning of the word
188
- #if word =~ /^([\(\[`'\"-]+)(.*)$/
189
- if word =~ /^([\(\[`\"-]+)(.*)$/
190
- punct = $1
191
- word = $2
192
- punct.scan(/./) { |single_punct|
193
- sentence << single_punct
194
- }
195
-
196
- end
197
- # punctuation at the end of the word
198
- #if word =~ /[,:;-\`?!'\"\.\)\]]+$/
199
- if word =~ /[,:;-\`?!\"\.\)\]]+$/
200
- sentence << $` # part before the match: the word
201
- punct = $&
202
- punct.scan(/./) { |single_punct|
203
- sentence << single_punct
204
- }
205
-
206
- else
207
- # no punctuation recognized
208
- sentence << word
209
- end
210
- }
211
-
212
-
213
-
214
- # remove empty words
215
- # AB: TODO Is it possible? Remove this.
216
- sentence.reject! { |word| word.nil? or word.strip.empty? }
217
-
218
- # write words to tab file
219
- # KE Dec 06: TabFormat changed
220
- sentence.each { |word|
221
- # for each word, one line, entries in the line tab-separated
222
- # the 'word' entry is the word, the 'lu_sent_ids' entry is the sentence ID sentid,
223
- # all other entries (gf, pt, frame etc.) are not set
224
- outfile.puts FNTabFormatFile.format_str({
225
- "word" => word,
226
- "sent_id" => sentid
227
- })
228
- }
229
- outfile.puts
230
- end
231
- outfile.close
232
- end
233
-
234
- ###########
235
- #
236
- # class method split_dir:
237
- # read all files in one directory and produce chunk files with _suffix_ in outdir
238
- # with a certain number of files in them (sent_num).
239
- # Optionally, remove all sentences longer than sent_leng
240
- #
241
- # produces output files 1.<suffix>, 2.<suffix>, etc.
242
- #
243
- # assumes TabFormat sentences
244
- #
245
- # example: split_all("/tmp/in","/tmp/out",".tab",2000,80)
246
-
247
- def FrprepHelper.split_dir(indir,
248
- outdir,
249
- suffix,
250
- sent_num,
251
- sent_leng=nil)
252
-
253
- unless indir[-1,1] == "/"
254
- indir += "/"
255
- end
256
- unless outdir[-1,1] == "/"
257
- outdir += "/"
258
- end
259
-
260
- outfile_counter = 0
261
- line_stack = Array.new
262
- sent_stack = Array.new
263
-
264
- Dir[indir+"*#{suffix}"].each {|infilename|
265
- STDERR.puts "Now splitting #{infilename}"
266
- infile = File.new(infilename)
267
-
268
- while line = infile.gets
269
- line.chomp!
270
- case line
271
- when "" # end of sentence
272
- if !(sent_leng.nil? or line_stack.length < sent_leng) # record sentence
273
- # suppress multiple empty lines
274
- # to avoid problems with lemmatiser
275
- # only record sent_stack if it is not empty.
276
-
277
- # change (sp 15 01 07): just cut off sentence at sent_leng.
278
-
279
- STDERR.puts "Cutting off long sentence #{line_stack.last.split("\t").last}"
280
- line_stack = line_stack[0..sent_leng-1]
281
- end
282
- unless line_stack.empty?
283
- sent_stack << line_stack
284
- # reset line_stack
285
- line_stack = Array.new
286
- end
287
-
288
-
289
- # check if we have to empty the sent stack
290
- if sent_stack.length == sent_num # enough sentences for new outfile?
291
- outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
292
- sent_stack.each {|l_stack|
293
- outfile.puts l_stack.join("\n")
294
- outfile.puts
295
- }
296
- outfile.close
297
- outfile_counter += 1
298
- sent_stack = Array.new
299
- end
300
-
301
- else # for any other line
302
- line_stack << line
303
- end
304
- end
305
- infile.close
306
- }
307
- # the last remaining sentences
308
- unless sent_stack.empty?
309
- outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
310
- sent_stack.each {|l_stack|
311
- l_stack << "\n"
312
- outfile.puts l_stack.join("\n")
313
- }
314
- outfile.close
315
- end
316
- end
317
-
318
- ####
319
- # note salsa targetlemma
320
- #
321
- # old_dir contains xml files whose name starts with the
322
- # target lemma for all frames in the file
323
- # record that target lemma in the <target> element of each frame
324
- def FrprepHelper.note_salsa_targetlemma(old_dir, # string ending in /
325
- new_dir) # string ending in /
326
-
327
-
328
- # each input file: extract target lemma from filename,
329
- # not this lemma in the <target> element of each frame
330
- Dir[old_dir + "*.xml"].each { |filename|
331
- changedfilename = new_dir + File.basename(filename)
332
-
333
- if File.basename(filename) =~ /^(.*?)[_\.]/
334
- lemma = $1
335
-
336
- infile = FilePartsParser.new(filename)
337
- outfile = File.new(changedfilename, "w")
338
-
339
- # write header
340
- outfile.puts infile.head()
341
-
342
- # iterate through sentences, yield as SalsaTigerSentence objects
343
- infile.scan_s() { |sent_string|
344
- sent = SalsaTigerSentence.new(sent_string)
345
- sent.each_frame { |frame|
346
- frame.target.set_attribute("lemma", lemma)
347
- }
348
-
349
- # write changed sentence
350
- outfile.puts sent.get()
351
- } # each sentence
352
-
353
- # write footer
354
- outfile.puts infile.tail()
355
- infile.close()
356
- outfile.close()
357
-
358
- else
359
- # couldn't determine lemma
360
- # just copy the file
361
- `cp #{filename} #{changedfilename}`
362
- end
363
- }
364
- end
365
-
366
- ####
367
- # stxml_split_dir
368
- #
369
- # split SalsaTigerXML files into new files of given length,
370
- # skipping sentences that are too long
371
- #
372
- # At the same time, sentences that occur several times (i.e. sentences which are
373
- # annotated by SALSA for more than one predicate) are compacted into one occurrence
374
- # with combined semantics.
375
- #
376
- # assumes that all files in input_dir with
377
- # extension .xml are SalsaTigerXMl files
378
- def FrprepHelper.stxml_split_dir(input_dir, # string: input directory with STXML files
379
- split_dir, # string: output directory
380
- max_sentnum, # integer: max num of sentences per file
381
- max_sentlen) # integer: max num of terminals per sentence
382
-
383
- filenames = Dir[input_dir+"*.xml"].to_a
384
-
385
- graph_hash = Hash.new # for each sentence id, keep <s...</graph>
386
- frame_hash = Hash.new # for each sentence id , keep the <frame... </frame> string
387
- uspfes_hash = Hash.new # for each sentence id, keep the uspfes stuff
388
- uspframes_hash = Hash.new # for each sentence id, keep the uspframes stuff
389
-
390
- ########################
391
- # Traverse of file(s): compute an index of all frames for each sentence, with unique identifiers
392
-
393
- filenames.each {|filename|
394
-
395
- infile = FilePartsParser.new(filename)
396
- infile.scan_s {|sent_str|
397
-
398
- sentlen = 0
399
- sent_str.delete("\n").scan(/<t\s/) { |occ| sentlen += 1}
400
- if sentlen > max_sentlen
401
- sent = RegXML.new(sent_str)
402
- # revisit handling of long sentences
403
- # $stderr.puts "I would have skipped overly long sentence " + sent.attributes["id"]+" but Sebastian forbade me.".to_s
404
- # next
405
- end
406
-
407
- # substitute old frame identifiers with new, unique ones
408
-
409
- # problem: we may have several frames per sentence, and need to keep track of them
410
- # if we rename etc sxx_f1 to sxx_f2 and there is already a sxx_f2, then
411
- # we cannot distinguish between these frames
412
-
413
- # therefore, we substitute temporary identifiers until we have substituted
414
- # all ids with temporary ones, and re-substitute final ones at the end.
415
-
416
- this_frames = Array.new
417
-
418
- temp_subs = Array.new
419
- final_subs = Array.new
420
-
421
- sent = RegXML.new(sent_str)
422
- sentid = sent.attributes["id"].to_s
423
- if sentid.nil?
424
- STDERR.puts "[frprep] Warning: cannot find sentence id, skipping sentence:"
425
- STDERR.puts sent_str
426
- # strange sentence, no ID? skip
427
- next
428
- end
429
-
430
- unless frame_hash.key? sentid
431
- frame_hash[sentid] = Array.new
432
- uspfes_hash[sentid] = Array.new
433
- uspframes_hash[sentid] = Array.new
434
- end
435
-
436
- # find everything up to and including the graph
437
- sent_children = sent.children_and_text()
438
- graph = sent_children.detect { |child| child.name == "graph" }
439
- graph_hash[sentid] = "<s " +
440
- sent.attributes.to_a.map { |at, val| "#{at}=\'#{val}\'" }.join(" ") +
441
- ">" +
442
- graph.to_s
443
-
444
- # find the usp block
445
-
446
- sem = sent_children.detect { |child| child.name == "sem"}
447
- usp = ""
448
- if sem
449
- usp = sem.children_and_text.detect { |child| child.name == "usp" }
450
- usp = usp.to_s
451
- end
452
-
453
- # find all frames
454
- if sem
455
- frames = sem.children_and_text.detect { |child| child.name == "frames" }
456
- if frames
457
- frames.children_and_text.each { |frame|
458
- unless frame.name == "frame"
459
- next
460
- end
461
- frameid = frame.attributes["id"]
462
-
463
- temp_frameid = "#{sentid}_temp_f#{frame_hash[sentid].length+this_frames.length+1}"
464
- final_frameid = "#{sentid}_f#{frame_hash[sentid].length+this_frames.length+1}"
465
-
466
- temp_subs << [frameid,temp_frameid]
467
- final_subs << [temp_frameid,final_frameid]
468
-
469
- this_frames << frame.to_s
470
- }
471
- end
472
- end
473
-
474
- # now first rename all the frames to temporary names
475
-
476
- temp_subs.each {|orig_frameid, temp_frameid|
477
- this_frames.map! {|frame_str|
478
- #print "orig ", orig_frameid, " temp ", temp_frameid, "\n"
479
- frame_str.gsub(orig_frameid,temp_frameid)
480
- }
481
-
482
- usp.gsub!(orig_frameid,temp_frameid)
483
- }
484
-
485
- # and re-rename the temporary names
486
-
487
- final_subs.each {|temp_frameid, final_frameid|
488
- this_frames.map! {|frame_str|
489
- frame_str.gsub(temp_frameid,final_frameid)
490
- }
491
- usp.gsub!(temp_frameid, final_frameid)
492
- }
493
-
494
- # store frames in data structure
495
- this_frames.each {|frame_str|
496
- frame_hash[sentid] << frame_str
497
- }
498
-
499
- # store uspfes in data structure
500
- unless usp.empty?
501
- usp_elt = RegXML.new(usp)
502
- uspfes = usp_elt.children_and_text.detect { |child| child.name == "uspfes" }
503
- uspfes.children_and_text.each { |child|
504
- unless child.name == "uspblock"
505
- next
506
- end
507
- uspfes_hash[sentid] << child.to_s
508
- }
509
-
510
- # store uspframes in data structure
511
- uspframes = usp_elt.children_and_text.detect { |child| child.name == "uspframes" }
512
- uspframes.children_and_text.each { |child|
513
- unless child.name == "uspblock"
514
- next
515
- end
516
- uspframes_hash[sentid] << child.to_s
517
- }
518
- end
519
- }
520
- }
521
-
522
- # now write everything in the data structure back to a file
523
-
524
- filecounter = 0
525
- sentcounter = 0
526
- outfile = nil
527
- sent_stack = Array.new
528
-
529
- graph_hash.sort {|a,b| a[0].to_i <=> b[0].to_i}.each {|sentid,graph_str|
530
-
531
- if sentcounter == max_sentnum
532
- outfile.puts SalsaTigerXMLHelper.get_footer
533
- outfile.close
534
- outfile = nil
535
- end
536
-
537
- unless outfile
538
- outfile = File.new(split_dir+filecounter.to_s+".xml","w")
539
- outfile.puts SalsaTigerXMLHelper.get_header
540
- filecounter +=1
541
- sentcounter = 0
542
- end
543
-
544
- xml = Array.new
545
- xml << graph_str
546
- xml << "<sem>"
547
- xml << "<globals>"
548
- xml << "</globals>"
549
- xml << "<frames>"
550
- frame_hash[sentid].each {|frame_str|
551
- xml << frame_str
552
- }
553
- xml << "</frames>"
554
- xml << "<usp>"
555
- xml << "<uspframes>"
556
- uspframes_hash[sentid].each {|uspblock_str|
557
- xml << uspblock_str
558
- }
559
- xml << "</uspframes>"
560
- xml << "<uspfes>"
561
- uspfes_hash[sentid].each {|uspblock_str|
562
- xml << uspblock_str
563
- }
564
- xml << "</uspfes>"
565
- xml << "</usp>"
566
- xml << "</sem>"
567
- xml << "</s>"
568
-
569
- outfile.puts xml.join("\n")
570
- sentcounter += 1
571
- }
572
-
573
- if outfile
574
- outfile.puts SalsaTigerXMLHelper.get_footer
575
- outfile.close
576
- outfile = nil
577
- end
578
-
579
- end
580
-
581
-
582
- ####
583
- # transform SalsaTigerXML file to Tab format file
584
- def FrprepHelper.stxml_to_tab_file(input_filename, # string: name of input file
585
- output_filename, # string: name of output file
586
- exp) # FrprepConfigData
587
- infile = FilePartsParser.new(input_filename)
588
- begin
589
- outfile = File.new(output_filename,"w")
590
- rescue
591
- raise "Stxml to tab: could not write to tab file #{output_filename}"
592
- end
593
-
594
- infile.scan_s {|sent_string|
595
-
596
- # determine sentence ID
597
- sentid = RegXML.new(sent_string).attributes["id"]
598
- unless sentid
599
- $stderr.puts "No sentence ID in sentence:\n "+ sent_string
600
- $stderr.puts "Making a new one up."
601
- sentid = Time.new().to_f.to_s
602
- end
603
-
604
- # find terminals and process them
605
- unless sent_string.delete("\n") =~ /<terminals[ >].+<\/terminals>/
606
- $stderr.puts "Warning: could not find terminals in sentence:"
607
- $stderr.puts sent_string
608
- $stderr.puts "Skipping"
609
- next
610
- end
611
-
612
- # modified by ines, 27/08/08
613
- # for Berkeley => convert ( ) to -LRB- -RRB-
614
-
615
- text = $&
616
- if exp.get("parser") == "berkeley"
617
- text.gsub!(/word='\('/, "word='*LRB*'")
618
- text.gsub!(/word='\)'/, "word='*RRB*'")
619
- text.gsub!(/word=['"]``['"]/, "word='\"'")
620
- text.gsub!(/word=['"]''['"]/, "word='\"'")
621
- text.gsub!(/word=['"]\&apos;\&apos;['"]/, "word='\"'")
622
- #text.gsub!(/word=['"]\(['"]/, "word='-LRB-'")
623
- #text.gsub!(/word=['"]\)['"]/, "word='-RRB-'")
624
-
625
- end
626
- terminals = text
627
- #terminals = sent_string
628
- terminals = RegXML.new(terminals)
629
- terminals.children_and_text.each { |terminal|
630
-
631
- unless terminal.name == "t"
632
- # not a terminal after all
633
- next
634
- end
635
-
636
-
637
- outfile.puts FNTabFormatFile.format_str({
638
- "word" => SalsaTigerXMLHelper.unescape(terminal.attributes["word"]),
639
- "sent_id" => sentid
640
- })
641
- } # each terminal
642
- outfile.puts
643
- } # each sentence
644
- outfile.close
645
- end
646
-
647
- ###
648
- # add semantics from tab:
649
- #
650
- # add information about semantics from a FN tab sentence
651
- # to a SalsaTigerSentence object:
652
- # - frames (one frame per sentence)
653
- # - roles
654
- # - FrameNet grammatical functions
655
- # - FrameNet POS of target
656
- def FrprepHelper.add_semantics_from_tab(st_sent, # SalsaTigerSentence object
657
- tab_sent, # FNTabFormatSentence object
658
- mapping, # hash: tab lineno -> array:SynNode
659
- interpreter_class, # SynInterpreter class
660
- exp) # FrprepConfigData
661
-
662
- if tab_sent.nil?
663
- # tab sentence not found
664
- return
665
- end
666
-
667
- # iterate through frames in the tabsent
668
- frame_index = 0
669
- tab_sent.each_frame { |tab_frame_obj|
670
- frame_name = tab_frame_obj.get_frame() # string
671
-
672
- if frame_name.nil? or frame_name =~ /^-*$/
673
- # weird: a frame without a frame
674
- $stderr.puts "Warning: frame entry without a frame in tab sentence #{st_sent.id}."
675
- $stderr.puts "Skipping"
676
- next
677
- end
678
-
679
- frame_node = st_sent.add_frame(frame_name, tab_sent.get_sent_id() + "_f#{frame_index}")
680
- frame_index += 1
681
-
682
- # target
683
- target_nodes = Array.new
684
- tab_frame_obj.get_target_indices.each {|terminal_id|
685
- if mapping[terminal_id]
686
- target_nodes.concat mapping[terminal_id]
687
- end
688
- }
689
-
690
- # let the interpreter class decide on how to determine the maximum constituents
691
- target_maxnodes = interpreter_class.max_constituents(target_nodes, st_sent)
692
- if target_maxnodes.empty?
693
- # HIEr
694
- STDERR.puts "Warning: no target in frame entry, sentence #{st_sent.id}."
695
- $stderr.puts "frame is #{frame_name}, frame no #{frame_index}"
696
- $stderr.puts "Skipping."
697
- $stderr.puts "target indices: " + tab_frame_obj.get_target_indices.join(", ")
698
- #tab_sent.each_line { |line|
699
- # $stderr.puts line
700
- # $stderr.puts "--"
701
- #}
702
- next
703
- end
704
- frame_node.add_fe("target",target_maxnodes)
705
-
706
- # set features on target: target lemma, target POS
707
- target_lemma = tab_frame_obj.get_target()
708
- target_pos = nil
709
- if target_lemma
710
- if exp.get("origin") == "FrameNet"
711
- # FrameNet data: here the lemma in the tab file has the form
712
- # <lemma>.<POS>
713
- # separate the two
714
- if target_lemma =~ /^(.*)\.(.*)$/
715
- target_lemma = $1
716
- target_pos = $2
717
- end
718
- end
719
- frame_node.target.set_attribute("lemma", target_lemma)
720
- if target_pos
721
- frame_node.target.set_attribute("pos", target_pos)
722
- end
723
- end
724
-
725
- # roles, GF, PT
726
- # synnode_markable_label:
727
- # hash "role" | "gf" | "pt" -> SynNode -> array: label(string)
728
- layer_synnode_label = Hash.new
729
- ["gf", "pt", "role"].each {|layer|
730
- termids2labels = tab_frame_obj.markables(layer)
731
-
732
- unless layer_synnode_label[layer]
733
- layer_synnode_label[layer] = Hash.new
734
- end
735
-
736
- termids2labels.each {|terminal_indices, label|
737
- terminal_indices.each { |t_i|
738
-
739
- if (nodes = mapping[t_i])
740
-
741
- nodes.each { |node|
742
- unless layer_synnode_label[layer][node]
743
- layer_synnode_label[layer][node] = Array.new
744
- end
745
-
746
- layer_synnode_label[layer][node] << label
747
- } # each node that t_i maps to
748
- end # if t_i maps to anything
749
-
750
- } # each terminal index
751
- } # each mapping terminal indices -> label
752
- } # each layer
753
-
754
- # 'stuff' (Support and other things)
755
- layer_synnode_label["stuff"] = Hash.new
756
- tab_frame_obj.each_line_parsed { |line_obj|
757
- if (label = line_obj.get("stuff")) != "-"
758
- if (nodes = mapping[line_obj.get("lineno")])
759
- nodes.each { |node|
760
- unless layer_synnode_label["stuff"][node]
761
- layer_synnode_label["stuff"][node] = Array.new
762
- end
763
- layer_synnode_label["stuff"][node] << label
764
- }
765
- end
766
- end
767
- }
768
-
769
- # reencode:
770
- # hash role_label(string) -> array of tuples [synnodes, gflabels, ptlabels]
771
- # synnodes: array:SynNode. gflabels, ptlabels: array:String
772
- #
773
- # note that in this step, any gf or pt labels that have been
774
- # assigned to a SynNode that has not also been assigned a role
775
- # will be lost
776
- role2nodes_labels = Hash.new
777
- layer_synnode_label["role"].each_pair { |synnode, labels|
778
- labels.each { | rolelabel|
779
- unless role2nodes_labels[rolelabel]
780
- role2nodes_labels[rolelabel] = Array.new
781
- end
782
-
783
- role2nodes_labels[rolelabel] << [
784
- synnode,
785
- layer_synnode_label["gf"][synnode],
786
- layer_synnode_label["pt"][synnode]
787
- ]
788
- } # each role label
789
- } # each pair SynNode/role labels
790
-
791
- # reencode "stuff", but only the support cases
792
- role2nodes_labels["Support"] = Array.new()
793
-
794
- layer_synnode_label["stuff"].each_pair { |synnode, labels|
795
- labels.each { |stufflabel|
796
- if stufflabel =~ /Supp/
797
- # some sort of support
798
- role2nodes_labels["Support"] << [synnode, nil, nil]
799
- end
800
- }
801
- }
802
-
803
- ##
804
- # each role label:
805
- # make FeNode for the current frame
806
- role2nodes_labels.each_pair { |rolelabel, node_gf_pt|
807
-
808
- # get list of syn nodes, GF and PT labels for this role
809
- # shortcut for GF and PT labels: take any labels that have
810
- # been assigned for _some_ Synnode of this role
811
- synnodes = node_gf_pt.map { |ngp| ngp[0] }
812
- gflabels = node_gf_pt.map { |ngp| ngp[1] }.compact.flatten.uniq
813
- ptlabels = node_gf_pt.map { |ngp| ngp[2] }.compact.flatten.uniq
814
-
815
-
816
- # let the interpreter class decide on how to
817
- # determine the maximum constituents
818
- maxnodes = interpreter_class.max_constituents(synnodes, st_sent)
819
-
820
- fe_node = st_sent.add_fe(frame_node, rolelabel, maxnodes)
821
- unless gflabels.empty?
822
- fe_node.set_attribute("gf", gflabels.join(","))
823
- end
824
- unless ptlabels.empty?
825
- fe_node.set_attribute("pt", ptlabels.join(","))
826
- end
827
- } # each role label
828
- } # each frame
829
- end
830
-
831
-
832
- ######
833
- # handle multiword targets:
834
- # if you find a verb with a separate prefix,
835
- # change the verb's lemma information accordingly
836
- # and add an attribute "other_words" to the verb node
837
- # pointing to the other node
838
- #
839
- # In general, it will be assumed that "other_words" contains
840
- # a list of node IDs for other nodes belonging to the same
841
- # group, node IDs separated by spaces, and that
842
- # each node of a group has the "other_words" attribute.
843
- #
844
- def FrprepHelper.handle_multiword_targets(sent, # SalsaTigerSentence object
845
- interpreter, # SynInterpreter object
846
- language) # string: en, de
847
- ##
848
- # only retain the interesting words of the sentence:
849
- # content words and prepositions
850
- if sent.nil?
851
- return
852
- end
853
-
854
- nodes = sent.terminals.select { |node|
855
- [
856
- "adj", "adv", "card", "noun", "part", "prep", "verb"
857
- ].include? interpreter.category(node)
858
- }
859
-
860
- ##
861
- # group:
862
- # group verbs with their separate particles
863
- # (at a later point, other types of grouping can be inserted here)
864
- groups = FrprepHelper.group_words(nodes, interpreter)
865
-
866
- ##
867
- # record grouping information as attributes on the terminals.
868
- groups.each { |descr, group_of_nodes|
869
- case descr
870
- when "none"
871
- # no grouping
872
- when "part"
873
- # separate particle belonging to a verb
874
-
875
- # group_of_nodes is a pair [verb, particle]
876
- verb, particle = group_of_nodes
877
-
878
- verb.set_attribute("other_words", particle.id())
879
- particle.set_attribute("other_words", verb.id())
880
-
881
- if verb.get_attribute("lemma") and particle.get_attribute("lemma")
882
- case language
883
- when "de"
884
- # German: prepend SVP to get the real lemma of the verb
885
- verb.set_attribute("lemma",
886
- particle.get_attribute("lemma") +
887
- verb.get_attribute("lemma"))
888
- when "en"
889
- # English: append particle as separate word after the lemma of the verb
890
- verb.set_attribute("lemma",
891
- verb.get_attribute("lemma") + " " +
892
- particle.get_attribute("lemma"))
893
- else
894
- # default
895
- verb.set_attribute("lemma",
896
- verb.get_attribute("lemma") + " " +
897
- particle.get_attribute("lemma"))
898
- end
899
- end
900
-
901
- else
902
- raise "Shouldn't be here: unexpected description #{descr}"
903
- end
904
- }
905
- end
906
-
907
- ########################
908
- # group_words
909
- #
910
- # auxiliary of transform_multiword targets
911
- #
912
- # Group terminals:
913
- # At the moment, just find separate prefixes and particles
914
- # for verbs
915
- #
916
- # returns: list of pairs [descr, nodes]
917
- # descr: string, "none" (no group), "part" (separate verb particle)
918
- # nodes: array:SynNode
919
- def FrprepHelper.group_words(nodes, # array: SynNode
920
- interpreter) # SynInterpreter object
921
-
922
- retv = Array.new # array of groups, array:array:SynNode
923
- done = Array.new # remember nodes already covered
924
-
925
- nodes.each { |terminal_node|
926
- if done.include? terminal_node
927
- # we have already included this node in one of the groups
928
- next
929
- end
930
-
931
- if (svp = interpreter.particle_of_verb(terminal_node, nodes))
932
- retv << ["part", [terminal_node, svp]]
933
- done << terminal_node
934
- done << svp
935
- else
936
- retv << ["none", [terminal_node]]
937
- done << terminal_node
938
- end
939
-
940
- }
941
-
942
- return retv
943
- end
944
-
945
-
946
- ######
947
- # handle unknown framenames
948
- #
949
- # For all frames with names matching Unknown\d+,
950
- # rename them to <lemma>_Unknown\d+
951
- def FrprepHelper.handle_unknown_framenames(sent, # SalsaTigerSentence
952
- interpreter) # SynInterpreter class
953
- if sent.nil?
954
- return
955
- end
956
-
957
- sent.each_frame { |frame|
958
- if frame.name() =~ /^Unknown/
959
- if frame.target
960
- maintarget = interpreter.main_node_of_expr(frame.target.children(), "no_mwe")
961
- else
962
- maintarget = nil
963
- end
964
- unless maintarget
965
- $stderr.puts "Warning: Unknown frame, and I could not determine the target lemma: Frame #{frame.id()}"
966
- $stderr.puts "Cannot repair frame name, leaving it as is."
967
- return
968
- end
969
-
970
- # get lemma, if it exists, otherwise get word
971
- # also, if the lemmatizer has returned a disjunction of lemmas,
972
- # get the first disjunct
973
- lemma = interpreter.lemma_backoff(maintarget)
974
- if lemma
975
- # we have a lemma
976
- frame.set_name(lemma + "_" + frame.name())
977
- else
978
- # the main target word has no lemma attribute,
979
- # and somehow I couldn't even get the target word
980
- $stderr.puts "Warning: Salsa 'Unknown' frame."
981
- $stderr.puts "Trying to make its lemma-specificity explicit, but"
982
- $stderr.puts "I could not determine the target lemma nor the target word: frame #{frame.id()}"
983
- $stderr.puts "Leaving 'Unknown' as it is."
984
- end
985
- end
986
- }
987
- end
988
-
989
-
990
- #####################
991
- #
992
- # Integrate the semantic annotation of an old sentence
993
- # into the corresponding new sentence
994
- # At the same time, integrate the lemma information from the
995
- # old sentence into the new sentence
996
- def FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent,
997
- newsent,
998
- interpreter_class,
999
- exp)
1000
- if oldsent.nil? or newsent.nil?
1001
- return
1002
- end
1003
- ##
1004
- # match old and new sentence via terminals
1005
- newterminals = newsent.terminals_sorted()
1006
- oldterminals = oldsent.terminals_sorted()
1007
- # sanity check: exact match on terminals?
1008
- newterminals.interleave(oldterminals).each { |newnode, oldnode|
1009
- #print "old ", oldnode.word, " ", newnode.word, "\n"
1010
- # new and old word: use both unescaped and escaped variant
1011
- if newnode
1012
- newwords = [ newnode.word, SalsaTigerXMLHelper.escape(newnode.word) ]
1013
- else
1014
- newwords = [nil, nil]
1015
- end
1016
- if oldnode
1017
- oldwords = [ oldnode.word, SalsaTigerXMLHelper.escape(oldnode.word) ]
1018
- else
1019
- oldwords = [ nil, nil]
1020
- end
1021
-
1022
- if (newwords & oldwords).empty?
1023
- # old and new word don't match, either escaped or non-escaped
1024
-
1025
- $stderr.puts "Warning: could not match terminals of sentence #{newsent.id()}"
1026
- $stderr.puts "This means that I cannot match the semantic annotation"
1027
- $stderr.puts "to the newly parsed sentence. Skipping."
1028
- #$stderr.puts "Old sentence: "
1029
- #$stderr.puts oldterminals.map { |n| n.word }.join("--")
1030
- #$stderr.puts "New sentence: "
1031
- #$stderr.puts newterminals.map { |n| n.word }.join("--")
1032
- return false
1033
- end
1034
- }
1035
-
1036
- ##
1037
- # copy lemma information
1038
- oldterminals.each_with_index { |oldnode, ix|
1039
- newnode = newterminals[ix]
1040
- if oldnode.get_attribute("lemma")
1041
- newnode.set_attribute("lemma", oldnode.get_attribute("lemma"))
1042
- end
1043
- }
1044
-
1045
- ##
1046
- # copy frames
1047
- oldsent.each_frame { |oldframe|
1048
- # make new frame with same ID
1049
- newframe = newsent.add_frame(oldframe.name, oldframe.id())
1050
- # copy FEs
1051
- oldframe.each_child { |oldfe|
1052
- # new nodes: map old terminals to new terminals,
1053
- # then find max constituents covering them
1054
- newnodes = oldfe.descendants.select { |n|
1055
- n.is_terminal?
1056
- }.map { |n|
1057
- oldterminals.index(n)
1058
- }.map { |ix|
1059
- newterminals[ix]
1060
- }
1061
-
1062
- # let the interpreter class decide on how to determine the maximum constituents
1063
- newnodes = interpreter_class.max_constituents(newnodes, newsent)
1064
-
1065
- # make new FE with same ID
1066
- new_fe = newsent.add_fe(newframe, oldfe.name(), newnodes, oldfe.id())
1067
- # keep all attributes of the FE
1068
- if oldfe.get_f("attributes")
1069
- oldfe.get_f("attributes").each_pair { |attr, value|
1070
- new_fe.set_attribute(attr, value)
1071
- }
1072
- end
1073
- }
1074
- }
1075
-
1076
- ##
1077
- ### changed by ines => appears twice in stxml file
1078
-
1079
- # copy underspecification
1080
- # keep as is, since we've kept all frame and FE IDs
1081
- oldsent.each_usp_frameblock { |olduspframe|
1082
- newuspframe = newsent.add_usp("frame")
1083
- olduspframe.each_child { |oldnode|
1084
- newnode = newsent.sem_node_with_id(oldnode.id())
1085
- if newnode
1086
- newuspframe.add_child(newnode)
1087
- else
1088
- $stderr.puts "Error: unknown frame with ID #{oldnode.id()}"
1089
- end
1090
- }
1091
- }
1092
- oldsent.each_usp_feblock { |olduspfe|
1093
- newuspfe = newsent.add_usp("fe")
1094
- olduspfe.each_child { |oldnode|
1095
- newnode = newsent.sem_node_with_id(oldnode.id())
1096
- if newnode
1097
- newuspfe.add_child(newnode)
1098
- else
1099
- $stderr.puts "Error: unknown FE with ID #{oldnode.id()}"
1100
- end
1101
- }
1102
- }
1103
-
1104
- end
1105
-
1106
- ####################
1107
- # add head attributes to each nonterminal in each
1108
- # SalsaTigerXML file in a directory
1109
-
1110
- def FrprepHelper.add_head_attributes(st_sent, # SalsaTigerSentence object
1111
- interpreter) # SynInterpreter class
1112
- st_sent.each_nonterminal {|nt_node|
1113
- head_term = interpreter.head_terminal(nt_node)
1114
- if head_term and head_term.word()
1115
- nt_node.set_attribute("head", head_term.word())
1116
- else
1117
- nt_node.set_attribute("head", "--")
1118
- end
1119
- } # each nonterminal
1120
- end
1121
-
1122
- # add lemma information to each terminal in a given SalsaTigerSentence object
1123
- def FrprepHelper.add_lemmas_from_tab(st_sent, # SalsaTigerSentence object
1124
- tab_sent,# FNTabFormatSentence object
1125
- mapping) # hash: tab lineno -> array:SynNode
1126
- if tab_sent.nil?
1127
- # tab sentence not found
1128
- return
1129
- end
1130
-
1131
- # produce list with word, lemma pairs
1132
- lemmat = Array.new
1133
- tab_sent.each_line_parsed {|line|
1134
- word = line.get("word")
1135
- lemma = line.get("lemma")
1136
- lemmat << [word,lemma]
1137
- }
1138
-
1139
- # match with st_sent terminal list and add lemma attributes
1140
- # KE Jan 07: if word mismatch,
1141
- # set to Lemmatizer file version,
1142
- # but count mismatches
1143
- word_mismatches = Array.new()
1144
-
1145
- st_sent.each_terminal_sorted {|t|
1146
- matching_lineno = (0..lemmat.length()-1).to_a.detect { |tab_lineno|
1147
- mapping[tab_lineno].include? t
1148
- }
1149
- unless matching_lineno
1150
- next
1151
- end
1152
- word, lemma = lemmat[matching_lineno]
1153
-
1154
- # transform characters to XML-friendly form
1155
- # for comparison with st_word, which is also escaped
1156
- word = SalsaTigerXMLHelper.escape(word)
1157
- st_word = t.word()
1158
- if word != st_word and
1159
- word != SalsaTigerXMLHelper.escape(st_word)
1160
- # true mismatch.
1161
- # use the Lemmatizer version of the word, remember the mismatch
1162
- word_mismatches << [st_word, word]
1163
- t.set_attribute("word", word)
1164
- end
1165
-
1166
- if lemma
1167
- # we actually do have lemma information
1168
- lemmatised_head = SalsaTigerXMLHelper.escape(lemma)
1169
- t.set_attribute("lemma",lemmatised_head)
1170
- end
1171
- } # each terminal
1172
-
1173
- # did we have mismatches? then report them
1174
- unless word_mismatches.empty?
1175
- $stderr.puts "Warning: Word mismatches found between Lemmatizer file and SalsaTigerXML file generalted from parser output."
1176
- $stderr.puts "(May be due to failed reencoding of special character in the parser output.)"
1177
- $stderr.puts "I am using the Lemmatizer version by default."
1178
- $stderr.puts "Version used:"
1179
- $stderr.print "\t"
1180
- st_sent.each_terminal_sorted { |t| $stderr.print ">>#{t}<<" }
1181
- $stderr.puts
1182
- $stderr.print "SalsaTigerXML file had: "
1183
- $stderr.print word_mismatches.map { |st_word, tab_word|
1184
- "#{st_word} instead of #{tab_word}"
1185
- }.join(", ")
1186
- $stderr.puts
1187
- end
1188
- end
1189
-
1190
- ###################3
1191
- # given a SalsaTigerSentence,
1192
- # look for FrameNet frames that are
1193
- # test frames, and remove them
1194
- def FrprepHelper.remove_deprecated_frames(sent, # SalsaTigerSentence
1195
- exp) # FrprepConfigData
1196
-
1197
- unless exp.get("origin") == "FrameNet"
1198
- return
1199
- end
1200
-
1201
- sent.frames.each { |frame_obj|
1202
- if frame_obj.name() == "Boulder" or
1203
- frame_obj.name() =~ /^Test/
1204
- sent.remove_frame(frame_obj)
1205
- end
1206
- }
1207
- end
1208
-
1209
- end
1210
-
1211
- ############################################3
1212
- # Class FrprepFlatSyntax:
1213
- #
1214
- # given a FNTabFormat file,
1215
- # yield each of its sentences in SalsaTigerXML,
1216
- # constructing a flat syntax
1217
- class FrprepFlatSyntax
1218
- def initialize(tabfilename, # string: name of tab file
1219
- postag_suffix, # postag file suffix (or nil)
1220
- lemma_suffix) # lemmatisation file suffix (or nil)
1221
-
1222
- @tabfilename = tabfilename
1223
- @pos_suffix = postag_suffix
1224
- @lemma_suffix = lemma_suffix
1225
- end
1226
-
1227
- # yield each non-parse sentence as a tuple
1228
- # [ salsa/tiger xml sentence, tab format sentence, mapping]
1229
- # of a SalsaTigerSentence object, a FNTabSentence object,
1230
- # and a hash: FNTab sentence lineno(integer) -> array:SynNode
1231
- # pointing each tab word to one or more SalsaTigerSentence terminals
1232
- def each_sentence(dummy)
1233
-
1234
- # read tab file with lemma and POS info
1235
- tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
1236
-
1237
- tabfile.each_sentence() { |tabsent|
1238
- # start new, empty sentence with "failed" attribute (i.e. no parse)
1239
- # and with the ID of the corresponding TabFormat sentence
1240
- sentid = tabsent.get_sent_id()
1241
- if sentid.nil? or sentid =~ /^-*$/
1242
- $stderr.puts "No sentence ID for sentence:"
1243
- tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
1244
- $stderr.puts
1245
- sentid = Time.new().to_f.to_s
1246
- end
1247
- sent = SalsaTigerSentence.new("<s id=\"#{SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
1248
-
1249
- # add single nonterminal node, category "S"
1250
- single_nonterminal_id = SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
1251
- vroot = sent.add_syn("nt", "S", # category
1252
- nil, # word
1253
- nil, # pos
1254
- single_nonterminal_id)
1255
-
1256
- # add terminals
1257
- tabsent.each_line_parsed() { |line_obj|
1258
- # make terminal node with tab sent info
1259
- node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
1260
- word = line_obj.get("word")
1261
- unless word
1262
- word = ""
1263
- end
1264
- word = SalsaTigerXMLHelper.escape(word)
1265
- pos = line_obj.get("pos")
1266
- unless pos
1267
- pos = ""
1268
- end
1269
- pos = SalsaTigerXMLHelper.escape(pos)
1270
- terminal = sent.add_syn("t", nil, # category
1271
- word, pos,
1272
- node_id)
1273
-
1274
- if line_obj.get("lemma")
1275
- # lemma
1276
- terminal.set_attribute("lemma", SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
1277
- end
1278
-
1279
- # add new terminal as child of vroot
1280
- vroot.add_child(terminal, nil)
1281
- terminal.add_parent(vroot, nil)
1282
- } # each line of tab file
1283
-
1284
- # yield newly constructed SalsaTigerXMl sentence plus tab sentence
1285
- yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
1286
- }
1287
- end
1288
- end
1289
-
1290
- ############################################3
1291
- # Class FrprepReadStxml
1292
- #
1293
- # given a STXML file,
1294
- # yield each of its sentences
1295
- class FrprepReadStxml
1296
- def initialize(stxmlfilename, # string: name of SalsaTigerXML file
1297
- tabfilename, # string: name of corresponding tab file (or nil)
1298
- postag_suffix, # POS tag file suffix (or nil)
1299
- lemma_suffix) # lemmatization file suffix (or nil)
1300
-
1301
- @stxmlfilename = stxmlfilename
1302
- @tabfilename = tabfilename
1303
- @pos_suffix = postag_suffix
1304
- @lemma_suffix = lemma_suffix
1305
- end
1306
- # yield each non-parse sentence as a tuple
1307
- # [ salsa/tiger xml sentence, tab format sentence, mapping]
1308
- # of a SalsaTigerSentence object, a FNTabSentence object,
1309
- # and a hash: FNTab sentence lineno(integer) -> array:SynNode
1310
- # pointing each tab word to one or more SalsaTigerSentence terminals
1311
- def each_sentence(dummy)
1312
- # read corresponding tab file?
1313
- tab_sents = Array.new()
1314
- if File.exists? @tabfilename
1315
- tabfile = FNTabFormatFile.new(@tabfilename,@pos_suffix,@lemma_suffix)
1316
- tabfile.each_sentence { |tabsent|
1317
- tab_sents << tabsent
1318
- }
1319
- end
1320
-
1321
- # read STXML file
1322
- infile = FilePartsParser.new(@stxmlfilename)
1323
- index = 0
1324
- infile.scan_s { |sent_string|
1325
- sent = SalsaTigerSentence.new(sent_string)
1326
- yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
1327
- index += 1
1328
- }
1329
- end
1330
- end