frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,686 @@
1
+ require 'frprep/do_parses'
2
+ require 'frprep/FrprepHelper'
3
+ require 'frprep/FixSynSemMapping'
4
+
5
+ ##############################
6
+ # The class that does all the work
7
+ module FrPrep
8
+ class FrPrep
9
+
10
+ def initialize(exp) # FrprepConfigData object
11
+ @exp = exp
12
+
13
+ # AB: move to FRprepOptionParser
14
+ # remove previous contents of frprep internal data directory
15
+ unless exp.get("frprep_directory")
16
+ raise "Please set 'frprep_directory', the frprep internal data directory,\n" +
17
+ "in the experiment file."
18
+ end
19
+
20
+ # experiment directory:
21
+ # frprep internal data directory, subdir according to experiment ID
22
+ exp_dir = File.new_dir(@exp.get("frprep_directory"),
23
+ @exp.get("prep_experiment_ID"))
24
+ # %x{rm -rf #{exp_dir}}
25
+
26
+ # suffixes for different types of output files
27
+ @file_suffixes = {"lemma" => ".lemma",
28
+ "pos" => ".pos",
29
+ "tab" => ".tab",
30
+ "stxml" => ".xml"}
31
+ end
32
+
33
+ def transform()
34
+
35
+ # AB: Debugging.
36
+ debugger if $DEBUG
37
+
38
+ current_format = @exp.get("format")
39
+
40
+ # AB: move to FRprepOptionParser
41
+ unless @exp.get("directory_input")
42
+ $stderr.puts "Please specify 'directory_input' in the experiment file."
43
+ exit 1
44
+ end
45
+ # AB: move to FRprepOptionParser
46
+ unless @exp.get("directory_preprocessed")
47
+ $stderr.puts "Please specify 'directory_preprocessed' in the experiment file."
48
+ exit 1
49
+ end
50
+
51
+ ##
52
+ # input and output directories.
53
+ #
54
+ # sanity check: output in tab format will not work
55
+ # if we also do a parse
56
+ if @exp.get("tabformat_output") and @exp.get("do_parse")
57
+ $stderr.puts "Error: Cannot do Tab format output"
58
+ $stderr.puts "when the input text is being parsed."
59
+ $stderr.puts "Please set either 'tabformat_output' or 'do_parse' to false."
60
+ exit 1
61
+ end
62
+ input_dir = File.existing_dir(@exp.get("directory_input"))
63
+ output_dir = File.new_dir(@exp.get("directory_preprocessed"))
64
+ if @exp.get("tabformat_output")
65
+ split_dir = output_dir
66
+ else
67
+ split_dir = frprep_dirname("split", "new")
68
+ end
69
+
70
+ ####
71
+ # transform data to UTF-8
72
+
73
+ if ["iso", "hex"].include? @exp.get("encoding")
74
+ # transform ISO -> UTF-8 or Hex -> UTF-8
75
+ # write result to encoding_dir,
76
+ # then set encoding_dir to be the new input_dir
77
+
78
+ encoding_dir = frprep_dirname("encoding", "new")
79
+ $stderr.puts "Frprep: Transforming to UTF-8."
80
+ Dir[input_dir + "*"].each { |filename|
81
+ unless File.file? filename
82
+ # not a file? then skip
83
+ next
84
+ end
85
+ outfilename = encoding_dir + File.basename(filename)
86
+ FrprepHelper.to_utf8_file(filename, outfilename, @exp.get("encoding"))
87
+ }
88
+
89
+ input_dir = encoding_dir
90
+ end
91
+
92
+
93
+ ####
94
+ # transform data all the way to the output format,
95
+ # which is SalsaTigerXML by default,
96
+ # except when tabformat_output has been set, in which case it's
97
+ # Tab format.
98
+ current_dir = input_dir
99
+
100
+ if @exp.get("tabformat_output")
101
+ done_format = "SalsaTabWithPos"
102
+ else
103
+ done_format = "Done"
104
+ end
105
+
106
+ while not(current_format == done_format)
107
+ case current_format
108
+
109
+ when "BNC"
110
+ # basically plain, plus some tags to be removed
111
+ plain_dir = frprep_dirname("plain", "new")
112
+
113
+ $stderr.puts "Frprep: Transforming BNC format text in #{current_dir} to plain format."
114
+ $stderr.puts "Storing the result in #{plain_dir}."
115
+ $stderr.puts "Expecting one sentence per line."
116
+
117
+ transform_bncformat_dir(current_dir, plain_dir)
118
+
119
+ current_dir = plain_dir
120
+ current_format = "Plain"
121
+
122
+ when "Plain"
123
+ # transform to tab format
124
+
125
+ tab_dir = frprep_dirname("tab", "new")
126
+
127
+ $stderr.puts "Frprep: Transforming plain text in #{current_dir} to SalsaTab format."
128
+ $stderr.puts "Storing the result in #{tab_dir}."
129
+ $stderr.puts "Expecting one sentence per line."
130
+
131
+ transform_plain_dir(current_dir, tab_dir)
132
+
133
+ current_dir = tab_dir
134
+ current_format = "SalsaTab"
135
+
136
+ when "FNXml"
137
+ # transform to tab format
138
+
139
+ tab_dir = frprep_dirname("tab", "new")
140
+
141
+ $stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
142
+ $stderr.puts "Storing the result in " + tab_dir
143
+
144
+ fndata = FNDatabase.new(current_dir)
145
+ fndata.extract_everything(tab_dir)
146
+ Kernel.system("chmod -R g+rx #{tab_dir}")
147
+
148
+ current_dir = tab_dir
149
+ current_format = "SalsaTab"
150
+
151
+ when "FNCorpusXml"
152
+ # transform to tab format
153
+ tab_dir = frprep_dirname("tab", "new")
154
+
155
+ $stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
156
+ $stderr.puts "Storing the result in " + tab_dir
157
+ # assuming that all XML files in the current directory are FN Corpus XML files
158
+ Dir[current_dir + "*.xml"].each { |fncorpusfilename|
159
+ corpus = FNCorpusXMLFile.new(fncorpusfilename)
160
+ outfile = File.new(tab_dir + File.basename(fncorpusfilename, ".xml") + ".tab",
161
+ "w")
162
+ corpus.print_conll_style(outfile)
163
+ outfile.close()
164
+ }
165
+
166
+ Kernel.system("chmod -R g+rx #{tab_dir}")
167
+ current_dir = tab_dir
168
+ current_format = "SalsaTab"
169
+
170
+ when "SalsaTab"
171
+ # lemmatize and POStag
172
+
173
+ $stderr.puts "Frprep: Lemmatizing and parsing text in #{current_dir}."
174
+ $stderr.puts "Storing the result in #{split_dir}."
175
+ transform_pos_and_lemmatize(current_dir, split_dir)
176
+
177
+ current_dir = split_dir
178
+ current_format = "SalsaTabWithPos"
179
+
180
+ when "SalsaTabWithPos"
181
+ # parse
182
+
183
+ parse_dir = frprep_dirname("parse", "new")
184
+
185
+ $stderr.puts "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format."
186
+ $stderr.puts "Storing the result in #{parse_dir}."
187
+
188
+ transform_salsatab_dir(current_dir, parse_dir, output_dir)
189
+
190
+ current_dir = output_dir
191
+ current_format = "Done"
192
+
193
+ when "SalsaTigerXML"
194
+
195
+ parse_dir = frprep_dirname("parse", "new")
196
+ print "Transform parser output into stxml\n"
197
+ transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir, @exp)
198
+ current_dir = output_dir
199
+ current_format = "Done"
200
+
201
+ else
202
+ $stderr.puts "Unknown data format #{current_format}"
203
+ $stderr.puts "Please check the 'format' entry in your experiment file."
204
+ raise "Experiment file problem"
205
+ end
206
+ end
207
+
208
+ $stderr.puts "Frprep: Done preprocessing."
209
+ end
210
+
211
+ ############################################################################3
212
+ private
213
+ ############################################################################3
214
+
215
+ ###############
216
+ # frprep_dirname:
217
+ # make directory name for frprep-internal data
218
+ # of a certain kind described in <subdir>
219
+ #
220
+ # frprep_directory has one subdirectory for each experiment ID,
221
+ # and below that there is one subdir per subtask
222
+ #
223
+ # If this is a new directory, it is constructed,
224
+ # if it should be an existing directory, its existence is checked.
225
+ def frprep_dirname(subdir, # string: designator of subdirectory
226
+ new = nil) # non-nil: this may be a new directory
227
+
228
+ dirname = File.new_dir(@exp.get("frprep_directory"),
229
+ @exp.get("prep_experiment_ID"),
230
+ subdir)
231
+
232
+
233
+ if new
234
+ return File.new_dir(dirname)
235
+ else
236
+ return File.existing_dir(dirname)
237
+ end
238
+ end
239
+
240
+
241
+
242
+ ###############
243
+ # transform_plain:
244
+ #
245
+ # transformation for BNC format:
246
+ #
247
+ # transform to plain format, removing <> elements
248
+ def transform_bncformat_dir(input_dir, # string: input directory
249
+ output_dir) # string: output directory
250
+
251
+ Dir[input_dir + "*"].each { |bncfilename|
252
+
253
+ # open input and output file
254
+ # end output file name in "tab" because that is, at the moment, required
255
+ outfilename = output_dir + File.basename(bncfilename)
256
+ FrprepHelper.bnc_to_plain_file(bncfilename, outfilename)
257
+ }
258
+ end
259
+
260
+
261
+ ###############
262
+ # transform_plain:
263
+ #
264
+ # transformation for plaintext:
265
+ #
266
+ # transform to Tab format, separating punctuation from adjacent words
267
+ def transform_plain_dir(input_dir, # string: input directory
268
+ output_dir) # string: output directory
269
+
270
+ Dir[input_dir + "*"].each { |plainfilename|
271
+
272
+ # open input and output file
273
+ # end output file name in "tab" because that is, at the moment, required
274
+ outfilename = output_dir + File.basename(plainfilename) + @file_suffixes["tab"]
275
+ FrprepHelper.plain_to_tab_file(plainfilename, outfilename)
276
+ }
277
+ end
278
+
279
+ ###############
280
+ # transform_pos_and_lemmatize
281
+ #
282
+ # transformation for Tab format files:
283
+ #
284
+ # - Split into parser-size chunks
285
+ # - POS-tag, lemmatize
286
+ def transform_pos_and_lemmatize(input_dir, # string: input directory
287
+ output_dir) # string: output directory
288
+ ##
289
+ # split the TabFormatFile into chunks of max_sent_num size
290
+ FrprepHelper.split_dir(input_dir, output_dir,@file_suffixes["tab"],
291
+ @exp.get("parser_max_sent_num"),
292
+ @exp.get("parser_max_sent_len"))
293
+
294
+ ##
295
+ # POS-Tagging
296
+ if @exp.get("do_postag")
297
+ $stderr.puts "Frprep: Tagging."
298
+ unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
299
+ raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
300
+ end
301
+
302
+ sys_class = SynInterfaces.get_interface("pos_tagger",
303
+ @exp.get("pos_tagger"))
304
+ print "pos tagger interface: ", sys_class, "\n"
305
+ unless sys_class
306
+ raise "Shouldn't be here"
307
+ end
308
+ sys = sys_class.new(@exp.get("pos_tagger_path"),
309
+ @file_suffixes["tab"],
310
+ @file_suffixes["pos"])
311
+ sys.process_dir(output_dir, output_dir)
312
+ end
313
+
314
+
315
+ ##
316
+ # Lemmatization
317
+ if @exp.get("do_lemmatize")
318
+ $stderr.puts "Frprep: Lemmatizing."
319
+ unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
320
+ raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
321
+ end
322
+
323
+ sys_class = SynInterfaces.get_interface("lemmatizer",
324
+ @exp.get("lemmatizer"))
325
+ # AB: make this exception explicit.
326
+ unless sys_class
327
+ raise 'I got a empty interface class for the lemmatizer!'
328
+ end
329
+ sys = sys_class.new(@exp.get("lemmatizer_path"),
330
+ @file_suffixes["tab"],
331
+ @file_suffixes["lemma"])
332
+ sys.process_dir(output_dir, output_dir)
333
+ end
334
+ end
335
+
336
+ ###############
337
+ # transform_salsatab
338
+ #
339
+ # transformation for Tab format files:
340
+ #
341
+ # - parse
342
+ # - Transform parser output to SalsaTigerXML
343
+ # If no parsing, make flat syntactic structure.
344
+ def transform_salsatab_dir(input_dir, # string: input directory
345
+ parse_dir, # string: output directory for parses
346
+ output_dir) # string: global output directory
347
+
348
+ ##
349
+ # (Parse and) transform to SalsaTigerXML
350
+
351
+ # get interpretation class for this
352
+ # parser/lemmatizer/POS tagger combination
353
+ interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
354
+ unless interpreter_class
355
+ raise "Shouldn't be here"
356
+ end
357
+
358
+ parse_obj = DoParses.new(@exp, @file_suffixes,
359
+ parse_dir,
360
+ "tab_dir" => input_dir)
361
+ parse_obj.each_parsed_file { |parsed_file_obj|
362
+
363
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
364
+ $stderr.puts "Writing #{outfilename}"
365
+ begin
366
+ outfile = File.new(outfilename, "w")
367
+ rescue
368
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
369
+ end
370
+
371
+ outfile.puts SalsaTigerXMLHelper.get_header()
372
+ # work with triples
373
+ # SalsaTigerSentence, FNTabSentence,
374
+ # hash: tab sentence index(integer) -> array:SynNode
375
+ parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
376
+
377
+ # parsed: add headwords using parse tree
378
+ if @exp.get("do_parse")
379
+ FrprepHelper.add_head_attributes(st_sent, interpreter_class)
380
+ end
381
+
382
+ # add lemmas, if they are there. If they are not, don't print out a warning.
383
+ if @exp.get("do_lemmatize")
384
+ FrprepHelper.add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
385
+ end
386
+
387
+ # add semantics
388
+ # we can use the method in SalsaTigerXMLHelper
389
+ # that reads semantic information from the tab file
390
+ # and combines all targets of a sentence into one frame
391
+ FrprepHelper.add_semantics_from_tab(st_sent, tabformat_sent, mapping,
392
+ interpreter_class, @exp)
393
+
394
+ # remove pseudo-frames from FrameNet data
395
+ FrprepHelper.remove_deprecated_frames(st_sent, @exp)
396
+
397
+ # handle multiword targets
398
+ FrprepHelper.handle_multiword_targets(st_sent,
399
+ interpreter_class, @exp.get("language"))
400
+
401
+ # handle Unknown frame names
402
+ FrprepHelper.handle_unknown_framenames(st_sent, interpreter_class)
403
+
404
+ outfile.puts st_sent.get()
405
+ }
406
+ outfile.puts SalsaTigerXMLHelper.get_footer()
407
+ }
408
+ end
409
+
410
+ #############################################
411
+ # transform_stxml
412
+ #
413
+ # transformation for SalsaTigerXML data
414
+ #
415
+ # - If the input format was SalsaTigerXML:
416
+ # - Tag, lemmatize and parse, if the experiment file tells you so
417
+ #
418
+ # - If the origin is the Salsa corpus:
419
+ # Change frame names from Unknown\d+ to lemma_Unknown\d+
420
+ #
421
+ # - fix multiword lemmas, or at least try
422
+ # - transform to UTF 8
423
+ def transform_stxml_dir(parse_dir, # string: name of directory for parse data
424
+ tab_dir, # string: name of directory for split/tab data
425
+ input_dir, # string: name of input directory
426
+ output_dir, # string: name of final output directory
427
+ exp) # FrprepConfigData
428
+
429
+ ####
430
+ # Data preparation
431
+
432
+ # Data with Salsa as origin:
433
+ # remember the target lemma as an attribute on the
434
+ # <target> elements
435
+ #
436
+ # currently deactivated: encoding problems
437
+ # if @exp.get("origin") == "SalsaTiger"
438
+ # $stderr.puts "Frprep: noting target lemmas"
439
+ # changed_input_dir = frprep_dirname("salsalemma", "new")
440
+ # FrprepHelper.note_salsa_targetlemmas(input_dir, changed_input_dir)
441
+
442
+ # # remember changed input dir as input dir
443
+ # input_dir = changed_input_dir
444
+ # end
445
+
446
+ # If data is to be parsed, split and tabify input files
447
+ # else copy data to stxml_indir.
448
+
449
+ # stxml_dir: directory where SalsaTiger data is situated
450
+ if @exp.get("do_parse")
451
+ # split data
452
+ stxml_splitdir = frprep_dirname("stxml_split", "new")
453
+ stxml_dir = stxml_splitdir
454
+
455
+ $stderr.puts "Frprep: splitting data"
456
+ FrprepHelper.stxml_split_dir(input_dir, stxml_splitdir,
457
+ @exp.get("parser_max_sent_num"),
458
+ @exp.get("parser_max_sent_len"))
459
+ else
460
+ # no parsing: copy data to split dir
461
+ stxml_dir = parse_dir
462
+ $stderr.puts "Frprep: Copying data to #{stxml_dir}"
463
+ Dir[input_dir + "*.xml"].each { |filename|
464
+ `cp #{filename} #{stxml_dir}#{File.basename(filename)}`
465
+ }
466
+ end
467
+
468
+ # Some syntactic processing will take place:
469
+ # tabify data
470
+ if @exp.get("do_parse") or @exp.get("do_lemmatize") or @exp.get("do_postag")
471
+ $stderr.puts "Frprep: making input for syn. processing"
472
+
473
+ Dir[stxml_dir+"*"+@file_suffixes["stxml"]].each { |stxmlfilename|
474
+
475
+ tabfilename = tab_dir + File.basename(stxmlfilename,@file_suffixes["stxml"]) + @file_suffixes["tab"]
476
+ FrprepHelper.stxml_to_tab_file(stxmlfilename, tabfilename, exp)
477
+ }
478
+ end
479
+
480
+ ###
481
+ # POS-tagging
482
+ if @exp.get("do_postag")
483
+ $stderr.puts "Frprep: Tagging."
484
+ unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
485
+ raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
486
+ end
487
+
488
+ sys_class = SynInterfaces.get_interface("pos_tagger",
489
+ @exp.get("pos_tagger"))
490
+ unless sys_class
491
+ raise "Shouldn't be here"
492
+ end
493
+ sys = sys_class.new(@exp.get("pos_tagger_path"),
494
+ @file_suffixes["tab"],
495
+ @file_suffixes["pos"])
496
+ sys.process_dir(tab_dir, tab_dir)
497
+ end
498
+
499
+ ###
500
+ # Lemmatization
501
+ if @exp.get("do_lemmatize")
502
+ $stderr.puts "Frprep: Lemmatizing."
503
+ unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
504
+ raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
505
+ end
506
+
507
+ sys_class = SynInterfaces.get_interface("lemmatizer",
508
+ @exp.get("lemmatizer"))
509
+ unless sys_class
510
+ raise "Shouldn't be here"
511
+ end
512
+ sys = sys_class.new(@exp.get("lemmatizer_path"),
513
+ @file_suffixes["tab"],
514
+ @file_suffixes["lemma"])
515
+ sys.process_dir(tab_dir, tab_dir)
516
+ end
517
+
518
+ ###
519
+ # Parsing, production of SalsaTigerXML output
520
+
521
+ # get interpretation class for this
522
+ # parser/lemmatizer/POS tagger combination
523
+ sys_class_names = Hash.new
524
+ [["do_postag", "pos_tagger"],
525
+ ["do_lemmatize", "lemmatizer"],
526
+ ["do_parse", "parser"]].each { |service, system_name|
527
+ if @exp.get(service) # yes, perform this service
528
+ sys_class_names[system_name] = @exp.get(system_name)
529
+ end
530
+ }
531
+ interpreter_class = SynInterfaces.get_interpreter(sys_class_names)
532
+ unless interpreter_class
533
+ raise "Shouldn't be here"
534
+ end
535
+
536
+ parse_obj = DoParses.new(@exp, @file_suffixes,
537
+ parse_dir,
538
+ "tab_dir" => tab_dir,
539
+ "stxml_dir" => stxml_dir)
540
+ parse_obj.each_parsed_file { |parsed_file_obj|
541
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
542
+ $stderr.puts "Writing #{outfilename}"
543
+ begin
544
+ outfile = File.new(outfilename, "w")
545
+ rescue
546
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
547
+ end
548
+
549
+
550
+ if @exp.get("do_parse")
551
+ # read old SalsaTigerXML file
552
+ # so we can integrate the old file's semantics later
553
+ oldxml = Array.new # array of sentence strings
554
+ # we assume that the old and the new file have the same name,
555
+ # ending in .xml.
556
+ oldxmlfile = FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
557
+ oldxmlfile.scan_s { |sent_string|
558
+ # remember this sentence by its ID
559
+ oldxml << sent_string
560
+ }
561
+ end
562
+
563
+ outfile.puts SalsaTigerXMLHelper.get_header()
564
+ index = 0
565
+ # work with triples
566
+ # SalsaTigerSentence, FNTabSentence,
567
+ # hash: tab sentence index(integer) -> array:SynNode
568
+ parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
569
+
570
+ # parsed? then integrate semantics and lemmas from old file
571
+ if @exp.get("do_parse")
572
+ oldsent_string = oldxml[index]
573
+ index += 1
574
+ if oldsent_string
575
+
576
+ # modified by ines, 27/08/08
577
+ # for Berkeley => substitute ( ) for *LRB* *RRB*
578
+ if exp.get("parser") == "berkeley"
579
+ oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
580
+ oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
581
+ oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
582
+ oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
583
+ end
584
+
585
+ # we have both an old and a new sentence, so integrate semantics
586
+ oldsent = SalsaTigerSentence.new(oldsent_string)
587
+ if st_sent.nil?
588
+ next
589
+ end
590
+ if ( FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp) == false)
591
+ #print "FALSE \n";
592
+ #print oldsent, "\n", st_sent, "\n\n";
593
+
594
+ oldsent_string = oldxml[index]
595
+ index += 1
596
+ if oldsent_string
597
+
598
+ # modified by ines, 27/08/08
599
+ # for Berkeley => substitute ( ) for *LRB* *RRB*
600
+ if exp.get("parser") == "berkeley"
601
+ oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
602
+ oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
603
+ oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
604
+ oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
605
+ end
606
+
607
+ # we have both an old and a new sentence, so integrate semantics
608
+ oldsent = SalsaTigerSentence.new(oldsent_string)
609
+ #print oldsent, "\n", st_sent, "\n\n";
610
+ FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
611
+
612
+ end
613
+ #else
614
+ #print "TRUE\n";
615
+ #print oldsent, "\n", st_sent, "\n\n";
616
+ end
617
+ else
618
+ # no corresponding old sentence for this new sentence
619
+ $stderr.puts "Warning: transporting semantics -- missing source sentence, skipping"
620
+ end
621
+ end
622
+
623
+ # remove pseudo-frames from FrameNet data
624
+ FrprepHelper.remove_deprecated_frames(st_sent, @exp)
625
+
626
+ # repair syn/sem mapping problems?
627
+ if @exp.get("fe_syn_repair") or @exp.get("fe_rel_repair")
628
+ FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
629
+ end
630
+
631
+ outfile.puts st_sent.get()
632
+ } # each ST sentence
633
+ outfile.puts SalsaTigerXMLHelper.get_footer()
634
+ } # each file parsed
635
+ end
636
+
637
+
638
+ ###################################
639
+ # general file iterators
640
+
641
+ # yields pairs of [infile name, outfile stream]
642
+ def change_each_file_in_dir(dir, # string: directory name
643
+ suffix) # string: filename pattern, e.g. "*.xml"
644
+ Dir[dir + "*#{suffix}"].each { |filename|
645
+ tempfile = Tempfile.new("FrprepHelper")
646
+ yield [filename, tempfile]
647
+
648
+ # move temp file to original file location
649
+ tempfile.close()
650
+ `cp #{filename} #{filename}.bak`
651
+ `mv #{tempfile.path()} #{filename}`
652
+ tempfile.close(true)
653
+ } # each file
654
+ end
655
+
656
+ #######
657
+ # change_each_stxml_file_in_dir
658
+ #
659
+ # use change_each_file_in_dir, but assume that the files
660
+ # are SalsaTigerXML files: Keep file headers and footers,
661
+ # and just offer individual sentences for changing
662
+ #
663
+ # Yields SalsaTigerSentence objects, each sentence to be changed
664
+ def change_each_stxml_file_in_dir(dir) # string: directory name
665
+
666
+ change_each_file_in_dir(dir, "*.xml") { |stfilename, tf|
667
+ infile = FilePartsParser.new(stfilename)
668
+
669
+ # write header
670
+ tf.puts infile.head()
671
+
672
+ # iterate through sentences, yield as SalsaTigerSentence objects
673
+ infile.scan_s() { |sent_string|
674
+ sent = SalsaTigerSentence.new(sent_string)
675
+ yield sent
676
+ # write changed sentence
677
+ tf.puts sent.get()
678
+ } # each sentence
679
+
680
+ # write footer
681
+ tf.puts infile.tail()
682
+ infile.close()
683
+ }
684
+ end
685
+ end
686
+ end