frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,686 @@
1
+ require 'frprep/do_parses'
2
+ require 'frprep/FrprepHelper'
3
+ require 'frprep/FixSynSemMapping'
4
+
5
+ ##############################
6
+ # The class that does all the work
7
+ module FrPrep
8
+ class FrPrep
9
+
10
+ def initialize(exp) # FrprepConfigData object
11
+ @exp = exp
12
+
13
+ # AB: move to FRprepOptionParser
14
+ # remove previous contents of frprep internal data directory
15
+ unless exp.get("frprep_directory")
16
+ raise "Please set 'frprep_directory', the frprep internal data directory,\n" +
17
+ "in the experiment file."
18
+ end
19
+
20
+ # experiment directory:
21
+ # frprep internal data directory, subdir according to experiment ID
22
+ exp_dir = File.new_dir(@exp.get("frprep_directory"),
23
+ @exp.get("prep_experiment_ID"))
24
+ # %x{rm -rf #{exp_dir}}
25
+
26
+ # suffixes for different types of output files
27
+ @file_suffixes = {"lemma" => ".lemma",
28
+ "pos" => ".pos",
29
+ "tab" => ".tab",
30
+ "stxml" => ".xml"}
31
+ end
32
+
33
+ def transform()
34
+
35
+ # AB: Debugging.
36
+ debugger if $DEBUG
37
+
38
+ current_format = @exp.get("format")
39
+
40
+ # AB: move to FRprepOptionParser
41
+ unless @exp.get("directory_input")
42
+ $stderr.puts "Please specify 'directory_input' in the experiment file."
43
+ exit 1
44
+ end
45
+ # AB: move to FRprepOptionParser
46
+ unless @exp.get("directory_preprocessed")
47
+ $stderr.puts "Please specify 'directory_preprocessed' in the experiment file."
48
+ exit 1
49
+ end
50
+
51
+ ##
52
+ # input and output directories.
53
+ #
54
+ # sanity check: output in tab format will not work
55
+ # if we also do a parse
56
+ if @exp.get("tabformat_output") and @exp.get("do_parse")
57
+ $stderr.puts "Error: Cannot do Tab format output"
58
+ $stderr.puts "when the input text is being parsed."
59
+ $stderr.puts "Please set either 'tabformat_output' or 'do_parse' to false."
60
+ exit 1
61
+ end
62
+ input_dir = File.existing_dir(@exp.get("directory_input"))
63
+ output_dir = File.new_dir(@exp.get("directory_preprocessed"))
64
+ if @exp.get("tabformat_output")
65
+ split_dir = output_dir
66
+ else
67
+ split_dir = frprep_dirname("split", "new")
68
+ end
69
+
70
+ ####
71
+ # transform data to UTF-8
72
+
73
+ if ["iso", "hex"].include? @exp.get("encoding")
74
+ # transform ISO -> UTF-8 or Hex -> UTF-8
75
+ # write result to encoding_dir,
76
+ # then set encoding_dir to be the new input_dir
77
+
78
+ encoding_dir = frprep_dirname("encoding", "new")
79
+ $stderr.puts "Frprep: Transforming to UTF-8."
80
+ Dir[input_dir + "*"].each { |filename|
81
+ unless File.file? filename
82
+ # not a file? then skip
83
+ next
84
+ end
85
+ outfilename = encoding_dir + File.basename(filename)
86
+ FrprepHelper.to_utf8_file(filename, outfilename, @exp.get("encoding"))
87
+ }
88
+
89
+ input_dir = encoding_dir
90
+ end
91
+
92
+
93
+ ####
94
+ # transform data all the way to the output format,
95
+ # which is SalsaTigerXML by default,
96
+ # except when tabformat_output has been set, in which case it's
97
+ # Tab format.
98
+ current_dir = input_dir
99
+
100
+ if @exp.get("tabformat_output")
101
+ done_format = "SalsaTabWithPos"
102
+ else
103
+ done_format = "Done"
104
+ end
105
+
106
+ while not(current_format == done_format)
107
+ case current_format
108
+
109
+ when "BNC"
110
+ # basically plain, plus some tags to be removed
111
+ plain_dir = frprep_dirname("plain", "new")
112
+
113
+ $stderr.puts "Frprep: Transforming BNC format text in #{current_dir} to plain format."
114
+ $stderr.puts "Storing the result in #{plain_dir}."
115
+ $stderr.puts "Expecting one sentence per line."
116
+
117
+ transform_bncformat_dir(current_dir, plain_dir)
118
+
119
+ current_dir = plain_dir
120
+ current_format = "Plain"
121
+
122
+ when "Plain"
123
+ # transform to tab format
124
+
125
+ tab_dir = frprep_dirname("tab", "new")
126
+
127
+ $stderr.puts "Frprep: Transforming plain text in #{current_dir} to SalsaTab format."
128
+ $stderr.puts "Storing the result in #{tab_dir}."
129
+ $stderr.puts "Expecting one sentence per line."
130
+
131
+ transform_plain_dir(current_dir, tab_dir)
132
+
133
+ current_dir = tab_dir
134
+ current_format = "SalsaTab"
135
+
136
+ when "FNXml"
137
+ # transform to tab format
138
+
139
+ tab_dir = frprep_dirname("tab", "new")
140
+
141
+ $stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
142
+ $stderr.puts "Storing the result in " + tab_dir
143
+
144
+ fndata = FNDatabase.new(current_dir)
145
+ fndata.extract_everything(tab_dir)
146
+ Kernel.system("chmod -R g+rx #{tab_dir}")
147
+
148
+ current_dir = tab_dir
149
+ current_format = "SalsaTab"
150
+
151
+ when "FNCorpusXml"
152
+ # transform to tab format
153
+ tab_dir = frprep_dirname("tab", "new")
154
+
155
+ $stderr.puts "Frprep: Transforming FN data in #{current_dir} to tabular format."
156
+ $stderr.puts "Storing the result in " + tab_dir
157
+ # assuming that all XML files in the current directory are FN Corpus XML files
158
+ Dir[current_dir + "*.xml"].each { |fncorpusfilename|
159
+ corpus = FNCorpusXMLFile.new(fncorpusfilename)
160
+ outfile = File.new(tab_dir + File.basename(fncorpusfilename, ".xml") + ".tab",
161
+ "w")
162
+ corpus.print_conll_style(outfile)
163
+ outfile.close()
164
+ }
165
+
166
+ Kernel.system("chmod -R g+rx #{tab_dir}")
167
+ current_dir = tab_dir
168
+ current_format = "SalsaTab"
169
+
170
+ when "SalsaTab"
171
+ # lemmatize and POStag
172
+
173
+ $stderr.puts "Frprep: Lemmatizing and parsing text in #{current_dir}."
174
+ $stderr.puts "Storing the result in #{split_dir}."
175
+ transform_pos_and_lemmatize(current_dir, split_dir)
176
+
177
+ current_dir = split_dir
178
+ current_format = "SalsaTabWithPos"
179
+
180
+ when "SalsaTabWithPos"
181
+ # parse
182
+
183
+ parse_dir = frprep_dirname("parse", "new")
184
+
185
+ $stderr.puts "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format."
186
+ $stderr.puts "Storing the result in #{parse_dir}."
187
+
188
+ transform_salsatab_dir(current_dir, parse_dir, output_dir)
189
+
190
+ current_dir = output_dir
191
+ current_format = "Done"
192
+
193
+ when "SalsaTigerXML"
194
+
195
+ parse_dir = frprep_dirname("parse", "new")
196
+ print "Transform parser output into stxml\n"
197
+ transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir, @exp)
198
+ current_dir = output_dir
199
+ current_format = "Done"
200
+
201
+ else
202
+ $stderr.puts "Unknown data format #{current_format}"
203
+ $stderr.puts "Please check the 'format' entry in your experiment file."
204
+ raise "Experiment file problem"
205
+ end
206
+ end
207
+
208
+ $stderr.puts "Frprep: Done preprocessing."
209
+ end
210
+
211
+ ############################################################################3
212
+ private
213
+ ############################################################################3
214
+
215
+ ###############
216
+ # frprep_dirname:
217
+ # make directory name for frprep-internal data
218
+ # of a certain kind described in <subdir>
219
+ #
220
+ # frprep_directory has one subdirectory for each experiment ID,
221
+ # and below that there is one subdir per subtask
222
+ #
223
+ # If this is a new directory, it is constructed,
224
+ # if it should be an existing directory, its existence is checked.
225
+ def frprep_dirname(subdir, # string: designator of subdirectory
226
+ new = nil) # non-nil: this may be a new directory
227
+
228
+ dirname = File.new_dir(@exp.get("frprep_directory"),
229
+ @exp.get("prep_experiment_ID"),
230
+ subdir)
231
+
232
+
233
+ if new
234
+ return File.new_dir(dirname)
235
+ else
236
+ return File.existing_dir(dirname)
237
+ end
238
+ end
239
+
240
+
241
+
242
+ ###############
243
+ # transform_plain:
244
+ #
245
+ # transformation for BNC format:
246
+ #
247
+ # transform to plain format, removing <> elements
248
+ def transform_bncformat_dir(input_dir, # string: input directory
249
+ output_dir) # string: output directory
250
+
251
+ Dir[input_dir + "*"].each { |bncfilename|
252
+
253
+ # open input and output file
254
+ # end output file name in "tab" because that is, at the moment, required
255
+ outfilename = output_dir + File.basename(bncfilename)
256
+ FrprepHelper.bnc_to_plain_file(bncfilename, outfilename)
257
+ }
258
+ end
259
+
260
+
261
+ ###############
262
+ # transform_plain:
263
+ #
264
+ # transformation for plaintext:
265
+ #
266
+ # transform to Tab format, separating punctuation from adjacent words
267
+ def transform_plain_dir(input_dir, # string: input directory
268
+ output_dir) # string: output directory
269
+
270
+ Dir[input_dir + "*"].each { |plainfilename|
271
+
272
+ # open input and output file
273
+ # end output file name in "tab" because that is, at the moment, required
274
+ outfilename = output_dir + File.basename(plainfilename) + @file_suffixes["tab"]
275
+ FrprepHelper.plain_to_tab_file(plainfilename, outfilename)
276
+ }
277
+ end
278
+
279
+ ###############
280
+ # transform_pos_and_lemmatize
281
+ #
282
+ # transformation for Tab format files:
283
+ #
284
+ # - Split into parser-size chunks
285
+ # - POS-tag, lemmatize
286
+ def transform_pos_and_lemmatize(input_dir, # string: input directory
287
+ output_dir) # string: output directory
288
+ ##
289
+ # split the TabFormatFile into chunks of max_sent_num size
290
+ FrprepHelper.split_dir(input_dir, output_dir,@file_suffixes["tab"],
291
+ @exp.get("parser_max_sent_num"),
292
+ @exp.get("parser_max_sent_len"))
293
+
294
+ ##
295
+ # POS-Tagging
296
+ if @exp.get("do_postag")
297
+ $stderr.puts "Frprep: Tagging."
298
+ unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
299
+ raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
300
+ end
301
+
302
+ sys_class = SynInterfaces.get_interface("pos_tagger",
303
+ @exp.get("pos_tagger"))
304
+ print "pos tagger interface: ", sys_class, "\n"
305
+ unless sys_class
306
+ raise "Shouldn't be here"
307
+ end
308
+ sys = sys_class.new(@exp.get("pos_tagger_path"),
309
+ @file_suffixes["tab"],
310
+ @file_suffixes["pos"])
311
+ sys.process_dir(output_dir, output_dir)
312
+ end
313
+
314
+
315
+ ##
316
+ # Lemmatization
317
+ if @exp.get("do_lemmatize")
318
+ $stderr.puts "Frprep: Lemmatizing."
319
+ unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
320
+ raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
321
+ end
322
+
323
+ sys_class = SynInterfaces.get_interface("lemmatizer",
324
+ @exp.get("lemmatizer"))
325
+ # AB: make this exception explicit.
326
+ unless sys_class
327
+ raise 'I got a empty interface class for the lemmatizer!'
328
+ end
329
+ sys = sys_class.new(@exp.get("lemmatizer_path"),
330
+ @file_suffixes["tab"],
331
+ @file_suffixes["lemma"])
332
+ sys.process_dir(output_dir, output_dir)
333
+ end
334
+ end
335
+
336
+ ###############
337
+ # transform_salsatab
338
+ #
339
+ # transformation for Tab format files:
340
+ #
341
+ # - parse
342
+ # - Transform parser output to SalsaTigerXML
343
+ # If no parsing, make flat syntactic structure.
344
+ def transform_salsatab_dir(input_dir, # string: input directory
345
+ parse_dir, # string: output directory for parses
346
+ output_dir) # string: global output directory
347
+
348
+ ##
349
+ # (Parse and) transform to SalsaTigerXML
350
+
351
+ # get interpretation class for this
352
+ # parser/lemmatizer/POS tagger combination
353
+ interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
354
+ unless interpreter_class
355
+ raise "Shouldn't be here"
356
+ end
357
+
358
+ parse_obj = DoParses.new(@exp, @file_suffixes,
359
+ parse_dir,
360
+ "tab_dir" => input_dir)
361
+ parse_obj.each_parsed_file { |parsed_file_obj|
362
+
363
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
364
+ $stderr.puts "Writing #{outfilename}"
365
+ begin
366
+ outfile = File.new(outfilename, "w")
367
+ rescue
368
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
369
+ end
370
+
371
+ outfile.puts SalsaTigerXMLHelper.get_header()
372
+ # work with triples
373
+ # SalsaTigerSentence, FNTabSentence,
374
+ # hash: tab sentence index(integer) -> array:SynNode
375
+ parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
376
+
377
+ # parsed: add headwords using parse tree
378
+ if @exp.get("do_parse")
379
+ FrprepHelper.add_head_attributes(st_sent, interpreter_class)
380
+ end
381
+
382
+ # add lemmas, if they are there. If they are not, don't print out a warning.
383
+ if @exp.get("do_lemmatize")
384
+ FrprepHelper.add_lemmas_from_tab(st_sent, tabformat_sent, mapping)
385
+ end
386
+
387
+ # add semantics
388
+ # we can use the method in SalsaTigerXMLHelper
389
+ # that reads semantic information from the tab file
390
+ # and combines all targets of a sentence into one frame
391
+ FrprepHelper.add_semantics_from_tab(st_sent, tabformat_sent, mapping,
392
+ interpreter_class, @exp)
393
+
394
+ # remove pseudo-frames from FrameNet data
395
+ FrprepHelper.remove_deprecated_frames(st_sent, @exp)
396
+
397
+ # handle multiword targets
398
+ FrprepHelper.handle_multiword_targets(st_sent,
399
+ interpreter_class, @exp.get("language"))
400
+
401
+ # handle Unknown frame names
402
+ FrprepHelper.handle_unknown_framenames(st_sent, interpreter_class)
403
+
404
+ outfile.puts st_sent.get()
405
+ }
406
+ outfile.puts SalsaTigerXMLHelper.get_footer()
407
+ }
408
+ end
409
+
410
+ #############################################
411
+ # transform_stxml
412
+ #
413
+ # transformation for SalsaTigerXML data
414
+ #
415
+ # - If the input format was SalsaTigerXML:
416
+ # - Tag, lemmatize and parse, if the experiment file tells you so
417
+ #
418
+ # - If the origin is the Salsa corpus:
419
+ # Change frame names from Unknown\d+ to lemma_Unknown\d+
420
+ #
421
+ # - fix multiword lemmas, or at least try
422
+ # - transform to UTF 8
423
+ def transform_stxml_dir(parse_dir, # string: name of directory for parse data
424
+ tab_dir, # string: name of directory for split/tab data
425
+ input_dir, # string: name of input directory
426
+ output_dir, # string: name of final output directory
427
+ exp) # FrprepConfigData
428
+
429
+ ####
430
+ # Data preparation
431
+
432
+ # Data with Salsa as origin:
433
+ # remember the target lemma as an attribute on the
434
+ # <target> elements
435
+ #
436
+ # currently deactivated: encoding problems
437
+ # if @exp.get("origin") == "SalsaTiger"
438
+ # $stderr.puts "Frprep: noting target lemmas"
439
+ # changed_input_dir = frprep_dirname("salsalemma", "new")
440
+ # FrprepHelper.note_salsa_targetlemmas(input_dir, changed_input_dir)
441
+
442
+ # # remember changed input dir as input dir
443
+ # input_dir = changed_input_dir
444
+ # end
445
+
446
+ # If data is to be parsed, split and tabify input files
447
+ # else copy data to stxml_indir.
448
+
449
+ # stxml_dir: directory where SalsaTiger data is situated
450
+ if @exp.get("do_parse")
451
+ # split data
452
+ stxml_splitdir = frprep_dirname("stxml_split", "new")
453
+ stxml_dir = stxml_splitdir
454
+
455
+ $stderr.puts "Frprep: splitting data"
456
+ FrprepHelper.stxml_split_dir(input_dir, stxml_splitdir,
457
+ @exp.get("parser_max_sent_num"),
458
+ @exp.get("parser_max_sent_len"))
459
+ else
460
+ # no parsing: copy data to split dir
461
+ stxml_dir = parse_dir
462
+ $stderr.puts "Frprep: Copying data to #{stxml_dir}"
463
+ Dir[input_dir + "*.xml"].each { |filename|
464
+ `cp #{filename} #{stxml_dir}#{File.basename(filename)}`
465
+ }
466
+ end
467
+
468
+ # Some syntactic processing will take place:
469
+ # tabify data
470
+ if @exp.get("do_parse") or @exp.get("do_lemmatize") or @exp.get("do_postag")
471
+ $stderr.puts "Frprep: making input for syn. processing"
472
+
473
+ Dir[stxml_dir+"*"+@file_suffixes["stxml"]].each { |stxmlfilename|
474
+
475
+ tabfilename = tab_dir + File.basename(stxmlfilename,@file_suffixes["stxml"]) + @file_suffixes["tab"]
476
+ FrprepHelper.stxml_to_tab_file(stxmlfilename, tabfilename, exp)
477
+ }
478
+ end
479
+
480
+ ###
481
+ # POS-tagging
482
+ if @exp.get("do_postag")
483
+ $stderr.puts "Frprep: Tagging."
484
+ unless @exp.get("pos_tagger_path") and @exp.get("pos_tagger")
485
+ raise "POS-tagging: I need 'pos_tagger' and 'pos_tagger_path' in the experiment file."
486
+ end
487
+
488
+ sys_class = SynInterfaces.get_interface("pos_tagger",
489
+ @exp.get("pos_tagger"))
490
+ unless sys_class
491
+ raise "Shouldn't be here"
492
+ end
493
+ sys = sys_class.new(@exp.get("pos_tagger_path"),
494
+ @file_suffixes["tab"],
495
+ @file_suffixes["pos"])
496
+ sys.process_dir(tab_dir, tab_dir)
497
+ end
498
+
499
+ ###
500
+ # Lemmatization
501
+ if @exp.get("do_lemmatize")
502
+ $stderr.puts "Frprep: Lemmatizing."
503
+ unless @exp.get("lemmatizer_path") and @exp.get("lemmatizer")
504
+ raise "Lemmatization: I need 'lemmatizer' and 'lemmatizer_path' in the experiment file."
505
+ end
506
+
507
+ sys_class = SynInterfaces.get_interface("lemmatizer",
508
+ @exp.get("lemmatizer"))
509
+ unless sys_class
510
+ raise "Shouldn't be here"
511
+ end
512
+ sys = sys_class.new(@exp.get("lemmatizer_path"),
513
+ @file_suffixes["tab"],
514
+ @file_suffixes["lemma"])
515
+ sys.process_dir(tab_dir, tab_dir)
516
+ end
517
+
518
+ ###
519
+ # Parsing, production of SalsaTigerXML output
520
+
521
+ # get interpretation class for this
522
+ # parser/lemmatizer/POS tagger combination
523
+ sys_class_names = Hash.new
524
+ [["do_postag", "pos_tagger"],
525
+ ["do_lemmatize", "lemmatizer"],
526
+ ["do_parse", "parser"]].each { |service, system_name|
527
+ if @exp.get(service) # yes, perform this service
528
+ sys_class_names[system_name] = @exp.get(system_name)
529
+ end
530
+ }
531
+ interpreter_class = SynInterfaces.get_interpreter(sys_class_names)
532
+ unless interpreter_class
533
+ raise "Shouldn't be here"
534
+ end
535
+
536
+ parse_obj = DoParses.new(@exp, @file_suffixes,
537
+ parse_dir,
538
+ "tab_dir" => tab_dir,
539
+ "stxml_dir" => stxml_dir)
540
+ parse_obj.each_parsed_file { |parsed_file_obj|
541
+ outfilename = output_dir + parsed_file_obj.filename + ".xml"
542
+ $stderr.puts "Writing #{outfilename}"
543
+ begin
544
+ outfile = File.new(outfilename, "w")
545
+ rescue
546
+ raise "Cannot write to SalsaTigerXML output file #{outfilename}"
547
+ end
548
+
549
+
550
+ if @exp.get("do_parse")
551
+ # read old SalsaTigerXML file
552
+ # so we can integrate the old file's semantics later
553
+ oldxml = Array.new # array of sentence strings
554
+ # we assume that the old and the new file have the same name,
555
+ # ending in .xml.
556
+ oldxmlfile = FilePartsParser.new(stxml_dir + parsed_file_obj.filename + ".xml")
557
+ oldxmlfile.scan_s { |sent_string|
558
+ # remember this sentence by its ID
559
+ oldxml << sent_string
560
+ }
561
+ end
562
+
563
+ outfile.puts SalsaTigerXMLHelper.get_header()
564
+ index = 0
565
+ # work with triples
566
+ # SalsaTigerSentence, FNTabSentence,
567
+ # hash: tab sentence index(integer) -> array:SynNode
568
+ parsed_file_obj.each_sentence { |st_sent, tabformat_sent, mapping|
569
+
570
+ # parsed? then integrate semantics and lemmas from old file
571
+ if @exp.get("do_parse")
572
+ oldsent_string = oldxml[index]
573
+ index += 1
574
+ if oldsent_string
575
+
576
+ # modified by ines, 27/08/08
577
+ # for Berkeley => substitute ( ) for *LRB* *RRB*
578
+ if exp.get("parser") == "berkeley"
579
+ oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
580
+ oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
581
+ oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
582
+ oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
583
+ end
584
+
585
+ # we have both an old and a new sentence, so integrate semantics
586
+ oldsent = SalsaTigerSentence.new(oldsent_string)
587
+ if st_sent.nil?
588
+ next
589
+ end
590
+ if ( FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp) == false)
591
+ #print "FALSE \n";
592
+ #print oldsent, "\n", st_sent, "\n\n";
593
+
594
+ oldsent_string = oldxml[index]
595
+ index += 1
596
+ if oldsent_string
597
+
598
+ # modified by ines, 27/08/08
599
+ # for Berkeley => substitute ( ) for *LRB* *RRB*
600
+ if exp.get("parser") == "berkeley"
601
+ oldsent_string.gsub!(/word='\('/, "word='*LRB*'")
602
+ oldsent_string.gsub!(/word='\)'/, "word='*RRB*'")
603
+ oldsent_string.gsub!(/word=\"\(\"/, "word='*LRB*'")
604
+ oldsent_string.gsub!(/word=\"\)\"/, "word='*RRB*'")
605
+ end
606
+
607
+ # we have both an old and a new sentence, so integrate semantics
608
+ oldsent = SalsaTigerSentence.new(oldsent_string)
609
+ #print oldsent, "\n", st_sent, "\n\n";
610
+ FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, st_sent, interpreter_class, @exp)
611
+
612
+ end
613
+ #else
614
+ #print "TRUE\n";
615
+ #print oldsent, "\n", st_sent, "\n\n";
616
+ end
617
+ else
618
+ # no corresponding old sentence for this new sentence
619
+ $stderr.puts "Warning: transporting semantics -- missing source sentence, skipping"
620
+ end
621
+ end
622
+
623
+ # remove pseudo-frames from FrameNet data
624
+ FrprepHelper.remove_deprecated_frames(st_sent, @exp)
625
+
626
+ # repair syn/sem mapping problems?
627
+ if @exp.get("fe_syn_repair") or @exp.get("fe_rel_repair")
628
+ FixSynSemMapping.fixit(st_sent, @exp, interpreter_class)
629
+ end
630
+
631
+ outfile.puts st_sent.get()
632
+ } # each ST sentence
633
+ outfile.puts SalsaTigerXMLHelper.get_footer()
634
+ } # each file parsed
635
+ end
636
+
637
+
638
+ ###################################
639
+ # general file iterators
640
+
641
+ # yields pairs of [infile name, outfile stream]
642
+ def change_each_file_in_dir(dir, # string: directory name
643
+ suffix) # string: filename pattern, e.g. "*.xml"
644
+ Dir[dir + "*#{suffix}"].each { |filename|
645
+ tempfile = Tempfile.new("FrprepHelper")
646
+ yield [filename, tempfile]
647
+
648
+ # move temp file to original file location
649
+ tempfile.close()
650
+ `cp #{filename} #{filename}.bak`
651
+ `mv #{tempfile.path()} #{filename}`
652
+ tempfile.close(true)
653
+ } # each file
654
+ end
655
+
656
+ #######
657
+ # change_each_stxml_file_in_dir
658
+ #
659
+ # use change_each_file_in_dir, but assume that the files
660
+ # are SalsaTigerXML files: Keep file headers and footers,
661
+ # and just offer individual sentences for changing
662
+ #
663
+ # Yields SalsaTigerSentence objects, each sentence to be changed
664
+ def change_each_stxml_file_in_dir(dir) # string: directory name
665
+
666
+ change_each_file_in_dir(dir, "*.xml") { |stfilename, tf|
667
+ infile = FilePartsParser.new(stfilename)
668
+
669
+ # write header
670
+ tf.puts infile.head()
671
+
672
+ # iterate through sentences, yield as SalsaTigerSentence objects
673
+ infile.scan_s() { |sent_string|
674
+ sent = SalsaTigerSentence.new(sent_string)
675
+ yield sent
676
+ # write changed sentence
677
+ tf.puts sent.get()
678
+ } # each sentence
679
+
680
+ # write footer
681
+ tf.puts infile.tail()
682
+ infile.close()
683
+ }
684
+ end
685
+ end
686
+ end