frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,31 @@
1
+ class FileZipped
2
+
3
+ def FileZipped.new(filename,
4
+ mode = "r")
5
+
6
+ # escape characters in the filename that
7
+ # would make the shell hiccup on the command
8
+ filename = filename.gsub(/([();:!?'`])/, 'XXSLASHXX\1')
9
+ filename = filename.gsub(/XXSLASHXX/, "\\")
10
+
11
+ begin
12
+ case mode
13
+ when "r"
14
+ unless File.exists? filename
15
+ raise "catchme"
16
+ end
17
+ return IO.popen("gunzip -c #{filename}")
18
+ when "w"
19
+ return IO.popen("gzip > #{filename}", "w")
20
+ when "a"
21
+ return IO.popen("gzip >> #{filename}", "w")
22
+ else
23
+ $stderr.puts "FileZipped error: only modes r, w, a are implemented. I got: #{mode}."
24
+ exit 1
25
+ end
26
+ rescue
27
+ raise "Error opening file #{filename}."
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,863 @@
1
+ require "tempfile"
2
+ require 'fileutils'
3
+
4
+ require "common/RegXML"
5
+ require "common/SynInterfaces"
6
+ require "common/TabFormat"
7
+ require "common/SalsaTigerRegXML"
8
+ require "common/SalsaTigerXMLHelper"
9
+
10
+ require 'fred/md5'
11
+ require "fred/FredConfigData"
12
+ require "fred/FredConventions"
13
+ require "fred/FredDetermineTargets"
14
+ require "common/DBInterface"
15
+ require "common/RosyConventions"
16
+ require "common/SQLQuery"
17
+
18
+ ########################################
19
+ # Context Provider classes:
20
+ # read in text, collecting context windows of given size
21
+ # around target words, yield contexts as soon as they are complete
22
+ #
23
+ # Target words are determined by delegating to either TargetsFromFrames or AllTargets
24
+ #
25
+ class AbstractContextProvider
26
+
27
+ include WordLemmaPosNe
28
+
29
+ ################
30
+ def initialize(window_size, # int: size of context window (one-sided)
31
+ exp, # experiment file object
32
+ interpreter_class, #SynInterpreter class
33
+ target_obj, # AbstractTargetDeterminer object
34
+ dataset) # "train", "test"
35
+
36
+ @window_size = window_size
37
+ @exp = exp
38
+ @interpreter_class = interpreter_class
39
+ @target_obj = target_obj
40
+ @dataset = dataset
41
+
42
+ # make arrays:
43
+ # context words
44
+ @context = Array.new(2 * @window_size + 1, nil)
45
+ # nil for non-targets, all information on the target for targets
46
+ @is_target = Array.new(2 * @window_size + 1, nil)
47
+ # sentence object
48
+ @sentence = Array.new(2 * @window_size + 1, nil)
49
+
50
+ end
51
+
52
+ ###################
53
+ # each_window: iterator
54
+ #
55
+ # given a directory with Salsa/Tiger XML data,
56
+ # iterate through the data,
57
+ # yielding each target word as soon as its context window is filled
58
+ # (or the last file is at an end)
59
+ #
60
+ # yields tuples of:
61
+ # - a context, an array of tuples [word,lemma, pos, ne]
62
+ # string/nil*string/nil*string/nil*string/nil
63
+ # - ID of main target: string
64
+ # - target_IDs: array:string, list of IDs of target words
65
+ # - senses: array:string, the senses for the target
66
+ # - sent: SalsaTigerSentence object
67
+ def each_window(dir) # string: directory containing Salsa/Tiger XML data
68
+ raise "overwrite me"
69
+ end
70
+
71
+ ####################
72
+ protected
73
+
74
+ ############################
75
+ # shift a sentence through the @context window,
76
+ # yield when at target
77
+ #
78
+ # yields tuples of:
79
+ # - a context, an array of tuples [word,lemma, pos, ne]
80
+ # string/nil*string/nil*string/nil*string/nil
81
+ # - ID of main target: string
82
+ # - target_IDs: array:string, list of IDs of target words
83
+ # - senses: array:string, the senses for the target
84
+ # - sent: SalsaTigerSentence object
85
+ def each_window_for_sent(sent) # SalsaTigerSentence object or TabSentence object
86
+ if sent.kind_of? SalsaTigerSentence
87
+ each_window_for_stsent(sent) { |result| yield result }
88
+
89
+ elsif sent.kind_of? TabFormatSentence
90
+ each_window_for_tabsent(sent) { |result | yield result }
91
+
92
+ else
93
+ $stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
94
+ exit 1
95
+ end
96
+ end
97
+
98
+ ###
99
+ # sent is a SalsaTigerSentence object:
100
+ # there may be targets
101
+ #
102
+ # yields tuples of:
103
+ # - a context, an array of tuples [word,lemma, pos, ne]
104
+ # string/nil*string/nil*string/nil*string/nil
105
+ # - ID of main target: string
106
+ # - target_IDs: array:string, list of IDs of target words
107
+ # - senses: array:string, the senses for the target
108
+ # - sent: SalsaTigerSentence object
109
+ def each_window_for_stsent(sent)
110
+ # determine targets first.
111
+ # original targets:
112
+ # hash: target_IDs -> list of senses
113
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
114
+ #
115
+ # where a sense is represented as a hash:
116
+ # "sense": sense, a string
117
+ # "obj": FrameNode object
118
+ # "all_targets": list of node IDs, may comprise more than a single node
119
+ # "lex": lemma, or multiword expression in canonical form
120
+ # "sid": sentence ID
121
+ original_targets = @target_obj.determine_targets(sent)
122
+
123
+
124
+ # reencode, make hashes:
125
+ # main target ID -> list of senses,
126
+ # main target ID -> all target IDs
127
+ maintarget_to_senses = Hash.new()
128
+ main_to_all_targets = Hash.new()
129
+ original_targets.each_key { |alltargets, maintarget|
130
+
131
+ main_to_all_targets[maintarget] = alltargets
132
+ maintarget_to_senses[maintarget] = original_targets[[alltargets, maintarget]]
133
+
134
+ }
135
+
136
+ # then shift each terminal into the context window
137
+ # and check whether there is a target at the center
138
+ # position
139
+ sent_terminals_nopunct(sent).each { |term_obj|
140
+ # add new word to end of context array
141
+ @context.push(word_lemma_pos_ne(term_obj, @interpreter_class))
142
+
143
+ if maintarget_to_senses.has_key? term_obj.id()
144
+ @is_target.push( [ term_obj.id(),
145
+ main_to_all_targets[term_obj.id()],
146
+ maintarget_to_senses[term_obj.id()]
147
+ ] )
148
+ else
149
+ @is_target.push(nil)
150
+ end
151
+
152
+ @sentence.push(sent)
153
+
154
+ # remove first word from context array
155
+ @context.shift()
156
+ @is_target.shift()
157
+ @sentence.shift()
158
+
159
+ # check for target at center
160
+ if @is_target[@window_size]
161
+ # yes, we have a target at center position.
162
+ # yield it:
163
+ # - a context, an array of tuples [word,lemma, pos, ne]
164
+ # string/nil*string/nil*string/nil*string/nil
165
+ # - ID of main target: string
166
+ # - target_IDs: array:string, list of IDs of target words
167
+ # - senses: array:string, the senses for the target
168
+ # - sent: SalsaTigerSentence object
169
+ main_target_id, all_target_ids, senses = @is_target[@window_size]
170
+
171
+ yield [ @context,
172
+ main_target_id, all_target_ids,
173
+ senses,
174
+ @sentence[@window_size]
175
+ ]
176
+ end
177
+ }
178
+ end
179
+
180
+ ###
181
+ # sent is a TabFormatSentence object.
182
+ # shift word/lemma/pos/ne tuples throught the context window.
183
+ # Whenever this brings a target (from another sentence, necessarily)
184
+ # to the center of the context window, yield it.
185
+ def each_window_for_tabsent(sent)
186
+ sent.each_line_parsed() { |line_obj|
187
+ # push onto the context array:
188
+ # [word, lemma, pos, ne], all lowercase
189
+ @context.push([ line_obj.get("word").downcase(),
190
+ line_obj.get("lemma").downcase(),
191
+ line_obj.get("pos").downcase(),
192
+ nil])
193
+ @is_target.push(nil)
194
+ @sentence.push(nil)
195
+
196
+ # remove first word from context array
197
+ @context.shift()
198
+ @is_target.shift()
199
+ @sentence.shift()
200
+
201
+ # check for target at center
202
+ if @is_target[@window_size]
203
+ # yes, we have a target at center position.
204
+ # yield it:
205
+ # context window, main target ID, all target IDs,
206
+ # senses (as FrameNode objects), sentence as XML
207
+ main_target_id, all_target_ids, senses = @is_target[@window_size]
208
+ yield [ @context,
209
+ main_target_id, all_target_ids,
210
+ senses,
211
+ @sentence[@window_size]
212
+ ]
213
+ end
214
+ }
215
+ end
216
+
217
+ ############################
218
+ # each remaining target:
219
+ # call this to empty the context window after everything has been shifted in
220
+ def each_remaining_target()
221
+ while @context.detect { |entry| not(entry.nil?) }
222
+ # push nil on the context array
223
+ @context.push(nil)
224
+ @is_target.push(nil)
225
+ @sentence.push(nil)
226
+
227
+ # remove first word from context array
228
+ @context.shift()
229
+ @is_target.shift()
230
+ @sentence.shift()
231
+
232
+ # check for target at center
233
+ if @is_target[@window_size]
234
+ # yes, we have a target at center position.
235
+ # yield it:
236
+ # context window, main target ID, all target IDs,
237
+ # senses (as FrameNode objects), sentence as XML
238
+ main_target_id, all_target_ids, senses = @is_target[@window_size]
239
+ yield [ @context,
240
+ main_target_id, all_target_ids,
241
+ senses,
242
+ @sentence[@window_size]
243
+ ]
244
+ end
245
+ end
246
+ end
247
+ ############################
248
+ # helper: remove punctuation
249
+ def sent_terminals_nopunct(sent)
250
+ return sent.terminals_sorted.reject { |node|
251
+ @interpreter_class.category(node) == "pun"
252
+ }
253
+ end
254
+ end
255
+
256
+ ####################################
257
+ # ContextProvider:
258
+ # subclass of AbstractContextProvider
259
+ # that assumes that the input text is a contiguous text
260
+ # and computes the context accordingly.
261
+ class ContextProvider < AbstractContextProvider
262
+ ###
263
+ # each_window: iterator
264
+ #
265
+ # given a directory with Salsa/Tiger XML data,
266
+ # iterate through the data,
267
+ # yielding each target word as soon as its context window is filled
268
+ # (or the last file is at an end)
269
+ def each_window(dir) # string: directory containing Salsa/Tiger XML data
270
+
271
+ # iterate through files in the directory.
272
+ # Try sorting filenames numerically, since this is
273
+ # what frprep mostly does with filenames
274
+ Dir[dir + "*.xml"].sort { |a, b|
275
+ File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
276
+ }.each { |filename|
277
+
278
+ # progress bar
279
+ if @exp.get("verbose")
280
+ $stderr.puts "Featurizing #{File.basename(filename)}"
281
+ end
282
+ f = FilePartsParser.new(filename)
283
+ each_window_for_file(f) { |result|
284
+ yield result
285
+ }
286
+ }
287
+ # and empty the context array
288
+ each_remaining_target() { |result| yield result }
289
+ end
290
+
291
+ ##################################
292
+ protected
293
+
294
+ ######################
295
+ # each_window_for_file: iterator
296
+ # same as each_window, but only for a single file
297
+ # (to be called from each_window())
298
+ def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
299
+ fpp.scan_s() { |sent_string|
300
+ sent = SalsaTigerSentence.new(sent_string)
301
+ each_window_for_sent(sent) { |result| yield result }
302
+ }
303
+ end
304
+ end
305
+
306
+ ####################################
307
+ # SingleSentContextProvider:
308
+ # subclass of AbstractContextProvider
309
+ # that assumes that each sentence of the input text
310
+ # stands on its own
311
+ class SingleSentContextProvider < AbstractContextProvider
312
+ ###
313
+ # each_window: iterator
314
+ #
315
+ # given a directory with Salsa/Tiger XML data,
316
+ # iterate through the data,
317
+ # yielding each target word as soon as its context window is filled
318
+ # (or the last file is at an end)
319
+ def each_window(dir) # string: directory containing Salsa/Tiger XML data
320
+ # iterate through files in the directory.
321
+ # Try sorting filenames numerically, since this is
322
+ # what frprep mostly does with filenames
323
+ Dir[dir + "*.xml"].sort { |a, b|
324
+ File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
325
+ }.each { |filename|
326
+ # progress bar
327
+ if @exp.get("verbose")
328
+ $stderr.puts "Featurizing #{File.basename(filename)}"
329
+ end
330
+ f = FilePartsParser.new(filename)
331
+ each_window_for_file(f) { |result|
332
+ yield result
333
+ }
334
+ }
335
+ end
336
+
337
+ ##################################
338
+ protected
339
+
340
+
341
+ ######################
342
+ # each_window_for_file: iterator
343
+ # same as each_window, but only for a single file
344
+ # (to be called from each_window())
345
+ def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
346
+ fpp.scan_s() { |sent_string|
347
+ sent = SalsaTigerSentence.new(sent_string)
348
+
349
+ each_window_for_sent(sent) { |result|
350
+ yield result
351
+ }
352
+ }
353
+ # no need to clear the context: we're doing this after each sentence
354
+ end
355
+
356
+ ###
357
+ # each_window_for_sent: empty context after each sentence
358
+ def each_window_for_sent(sent)
359
+ if sent.kind_of? SalsaTigerSentence
360
+ each_window_for_stsent(sent) { |result| yield result }
361
+
362
+ elsif sent.kind_of? TabFormatSentence
363
+ each_window_for_tabsent(sent) { |result | yield result }
364
+
365
+ else
366
+ $stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
367
+ exit 1
368
+ end
369
+
370
+ # clear the context
371
+ each_remaining_target() { |result| yield result }
372
+ end
373
+ end
374
+
375
+
376
+ ####################################
377
+ # NoncontiguousContextProvider:
378
+ # subclass of AbstractContextProvider
379
+ #
380
+ # This class assumes that the input text consists of single sentences
381
+ # drawn from a larger corpus.
382
+ # It first constructs an index to the sentences of the input text,
383
+ # then reads the larger corpus
384
+
385
+ class NoncontiguousContextProvider < AbstractContextProvider
386
+
387
+ ###
388
+ # each_window: iterator
389
+ #
390
+ # given a directory with Salsa/Tiger XML data,
391
+ # iterate through the data and construct an index to the sentences.
392
+ #
393
+ # Then iterate through the larger corpus,
394
+ # yielding contexts.
395
+ def each_window(dir) # string: directory containing Salsa/Tiger XML data
396
+
397
+ # sanity check: do we know where the larger corpus is?
398
+ unless @exp.get("larger_corpus_dir")
399
+ $stderr.puts "Error: 'noncontiguous_input' has been set in the experiment file"
400
+ $stderr.puts "but no location for the larger corpus has been given."
401
+ $stderr.puts "Please set 'larger_corpus_dir' in the experiment file"
402
+ $stderr.puts "to indicate the larger corpus from which the input corpus sentences are drawn."
403
+ exit 1
404
+ end
405
+
406
+ ##
407
+ # remember all sentences from the main corpus
408
+ temptable_obj, sentkeys = make_index(dir)
409
+
410
+ ##
411
+ # make frprep experiment file
412
+ # for lemmatization and POS-tagging of larger corpus files
413
+ tf_exp_frprep = Tempfile.new("fred_bow_context")
414
+ frprep_in, frprep_out, frprep_dir = write_frprep_experiment_file(tf_exp_frprep)
415
+
416
+ ##
417
+ # Iterate through the files of the larger corpus,
418
+ # check for each sentence whether it is also in the input corpus
419
+ # and yield it if it does.
420
+ # larger corpus may contain subdirectories
421
+ initialize_match_check()
422
+
423
+ each_infile(@exp.get("larger_corpus_dir")) { |filename|
424
+ $stderr.puts "Larger corpus: reading #{filename}"
425
+
426
+ # remove previous data from temp directories
427
+ remove_files(frprep_in)
428
+ remove_files(frprep_out)
429
+ remove_files(frprep_dir)
430
+
431
+ # link the input file to input directory for frprep
432
+ File.symlink(filename, frprep_in + "infile")
433
+
434
+ # call frprep
435
+ retv = Kernel.system("ruby frprep.rb -e #{tf_exp_frprep.path()}")
436
+ unless retv
437
+ $stderr.puts "Error analyzing #{filename}. Exiting."
438
+ exit 1
439
+ end
440
+
441
+
442
+ # read the resulting Tab format file, one sentence at a time:
443
+ # - check to see if the checksum of the sentence is in sentkeys
444
+ # (which means it is an input sentence)
445
+ # If it is, retrieve the sentence and determine targets
446
+ # - shift the sentence through the context window
447
+ # - whenever a target word comes to be in the center of the context window,
448
+ # yield.
449
+ $stderr.puts "Computing context features from frprep output."
450
+ Dir[frprep_out + "*.tab"].each { |tabfilename|
451
+ tabfile = FNTabFormatFile.new(tabfilename, ".pos", ".lemma")
452
+ tabfile.each_sentence() { |tabsent|
453
+
454
+ # get as Salsa/Tiger XML sentence, or TabSentence
455
+ sent = get_stxml_sent(tabsent, sentkeys, temptable_obj)
456
+
457
+ # shift sentence through context window
458
+ each_window_for_sent(sent) { |result|
459
+ yield result
460
+ }
461
+
462
+ } # each tab sent
463
+ } # each tab file
464
+ } # each infile from the larger corpus
465
+
466
+ # empty the context array
467
+ each_remaining_target() { |result| yield result }
468
+ each_unmatched(sentkeys, temptable_obj) { |result| yield result }
469
+
470
+ # remove temporary data
471
+ temptable_obj.drop_temp_table()
472
+ %x{rm -rf #{frprep_in}}
473
+ %x{rm -rf #{frprep_out}}
474
+ %x{rm -rf #{frprep_dir}}
475
+ end
476
+
477
+ ##################################
478
+ private
479
+
480
+ ###
481
+ # for each sentence of each file in the given directory:
482
+ # remember the sentence in a temporary DB,
483
+ # indexed by a hash key computed from the plaintext sentence.
484
+ #
485
+ # return:
486
+ # - DBTempTable object containing the temporary DB
487
+ # - hash table containing all hash keys
488
+ def make_index(dir)
489
+
490
+ space_for_sentstring = 30000
491
+ space_for_hashkey = 500
492
+
493
+ $stderr.puts "Indexing input corpus:"
494
+
495
+ # start temporary table
496
+ temptable_obj = get_db_interface(@exp).make_temp_table([
497
+ ["hashkey", "varchar(#{space_for_hashkey})"],
498
+ ["sent", "varchar(#{space_for_sentstring})"]
499
+ ],
500
+ ["hashkey"],
501
+ "autoinc_index")
502
+
503
+ # and hash table for the keys
504
+ retv_keys = Hash.new()
505
+
506
+ # iterate through files in the directory,
507
+ # make an index for each sentence, and store
508
+ # the sentence under that index
509
+ Dir[dir + "*.xml"].each { |filename|
510
+ $stderr.puts "\t#{filename}"
511
+ f = FilePartsParser.new(filename)
512
+ f.scan_s() { |sent_string|
513
+
514
+ xml_obj = RegXML.new(sent_string)
515
+
516
+ # make hash key from words of sentence
517
+ graph = xml_obj.children_and_text().detect { |c| c.name() == "graph" }
518
+ unless graph
519
+ next
520
+ end
521
+ terminals = graph.children_and_text().detect { |c| c.name() == "terminals" }
522
+ unless terminals
523
+ next
524
+ end
525
+ # in making a hash key, use special characters
526
+ # rather than their escaped &..; form
527
+ # $stderr.puts "HIER calling checksum for noncontig"
528
+ hashkey = checksum(terminals.children_and_text().select { |c| c.name() == "t"
529
+ }.map { |t|
530
+ SalsaTigerXMLHelper.unescape(t.attributes()["word"].to_s() )
531
+ })
532
+ # HIER
533
+ # $stderr.puts "HIER " + terminals.children_and_text().select { |c| c.name() == "t"
534
+ # }.map { |t| t.attributes()["word"].to_s() }.join(" ")
535
+
536
+ # sanity check: if the sentence is longer than
537
+ # the space currently allotted to sentence strings,
538
+ # we won't be able to recover it.
539
+ if SQLQuery.stringify_value(hashkey).length() > space_for_hashkey
540
+ $stderr.puts "Warning: sentence checksum too long, cannot store it."
541
+ $stderr.print "Max length: #{space_for_hashkey}. "
542
+ $stderr.puts "Required: #{SQLQuery.stringify_value(hashkey).length()}."
543
+ $stderr.puts "Skipping."
544
+ next
545
+ end
546
+
547
+ if SQLQuery.stringify_value(sent_string).length() > space_for_sentstring
548
+ $stderr.puts "Warning: sentence too long, cannot store it."
549
+ $stderr.print "Max length: #{space_for_sentstring}. "
550
+ $stderr.puts "Required: #{SQLQuery.stringify_value(sent_string).length()}."
551
+ $stderr.puts "Skipping."
552
+ next
553
+ end
554
+
555
+ # store
556
+ temptable_obj.query_noretv(SQLQuery.insert(temptable_obj.table_name,
557
+ [["hashkey", hashkey],
558
+ ["sent", sent_string]]))
559
+ retv_keys[hashkey] = true
560
+ }
561
+ }
562
+ $stderr.puts "Indexing finished."
563
+
564
+ return [ temptable_obj, retv_keys ]
565
+ end
566
+
567
+ ######
568
+ # compute checksum from the given sentence,
569
+ # and return as string
570
+ def checksum(words) # array: string
571
+ string = ""
572
+
573
+ # HIER removed sort() after downcase
574
+ words.map { |w| w.to_s.downcase }.each { |w|
575
+ string << w.gsub(/[^a-z]/, "")
576
+ }
577
+ return MD5.new(string).hexdigest
578
+ end
579
+
580
+ #####
581
+ # yield each file of the given directory
582
+ # or one of its subdirectories
583
+ def each_infile(indir)
584
+ unless indir =~ /\/$/
585
+ indir = indir + "/"
586
+ end
587
+
588
+ Dir[indir + "*"].each { |filename|
589
+ if File.file?(filename)
590
+ yield filename
591
+ end
592
+ }
593
+
594
+ # enter recursion
595
+ Dir[indir + "**"].each { |subdir|
596
+ # same directory we had before? don't redo
597
+ if indir == subdir
598
+ next
599
+ end
600
+
601
+ begin
602
+ unless File.stat(subdir).directory?
603
+ next
604
+ end
605
+ rescue
606
+ # no access, I assume
607
+ next
608
+ end
609
+
610
+ each_infile(subdir) { |inf|
611
+ yield inf
612
+ }
613
+ }
614
+ end
615
+
616
+ ###
617
+ # remove files: remove all files and subdirectories in the given directory
618
+ def remove_files(indir)
619
+ Dir[indir + "*"].each { |filename|
620
+ if File.file?(filename) or File.symlink?(filename)
621
+ retv = File.delete(filename)
622
+ end
623
+ }
624
+
625
+ # enter recursion
626
+ Dir[indir + "**"].each { |subdir|
627
+ # same directory we had before? don't redo
628
+ if indir == subdir
629
+ next
630
+ end
631
+
632
+ begin
633
+ unless File.stat(subdir).directory?
634
+ next
635
+ end
636
+ rescue
637
+ # no access, I assume
638
+ next
639
+ end
640
+
641
+ # subdir must end in slash
642
+ unless subdir =~ /\/$/
643
+ subdir = subdir + "/"
644
+ end
645
+ # and enter recursion
646
+ remove_files(subdir)
647
+ File.rm_f(subdir)
648
+ }
649
+ end
650
+
651
+ def write_frprep_experiment_file(tf_exp_frprep) # Tempfile object
652
+
653
+ # make unique experiment ID
654
+ experiment_id = "larger_corpus"
655
+ # input and output directory for frprep
656
+ frprep_in = fred_dirname(@exp, "temp", "in", "new")
657
+ frprep_out = fred_dirname(@exp, "temp", "out", "new")
658
+ frprep_dir = fred_dirname(@exp, "temp", "frprep", "new")
659
+
660
+ # write file:
661
+
662
+ # experiment ID and directories
663
+ tf_exp_frprep.puts "prep_experiment_ID = #{experiment_id}"
664
+ tf_exp_frprep.puts "directory_input = #{frprep_in}"
665
+ tf_exp_frprep.puts "directory_preprocessed = #{frprep_out}"
666
+ tf_exp_frprep.puts "frprep_directory = #{frprep_dir}"
667
+
668
+ # output format: tab
669
+ tf_exp_frprep.puts "tabformat_output = true"
670
+
671
+ # corpus description: language, format, encoding
672
+ if @exp.get("language")
673
+ tf_exp_frprep.puts "language = #{@exp.get("language")}"
674
+ end
675
+ if @exp.get("larger_corpus_format")
676
+ tf_exp_frprep.puts "format = #{@exp.get("larger_corpus_format")}"
677
+ elsif @exp.get("format")
678
+ $stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
679
+ $stderr.puts "using 'format' setting of frprep experiment file instead."
680
+ tf_exp_frprep.puts "format = #{@exp.get("format")}"
681
+ else
682
+ $stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
683
+ $stderr.puts "relying on default setting."
684
+ end
685
+ if @exp.get("larger_corpus_encoding")
686
+ tf_exp_frprep.puts "encoding = #{@exp.get("larger_corpus_encoding")}"
687
+ elsif @exp.get("encoding")
688
+ $stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
689
+ $stderr.puts "using 'encoding' setting of frprep experiment file instead."
690
+ tf_exp_frprep.puts "encoding = #{@exp.get("encoding")}"
691
+ else
692
+ $stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
693
+ $stderr.puts "relying on default setting."
694
+ end
695
+
696
+ # processing: lemmatization, POS tagging, no parsing
697
+ tf_exp_frprep.puts "do_lemmatize = true"
698
+ tf_exp_frprep.puts "do_postag = true"
699
+ tf_exp_frprep.puts "do_parse = false"
700
+
701
+ # lemmatizer and POS tagger settings:
702
+ # take verbatim from frprep file
703
+ begin
704
+ f = File.new(@exp.get("preproc_descr_file_" + @dataset))
705
+ rescue
706
+ $stderr.puts "Error: could not read frprep experiment file #{@exp.get("preproc_descr_file_" + @dataset)}"
707
+ exit 1
708
+ end
709
+ f.each { |line|
710
+ if line =~ /pos_tagger\s*=/ or
711
+ line =~ /pos_tagger_path\s*=/ or
712
+ line =~ /lemmatizer\s*=/ or
713
+ line =~ /lemmatizer_path\s*=/
714
+
715
+ tf_exp_frprep.puts line
716
+ end
717
+ }
718
+ # finalize frprep experiment file
719
+ tf_exp_frprep.close()
720
+
721
+ return [frprep_in, frprep_out, frprep_dir]
722
+ end
723
+
724
+ ####
725
+ # get SalsaTigerXML sentence and targets:
726
+ #
727
+ # given a Tab format sentence:
728
+ # - check whether it is in the table of input sentences.
729
+ # if so, retrieve it.
730
+ # - otherwise, fashion a makeshift SalsaTigerSentence object
731
+ # from the words, lemmas and POS
732
+ def get_stxml_sent(tabsent,
733
+ sentkeys,
734
+ temptable_obj)
735
+
736
+ # SalsaTigerSentence object
737
+ sent = nil
738
+
739
+ # make checksum
740
+ words = Array.new()
741
+ words2 = Array.new()
742
+ tabsent.each_line_parsed { |line_obj|
743
+ words << SalsaTigerXMLHelper.unescape(line_obj.get("word"))
744
+ words2 << line_obj.get("word")
745
+ }
746
+ # $stderr.puts "HIER calling checksum from larger corpus"
747
+ hashkey_this_sentence = checksum(words)
748
+
749
+ # HIER
750
+ # $stderr.puts "HIER2 " + words.join(" ")
751
+ # $stderr.puts "HIER3 " + words2.join(" ")
752
+
753
+
754
+ if sentkeys[hashkey_this_sentence]
755
+ # sentence from the input corpus.
756
+
757
+ # register
758
+ register_matched(hashkey_this_sentence)
759
+
760
+
761
+ # select "sent" columns from temp table
762
+ # where "hashkey" == sent_checksum
763
+ # returns a DBResult object
764
+ query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
765
+ [ ValueRestriction.new("hashkey", hashkey_this_sentence) ]))
766
+ query_result.each { |row|
767
+
768
+ sent_string = SQLQuery.unstringify_value(row.first().to_s())
769
+ begin
770
+ sent = SalsaTigerSentence.new(sent_string)
771
+ rescue
772
+ $stderr.puts "Error reading Salsa/Tiger XML sentence."
773
+ $stderr.puts
774
+ $stderr.puts "SQL-stored sentence was:"
775
+ $stderr.puts row.first().to_s()
776
+ $stderr.puts
777
+ $stderr.puts "==================="
778
+ $stderr.puts "With restored quotes:"
779
+ $stderr.puts sent_string
780
+ exit 1
781
+ end
782
+ break
783
+ }
784
+ unless sent
785
+ $stderr.puts "Warning: could not retrieve input corpus sentence: " + words.join(" ")
786
+ end
787
+ end
788
+
789
+ if sent
790
+ return sent
791
+ else
792
+ return tabsent
793
+ end
794
+ end
795
+
796
+ ###
797
+ # Keep track of which sentences from the smaller, noncontiguous corpus
798
+ # have been matched in the larger corpus
799
+ def initialize_match_check()
800
+ @index_matched = Hash.new()
801
+ end
802
+
803
+ ###
804
+ # Record a sentence from the smaller, noncontiguous corpus
805
+ # as matched in the larger corpus
806
+ def register_matched(hash_key)
807
+ @index_matched[hash_key] = true
808
+ end
809
+
810
+ ###
811
+ # Call this method after all sentences from the larger corpus
812
+ # have been checked against the smaller corpus.
813
+ # This method prints a warning message for each sentence from the smaller corpus
814
+ # that has not been matched,
815
+ # and yields it in the same format as each_window(),
816
+ # such that the unmatched sentences can still be processed,
817
+ # but without a larger context.
818
+ def each_unmatched(all_keys,
819
+ temptable_obj)
820
+
821
+ num_unmatched = 0
822
+
823
+ all_keys.each_key { |hash_key|
824
+ unless @index_matched[hash_key]
825
+ # unmatched sentence:
826
+
827
+ num_unmatched += 1
828
+
829
+ # retrieve
830
+ query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
831
+ [ ValueRestriction.new("hashkey", hash_key) ]))
832
+
833
+ # report and yield
834
+ query_result.each { |row|
835
+
836
+ sent_string = SQLQuery.unstringify_value(row.first().to_s())
837
+ begin
838
+ # report on unmatched sentence
839
+ sent = SalsaTigerSentence.new(sent_string)
840
+ $stderr.puts "Unmatched sentence from noncontiguous input:\n" +
841
+ sent.id().to_s() + " " + sent.to_s()
842
+
843
+ # push the sentence through the context window,
844
+ # filling it up with "nil",
845
+ # and yield when we reach the target at center position.
846
+ each_window_for_stsent(sent) { |result| yield result }
847
+ each_remaining_target() { |result| yield result }
848
+
849
+ rescue
850
+ # Couldn't turn it into a SalsaTigerSentence object:
851
+ # just report, don't yield
852
+ $stderr.puts "Unmatched sentence from noncontiguous input (raw):\n" +
853
+ sent_string
854
+ $stderr.puts "ERROR: cannot process this sentence, skipping."
855
+ end
856
+ }
857
+ end
858
+ }
859
+
860
+ $stderr.puts "Unmatched sentences: #{num_unmatched} all in all."
861
+ end
862
+
863
+ end