frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,31 @@
1
+ class FileZipped
2
+
3
+ def FileZipped.new(filename,
4
+ mode = "r")
5
+
6
+ # escape characters in the filename that
7
+ # would make the shell hiccup on the command
8
+ filename = filename.gsub(/([();:!?'`])/, 'XXSLASHXX\1')
9
+ filename = filename.gsub(/XXSLASHXX/, "\\")
10
+
11
+ begin
12
+ case mode
13
+ when "r"
14
+ unless File.exists? filename
15
+ raise "catchme"
16
+ end
17
+ return IO.popen("gunzip -c #{filename}")
18
+ when "w"
19
+ return IO.popen("gzip > #{filename}", "w")
20
+ when "a"
21
+ return IO.popen("gzip >> #{filename}", "w")
22
+ else
23
+ $stderr.puts "FileZipped error: only modes r, w, a are implemented. I got: #{mode}."
24
+ exit 1
25
+ end
26
+ rescue
27
+ raise "Error opening file #{filename}."
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,863 @@
1
+ require "tempfile"
2
+ require 'fileutils'
3
+
4
+ require "common/RegXML"
5
+ require "common/SynInterfaces"
6
+ require "common/TabFormat"
7
+ require "common/SalsaTigerRegXML"
8
+ require "common/SalsaTigerXMLHelper"
9
+
10
+ require 'fred/md5'
11
+ require "fred/FredConfigData"
12
+ require "fred/FredConventions"
13
+ require "fred/FredDetermineTargets"
14
+ require "common/DBInterface"
15
+ require "common/RosyConventions"
16
+ require "common/SQLQuery"
17
+
18
+ ########################################
19
+ # Context Provider classes:
20
+ # read in text, collecting context windows of given size
21
+ # around target words, yield contexts as soon as they are complete
22
+ #
23
+ # Target words are determined by delegating to either TargetsFromFrames or AllTargets
24
+ #
25
+ class AbstractContextProvider
26
+
27
+ include WordLemmaPosNe
28
+
29
+ ################
30
+ def initialize(window_size, # int: size of context window (one-sided)
31
+ exp, # experiment file object
32
+ interpreter_class, #SynInterpreter class
33
+ target_obj, # AbstractTargetDeterminer object
34
+ dataset) # "train", "test"
35
+
36
+ @window_size = window_size
37
+ @exp = exp
38
+ @interpreter_class = interpreter_class
39
+ @target_obj = target_obj
40
+ @dataset = dataset
41
+
42
+ # make arrays:
43
+ # context words
44
+ @context = Array.new(2 * @window_size + 1, nil)
45
+ # nil for non-targets, all information on the target for targets
46
+ @is_target = Array.new(2 * @window_size + 1, nil)
47
+ # sentence object
48
+ @sentence = Array.new(2 * @window_size + 1, nil)
49
+
50
+ end
51
+
52
+ ###################
53
+ # each_window: iterator
54
+ #
55
+ # given a directory with Salsa/Tiger XML data,
56
+ # iterate through the data,
57
+ # yielding each target word as soon as its context window is filled
58
+ # (or the last file is at an end)
59
+ #
60
+ # yields tuples of:
61
+ # - a context, an array of tuples [word,lemma, pos, ne]
62
+ # string/nil*string/nil*string/nil*string/nil
63
+ # - ID of main target: string
64
+ # - target_IDs: array:string, list of IDs of target words
65
+ # - senses: array:string, the senses for the target
66
+ # - sent: SalsaTigerSentence object
67
+ def each_window(dir) # string: directory containing Salsa/Tiger XML data
68
+ raise "overwrite me"
69
+ end
70
+
71
+ ####################
72
+ protected
73
+
74
+ ############################
75
+ # shift a sentence through the @context window,
76
+ # yield when at target
77
+ #
78
+ # yields tuples of:
79
+ # - a context, an array of tuples [word,lemma, pos, ne]
80
+ # string/nil*string/nil*string/nil*string/nil
81
+ # - ID of main target: string
82
+ # - target_IDs: array:string, list of IDs of target words
83
+ # - senses: array:string, the senses for the target
84
+ # - sent: SalsaTigerSentence object
85
+ def each_window_for_sent(sent) # SalsaTigerSentence object or TabSentence object
86
+ if sent.kind_of? SalsaTigerSentence
87
+ each_window_for_stsent(sent) { |result| yield result }
88
+
89
+ elsif sent.kind_of? TabFormatSentence
90
+ each_window_for_tabsent(sent) { |result | yield result }
91
+
92
+ else
93
+ $stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
94
+ exit 1
95
+ end
96
+ end
97
+
98
+ ###
99
+ # sent is a SalsaTigerSentence object:
100
+ # there may be targets
101
+ #
102
+ # yields tuples of:
103
+ # - a context, an array of tuples [word,lemma, pos, ne]
104
+ # string/nil*string/nil*string/nil*string/nil
105
+ # - ID of main target: string
106
+ # - target_IDs: array:string, list of IDs of target words
107
+ # - senses: array:string, the senses for the target
108
+ # - sent: SalsaTigerSentence object
109
+ def each_window_for_stsent(sent)
110
+ # determine targets first.
111
+ # original targets:
112
+ # hash: target_IDs -> list of senses
113
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
114
+ #
115
+ # where a sense is represented as a hash:
116
+ # "sense": sense, a string
117
+ # "obj": FrameNode object
118
+ # "all_targets": list of node IDs, may comprise more than a single node
119
+ # "lex": lemma, or multiword expression in canonical form
120
+ # "sid": sentence ID
121
+ original_targets = @target_obj.determine_targets(sent)
122
+
123
+
124
+ # reencode, make hashes:
125
+ # main target ID -> list of senses,
126
+ # main target ID -> all target IDs
127
+ maintarget_to_senses = Hash.new()
128
+ main_to_all_targets = Hash.new()
129
+ original_targets.each_key { |alltargets, maintarget|
130
+
131
+ main_to_all_targets[maintarget] = alltargets
132
+ maintarget_to_senses[maintarget] = original_targets[[alltargets, maintarget]]
133
+
134
+ }
135
+
136
+ # then shift each terminal into the context window
137
+ # and check whether there is a target at the center
138
+ # position
139
+ sent_terminals_nopunct(sent).each { |term_obj|
140
+ # add new word to end of context array
141
+ @context.push(word_lemma_pos_ne(term_obj, @interpreter_class))
142
+
143
+ if maintarget_to_senses.has_key? term_obj.id()
144
+ @is_target.push( [ term_obj.id(),
145
+ main_to_all_targets[term_obj.id()],
146
+ maintarget_to_senses[term_obj.id()]
147
+ ] )
148
+ else
149
+ @is_target.push(nil)
150
+ end
151
+
152
+ @sentence.push(sent)
153
+
154
+ # remove first word from context array
155
+ @context.shift()
156
+ @is_target.shift()
157
+ @sentence.shift()
158
+
159
+ # check for target at center
160
+ if @is_target[@window_size]
161
+ # yes, we have a target at center position.
162
+ # yield it:
163
+ # - a context, an array of tuples [word,lemma, pos, ne]
164
+ # string/nil*string/nil*string/nil*string/nil
165
+ # - ID of main target: string
166
+ # - target_IDs: array:string, list of IDs of target words
167
+ # - senses: array:string, the senses for the target
168
+ # - sent: SalsaTigerSentence object
169
+ main_target_id, all_target_ids, senses = @is_target[@window_size]
170
+
171
+ yield [ @context,
172
+ main_target_id, all_target_ids,
173
+ senses,
174
+ @sentence[@window_size]
175
+ ]
176
+ end
177
+ }
178
+ end
179
+
180
+ ###
181
+ # sent is a TabFormatSentence object.
182
+ # shift word/lemma/pos/ne tuples throught the context window.
183
+ # Whenever this brings a target (from another sentence, necessarily)
184
+ # to the center of the context window, yield it.
185
+ def each_window_for_tabsent(sent)
186
+ sent.each_line_parsed() { |line_obj|
187
+ # push onto the context array:
188
+ # [word, lemma, pos, ne], all lowercase
189
+ @context.push([ line_obj.get("word").downcase(),
190
+ line_obj.get("lemma").downcase(),
191
+ line_obj.get("pos").downcase(),
192
+ nil])
193
+ @is_target.push(nil)
194
+ @sentence.push(nil)
195
+
196
+ # remove first word from context array
197
+ @context.shift()
198
+ @is_target.shift()
199
+ @sentence.shift()
200
+
201
+ # check for target at center
202
+ if @is_target[@window_size]
203
+ # yes, we have a target at center position.
204
+ # yield it:
205
+ # context window, main target ID, all target IDs,
206
+ # senses (as FrameNode objects), sentence as XML
207
+ main_target_id, all_target_ids, senses = @is_target[@window_size]
208
+ yield [ @context,
209
+ main_target_id, all_target_ids,
210
+ senses,
211
+ @sentence[@window_size]
212
+ ]
213
+ end
214
+ }
215
+ end
216
+
217
+ ############################
218
+ # each remaining target:
219
+ # call this to empty the context window after everything has been shifted in
220
+ def each_remaining_target()
221
+ while @context.detect { |entry| not(entry.nil?) }
222
+ # push nil on the context array
223
+ @context.push(nil)
224
+ @is_target.push(nil)
225
+ @sentence.push(nil)
226
+
227
+ # remove first word from context array
228
+ @context.shift()
229
+ @is_target.shift()
230
+ @sentence.shift()
231
+
232
+ # check for target at center
233
+ if @is_target[@window_size]
234
+ # yes, we have a target at center position.
235
+ # yield it:
236
+ # context window, main target ID, all target IDs,
237
+ # senses (as FrameNode objects), sentence as XML
238
+ main_target_id, all_target_ids, senses = @is_target[@window_size]
239
+ yield [ @context,
240
+ main_target_id, all_target_ids,
241
+ senses,
242
+ @sentence[@window_size]
243
+ ]
244
+ end
245
+ end
246
+ end
247
+ ############################
248
+ # helper: remove punctuation
249
+ def sent_terminals_nopunct(sent)
250
+ return sent.terminals_sorted.reject { |node|
251
+ @interpreter_class.category(node) == "pun"
252
+ }
253
+ end
254
+ end
255
+
256
+ ####################################
257
+ # ContextProvider:
258
+ # subclass of AbstractContextProvider
259
+ # that assumes that the input text is a contiguous text
260
+ # and computes the context accordingly.
261
+ class ContextProvider < AbstractContextProvider
262
+ ###
263
+ # each_window: iterator
264
+ #
265
+ # given a directory with Salsa/Tiger XML data,
266
+ # iterate through the data,
267
+ # yielding each target word as soon as its context window is filled
268
+ # (or the last file is at an end)
269
+ def each_window(dir) # string: directory containing Salsa/Tiger XML data
270
+
271
+ # iterate through files in the directory.
272
+ # Try sorting filenames numerically, since this is
273
+ # what frprep mostly does with filenames
274
+ Dir[dir + "*.xml"].sort { |a, b|
275
+ File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
276
+ }.each { |filename|
277
+
278
+ # progress bar
279
+ if @exp.get("verbose")
280
+ $stderr.puts "Featurizing #{File.basename(filename)}"
281
+ end
282
+ f = FilePartsParser.new(filename)
283
+ each_window_for_file(f) { |result|
284
+ yield result
285
+ }
286
+ }
287
+ # and empty the context array
288
+ each_remaining_target() { |result| yield result }
289
+ end
290
+
291
+ ##################################
292
+ protected
293
+
294
+ ######################
295
+ # each_window_for_file: iterator
296
+ # same as each_window, but only for a single file
297
+ # (to be called from each_window())
298
+ def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
299
+ fpp.scan_s() { |sent_string|
300
+ sent = SalsaTigerSentence.new(sent_string)
301
+ each_window_for_sent(sent) { |result| yield result }
302
+ }
303
+ end
304
+ end
305
+
306
+ ####################################
307
+ # SingleSentContextProvider:
308
+ # subclass of AbstractContextProvider
309
+ # that assumes that each sentence of the input text
310
+ # stands on its own
311
+ class SingleSentContextProvider < AbstractContextProvider
312
+ ###
313
+ # each_window: iterator
314
+ #
315
+ # given a directory with Salsa/Tiger XML data,
316
+ # iterate through the data,
317
+ # yielding each target word as soon as its context window is filled
318
+ # (or the last file is at an end)
319
+ def each_window(dir) # string: directory containing Salsa/Tiger XML data
320
+ # iterate through files in the directory.
321
+ # Try sorting filenames numerically, since this is
322
+ # what frprep mostly does with filenames
323
+ Dir[dir + "*.xml"].sort { |a, b|
324
+ File.basename(a, ".xml").to_i() <=> File.basename(b, ".xml").to_i()
325
+ }.each { |filename|
326
+ # progress bar
327
+ if @exp.get("verbose")
328
+ $stderr.puts "Featurizing #{File.basename(filename)}"
329
+ end
330
+ f = FilePartsParser.new(filename)
331
+ each_window_for_file(f) { |result|
332
+ yield result
333
+ }
334
+ }
335
+ end
336
+
337
+ ##################################
338
+ protected
339
+
340
+
341
+ ######################
342
+ # each_window_for_file: iterator
343
+ # same as each_window, but only for a single file
344
+ # (to be called from each_window())
345
+ def each_window_for_file(fpp) # FilePartsParser object: Salsa/Tiger XMl data
346
+ fpp.scan_s() { |sent_string|
347
+ sent = SalsaTigerSentence.new(sent_string)
348
+
349
+ each_window_for_sent(sent) { |result|
350
+ yield result
351
+ }
352
+ }
353
+ # no need to clear the context: we're doing this after each sentence
354
+ end
355
+
356
+ ###
357
+ # each_window_for_sent: empty context after each sentence
358
+ def each_window_for_sent(sent)
359
+ if sent.kind_of? SalsaTigerSentence
360
+ each_window_for_stsent(sent) { |result| yield result }
361
+
362
+ elsif sent.kind_of? TabFormatSentence
363
+ each_window_for_tabsent(sent) { |result | yield result }
364
+
365
+ else
366
+ $stderr.puts "Error: got #{sent.class()}, expected SalsaTigerSentence or TabFormatSentence."
367
+ exit 1
368
+ end
369
+
370
+ # clear the context
371
+ each_remaining_target() { |result| yield result }
372
+ end
373
+ end
374
+
375
+
376
+ ####################################
377
+ # NoncontiguousContextProvider:
378
+ # subclass of AbstractContextProvider
379
+ #
380
+ # This class assumes that the input text consists of single sentences
381
+ # drawn from a larger corpus.
382
+ # It first constructs an index to the sentences of the input text,
383
+ # then reads the larger corpus
384
+
385
+ class NoncontiguousContextProvider < AbstractContextProvider
386
+
387
+ ###
388
+ # each_window: iterator
389
+ #
390
+ # given a directory with Salsa/Tiger XML data,
391
+ # iterate through the data and construct an index to the sentences.
392
+ #
393
+ # Then iterate through the larger corpus,
394
+ # yielding contexts.
395
+ def each_window(dir) # string: directory containing Salsa/Tiger XML data
396
+
397
+ # sanity check: do we know where the larger corpus is?
398
+ unless @exp.get("larger_corpus_dir")
399
+ $stderr.puts "Error: 'noncontiguous_input' has been set in the experiment file"
400
+ $stderr.puts "but no location for the larger corpus has been given."
401
+ $stderr.puts "Please set 'larger_corpus_dir' in the experiment file"
402
+ $stderr.puts "to indicate the larger corpus from which the input corpus sentences are drawn."
403
+ exit 1
404
+ end
405
+
406
+ ##
407
+ # remember all sentences from the main corpus
408
+ temptable_obj, sentkeys = make_index(dir)
409
+
410
+ ##
411
+ # make frprep experiment file
412
+ # for lemmatization and POS-tagging of larger corpus files
413
+ tf_exp_frprep = Tempfile.new("fred_bow_context")
414
+ frprep_in, frprep_out, frprep_dir = write_frprep_experiment_file(tf_exp_frprep)
415
+
416
+ ##
417
+ # Iterate through the files of the larger corpus,
418
+ # check for each sentence whether it is also in the input corpus
419
+ # and yield it if it does.
420
+ # larger corpus may contain subdirectories
421
+ initialize_match_check()
422
+
423
+ each_infile(@exp.get("larger_corpus_dir")) { |filename|
424
+ $stderr.puts "Larger corpus: reading #{filename}"
425
+
426
+ # remove previous data from temp directories
427
+ remove_files(frprep_in)
428
+ remove_files(frprep_out)
429
+ remove_files(frprep_dir)
430
+
431
+ # link the input file to input directory for frprep
432
+ File.symlink(filename, frprep_in + "infile")
433
+
434
+ # call frprep
435
+ retv = Kernel.system("ruby frprep.rb -e #{tf_exp_frprep.path()}")
436
+ unless retv
437
+ $stderr.puts "Error analyzing #{filename}. Exiting."
438
+ exit 1
439
+ end
440
+
441
+
442
+ # read the resulting Tab format file, one sentence at a time:
443
+ # - check to see if the checksum of the sentence is in sentkeys
444
+ # (which means it is an input sentence)
445
+ # If it is, retrieve the sentence and determine targets
446
+ # - shift the sentence through the context window
447
+ # - whenever a target word comes to be in the center of the context window,
448
+ # yield.
449
+ $stderr.puts "Computing context features from frprep output."
450
+ Dir[frprep_out + "*.tab"].each { |tabfilename|
451
+ tabfile = FNTabFormatFile.new(tabfilename, ".pos", ".lemma")
452
+ tabfile.each_sentence() { |tabsent|
453
+
454
+ # get as Salsa/Tiger XML sentence, or TabSentence
455
+ sent = get_stxml_sent(tabsent, sentkeys, temptable_obj)
456
+
457
+ # shift sentence through context window
458
+ each_window_for_sent(sent) { |result|
459
+ yield result
460
+ }
461
+
462
+ } # each tab sent
463
+ } # each tab file
464
+ } # each infile from the larger corpus
465
+
466
+ # empty the context array
467
+ each_remaining_target() { |result| yield result }
468
+ each_unmatched(sentkeys, temptable_obj) { |result| yield result }
469
+
470
+ # remove temporary data
471
+ temptable_obj.drop_temp_table()
472
+ %x{rm -rf #{frprep_in}}
473
+ %x{rm -rf #{frprep_out}}
474
+ %x{rm -rf #{frprep_dir}}
475
+ end
476
+
477
+ ##################################
478
+ private
479
+
480
+ ###
481
+ # for each sentence of each file in the given directory:
482
+ # remember the sentence in a temporary DB,
483
+ # indexed by a hash key computed from the plaintext sentence.
484
+ #
485
+ # return:
486
+ # - DBTempTable object containing the temporary DB
487
+ # - hash table containing all hash keys
488
+ def make_index(dir)
489
+
490
+ space_for_sentstring = 30000
491
+ space_for_hashkey = 500
492
+
493
+ $stderr.puts "Indexing input corpus:"
494
+
495
+ # start temporary table
496
+ temptable_obj = get_db_interface(@exp).make_temp_table([
497
+ ["hashkey", "varchar(#{space_for_hashkey})"],
498
+ ["sent", "varchar(#{space_for_sentstring})"]
499
+ ],
500
+ ["hashkey"],
501
+ "autoinc_index")
502
+
503
+ # and hash table for the keys
504
+ retv_keys = Hash.new()
505
+
506
+ # iterate through files in the directory,
507
+ # make an index for each sentence, and store
508
+ # the sentence under that index
509
+ Dir[dir + "*.xml"].each { |filename|
510
+ $stderr.puts "\t#{filename}"
511
+ f = FilePartsParser.new(filename)
512
+ f.scan_s() { |sent_string|
513
+
514
+ xml_obj = RegXML.new(sent_string)
515
+
516
+ # make hash key from words of sentence
517
+ graph = xml_obj.children_and_text().detect { |c| c.name() == "graph" }
518
+ unless graph
519
+ next
520
+ end
521
+ terminals = graph.children_and_text().detect { |c| c.name() == "terminals" }
522
+ unless terminals
523
+ next
524
+ end
525
+ # in making a hash key, use special characters
526
+ # rather than their escaped &..; form
527
+ # $stderr.puts "HIER calling checksum for noncontig"
528
+ hashkey = checksum(terminals.children_and_text().select { |c| c.name() == "t"
529
+ }.map { |t|
530
+ SalsaTigerXMLHelper.unescape(t.attributes()["word"].to_s() )
531
+ })
532
+ # HIER
533
+ # $stderr.puts "HIER " + terminals.children_and_text().select { |c| c.name() == "t"
534
+ # }.map { |t| t.attributes()["word"].to_s() }.join(" ")
535
+
536
+ # sanity check: if the sentence is longer than
537
+ # the space currently allotted to sentence strings,
538
+ # we won't be able to recover it.
539
+ if SQLQuery.stringify_value(hashkey).length() > space_for_hashkey
540
+ $stderr.puts "Warning: sentence checksum too long, cannot store it."
541
+ $stderr.print "Max length: #{space_for_hashkey}. "
542
+ $stderr.puts "Required: #{SQLQuery.stringify_value(hashkey).length()}."
543
+ $stderr.puts "Skipping."
544
+ next
545
+ end
546
+
547
+ if SQLQuery.stringify_value(sent_string).length() > space_for_sentstring
548
+ $stderr.puts "Warning: sentence too long, cannot store it."
549
+ $stderr.print "Max length: #{space_for_sentstring}. "
550
+ $stderr.puts "Required: #{SQLQuery.stringify_value(sent_string).length()}."
551
+ $stderr.puts "Skipping."
552
+ next
553
+ end
554
+
555
+ # store
556
+ temptable_obj.query_noretv(SQLQuery.insert(temptable_obj.table_name,
557
+ [["hashkey", hashkey],
558
+ ["sent", sent_string]]))
559
+ retv_keys[hashkey] = true
560
+ }
561
+ }
562
+ $stderr.puts "Indexing finished."
563
+
564
+ return [ temptable_obj, retv_keys ]
565
+ end
566
+
567
+ ######
568
+ # compute checksum from the given sentence,
569
+ # and return as string
570
+ def checksum(words) # array: string
571
+ string = ""
572
+
573
+ # HIER removed sort() after downcase
574
+ words.map { |w| w.to_s.downcase }.each { |w|
575
+ string << w.gsub(/[^a-z]/, "")
576
+ }
577
+ return MD5.new(string).hexdigest
578
+ end
579
+
580
+ #####
581
+ # yield each file of the given directory
582
+ # or one of its subdirectories
583
+ def each_infile(indir)
584
+ unless indir =~ /\/$/
585
+ indir = indir + "/"
586
+ end
587
+
588
+ Dir[indir + "*"].each { |filename|
589
+ if File.file?(filename)
590
+ yield filename
591
+ end
592
+ }
593
+
594
+ # enter recursion
595
+ Dir[indir + "**"].each { |subdir|
596
+ # same directory we had before? don't redo
597
+ if indir == subdir
598
+ next
599
+ end
600
+
601
+ begin
602
+ unless File.stat(subdir).directory?
603
+ next
604
+ end
605
+ rescue
606
+ # no access, I assume
607
+ next
608
+ end
609
+
610
+ each_infile(subdir) { |inf|
611
+ yield inf
612
+ }
613
+ }
614
+ end
615
+
616
+ ###
617
+ # remove files: remove all files and subdirectories in the given directory
618
+ def remove_files(indir)
619
+ Dir[indir + "*"].each { |filename|
620
+ if File.file?(filename) or File.symlink?(filename)
621
+ retv = File.delete(filename)
622
+ end
623
+ }
624
+
625
+ # enter recursion
626
+ Dir[indir + "**"].each { |subdir|
627
+ # same directory we had before? don't redo
628
+ if indir == subdir
629
+ next
630
+ end
631
+
632
+ begin
633
+ unless File.stat(subdir).directory?
634
+ next
635
+ end
636
+ rescue
637
+ # no access, I assume
638
+ next
639
+ end
640
+
641
+ # subdir must end in slash
642
+ unless subdir =~ /\/$/
643
+ subdir = subdir + "/"
644
+ end
645
+ # and enter recursion
646
+ remove_files(subdir)
647
+ File.rm_f(subdir)
648
+ }
649
+ end
650
+
651
+ def write_frprep_experiment_file(tf_exp_frprep) # Tempfile object
652
+
653
+ # make unique experiment ID
654
+ experiment_id = "larger_corpus"
655
+ # input and output directory for frprep
656
+ frprep_in = fred_dirname(@exp, "temp", "in", "new")
657
+ frprep_out = fred_dirname(@exp, "temp", "out", "new")
658
+ frprep_dir = fred_dirname(@exp, "temp", "frprep", "new")
659
+
660
+ # write file:
661
+
662
+ # experiment ID and directories
663
+ tf_exp_frprep.puts "prep_experiment_ID = #{experiment_id}"
664
+ tf_exp_frprep.puts "directory_input = #{frprep_in}"
665
+ tf_exp_frprep.puts "directory_preprocessed = #{frprep_out}"
666
+ tf_exp_frprep.puts "frprep_directory = #{frprep_dir}"
667
+
668
+ # output format: tab
669
+ tf_exp_frprep.puts "tabformat_output = true"
670
+
671
+ # corpus description: language, format, encoding
672
+ if @exp.get("language")
673
+ tf_exp_frprep.puts "language = #{@exp.get("language")}"
674
+ end
675
+ if @exp.get("larger_corpus_format")
676
+ tf_exp_frprep.puts "format = #{@exp.get("larger_corpus_format")}"
677
+ elsif @exp.get("format")
678
+ $stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
679
+ $stderr.puts "using 'format' setting of frprep experiment file instead."
680
+ tf_exp_frprep.puts "format = #{@exp.get("format")}"
681
+ else
682
+ $stderr.puts "Warning: 'larger_corpus_format' not set in experiment file,"
683
+ $stderr.puts "relying on default setting."
684
+ end
685
+ if @exp.get("larger_corpus_encoding")
686
+ tf_exp_frprep.puts "encoding = #{@exp.get("larger_corpus_encoding")}"
687
+ elsif @exp.get("encoding")
688
+ $stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
689
+ $stderr.puts "using 'encoding' setting of frprep experiment file instead."
690
+ tf_exp_frprep.puts "encoding = #{@exp.get("encoding")}"
691
+ else
692
+ $stderr.puts "Warning: 'larger_corpus_encoding' not set in experiment file,"
693
+ $stderr.puts "relying on default setting."
694
+ end
695
+
696
+ # processing: lemmatization, POS tagging, no parsing
697
+ tf_exp_frprep.puts "do_lemmatize = true"
698
+ tf_exp_frprep.puts "do_postag = true"
699
+ tf_exp_frprep.puts "do_parse = false"
700
+
701
+ # lemmatizer and POS tagger settings:
702
+ # take verbatim from frprep file
703
+ begin
704
+ f = File.new(@exp.get("preproc_descr_file_" + @dataset))
705
+ rescue
706
+ $stderr.puts "Error: could not read frprep experiment file #{@exp.get("preproc_descr_file_" + @dataset)}"
707
+ exit 1
708
+ end
709
+ f.each { |line|
710
+ if line =~ /pos_tagger\s*=/ or
711
+ line =~ /pos_tagger_path\s*=/ or
712
+ line =~ /lemmatizer\s*=/ or
713
+ line =~ /lemmatizer_path\s*=/
714
+
715
+ tf_exp_frprep.puts line
716
+ end
717
+ }
718
+ # finalize frprep experiment file
719
+ tf_exp_frprep.close()
720
+
721
+ return [frprep_in, frprep_out, frprep_dir]
722
+ end
723
+
724
+ ####
725
+ # get SalsaTigerXML sentence and targets:
726
+ #
727
+ # given a Tab format sentence:
728
+ # - check whether it is in the table of input sentences.
729
+ # if so, retrieve it.
730
+ # - otherwise, fashion a makeshift SalsaTigerSentence object
731
+ # from the words, lemmas and POS
732
+ def get_stxml_sent(tabsent,
733
+ sentkeys,
734
+ temptable_obj)
735
+
736
+ # SalsaTigerSentence object
737
+ sent = nil
738
+
739
+ # make checksum
740
+ words = Array.new()
741
+ words2 = Array.new()
742
+ tabsent.each_line_parsed { |line_obj|
743
+ words << SalsaTigerXMLHelper.unescape(line_obj.get("word"))
744
+ words2 << line_obj.get("word")
745
+ }
746
+ # $stderr.puts "HIER calling checksum from larger corpus"
747
+ hashkey_this_sentence = checksum(words)
748
+
749
+ # HIER
750
+ # $stderr.puts "HIER2 " + words.join(" ")
751
+ # $stderr.puts "HIER3 " + words2.join(" ")
752
+
753
+
754
+ if sentkeys[hashkey_this_sentence]
755
+ # sentence from the input corpus.
756
+
757
+ # register
758
+ register_matched(hashkey_this_sentence)
759
+
760
+
761
+ # select "sent" columns from temp table
762
+ # where "hashkey" == sent_checksum
763
+ # returns a DBResult object
764
+ query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
765
+ [ ValueRestriction.new("hashkey", hashkey_this_sentence) ]))
766
+ query_result.each { |row|
767
+
768
+ sent_string = SQLQuery.unstringify_value(row.first().to_s())
769
+ begin
770
+ sent = SalsaTigerSentence.new(sent_string)
771
+ rescue
772
+ $stderr.puts "Error reading Salsa/Tiger XML sentence."
773
+ $stderr.puts
774
+ $stderr.puts "SQL-stored sentence was:"
775
+ $stderr.puts row.first().to_s()
776
+ $stderr.puts
777
+ $stderr.puts "==================="
778
+ $stderr.puts "With restored quotes:"
779
+ $stderr.puts sent_string
780
+ exit 1
781
+ end
782
+ break
783
+ }
784
+ unless sent
785
+ $stderr.puts "Warning: could not retrieve input corpus sentence: " + words.join(" ")
786
+ end
787
+ end
788
+
789
+ if sent
790
+ return sent
791
+ else
792
+ return tabsent
793
+ end
794
+ end
795
+
796
+ ###
797
+ # Keep track of which sentences from the smaller, noncontiguous corpus
798
+ # have been matched in the larger corpus
799
+ def initialize_match_check()
800
+ @index_matched = Hash.new()
801
+ end
802
+
803
+ ###
804
+ # Record a sentence from the smaller, noncontiguous corpus
805
+ # as matched in the larger corpus
806
+ def register_matched(hash_key)
807
+ @index_matched[hash_key] = true
808
+ end
809
+
810
+ ###
811
+ # Call this method after all sentences from the larger corpus
812
+ # have been checked against the smaller corpus.
813
+ # This method prints a warning message for each sentence from the smaller corpus
814
+ # that has not been matched,
815
+ # and yields it in the same format as each_window(),
816
+ # such that the unmatched sentences can still be processed,
817
+ # but without a larger context.
818
+ def each_unmatched(all_keys,
819
+ temptable_obj)
820
+
821
+ num_unmatched = 0
822
+
823
+ all_keys.each_key { |hash_key|
824
+ unless @index_matched[hash_key]
825
+ # unmatched sentence:
826
+
827
+ num_unmatched += 1
828
+
829
+ # retrieve
830
+ query_result = temptable_obj.query(SQLQuery.select([ SelectTableAndColumns.new(temptable_obj, ["sent"]) ],
831
+ [ ValueRestriction.new("hashkey", hash_key) ]))
832
+
833
+ # report and yield
834
+ query_result.each { |row|
835
+
836
+ sent_string = SQLQuery.unstringify_value(row.first().to_s())
837
+ begin
838
+ # report on unmatched sentence
839
+ sent = SalsaTigerSentence.new(sent_string)
840
+ $stderr.puts "Unmatched sentence from noncontiguous input:\n" +
841
+ sent.id().to_s() + " " + sent.to_s()
842
+
843
+ # push the sentence through the context window,
844
+ # filling it up with "nil",
845
+ # and yield when we reach the target at center position.
846
+ each_window_for_stsent(sent) { |result| yield result }
847
+ each_remaining_target() { |result| yield result }
848
+
849
+ rescue
850
+ # Couldn't turn it into a SalsaTigerSentence object:
851
+ # just report, don't yield
852
+ $stderr.puts "Unmatched sentence from noncontiguous input (raw):\n" +
853
+ sent_string
854
+ $stderr.puts "ERROR: cannot process this sentence, skipping."
855
+ end
856
+ }
857
+ end
858
+ }
859
+
860
+ $stderr.puts "Unmatched sentences: #{num_unmatched} all in all."
861
+ end
862
+
863
+ end