frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,44 @@
1
+ require "tempfile"
2
+ require "frprep/AbstractSynInterface"
3
+
4
+ ################################################
5
+ # Interface class
6
+ class TntInterface < SynInterfaceTab
7
+ TntInterface.announce_me()
8
+
9
+ def TntInterface.system()
10
+ return "tnt"
11
+ end
12
+
13
+ def TntInterface.service()
14
+ return "pos_tagger"
15
+ end
16
+
17
+ def process_file(infilename, # string: name of input file
18
+ outfilename) # string: name of output file
19
+
20
+ tempfile = Tempfile.new("Tnt")
21
+ TntInterface.fntab_words_to_file(infilename, tempfile)
22
+ tempfile.close
23
+
24
+ # 1. use grep to remove commentaries from file
25
+ # 2. use sed to extract tags tag list:
26
+ # - match one or more non-spaces
27
+ # - match one or more spaces
28
+ # - match one or more non-spaces and write to outfilename
29
+
30
+ # This assumes that the experiment file entry for pos_tagger_path
31
+ # has the form
32
+ # pos_tagger_path = <program_name> <model>
33
+
34
+ Kernel.system(@program_path + " " + tempfile.path +
35
+ ' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
36
+
37
+ tempfile.close(true) # delete tempfile
38
+ unless `cat #{infilename} | wc -l`.strip ==
39
+ `cat #{outfilename} | wc -l`.strip
40
+ raise "Error: tagged file has different line number from corpus file!"
41
+ end
42
+ end
43
+ end
44
+
@@ -0,0 +1,61 @@
1
+ require 'frprep/Graph'
2
+
3
+ class TreeNode < GraphNode
4
+
5
+ def initialize(id)
6
+ super(id)
7
+ end
8
+
9
+ # redo the ancestor-related methods,
10
+ # since here we only have one parent per node
11
+ def parent()
12
+ retv = parents()
13
+ if retv.nil?
14
+ return nil
15
+ else
16
+ return retv.first
17
+ end
18
+ end
19
+
20
+ def parent_label()
21
+ retv = parent_labels()
22
+ if retv.nil?
23
+ return nil
24
+ else
25
+ return retv.first
26
+ end
27
+ end
28
+
29
+
30
+ def parent_with_edgelabel()
31
+ retv = parents_with_edgelabel()
32
+
33
+ if retv.nil?
34
+ return nil
35
+ else
36
+ return retv.first
37
+ end
38
+ end
39
+
40
+
41
+ def add_parent(parent, edgelabel, varhash={})
42
+ set_parent(parent, edgelabel, varhash)
43
+ end
44
+
45
+ def set_parent(parent, edgelabel, varhash={})
46
+ # remove old parent
47
+ each_parent_with_edgelabel { |label, parent|
48
+ remove_parent(parent, label, varhash)
49
+ }
50
+
51
+ # set new parent
52
+ @parents << [edgelabel, parent]
53
+
54
+ # and vice versa: add self as child to parent
55
+ unless varhash["pointer_insteadof_edge"]
56
+ unless parent.children_with_edgelabel().include? [edgelabel, self]
57
+ parent.add_child(self, edgelabel)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,303 @@
1
+ # sp 30 11 06
2
+ # extended by TreeTaggerPOSInterface
3
+
4
+ require "tempfile"
5
+
6
+ require "frprep/AbstractSynInterface"
7
+
8
+ ###########
9
+ # KE dec 7, 06
10
+ # common mixin for both Treetagger modules, doing the actual processing
11
+ module TreetaggerModule
12
+ ###
13
+ # Treetagger does both lemmatization and POS-tagging.
14
+ # However, the way the SynInterface system is set up in Shalmaneser,
15
+ # each SynInterface can offer only _one_ service.
16
+ # This means that we cannot do a SynInterface that writes
17
+ # both a POS file and a lemma file.
18
+ # Instead, both will include this module, which does the
19
+ # actual TreeTagger call and then stores the result in a file
20
+ # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
21
+ # but with a separate extension.
22
+ # really_process_file checks for existence of this file because,
23
+ # if the TreeTagger lemmatization and POS-tagging classes are called separately,
24
+ # one of them will go first, and the 2nd one will not need to do the
25
+ # TreeTagger call anymore
26
+ #
27
+ # really_process_file returns a filename, the name of the file containing
28
+ # the TreeTagger output with both POS tags and lemma information
29
+ #
30
+ # WARNING: this method assumes that outfilename contains a suffix
31
+ # that can be replaced by .TreeTagger
32
+ def really_process_file(infilename, # string: name of input file
33
+ outfilename,# string: name of file that the caller is to produce
34
+ make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
35
+
36
+ # fabricate the filename in which the
37
+ # actual TreeTagger output will be placed:
38
+ # <directory> + <outfilename minus last suffix> + ".TreeTagger"
39
+ current_suffix = outfilename[outfilename.rindex(".")..-1]
40
+ my_outfilename = File.dirname(outfilename) + "/" +
41
+ File.basename(outfilename, current_suffix) +
42
+ ".TreeTagger"
43
+
44
+ ##
45
+ # does it exist? then just return it
46
+ if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
47
+ return my_outfilename
48
+ end
49
+
50
+ ##
51
+ # else construct it, then return it
52
+ tempfile = Tempfile.new("Treetagger")
53
+ TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
54
+ tempfile.close
55
+
56
+ # call TreeTagger
57
+ Kernel.system(@program_path+" "+tempfile.path +
58
+ " > " + my_outfilename)
59
+ tempfile.close(true) # delete first tempfile
60
+
61
+ # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
62
+ # resulting on a .tagged file missing the last (blank) line
63
+
64
+ original_length = `cat #{infilename} | wc -l`.strip.to_i
65
+ puts infilename
66
+ lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
67
+
68
+ # `cp #{tempfile2.path()} /tmp/lout`
69
+
70
+ case original_length - lemmatised_length
71
+ when 0
72
+ # everything ok, don't do anything
73
+ when 1
74
+ # add one more newline to the .tagged file
75
+ `echo "" >> #{my_outfilename}`
76
+ else
77
+ # this is "real" error
78
+ STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
79
+ STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
80
+ $stderr.puts "has different line number from corpus file!"
81
+ raise
82
+ end
83
+
84
+
85
+ return my_outfilename
86
+ end
87
+ end
88
+
89
+ #######################################
90
+ class TreetaggerInterface < SynInterfaceTab
91
+ TreetaggerInterface.announce_me()
92
+
93
+ include TreetaggerModule
94
+
95
+ ###
96
+ def TreetaggerInterface.system()
97
+ return "treetagger"
98
+ end
99
+
100
+ ###
101
+ def TreetaggerInterface.service()
102
+ return "lemmatizer"
103
+ end
104
+
105
+ ###
106
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
107
+
108
+ def convert_to_berkeley(line)
109
+ line.chomp!
110
+ return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
111
+ end
112
+
113
+
114
+ ###
115
+ def process_file(infilename, # string: name of input file
116
+ outfilename) # string: name of output file
117
+
118
+ # KE change here
119
+ ttfilename = really_process_file(infilename, outfilename)
120
+
121
+ # write all output to tempfile2 first, then
122
+ # change ISO to UTF-8 into outputfile
123
+ tempfile2 = Tempfile.new("treetagger")
124
+ tempfile2.close()
125
+
126
+ # 2. use cut to get the actual lemmtisation
127
+
128
+ Kernel.system("cat " + ttfilename +
129
+ ' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
130
+
131
+ # transform ISO-8859-1 back to UTF-8,
132
+ # write to 'outfilename'
133
+ begin
134
+ outfile = File.new(outfilename, "w")
135
+ rescue
136
+ raise "Could not write to #{outfilename}"
137
+ end
138
+ tempfile2.open
139
+ # AB: Internally all the flow is an utf-8 encoded stream.
140
+ # TreeTagger consumes one byte encodings (but we should provide a
141
+ # utf-8 model for German). So we convert utf-8 to latin1, then
142
+ # process the text and convert it back to utf-8.
143
+ #
144
+ while line = tempfile2.gets
145
+ #outfile.puts UtfIso.from_iso_8859_1(line)
146
+ utf8line = UtfIso.from_iso_8859_1(line)
147
+ outfile.puts convert_to_berkeley(utf8line)
148
+ end
149
+
150
+ # remove second tempfile, finalize output file
151
+ tempfile2.close(true)
152
+ outfile.close()
153
+
154
+ end
155
+ end
156
+
157
+
158
+ # sp 30 11 06
159
+ #
160
+ # using TreeTagger for POS tagging of English text
161
+ #
162
+ # copy-and-paste from lemmatisation
163
+ #
164
+ # differences:
165
+ # 1. use field 2 and not 3 from the output
166
+ # 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
167
+ #
168
+ # KE 7 12 06
169
+ # change interface such that TreeTagger is called only once
170
+ # and both POS tags and lemma are read from the same files,
171
+ # rather than calling the tagger twice
172
+ class TreetaggerPOSInterface < SynInterfaceTab
173
+ TreetaggerPOSInterface.announce_me()
174
+ include TreetaggerModule
175
+
176
+ ###
177
+ def TreetaggerPOSInterface.system()
178
+ return "treetagger"
179
+ end
180
+
181
+ ###
182
+ def TreetaggerPOSInterface.service()
183
+ return "pos_tagger"
184
+ end
185
+
186
+ ###
187
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
188
+
189
+ def convert_to_collins(line)
190
+ line.chomp!
191
+ return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
192
+ end
193
+
194
+ ###
195
+ def process_file(infilename, # string: name of input file
196
+ outfilename) # string: name of output file
197
+
198
+ # KE change here
199
+ tt_filename = really_process_file(infilename, outfilename, true)
200
+
201
+ # write all output to tempfile2 first, then
202
+ # change ISO to UTF-8 into outputfile
203
+ tempfile2 = Tempfile.new("treetagger")
204
+ tempfile2.close()
205
+
206
+ # 2. use cut to get the actual lemmtisation
207
+
208
+ Kernel.system("cat " + tt_filename +
209
+ ' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
210
+
211
+ # transform ISO-8859-1 back to UTF-8,
212
+ # write to 'outfilename'
213
+ begin
214
+ outfile = File.new(outfilename, "w")
215
+ rescue
216
+ raise "Could not write to #{outfilename}"
217
+ end
218
+ tempfile2.open()
219
+ while (line = tempfile2.gets())
220
+ outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
221
+ end
222
+
223
+ # remove second tempfile, finalize output file
224
+ tempfile2.close(true)
225
+ outfile.close()
226
+ end
227
+ end
228
+
229
+ ###############
230
+ # an interpreter that only has Treetagger, no parser
231
+ class TreetaggerInterpreter < SynInterpreter
232
+ TreetaggerInterpreter.announce_me()
233
+
234
+ ###
235
+ # names of the systems interpreted by this class:
236
+ # returns a hash service(string) -> system name (string),
237
+ # e.g.
238
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
239
+ def TreetaggerInterpreter.systems()
240
+ return {
241
+ "pos_tagger" => "treetagger",
242
+ }
243
+ end
244
+
245
+ ###
246
+ # names of additional systems that may be interpreted by this class
247
+ # returns a hash service(string) -> system name(string)
248
+ # same as names()
249
+ def TreetaggerInterpreter.optional_systems()
250
+ return {
251
+ "lemmatizer" => "treetagger"
252
+ }
253
+ end
254
+
255
+ ###
256
+ # generalize over POS tags.
257
+ #
258
+ # returns one of:
259
+ #
260
+ # adj: adjective (phrase)
261
+ # adv: adverb (phrase)
262
+ # card: numbers, quantity phrases
263
+ # con: conjunction
264
+ # det: determiner, including possessive/demonstrative pronouns etc.
265
+ # for: foreign material
266
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
267
+ # part: particles, truncated words (German compound parts)
268
+ # prep: preposition (phrase)
269
+ # pun: punctuation, brackets, etc.
270
+ # sent: sentence
271
+ # top: top node of a sentence
272
+ # verb: verb (phrase)
273
+ # nil: something went wrong
274
+ #
275
+ # returns: string, or nil
276
+ def TreetaggerInterpreter.category(node) # SynNode
277
+ pt = TreetaggerInterpreter.pt(node)
278
+ if pt.nil?
279
+ # phrase type could not be determined
280
+ return nil
281
+ end
282
+
283
+ pt.to_s.strip() =~ /^([^-]*)/
284
+ case $1
285
+ when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
286
+ when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
287
+ when /^CD/, /^QP/ then return "card"
288
+ when /^CC/, /^WRB/, /^CONJP/ then return "con"
289
+ when /^DT/, /^POS/ then return "det"
290
+ when /^FW/, /^SYM/ then return "for"
291
+ when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
292
+ when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
293
+ when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
294
+ when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
295
+ when /^TOP/ then return "top"
296
+ when /^TRACE/ then return "trace"
297
+ when /^V/ , /^MD/ then return "verb"
298
+ else
299
+ # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
300
+ return nil
301
+ end
302
+ end
303
+ end
@@ -0,0 +1,142 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ # AB, 2010-11-25
4
+
5
+ ##############################
6
+ # class for managing parses:
7
+ #
8
+ # Given either a directory with tab format files or
9
+ # a directory with SalsaTigerXML files (or both) and
10
+ # a directory for putting parse files:
11
+ # - parse, unless no parsing set in the experiment file
12
+ # - for each parsed file: yield one OneParsedFile object
13
+ require 'frprep/one_parsed_file'
14
+
15
+ class DoParses
16
+ def initialize(exp, # FrPrepConfigData object
17
+ file_suffixes, # hash: file type(string) -> suffix(string)
18
+ parse_dir, # string: name of directory to put parses
19
+ var_hash = {}) # further directories
20
+ @exp = exp
21
+ @file_suffixes = file_suffixes
22
+ @parse_dir = parse_dir
23
+ @tab_dir = var_hash["tab_dir"]
24
+ @stxml_dir = var_hash["stxml_dir"]
25
+
26
+ # pre-parsed data available?
27
+ @parsed_files = @exp.get("directory_parserout")
28
+ end
29
+
30
+ ###
31
+ def each_parsed_file()
32
+ if @exp.get("do_postag")
33
+ postag_suffix = @file_suffixes["pos"]
34
+ else
35
+ postag_suffix = nil
36
+ end
37
+
38
+ if @exp.get("do_lemmatize")
39
+ lemma_suffix = @file_suffixes["lemma"]
40
+ else
41
+ lemma_suffix = nil
42
+ end
43
+
44
+ if @exp.get("do_parse")
45
+
46
+ # get parser interface
47
+ sys_class = SynInterfaces.get_interface("parser",
48
+ @exp.get("parser"))
49
+ unless sys_class
50
+ raise "Shouldn't be here"
51
+ end
52
+ parse_suffix = "." + sys_class.name()
53
+ sys = sys_class.new(@exp.get("parser_path"),
54
+ @file_suffixes["tab"],
55
+ parse_suffix,
56
+ @file_suffixes["stxml"],
57
+ "pos_suffix" => postag_suffix,
58
+ "lemma_suffix" => lemma_suffix,
59
+ "tab_dir" => @tab_dir)
60
+
61
+ if @parsed_files
62
+ # reuse old parses
63
+
64
+ $stderr.puts "Frprep: using pre-computed parses in " + @parsed_files.to_s()
65
+ $stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
66
+
67
+ Dir[@parsed_files + "*"].each { |parsefilename|
68
+
69
+ if File.stat(parsefilename).ftype != "file"
70
+ # something other than a file
71
+ next
72
+ end
73
+
74
+
75
+ # core filename: remove directory and anything after the last "."
76
+ filename_core = File.basename(parsefilename, ".*")
77
+ #print "FN ", filename_core, " PN ", parsefilename, " sys ", sys, "\n"
78
+ # use iterator to read each parsed file
79
+ yield OneParsedFile.new(filename_core, parsefilename, sys)
80
+ }
81
+
82
+ else
83
+ # do new parses
84
+ $stderr.puts "Frprep: Parsing"
85
+
86
+ # sanity check
87
+ unless @exp.get("parser_path")
88
+ raise "Parsing: I need 'parser_path' in the experiment file"
89
+ end
90
+ unless @tab_dir
91
+ raise "Cannot parse without tab files"
92
+ end
93
+
94
+ # parse
95
+ sys.process_dir(@tab_dir, @parse_dir)
96
+
97
+ $stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
98
+
99
+ Dir[@parse_dir + "*" + parse_suffix].each { |parsefilename|
100
+ filename_core = File.basename(parsefilename, parse_suffix)
101
+
102
+ # use iterator to read each parsed file
103
+ yield OneParsedFile.new(filename_core, parsefilename, sys)
104
+ }
105
+ end
106
+
107
+ else
108
+ # no parse:
109
+ # get pseudo-parse tree
110
+
111
+ if @stxml_dir
112
+ # use existing SalsaTigerXML files
113
+ Dir[@stxml_dir + "*.xml"].each { |stxmlfilename|
114
+
115
+ filename_core = File.basename(stxmlfilename, ".xml")
116
+ if @tab_dir
117
+ # we know the tab directory too
118
+ tabfilename = @tab_dir + filename_core + @file_suffixes["tab"]
119
+ each_sentence_obj = FrprepReadStxml.new(stxmlfilename, tabfilename,
120
+ postag_suffix, lemma_suffix)
121
+ else
122
+ # we have no tab directory
123
+ each_sentence_obj = FrprepReadStxml.new(stxmlfilename, nil,
124
+ postag_suffix, lemma_suffix)
125
+ end
126
+
127
+ yield OneParsedFile.new(filename_core, stxmlfilename, each_sentence_obj)
128
+ }
129
+
130
+ else
131
+ # construct SalsaTigerXML from tab files
132
+ Dir[@tab_dir+"*"+@file_suffixes["tab"]].each { |tabfilename|
133
+ each_sentence_obj = FrprepFlatSyntax.new(tabfilename,
134
+ postag_suffix,
135
+ lemma_suffix)
136
+ filename_core = File.basename(tabfilename, @file_suffixes["tab"])
137
+ yield OneParsedFile.new(filename_core, tabfilename, each_sentence_obj)
138
+ }
139
+ end # source of pseudo-parse
140
+ end # parse or no parse
141
+ end
142
+ end