frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,44 @@
1
+ require "tempfile"
2
+ require "frprep/AbstractSynInterface"
3
+
4
+ ################################################
5
+ # Interface class
6
+ class TntInterface < SynInterfaceTab
7
+ TntInterface.announce_me()
8
+
9
+ def TntInterface.system()
10
+ return "tnt"
11
+ end
12
+
13
+ def TntInterface.service()
14
+ return "pos_tagger"
15
+ end
16
+
17
+ def process_file(infilename, # string: name of input file
18
+ outfilename) # string: name of output file
19
+
20
+ tempfile = Tempfile.new("Tnt")
21
+ TntInterface.fntab_words_to_file(infilename, tempfile)
22
+ tempfile.close
23
+
24
+ # 1. use grep to remove commentaries from file
25
+ # 2. use sed to extract tags tag list:
26
+ # - match one or more non-spaces
27
+ # - match one or more spaces
28
+ # - match one or more non-spaces and write to outfilename
29
+
30
+ # This assumes that the experiment file entry for pos_tagger_path
31
+ # has the form
32
+ # pos_tagger_path = <program_name> <model>
33
+
34
+ Kernel.system(@program_path + " " + tempfile.path +
35
+ ' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
36
+
37
+ tempfile.close(true) # delete tempfile
38
+ unless `cat #{infilename} | wc -l`.strip ==
39
+ `cat #{outfilename} | wc -l`.strip
40
+ raise "Error: tagged file has different line number from corpus file!"
41
+ end
42
+ end
43
+ end
44
+
@@ -0,0 +1,61 @@
1
+ require 'frprep/Graph'
2
+
3
+ class TreeNode < GraphNode
4
+
5
+ def initialize(id)
6
+ super(id)
7
+ end
8
+
9
+ # redo the ancestor-related methods,
10
+ # since here we only have one parent per node
11
+ def parent()
12
+ retv = parents()
13
+ if retv.nil?
14
+ return nil
15
+ else
16
+ return retv.first
17
+ end
18
+ end
19
+
20
+ def parent_label()
21
+ retv = parent_labels()
22
+ if retv.nil?
23
+ return nil
24
+ else
25
+ return retv.first
26
+ end
27
+ end
28
+
29
+
30
+ def parent_with_edgelabel()
31
+ retv = parents_with_edgelabel()
32
+
33
+ if retv.nil?
34
+ return nil
35
+ else
36
+ return retv.first
37
+ end
38
+ end
39
+
40
+
41
+ def add_parent(parent, edgelabel, varhash={})
42
+ set_parent(parent, edgelabel, varhash)
43
+ end
44
+
45
+ def set_parent(parent, edgelabel, varhash={})
46
+ # remove old parent
47
+ each_parent_with_edgelabel { |label, parent|
48
+ remove_parent(parent, label, varhash)
49
+ }
50
+
51
+ # set new parent
52
+ @parents << [edgelabel, parent]
53
+
54
+ # and vice versa: add self as child to parent
55
+ unless varhash["pointer_insteadof_edge"]
56
+ unless parent.children_with_edgelabel().include? [edgelabel, self]
57
+ parent.add_child(self, edgelabel)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,303 @@
1
+ # sp 30 11 06
2
+ # extended by TreeTaggerPOSInterface
3
+
4
+ require "tempfile"
5
+
6
+ require "frprep/AbstractSynInterface"
7
+
8
+ ###########
9
+ # KE dec 7, 06
10
+ # common mixin for both Treetagger modules, doing the actual processing
11
+ module TreetaggerModule
12
+ ###
13
+ # Treetagger does both lemmatization and POS-tagging.
14
+ # However, the way the SynInterface system is set up in Shalmaneser,
15
+ # each SynInterface can offer only _one_ service.
16
+ # This means that we cannot do a SynInterface that writes
17
+ # both a POS file and a lemma file.
18
+ # Instead, both will include this module, which does the
19
+ # actual TreeTagger call and then stores the result in a file
20
+ # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
21
+ # but with a separate extension.
22
+ # really_process_file checks for existence of this file because,
23
+ # if the TreeTagger lemmatization and POS-tagging classes are called separately,
24
+ # one of them will go first, and the 2nd one will not need to do the
25
+ # TreeTagger call anymore
26
+ #
27
+ # really_process_file returns a filename, the name of the file containing
28
+ # the TreeTagger output with both POS tags and lemma information
29
+ #
30
+ # WARNING: this method assumes that outfilename contains a suffix
31
+ # that can be replaced by .TreeTagger
32
+ def really_process_file(infilename, # string: name of input file
33
+ outfilename,# string: name of file that the caller is to produce
34
+ make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
35
+
36
+ # fabricate the filename in which the
37
+ # actual TreeTagger output will be placed:
38
+ # <directory> + <outfilename minus last suffix> + ".TreeTagger"
39
+ current_suffix = outfilename[outfilename.rindex(".")..-1]
40
+ my_outfilename = File.dirname(outfilename) + "/" +
41
+ File.basename(outfilename, current_suffix) +
42
+ ".TreeTagger"
43
+
44
+ ##
45
+ # does it exist? then just return it
46
+ if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
47
+ return my_outfilename
48
+ end
49
+
50
+ ##
51
+ # else construct it, then return it
52
+ tempfile = Tempfile.new("Treetagger")
53
+ TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
54
+ tempfile.close
55
+
56
+ # call TreeTagger
57
+ Kernel.system(@program_path+" "+tempfile.path +
58
+ " > " + my_outfilename)
59
+ tempfile.close(true) # delete first tempfile
60
+
61
+ # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
62
+ # resulting on a .tagged file missing the last (blank) line
63
+
64
+ original_length = `cat #{infilename} | wc -l`.strip.to_i
65
+ puts infilename
66
+ lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
67
+
68
+ # `cp #{tempfile2.path()} /tmp/lout`
69
+
70
+ case original_length - lemmatised_length
71
+ when 0
72
+ # everything ok, don't do anything
73
+ when 1
74
+ # add one more newline to the .tagged file
75
+ `echo "" >> #{my_outfilename}`
76
+ else
77
+ # this is "real" error
78
+ STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
79
+ STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
80
+ $stderr.puts "has different line number from corpus file!"
81
+ raise
82
+ end
83
+
84
+
85
+ return my_outfilename
86
+ end
87
+ end
88
+
89
+ #######################################
90
+ class TreetaggerInterface < SynInterfaceTab
91
+ TreetaggerInterface.announce_me()
92
+
93
+ include TreetaggerModule
94
+
95
+ ###
96
+ def TreetaggerInterface.system()
97
+ return "treetagger"
98
+ end
99
+
100
+ ###
101
+ def TreetaggerInterface.service()
102
+ return "lemmatizer"
103
+ end
104
+
105
+ ###
106
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
107
+
108
+ def convert_to_berkeley(line)
109
+ line.chomp!
110
+ return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
111
+ end
112
+
113
+
114
+ ###
115
+ def process_file(infilename, # string: name of input file
116
+ outfilename) # string: name of output file
117
+
118
+ # KE change here
119
+ ttfilename = really_process_file(infilename, outfilename)
120
+
121
+ # write all output to tempfile2 first, then
122
+ # change ISO to UTF-8 into outputfile
123
+ tempfile2 = Tempfile.new("treetagger")
124
+ tempfile2.close()
125
+
126
+ # 2. use cut to get the actual lemmtisation
127
+
128
+ Kernel.system("cat " + ttfilename +
129
+ ' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
130
+
131
+ # transform ISO-8859-1 back to UTF-8,
132
+ # write to 'outfilename'
133
+ begin
134
+ outfile = File.new(outfilename, "w")
135
+ rescue
136
+ raise "Could not write to #{outfilename}"
137
+ end
138
+ tempfile2.open
139
+ # AB: Internally all the flow is an utf-8 encoded stream.
140
+ # TreeTagger consumes one byte encodings (but we should provide a
141
+ # utf-8 model for German). So we convert utf-8 to latin1, then
142
+ # process the text and convert it back to utf-8.
143
+ #
144
+ while line = tempfile2.gets
145
+ #outfile.puts UtfIso.from_iso_8859_1(line)
146
+ utf8line = UtfIso.from_iso_8859_1(line)
147
+ outfile.puts convert_to_berkeley(utf8line)
148
+ end
149
+
150
+ # remove second tempfile, finalize output file
151
+ tempfile2.close(true)
152
+ outfile.close()
153
+
154
+ end
155
+ end
156
+
157
+
158
+ # sp 30 11 06
159
+ #
160
+ # using TreeTagger for POS tagging of English text
161
+ #
162
+ # copy-and-paste from lemmatisation
163
+ #
164
+ # differences:
165
+ # 1. use field 2 and not 3 from the output
166
+ # 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
167
+ #
168
+ # KE 7 12 06
169
+ # change interface such that TreeTagger is called only once
170
+ # and both POS tags and lemma are read from the same files,
171
+ # rather than calling the tagger twice
172
+ class TreetaggerPOSInterface < SynInterfaceTab
173
+ TreetaggerPOSInterface.announce_me()
174
+ include TreetaggerModule
175
+
176
+ ###
177
+ def TreetaggerPOSInterface.system()
178
+ return "treetagger"
179
+ end
180
+
181
+ ###
182
+ def TreetaggerPOSInterface.service()
183
+ return "pos_tagger"
184
+ end
185
+
186
+ ###
187
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
188
+
189
+ def convert_to_collins(line)
190
+ line.chomp!
191
+ return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
192
+ end
193
+
194
+ ###
195
+ def process_file(infilename, # string: name of input file
196
+ outfilename) # string: name of output file
197
+
198
+ # KE change here
199
+ tt_filename = really_process_file(infilename, outfilename, true)
200
+
201
+ # write all output to tempfile2 first, then
202
+ # change ISO to UTF-8 into outputfile
203
+ tempfile2 = Tempfile.new("treetagger")
204
+ tempfile2.close()
205
+
206
+ # 2. use cut to get the actual lemmtisation
207
+
208
+ Kernel.system("cat " + tt_filename +
209
+ ' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
210
+
211
+ # transform ISO-8859-1 back to UTF-8,
212
+ # write to 'outfilename'
213
+ begin
214
+ outfile = File.new(outfilename, "w")
215
+ rescue
216
+ raise "Could not write to #{outfilename}"
217
+ end
218
+ tempfile2.open()
219
+ while (line = tempfile2.gets())
220
+ outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
221
+ end
222
+
223
+ # remove second tempfile, finalize output file
224
+ tempfile2.close(true)
225
+ outfile.close()
226
+ end
227
+ end
228
+
229
+ ###############
230
+ # an interpreter that only has Treetagger, no parser
231
+ class TreetaggerInterpreter < SynInterpreter
232
+ TreetaggerInterpreter.announce_me()
233
+
234
+ ###
235
+ # names of the systems interpreted by this class:
236
+ # returns a hash service(string) -> system name (string),
237
+ # e.g.
238
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
239
+ def TreetaggerInterpreter.systems()
240
+ return {
241
+ "pos_tagger" => "treetagger",
242
+ }
243
+ end
244
+
245
+ ###
246
+ # names of additional systems that may be interpreted by this class
247
+ # returns a hash service(string) -> system name(string)
248
+ # same as names()
249
+ def TreetaggerInterpreter.optional_systems()
250
+ return {
251
+ "lemmatizer" => "treetagger"
252
+ }
253
+ end
254
+
255
+ ###
256
+ # generalize over POS tags.
257
+ #
258
+ # returns one of:
259
+ #
260
+ # adj: adjective (phrase)
261
+ # adv: adverb (phrase)
262
+ # card: numbers, quantity phrases
263
+ # con: conjunction
264
+ # det: determiner, including possessive/demonstrative pronouns etc.
265
+ # for: foreign material
266
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
267
+ # part: particles, truncated words (German compound parts)
268
+ # prep: preposition (phrase)
269
+ # pun: punctuation, brackets, etc.
270
+ # sent: sentence
271
+ # top: top node of a sentence
272
+ # verb: verb (phrase)
273
+ # nil: something went wrong
274
+ #
275
+ # returns: string, or nil
276
+ def TreetaggerInterpreter.category(node) # SynNode
277
+ pt = TreetaggerInterpreter.pt(node)
278
+ if pt.nil?
279
+ # phrase type could not be determined
280
+ return nil
281
+ end
282
+
283
+ pt.to_s.strip() =~ /^([^-]*)/
284
+ case $1
285
+ when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
286
+ when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
287
+ when /^CD/, /^QP/ then return "card"
288
+ when /^CC/, /^WRB/, /^CONJP/ then return "con"
289
+ when /^DT/, /^POS/ then return "det"
290
+ when /^FW/, /^SYM/ then return "for"
291
+ when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
292
+ when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
293
+ when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
294
+ when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
295
+ when /^TOP/ then return "top"
296
+ when /^TRACE/ then return "trace"
297
+ when /^V/ , /^MD/ then return "verb"
298
+ else
299
+ # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
300
+ return nil
301
+ end
302
+ end
303
+ end
@@ -0,0 +1,142 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ # AB, 2010-11-25
4
+
5
+ ##############################
6
+ # class for managing parses:
7
+ #
8
+ # Given either a directory with tab format files or
9
+ # a directory with SalsaTigerXML files (or both) and
10
+ # a directory for putting parse files:
11
+ # - parse, unless no parsing set in the experiment file
12
+ # - for each parsed file: yield one OneParsedFile object
13
+ require 'frprep/one_parsed_file'
14
+
15
+ class DoParses
16
+ def initialize(exp, # FrPrepConfigData object
17
+ file_suffixes, # hash: file type(string) -> suffix(string)
18
+ parse_dir, # string: name of directory to put parses
19
+ var_hash = {}) # further directories
20
+ @exp = exp
21
+ @file_suffixes = file_suffixes
22
+ @parse_dir = parse_dir
23
+ @tab_dir = var_hash["tab_dir"]
24
+ @stxml_dir = var_hash["stxml_dir"]
25
+
26
+ # pre-parsed data available?
27
+ @parsed_files = @exp.get("directory_parserout")
28
+ end
29
+
30
+ ###
31
+ def each_parsed_file()
32
+ if @exp.get("do_postag")
33
+ postag_suffix = @file_suffixes["pos"]
34
+ else
35
+ postag_suffix = nil
36
+ end
37
+
38
+ if @exp.get("do_lemmatize")
39
+ lemma_suffix = @file_suffixes["lemma"]
40
+ else
41
+ lemma_suffix = nil
42
+ end
43
+
44
+ if @exp.get("do_parse")
45
+
46
+ # get parser interface
47
+ sys_class = SynInterfaces.get_interface("parser",
48
+ @exp.get("parser"))
49
+ unless sys_class
50
+ raise "Shouldn't be here"
51
+ end
52
+ parse_suffix = "." + sys_class.name()
53
+ sys = sys_class.new(@exp.get("parser_path"),
54
+ @file_suffixes["tab"],
55
+ parse_suffix,
56
+ @file_suffixes["stxml"],
57
+ "pos_suffix" => postag_suffix,
58
+ "lemma_suffix" => lemma_suffix,
59
+ "tab_dir" => @tab_dir)
60
+
61
+ if @parsed_files
62
+ # reuse old parses
63
+
64
+ $stderr.puts "Frprep: using pre-computed parses in " + @parsed_files.to_s()
65
+ $stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
66
+
67
+ Dir[@parsed_files + "*"].each { |parsefilename|
68
+
69
+ if File.stat(parsefilename).ftype != "file"
70
+ # something other than a file
71
+ next
72
+ end
73
+
74
+
75
+ # core filename: remove directory and anything after the last "."
76
+ filename_core = File.basename(parsefilename, ".*")
77
+ #print "FN ", filename_core, " PN ", parsefilename, " sys ", sys, "\n"
78
+ # use iterator to read each parsed file
79
+ yield OneParsedFile.new(filename_core, parsefilename, sys)
80
+ }
81
+
82
+ else
83
+ # do new parses
84
+ $stderr.puts "Frprep: Parsing"
85
+
86
+ # sanity check
87
+ unless @exp.get("parser_path")
88
+ raise "Parsing: I need 'parser_path' in the experiment file"
89
+ end
90
+ unless @tab_dir
91
+ raise "Cannot parse without tab files"
92
+ end
93
+
94
+ # parse
95
+ sys.process_dir(@tab_dir, @parse_dir)
96
+
97
+ $stderr.puts "Frprep: Postprocessing SalsaTigerXML data"
98
+
99
+ Dir[@parse_dir + "*" + parse_suffix].each { |parsefilename|
100
+ filename_core = File.basename(parsefilename, parse_suffix)
101
+
102
+ # use iterator to read each parsed file
103
+ yield OneParsedFile.new(filename_core, parsefilename, sys)
104
+ }
105
+ end
106
+
107
+ else
108
+ # no parse:
109
+ # get pseudo-parse tree
110
+
111
+ if @stxml_dir
112
+ # use existing SalsaTigerXML files
113
+ Dir[@stxml_dir + "*.xml"].each { |stxmlfilename|
114
+
115
+ filename_core = File.basename(stxmlfilename, ".xml")
116
+ if @tab_dir
117
+ # we know the tab directory too
118
+ tabfilename = @tab_dir + filename_core + @file_suffixes["tab"]
119
+ each_sentence_obj = FrprepReadStxml.new(stxmlfilename, tabfilename,
120
+ postag_suffix, lemma_suffix)
121
+ else
122
+ # we have no tab directory
123
+ each_sentence_obj = FrprepReadStxml.new(stxmlfilename, nil,
124
+ postag_suffix, lemma_suffix)
125
+ end
126
+
127
+ yield OneParsedFile.new(filename_core, stxmlfilename, each_sentence_obj)
128
+ }
129
+
130
+ else
131
+ # construct SalsaTigerXML from tab files
132
+ Dir[@tab_dir+"*"+@file_suffixes["tab"]].each { |tabfilename|
133
+ each_sentence_obj = FrprepFlatSyntax.new(tabfilename,
134
+ postag_suffix,
135
+ lemma_suffix)
136
+ filename_core = File.basename(tabfilename, @file_suffixes["tab"])
137
+ yield OneParsedFile.new(filename_core, tabfilename, each_sentence_obj)
138
+ }
139
+ end # source of pseudo-parse
140
+ end # parse or no parse
141
+ end
142
+ end