frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,338 @@
1
+ # name: Module Headz
2
+ # auth: albu@coli.uni-sb.de
3
+ #
4
+ # modified KE Sept 04:
5
+ # changed from old Sentence pkg to new SalsaTigerSentence pkg
6
+ #
7
+ # modified KE April 05:
8
+ # suppress the flood of warnings
9
+ #
10
+ # modified SP June 05: added some more cases; change to SalsTigerRegXML
11
+ #
12
+ #
13
+ # INIT: REXML TIGER sentence,
14
+ # FUNC: syn_nodes(term/non_term) -> heads
15
+ #
16
+ #
17
+ # usage:
18
+ #
19
+ # h = Headz.new()
20
+ #
21
+ # hash = h.get_sem_head(node) # node is a SalsaTigerXmlNode obj
22
+ #
23
+ # head = hash["head"]
24
+ # prep = hash["prep"]
25
+ #
26
+ # if h.complex(head)
27
+ # print "preposition of conjunction involved"
28
+ # end
29
+
30
+ require "frprep/SalsaTigerRegXML"
31
+
32
+ class Headz
33
+
34
+ def initialize()
35
+ @Helpers = HeadzHelpers.new()
36
+ @Verbose = false #KE 13.4.05: please not that many messages!
37
+ end
38
+
39
+ # head of one node
40
+ def get_sem_head(node)
41
+ gsh(node)
42
+ end
43
+
44
+ # all headz of top-nodes covering fe
45
+ def get_fe_heads(fe)
46
+ if (const = fe.children())
47
+ const.map { |node|
48
+ get_sem_head(node)
49
+ }
50
+ else
51
+ $stderr.puts "Headz.get_sem_head: no children for FE #{fe}"
52
+ []
53
+ end
54
+ end
55
+
56
+ def gsh (node)
57
+ if !node then
58
+ if @Verbose then $stderr.puts "Headz.gsh: no input node" end
59
+ return {}
60
+
61
+ elsif node.is_terminal? then return Hash['head'=>node]
62
+
63
+ else
64
+ case node.category
65
+ when 'AP'
66
+ return gsh(@Helpers.get_dtr(node,'HD'))
67
+
68
+ when 'AVP'
69
+ return gsh(@Helpers.get_dtr(node,'HD'))
70
+ when 'CAP', 'CAVP', 'CNP', 'CPP', 'CS', 'CVP'
71
+ conjs = @Helpers.get_conjuncts(node)
72
+ head = gsh(conjs.shift)
73
+ if head
74
+ head.update(Hash["conj"=>gsh_conjs(conjs)])
75
+ end
76
+ return head
77
+
78
+ when 'NM'
79
+ return gsh(@Helpers.get_rightmost_dtr(node,'NMC'))
80
+ when 'NP'
81
+ nk = @Helpers.get_rightmost_dtr(node,'NK')
82
+ if nk
83
+ return gsh(nk)
84
+ else
85
+ return gsh(@Helpers.get_rightmost_dtr(node, "NN"))
86
+ end
87
+
88
+ when 'PN'
89
+ pncs = @Helpers.get_dtrs(node,'PNC')
90
+ head = gsh(pncs.last)
91
+ if head
92
+ head.update(Hash["pncs"=>pncs])
93
+ end
94
+ return head
95
+
96
+ when 'PP'
97
+ return pp(node)
98
+
99
+ when 'S'
100
+ return s(node)
101
+ when 'VROOT'
102
+ dtrs = @Helpers.get_dtrs(node,'--')
103
+
104
+ # discourse level node with sentence nodes below?
105
+ # or conjunction with sentence nodes below?
106
+ discourselevel_dtr = dtrs.detect { |n| n.category == "DL"}
107
+ co_dtr = dtrs.detect { |n| n.category == "CO" }
108
+ if discourselevel_dtr
109
+ dtrs = discourselevel_dtr.children()
110
+ elsif co_dtr
111
+ dtrs = co_dtr.children()
112
+ end
113
+
114
+
115
+ # take first sentence node
116
+ sent_dtr = dtrs.detect {|n| n.category =~ /^C?S/}
117
+ if sent_dtr
118
+ return gsh(sent_dtr)
119
+ else
120
+ # $stderr.puts "headz Warning: no sentence found below VROOT! Node #{node.id()}"
121
+ return nil
122
+ end
123
+
124
+ when 'VP'
125
+ return vp(node)
126
+
127
+ when 'MTA'
128
+ return gsh(@Helpers.get_rightmost_dtr(node,'ADC'))
129
+
130
+ when 'VZ'
131
+ return gsh(@Helpers.get_dtr(node,'HD'))
132
+ else
133
+ if @Verbose
134
+ $stderr.puts " Headz.gsh: no rule for #{node.category}"
135
+ end
136
+ {}
137
+ end
138
+ end
139
+ end
140
+
141
+ # flatten the processed conjs to a list of (head) Hashes
142
+ # containing no conj features themselves
143
+ def gsh_conjs(conjs)
144
+ flat = Array.new
145
+
146
+ conjs.each {|conj|
147
+ current = gsh(conj)
148
+ @Helpers.descend(current,flat)
149
+ }
150
+
151
+ flat
152
+ end
153
+
154
+ #####################################3
155
+ def pp(node)
156
+
157
+ prep = node.terminals_sorted().detect { |n|
158
+ (pt = n.part_of_speech()) and
159
+ (pt =~ /^APPR/ or
160
+ pt =~ /^PWAV/ or
161
+ pt =~ /^C?PP/
162
+ )
163
+ }
164
+
165
+ if (lastnk = @Helpers.get_rightmost_dtr(node,'NK'))
166
+ head = gsh(lastnk)
167
+ if head and prep
168
+ head.update(Hash['prep'=>prep])
169
+ end
170
+
171
+ elsif (re = @Helpers.get_dtr(node,'RE'))
172
+ head = gsh(re)
173
+ if head and prep
174
+ head.update(Hash['prep'=>prep])
175
+ end
176
+ else
177
+ if @Verbose then $stderr.puts " pp: no rule for #{node}" end
178
+ end
179
+
180
+ head
181
+ end
182
+
183
+ ################
184
+ def s(node)
185
+ head = @Helpers.get_dtr(node,'HD')
186
+ if !head
187
+ # $stderr.puts " s: no head for #{node}"
188
+ return Hash[]
189
+ end
190
+
191
+ if head.outdeg() == 0
192
+ return gsh(head)
193
+ end
194
+
195
+ oc = @Helpers.get_dtr(node,'OC')
196
+ case head.category
197
+ when 'VVFIN'
198
+ if svp = @Helpers.get_dtr(node,'SVP') then
199
+ h = gsh(head)
200
+ if h
201
+ return h.update(Hash['svp'=>gsh(svp), 'oc'=>gsh(oc)])
202
+ else
203
+ return h
204
+ end
205
+ else
206
+ return gsh(head)
207
+ end
208
+
209
+ when 'VAFIN'
210
+ if oc && headd = @Helpers.get_dtr(oc,'HD')
211
+ h = gsh(headd)
212
+ if h
213
+ return h.update(Hash['oc'=>gsh(oc)])
214
+ else
215
+ return h
216
+ end
217
+
218
+ elsif pd = @Helpers.get_dtr(node,'PD') && head = @Helpers.get_dtr(pd,'HD')
219
+ return gsh(head)
220
+
221
+ else
222
+ if @Verbose then $stderr.puts " s: no rule for #{node}" end
223
+ end
224
+ else
225
+ if @Verbose then $stderr.puts " s: no rule for #{node}" end
226
+ end
227
+ end
228
+
229
+ ################
230
+ def vp(node)
231
+ head = gsh(@Helpers.get_dtr(node,'HD'))
232
+ tmp = @Verbose
233
+ @Verbose = false
234
+ newHash = Hash.new
235
+ ["da","oa"].each { |type|
236
+ if (dtr = @Helpers.get_dtr(node,type.upcase))
237
+ newHash[type] = gsh(dtr)
238
+ end
239
+ }
240
+ @Verbose = tmp
241
+ if head
242
+ return head.update(newHash)
243
+ else
244
+ return newHash
245
+ end
246
+ end
247
+
248
+ ################
249
+ # Access
250
+ def head(h)
251
+ return h['head']
252
+ end
253
+
254
+ def complex(h)
255
+ prep(h) or conj(h)
256
+ end
257
+
258
+ def prep(h)
259
+ return h['prep']
260
+ end
261
+
262
+ def conj(h)
263
+ return h['conj']
264
+ end
265
+
266
+
267
+
268
+ end # Class Headz
269
+
270
+
271
+ class HeadzHelpers
272
+ @Verbose = true
273
+
274
+ # Conjunction
275
+
276
+ def get_conjuncts(node)
277
+ conjuncts = get_dtrs(node,'CJ')
278
+ end
279
+
280
+ # flatten
281
+ def descend(current,flat)
282
+ if current.nil?
283
+ return flat
284
+ end
285
+
286
+ if current.has_key?("conj") then
287
+ tmp = current.delete("conj")
288
+ flat.push current
289
+ tmp.each {|item|
290
+ descend(item,flat)}
291
+ else
292
+ flat.push current
293
+ end
294
+ end
295
+
296
+ # Zugriff
297
+
298
+ def get_dtr(node,label)
299
+ if (dtrs = node.children_by_edgelabels([label]))
300
+ dtrs.first
301
+ else
302
+ if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtr for #{node}" end
303
+ nil
304
+ end
305
+ end
306
+
307
+ def get_dtrs(node,label)
308
+ if ! dtrs = node.children_by_edgelabels([label])
309
+ if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtr for #{node}" end
310
+ else
311
+ dtrs
312
+ end
313
+ end
314
+
315
+ def get_rightmost_dtr(node,label)
316
+ children = node.children_by_edgelabels([label])
317
+ if re = children.last then re
318
+ else
319
+ if @Verbose then $stderr.puts " SelectHeadDtr: no #{label} dtrs for #{node}" end
320
+ nil
321
+ end
322
+ end
323
+
324
+ # def l2h(list)
325
+ # h = Hash.new
326
+ # while (list.length > 1) do
327
+ # h[list.shift] = list.shift
328
+ # end
329
+ # if list.length == 1 then
330
+ # $stderr.puts "l2h: odd number of elems: " + list.join(" / ")
331
+ # end
332
+ # h
333
+ # end
334
+
335
+ end # Class HeadzHelpers
336
+
337
+
338
+
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+ # AB, 2010-11-25
3
+
4
+
5
+ ##############################
6
+ # class for managing the parses of one file
7
+ class OneParsedFile
8
+ attr_reader :filename
9
+
10
+ def initialize(filename, # string: core of filename for the parse file
11
+ complete_filename, # string: complete filename of parse file
12
+ obj_with_iterator) # object with each_sentence method, see above
13
+ @obj_with_iterator = obj_with_iterator
14
+ @filename = filename
15
+ @complete_filename = complete_filename
16
+ end
17
+
18
+ # yield each parse sentence as a tuple
19
+ # [ salsa/tiger xml sentence, tab format sentence, mapping]
20
+ # of a SalsaTigerSentence object, a FNTabSentence object,
21
+ # and a hash: FNTab sentence lineno(integer) -> array:SynNode
22
+ # pointing each tab word to one or more SalsaTigerSentence terminals
23
+ def each_sentence()
24
+ @obj_with_iterator.each_sentence(@complete_filename) { |st_sent, tab_sent, mapping|
25
+ yield [st_sent, tab_sent, mapping]
26
+ }
27
+ end
28
+ end
@@ -0,0 +1,94 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ # AB, 2010-11-25
4
+
5
+ require 'optparse'
6
+ require 'frprep/FrPrepConfigData'
7
+ require 'frprep/SynInterfaces'
8
+ module FrPrep
9
+
10
+ # This class parses options for FrPrep.
11
+ class OptParser
12
+
13
+ # Main class method.
14
+ # OP expects cmd_args to be an array like ARGV.
15
+ def self.parse(cmd_args)
16
+ @prg_name = 'frprep'
17
+ @@options = {}
18
+
19
+ parser = create_parser
20
+
21
+ # If no options provided print the help.
22
+ if cmd_args.empty?
23
+ $stderr.puts('You have to provide some options.',
24
+ "Please start with <#{@prg_name} --help>.")
25
+ exit(1)
26
+ end
27
+
28
+ # Parse ARGV and provide the options hash.
29
+ # Check if everything is correct and handle exceptions
30
+ begin
31
+ parser.parse(cmd_args)
32
+ rescue OptionParser::InvalidArgument => e
33
+ arg = e.message.split.last
34
+ $stderr.puts "The provided argument #{arg} is currently not supported!"
35
+ $stderr.puts "Please colsult <#{@prg_name} --help>."
36
+ exit(1)
37
+ rescue OptionParser::InvalidOption => e
38
+ $stderr.puts "You have provided an #{e.message}."
39
+ $stderr.puts "Please colsult <#{@prg_name} --help>."
40
+ exit(1)
41
+ rescue
42
+ raise
43
+ end
44
+
45
+
46
+ exp = FrPrepConfigData.new(@@options[:exp_file])
47
+
48
+ # AB: this stuff should be move into FrPrepConfigData.
49
+ # sanity checks
50
+ unless exp.get("prep_experiment_ID") =~ /^[A-Za-z0-9_]+$/
51
+ raise "Please choose an experiment ID consisting only of the letters A-Za-z0-9_."
52
+ end
53
+
54
+ SynInterfaces.check_interfaces_abort_if_missing(exp)
55
+
56
+ exp
57
+ end
58
+
59
+ private
60
+ def self.create_parser
61
+ OptionParser.new do |opts|
62
+ opts.banner = <<STOP
63
+ Fred Preprocessor <FrPrep>. Preprocessing stage before Fred and Rosy
64
+ for further frame/word sense assignment and semantic role assignment.
65
+
66
+ Usage: frprep -h|-e FILENAME'
67
+ STOP
68
+ opts.separator ''
69
+ opts.separator 'Program specific options:'
70
+
71
+ opts.on('-e', '--expfile FILENAME',
72
+ 'Provide the path to an experiment file.',
73
+ 'FrPrep will preprocess data according to the specifications',
74
+ 'given in your experiment file.',
75
+ 'This option is required!',
76
+ 'Also consider the documentation on format and features.'
77
+ ) do |exp_file|
78
+ @@options[:exp_file] = File.expand_path(exp_file)
79
+ end
80
+
81
+ opts.separator ''
82
+ opts.separator 'Common options:'
83
+
84
+ opts.on_tail('-h', '--help', 'Show this help message.') do
85
+ puts opts
86
+ exit
87
+ end
88
+
89
+ end
90
+
91
+ end # def self.parse
92
+
93
+ end # class OptParser
94
+ end # module FrPrep