shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,27 +0,0 @@
1
- require "fred/FredFeatures"
2
-
3
- def determine_training_senses(lemma, exp, lemmas_and_senses_obj, split_id)
4
- if split_id
5
- # oh no, we're splitting the dataset into random training and test portions.
6
- # this means that we actually have to look into the training part of the data to
7
- # determine the number of training senses
8
- senses_hash= Hash.new()
9
-
10
- reader = AnswerKeyAccess.new(exp, "train", lemma, "r", split_id, "train")
11
- reader.each { |lemma, pos, ids, sids, gold_senses, transformed_gold_senses|
12
- gold_senses.each { |s| senses_hash[s] = true }
13
- }
14
- return senses_hash.keys()
15
-
16
- else
17
- # we're using separate test data.
18
- # so we can just look up the number of training senses
19
- # in the lemmas_and_senses object
20
- senses = lemmas_and_senses_obj.get_senses(lemma)
21
- if senses
22
- return senses
23
- else
24
- return []
25
- end
26
- end
27
- end
@@ -1,402 +0,0 @@
1
- # FredParameters
2
- # Katrin Erk, April 05
3
- #
4
- # Frame disambiguation system:
5
- # test different values for system parameters,
6
- # construct text and graphical output
7
-
8
- # Salsa packages
9
- require "PlotAndREval"
10
-
11
- # Fred packages
12
- require "FredConfigData"
13
- require "FredConventions"
14
- require "FredSplit"
15
- require "FredTrain"
16
- require "FredTest"
17
- require "FredEval"
18
-
19
- ##########################################
20
-
21
- ################
22
- # SlideVar:
23
- # keeps a single sliding variable,
24
- # has an iterator that yields each value of the slide as a pair
25
- # [lhs, rhs] to be passed on to FredConfigData.set_entry()
26
- #
27
- # Initialization with the value of a --slide command line parameter.
28
- # Valid forms:
29
- #
30
- # feature=<f>:<what>:<start>-<end>:<slide>
31
- # with f in { context, ngram, syn, grfunc, fe }
32
- # what in { weight, dist } (dist only available for context)
33
- # start, end, slide floats represented as strings
34
- #
35
- # <var>:<start>-<end>:<slide>
36
- # with var in { smoothing_lambda, window_size }
37
- class SlideVar
38
- attr_reader :var_name
39
-
40
- def initialize(string, # value of --slide parameter
41
- exp) # FredConfigData object
42
-
43
- # keep start and end value and step size for the sliding
44
- @startval = @endval = @step = @current = 0.0
45
-
46
- # setting experiment file values for each step of the sliding:
47
- # remember lhs and rhs of what needs to be set.
48
- # rhs contains a string REPLACEME to be replaced by the current value
49
- @exp_lhs = ""
50
- @exp_rhs = ""
51
- @var_name = ""
52
- @remove_list_variable_regexp = nil # set non-nil if we need unset_list_entry()
53
-
54
- if string == ""
55
- # empty slide variable
56
- return
57
- end
58
-
59
- if string =~ /^feature=(\w+):(\w+):([\d\.]+)-([\d\.]+):([\d\.]+)$/
60
- # --slide feature=ngram:weight:0.8-4.0:0.3
61
- # --slide feature=context:dist:0.7-0.9:0.05
62
-
63
- featurename = $1
64
- parname = $2
65
- @startval = $3.to_f
66
- @endval = $4.to_f
67
- @step = $5.to_f
68
-
69
- @exp_lhs = "feature"
70
-
71
- if featurename == "context"
72
- # both weight and dist possible
73
-
74
- case parname
75
- when "weight"
76
- @exp_rhs = "#{featurename} REPLACEME #{exp.get_lf("feature", "context", "wtdist")}"
77
- when "dist"
78
- @exp_rhs = "#{featurename} #{exp.get_lf("feature", "context", "weight")} REPLACEME"
79
- else
80
- raise "Error in argument of --slide: I found a value of neither 'weight' nor 'dist': "+ parname
81
- end
82
-
83
- if exp.get_lf("feature", "context", "mwedist")
84
- @exp_rhs << " mwedist"
85
- end
86
-
87
- else
88
- # feature name not "context": only weight possible
89
- unless parname == "weight"
90
- raise "Error in argument of --slide: can only do 'weight', what I got is "+ parname
91
- end
92
-
93
- @exp_rhs = "#{featurename} REPLACEME"
94
- end
95
-
96
- @var_name = "feature #{featurename} #{parname}"
97
- @remove_list_variable_regexp = Regexp.new("^#{featurename}\s")
98
-
99
- elsif string =~ /^(\w+):([\d\.]+)-([\d\.]+):([\d\.]+)$/
100
- # --slide window_size:0-4:1
101
- # --slide smoothing_lambda:0.3-0.9:0.05
102
-
103
- featurename = $1
104
- case exp.get_type(featurename)
105
- when "integer"
106
- @startval = $2.to_i
107
- @endval = $3.to_i
108
- @step = $4.to_i
109
- when "float"
110
- @startval = $2.to_f
111
- @endval = $3.to_f
112
- @step = $4.to_f
113
- else
114
- raise "Unslidable variable "+ featurename
115
- end
116
-
117
- @exp_lhs = featurename
118
- @exp_rhs = "REPLACEME"
119
- @var_name = featurename
120
-
121
- else
122
- # not a valid argument to --slide
123
-
124
- raise "Sorry, could not parse argument of --slide. \nI got: "+ string
125
- end
126
- end
127
-
128
- ####
129
- # iterate through each value of the slide variable (if there is a slide variable)
130
- # and set it in the experiment file data structure
131
- #
132
- # also yield a descriptive text of the current setting
133
- def each_slide_value(exp) # FredConfigData object
134
-
135
- if empty?
136
- # no slide variable
137
-
138
- yield [0, ""]
139
- return
140
-
141
- else
142
- # the slide variable is nonempty
143
-
144
- @current = @startval
145
-
146
- while @current <= @endval
147
-
148
- if @remove_list_variable_regexp
149
- # we have a list feature that we first need to unset before setting it
150
- exp.unset_list_entry(@exp_lhs, @remove_list_variable_regexp)
151
- end
152
- exp.set_entry(@exp_lhs, @exp_rhs.sub(/REPLACEME/, @current.to_s))
153
-
154
- yield [@current, @var_name + "=" + @current.to_s]
155
- @current += @step
156
- end
157
- end
158
- end
159
-
160
- def empty?
161
- return @exp_lhs.empty?
162
- end
163
- end
164
-
165
- ################
166
- # ToggleVar:
167
- # keeps a single toggle variable,
168
- # and has a method that sets this toggle variable to a given value
169
- # in the experiment file data structure.
170
- class ToggleVar
171
- attr_reader :var_name
172
-
173
- def initialize(string, # part of value of --slide parameter, which has been split at :
174
- exp) # FredConfigData object
175
-
176
- if string =~ /^feature_dim=(\w+)$/
177
- # feature dimension
178
-
179
- @exp_lhs = "feature_dim"
180
- @exp_rhs = $1
181
- @unset_at_false = true # for false, un-set list valued parameter in set_value_to()
182
- @var_name = "feature_dim #{@exp_rhs}"
183
-
184
- unless ["word", "lemma", "pos", "ne"].include? @exp_rhs
185
- raise "Unknown feature dimension "+ @exp_rhs
186
- end
187
-
188
- else
189
- # normal variable
190
- unless exp.get_type(string) == "bool"
191
- raise "Unknown value in --toggle: "+ string
192
- end
193
-
194
- if ["use_fn_gf", "window_size"].include? string
195
- raise "Sorry, cannot toggle #{string}, since this variable takes its effect during featurization."
196
- end
197
-
198
- @exp_lhs = string
199
- @exp_rhs = "REPLACEME"
200
- @unset_at_false = false # for false, set parameter to false in set_value_to
201
- @var_name = @exp_lhs
202
- end
203
- end
204
-
205
- ###
206
- # set the value of my toggle variable to the given boolean
207
- # in the given experiment file data structure.
208
- #
209
- # returns a descriptive text of the current setting
210
- def set_value_to(boolean, # true, false
211
- exp) # FredConfigData object
212
-
213
- if @unset_at_false and not(boolean)
214
- exp.unset_list_entry(@exp_lhs, @exp_rhs)
215
- else
216
- exp.set_entry(@exp_lhs, @exp_rhs.sub(/REPLACEME/, boolean.to_s))
217
- end
218
-
219
- return @var_name + "=" + boolean.to_s
220
- end
221
-
222
- end
223
-
224
-
225
- ##########################################
226
- # main class of this package:
227
- # try out different values for system parameters,
228
- # and record the result.
229
- #
230
- # One value can be a slide variable, taking on several numerical values.
231
- # 0 or more values can be toggle variables, taking on the values true and false.
232
- class FredParameters
233
-
234
- #####
235
- def initialize(exp_obj, # FredConfigData object
236
- options) # hash: runtime option name (string) => value(string)
237
-
238
-
239
- in_enduser_mode_unavailable()
240
- @exp = exp_obj
241
-
242
- ##
243
- # evaluate runtime options:
244
- # record the slide variable (if any) plus all toggle variables
245
- @slide = SlideVar.new("", @exp)
246
- @toggle = Array.new
247
- @outfile_prefix = "fred_parameters"
248
-
249
- options.each_pair do |opt, arg|
250
- case opt
251
- when "--slide"
252
- @slide = SlideVar.new(arg, @exp)
253
-
254
- when "--toggle"
255
- arg.split(":").each { |toggle_var|
256
- @toggle << ToggleVar.new(toggle_var, @exp)
257
- }
258
-
259
- when "--output_to"
260
- @outfile_prefix = arg
261
-
262
- else
263
- # case of unknown arguments has been dealt with by fred.rb
264
- end
265
- end
266
-
267
-
268
- # announce the task
269
- $stderr.puts "---------"
270
- $stderr.puts "Fred parameter exploration, experiment #{@exp.get("experiment_ID")}"
271
- $stderr.puts "---------"
272
-
273
- end
274
-
275
- ####
276
- def compute()
277
- ##
278
- # make a split of the training data
279
- begin
280
- feature_dir = fred_dirname(@exp, "train", "features")
281
- rescue
282
- $stderr.puts "To experiment with system parameters, please first featurize training data."
283
- exit 1
284
- end
285
- # make new split ID from system time, and make a split with 80% training, 20% test data
286
- splitID = Time.new().to_f.to_s
287
- task_obj = FredSplit.new(@exp,
288
- { "--logID" => splitID,
289
- "--trainpercent" => "80",
290
- },
291
- true # ignore unambiguous
292
- )
293
- task_obj.compute()
294
-
295
- ##
296
- # start recording results:
297
-
298
- # text output file
299
- begin
300
- textout_file = File.new(@outfile_prefix + ".txt", "w")
301
- rescue
302
- raise "Could not write to output file #{@outfile_prefix}.txt"
303
- end
304
-
305
- # values_to_score: hash toggle_values_descr(string) =>
306
- # hash slide_value(float) => score(float)
307
- values_to_score = Hash.new()
308
-
309
- # max_score: float, describing maximum score achieved
310
- # max_setting: string, describing values for maximum score
311
- max_score = 0.0
312
- max_setting = ""
313
-
314
- ##
315
- # for each value of the toggle variables
316
- 0.upto(2**@toggle.length() - 1) { |binary|
317
-
318
- textout_line = ""
319
-
320
- # re-set toggle values according to 'binary':
321
- @toggle.each_index { |i|
322
- # if the i-th bit is set in binary, set this
323
- # boolean to true, else set it to false
324
- if (binary & (2**i)) > 0
325
- textout_line << @toggle[i].set_value_to(true, @exp) + " "
326
- else
327
- textout_line << @toggle[i].set_value_to(false, @exp) + " "
328
- end
329
- }
330
-
331
- values_to_score[textout_line] = Hash.new()
332
-
333
-
334
- ##
335
- # for each value of the slide variable
336
- @slide.each_slide_value(@exp) { |slide_value, slide_value_description|
337
-
338
- ##
339
- # progress bar
340
- $stderr.puts "Parameter exploration: #{textout_line} #{slide_value_description}"
341
-
342
- ##
343
- # @exp has been modified to fit the current values of the
344
- # slide and toggle variables.
345
- # Now train, test, evaluate on the split we have constructed
346
- task_obj = FredTrain.new(@exp, { "--logID" => splitID})
347
- task_obj.compute()
348
- task_obj = FredTest.new(@exp,
349
- { "--logID" => splitID,
350
- "--nooutput"=> true
351
- })
352
- task_obj.compute()
353
- task_obj = FredEval.new(@exp, {"--logID" => splitID})
354
- task_obj.compute(false) # don't print evaluation results to file
355
-
356
- ##
357
- # read off F-score, record result
358
- score = task_obj.f
359
-
360
- textout_file.puts textout_line + slide_value_description + " : " + score.to_s
361
- textout_file.flush()
362
- values_to_score[textout_line][slide_value] = score
363
-
364
- if score > max_score
365
- max_score = score
366
- max_setting = textout_line + slide_value_description + " : " + score.to_s
367
- end
368
- }
369
- }
370
-
371
- ##
372
- # remove split
373
- FredSplit.remove_split(@exp, splitID)
374
-
375
- ##
376
- # plot outcome, report overall maximum
377
-
378
- unless @slide.empty?
379
- # gnuplot output only if some slide variable has been used
380
- title = "Exploring #{@slide.var_name}, " + @toggle.map { |toggle_obj| toggle_obj.var_name }.join(", ")
381
- PlotAndREval.gnuplot_direct(values_to_score,
382
- title,
383
- @slide.var_name,
384
- "F-score",
385
- @outfile_prefix + ".ps")
386
- end
387
-
388
- $stderr.puts "Parameter exploration finished."
389
- $stderr.puts "Text output was written to #{@outfile_prefix}.txt"
390
- unless @slide.empty?
391
- $stderr.puts "Gnuplot output was written to #{@outfile_prefix}.ps"
392
- end
393
-
394
- unless max_setting.empty?
395
- $stderr.puts "-----------------------"
396
- $stderr.puts "Maximum score:"
397
- $stderr.puts max_setting
398
- end
399
- end
400
-
401
- end
402
-
@@ -1,84 +0,0 @@
1
- # FredSplit
2
- # Katrin Erk April 05
3
- #
4
- # Frame disambiguation system:
5
- # make random split of the training data
6
- #
7
- # The split is computed on the basis of the Fred format
8
- # feature data.
9
- # The split is recorded in a separate split directory
10
- # with a very simple system:
11
- # - one file per feature file, same filename
12
- # - one line per instance line in feature file
13
- # - entry in that line is either 'train' or 'test'
14
-
15
- # Fred packages
16
- require "fred/FredSplitPkg"
17
-
18
- class FredSplit
19
-
20
- ###
21
- # new
22
- #
23
- # evaluate runtime options and announce the task
24
- def initialize(exp_obj, # FredConfigData object
25
- options, # hash: runtime option name (string) => value(string)
26
- ignore_unambiguous = false)
27
-
28
- in_enduser_mode_unavailable()
29
-
30
- @exp = exp_obj
31
- @ignore_unambiguous = ignore_unambiguous
32
-
33
- # evaluate runtime options
34
- @split_id = nil
35
- @trainpercent = 0.9
36
-
37
- options.each_pair { |opt, arg|
38
- case opt
39
- when "--logID"
40
- @split_id = arg
41
-
42
- when "--trainpercent"
43
- @trainpercent = arg.to_f / 100.0
44
-
45
- else
46
- # case of unknown arguments has been dealt with by fred.rb
47
- end
48
- }
49
-
50
- # sanity check: need a log ID
51
- if @split_id.nil?
52
- raise "I need a log ID, parameter --logID"
53
- end
54
- if @trainpercent <= 0.0 or @trainpercent >= 1.0
55
- raise "Training percentage needs to be between 1 and 99. I got "+
56
- (@trainpercent * 100.0).to_i.to_s
57
- end
58
-
59
- ##
60
- # make a splitting object
61
- @split_obj = FredSplitPkg.new(@exp)
62
-
63
- # announce the task
64
- $stderr.puts "---------"
65
- $stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Making split, using " + (@trainpercent * 100.0).to_i.to_s + "% as training data."
66
- $stderr.puts "---------"
67
- end
68
-
69
- def FredSplit.remove_split(exp, # FredConfigData object
70
- splitID) # string: split ID
71
-
72
- FredSplitPkg.remove_split(exp, splitID)
73
- end
74
-
75
- ###
76
- # compute
77
- #
78
- # do the splitting
79
- def compute()
80
- FredSplit.remove_split(@exp, @split_id)
81
- @split_obj.make_new_split(@split_id, @trainpercent,
82
- @ignore_unambiguous)
83
- end
84
- end