shalmaneser 0.0.1.alpha → 1.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +2 -2
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +49 -0
  6. data/bin/fred +18 -0
  7. data/bin/frprep +34 -0
  8. data/bin/rosy +17 -0
  9. data/lib/common/AbstractSynInterface.rb +35 -33
  10. data/lib/common/Mallet.rb +236 -0
  11. data/lib/common/Maxent.rb +26 -12
  12. data/lib/common/Parser.rb +5 -5
  13. data/lib/common/SynInterfaces.rb +13 -6
  14. data/lib/common/TabFormat.rb +7 -6
  15. data/lib/common/Tiger.rb +4 -4
  16. data/lib/common/Timbl.rb +144 -0
  17. data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
  18. data/lib/common/headz.rb +1 -1
  19. data/lib/common/ruby_class_extensions.rb +3 -3
  20. data/lib/fred/FredBOWContext.rb +14 -2
  21. data/lib/fred/FredDetermineTargets.rb +4 -9
  22. data/lib/fred/FredEval.rb +1 -1
  23. data/lib/fred/FredFeatureExtractors.rb +4 -3
  24. data/lib/fred/FredFeaturize.rb +1 -1
  25. data/lib/frprep/CollinsInterface.rb +6 -6
  26. data/lib/frprep/MiniparInterface.rb +5 -5
  27. data/lib/frprep/SleepyInterface.rb +7 -7
  28. data/lib/frprep/TntInterface.rb +1 -1
  29. data/lib/frprep/TreetaggerInterface.rb +29 -5
  30. data/lib/frprep/do_parses.rb +1 -0
  31. data/lib/frprep/frprep.rb +36 -32
  32. data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
  33. data/lib/frprep/interfaces/stanford_interface.rb +353 -0
  34. data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
  35. data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
  36. data/lib/frprep/opt_parser.rb +2 -2
  37. data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
  38. data/lib/rosy/RosyIterator.rb +11 -10
  39. data/lib/rosy/rosy.rb +1 -0
  40. data/lib/shalmaneser/version.rb +1 -1
  41. data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
  42. data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
  43. data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
  44. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
  45. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
  46. data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
  47. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
  48. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
  49. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
  50. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
  51. data/test/functional/test_frprep.rb +3 -3
  52. data/test/functional/test_rosy.rb +20 -0
  53. metadata +215 -224
  54. data/CHANGELOG.rdoc +0 -0
  55. data/LICENSE.rdoc +0 -0
  56. data/README.rdoc +0 -0
  57. data/lib/common/CollinsInterface.rb +0 -1165
  58. data/lib/common/MiniparInterface.rb +0 -1388
  59. data/lib/common/SleepyInterface.rb +0 -384
  60. data/lib/common/TntInterface.rb +0 -44
  61. data/lib/common/TreetaggerInterface.rb +0 -303
  62. data/lib/frprep/AbstractSynInterface.rb +0 -1227
  63. data/lib/frprep/BerkeleyInterface.rb +0 -375
  64. data/lib/frprep/ConfigData.rb +0 -694
  65. data/lib/frprep/FixSynSemMapping.rb +0 -196
  66. data/lib/frprep/FrPrepConfigData.rb +0 -66
  67. data/lib/frprep/FrprepHelper.rb +0 -1324
  68. data/lib/frprep/ISO-8859-1.rb +0 -24
  69. data/lib/frprep/Parser.rb +0 -213
  70. data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
  71. data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
  72. data/lib/frprep/SynInterfaces.rb +0 -275
  73. data/lib/frprep/TabFormat.rb +0 -720
  74. data/lib/frprep/Tiger.rb +0 -1448
  75. data/lib/frprep/Tree.rb +0 -61
  76. data/lib/frprep/headz.rb +0 -338
@@ -1,375 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- ####
3
- # sp 21 07 05
4
- #
5
- # modified ke 30 10 05: adapted to fit into SynInterface
6
- #
7
- # represents a file containing Berkeley parses
8
- #
9
- # underlying data structure for individual sentences: SalsaTigerSentence
10
- require "tempfile"
11
-
12
- require "frprep/SalsaTigerRegXML"
13
- require "frprep/SalsaTigerXMLHelper"
14
- require "frprep/TabFormat"
15
- require "frprep/Counter"
16
-
17
- require "frprep/AbstractSynInterface"
18
- require "frprep/Tiger.rb"
19
-
20
- ################################################
21
- # Interface class
22
- class BerkeleyInterface < SynInterfaceSTXML
23
- $stderr.puts 'Announcing Berkeley Interface' if $DEBUG
24
- BerkeleyInterface.announce_me()
25
-
26
- ###
27
- def BerkeleyInterface.system()
28
- return "berkeley"
29
- end
30
-
31
- ###
32
- def BerkeleyInterface.service()
33
- return "parser"
34
- end
35
-
36
- ###
37
- # initialize to set values for all subsequent processing
38
- def initialize(program_path, # string: path to system
39
- insuffix, # string: suffix of tab files
40
- outsuffix, # string: suffix for parsed files
41
- stsuffix, # string: suffix for Salsa/TIGER XML files
42
- var_hash = {}) # optional arguments in a hash
43
-
44
- super(program_path, insuffix, outsuffix, stsuffix, var_hash)
45
- unless @program_path =~ /\/$/
46
- @program_path = @program_path + "/"
47
- end
48
-
49
- # new: evaluate var hash
50
- @pos_suffix = var_hash["pos_suffix"]
51
- @lemma_suffix = var_hash["lemma_suffix"]
52
- @tab_dir = var_hash["tab_dir"]
53
- end
54
-
55
- ####
56
- # parse a directory with TabFormat files and write the parse trees to outputdir
57
- # I assume that the files in inputdir are smaller than
58
- # the maximum number of sentences that
59
- # Berkeley can parse in one go (i.e. that they are split)
60
- def process_dir(in_dir, # string: input directory name
61
- out_dir) # string: output directory name
62
-
63
- # not using x64 arch, adjusting for 32 bit
64
- # berkeley_prog = "java -d64 -Xmx10000m -jar #{@program_path}berkeley-parser.jar -gr #{@program_path}gerNegra.01.utf8 "
65
- berkeley_prog = "java -Xmx2000m -jar #{@program_path}berkeleyParser.jar -gr #{@program_path}ger_sm5.gr"
66
-
67
- berkeley_prog = "java -jar #{@program_path}berkeley-parser.jar -gr #{@program_path}gerNegra.01.utf8 "
68
- Dir[in_dir + "*" + @insuffix].each {|inputfilename|
69
- STDERR.puts "*** Parsing #{inputfilename} with Berkeley"
70
- corpusfilename = File.basename(inputfilename, @insuffix)
71
- parsefilename = out_dir + corpusfilename + @outsuffix
72
- tempfile = Tempfile.new(corpusfilename)
73
-
74
- # we need neither lemmata nor POS tags; berkeley can do with the words
75
- corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
76
- corpusfile.each_sentence {|sentence|
77
- #puts sentence.to_s
78
- tempfile.puts sentence.to_s
79
- }
80
- tempfile.close
81
- # parse and remove comments in the parser output
82
- STDERR.puts "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
83
-
84
- # AB: for testing we leave this step out, it takes too much time.
85
- # Please keep the <parsefile> intact!!!
86
- # Kernel.system("#{berkeley_prog} < #{tempfile.path} > #{parsefilename}")
87
- FileUtils.cp tempfile.path, '/home/arbox/input.txt'
88
- }
89
- end
90
-
91
- ###
92
- # for a given parsed file:
93
- # yield each sentence as a pair
94
- # [SalsaTigerSentence object, FNTabFormatSentence object]
95
- # of the sentence in SalsaTigerXML and the matching tab format sentence
96
- #
97
- # If a parse has failed, returns
98
- # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
99
- # to allow more detailed accounting for failed parses
100
- # (basically just a flat structure with a failed=true attribute
101
- # at the sentence node)
102
- def each_sentence(parsefilename)
103
- # sanity checks
104
- unless @tab_dir
105
- raise "Need to set tab directory on initialization"
106
- end
107
-
108
- # get matching tab file for this parser output file
109
- parsefile = File.new(parsefilename)
110
- tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
111
- tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
112
-
113
- sentid = 0
114
- tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
115
-
116
- sentence_str = ""
117
- status = true # error encountered?
118
- # assemble next sentence in Berkeley file by reading lines from parsefile
119
- # for berkeley:
120
- while true
121
- line = parsefile.gets
122
-
123
- # search for the next "relevant" file or end of the file
124
- if line.nil? or line=~/^\( *\(TOP/ or line=~/^\(\(\)/
125
- break
126
- end
127
- sentid +=1
128
-
129
- end
130
-
131
-
132
- if line.nil? # while we search a parse, the parse file is over...
133
- raise "Error: premature end of parser file!"
134
- end
135
-
136
-
137
- # berkeley parser output: remove brackets /(.*)/
138
- line.sub!(/^\( */, '')
139
- line.sub!(/ *\) *$/, '')
140
- line.gsub!(/\)\)/, ') )')
141
- line.gsub!(/\)\)/, ') )')
142
- line.gsub!(/(\([A-Z]+)_/, '\1-')
143
-
144
- sentence_str = line.chomp!
145
-
146
- # if we are here, we have a sentence_str to work on
147
- # hopefully, our status is OK
148
- case status
149
- when true
150
- if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
151
- my_sent_id = tab_sent.get_sent_id()
152
- else
153
- my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
154
- end
155
-
156
- st_sent = build_salsatiger(" " + sentence_str + " ", 0,
157
- Array.new, Counter.new(0),
158
- Counter.new(500),
159
- SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
160
- if st_sent.nil?
161
- next
162
- end
163
- yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
164
- else # i.e. when "failed"
165
- #raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
166
- end
167
-
168
- }
169
-
170
- # we don't have a sentence: hopefully, this is becase parsing has failed
171
-
172
-
173
- # all TabFile sentences are consumed:
174
- # now we may just encounter comments, garbage, empty lines etc.
175
-
176
- while not parsefile.eof?
177
-
178
- case parsefile.gets
179
- when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
180
- else
181
- raise "Error: premature end of tab file!"
182
- end
183
- end
184
- end
185
-
186
-
187
- ###
188
- # write Salsa/TIGER XML output to file
189
- def to_stxml_file(infilename, # string: name of parse file
190
- outfilename) # string: name of output stxml file
191
-
192
- outfile = File.new(outfilename, "w")
193
-
194
- outfile.puts SalsaTigerXMLHelper.get_header()
195
- each_sentence(infilename) { |st_sent, tabsent|
196
- outfile.puts st_sent.get()
197
- }
198
- outfile.puts SalsaTigerXMLHelper.get_footer()
199
- outfile.close()
200
- end
201
-
202
-
203
-
204
- ########################
205
- private
206
-
207
- ###
208
- # Recursive function for parsing a Berkeley parse tree and
209
- # building a SalsaTigerSentence recursively
210
- #
211
- # Algorithm: manage stack which contains, for the current constituent,
212
- # child constituents (if a nonterminal), and the category label.
213
- # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
214
- # All children and the category label are popped from the stack and integrated into the
215
- # TigerSalsa data structure. The new node is re-pushed onto the stack.
216
- def build_salsatiger(sentence, # string
217
- pos, # position in string (index): integer
218
- stack, # stack with incomplete nodes: Array
219
- termc, # terminal counter
220
- nontc, # nonterminal counter
221
- sent_obj) # SalsaTigerSentence
222
-
223
-
224
-
225
- if sentence =~ /\(\)/
226
- return nil
227
- end
228
-
229
- # main case distinction: match the beginning of our string
230
- # (i.e. what follows our current position in the string)
231
- case sentence[pos..-1]
232
-
233
- when /^ *$/ # nothing -> whole sentence parsed
234
- if stack.length == 1
235
- # sleepy always delivers one "top" node; if we don't get just one
236
- # node, something has gone wrong
237
- node = stack.pop
238
- node.del_attribute("gf")
239
- return sent_obj
240
- else
241
- raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
242
- end
243
-
244
- when /^\s*\(([^ )]+) /
245
- # match the beginning of a new constituent
246
- # (opening bracket + category + space, may not contain closing bracket)
247
- cat = $1
248
- if cat.nil? or cat == ""
249
- raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
250
- end
251
- # STDERR.puts "new const #{cat}"
252
- stack.push cat # throw the category label on the stack
253
- return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
254
-
255
- when /^\s*(\S+)\) /
256
- # match the end of a terminal constituent (something before a closing bracket + space)
257
- word = $1
258
-
259
- comb_cat = stack.pop
260
- if comb_cat.to_s == ""
261
- raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
262
- end
263
-
264
- cat,gf = split_cat(comb_cat)
265
- node = sent_obj.add_syn("t",
266
- nil, # cat (doesn't matter here)
267
- SalsaTigerXMLHelper.escape(word), # word
268
- cat, # pos
269
- termc.next.to_s)
270
- node.set_attribute("gf",gf)
271
- # STDERR.puts "completed terminal #{cat}, #{word}"
272
- stack.push node
273
- return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
274
-
275
- when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
276
- # now collect children:
277
- # pop items from the stack until you find the category
278
- children = Array.new
279
- while true
280
- if stack.empty?
281
- raise "Error: stack empty; cannot find more children"
282
- end
283
- item = stack.pop
284
- case item.class.to_s
285
- when "SynNode" # this is a child
286
- children.push item
287
- when "String" # this is the category label
288
- if item.to_s == ""
289
- raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
290
- end
291
- cat,gf = split_cat(item)
292
- break
293
- else
294
- raise "Error: unknown item class #{item.class.to_s}"
295
- end
296
- end
297
- # now add a nonterminal node to the sentence object and
298
- # register the children nodes
299
- node = sent_obj.add_syn("nt",
300
- cat, # cat
301
- nil, # word (doesn't matter)
302
- nil, # pos (doesn't matter)
303
- nontc.next.to_s)
304
- children.each {|child|
305
- child_gf = child.get_attribute("gf")
306
- child.del_attribute("gf")
307
- node.add_child(child,child_gf)
308
- child.add_parent(node, child_gf)
309
- }
310
- node.set_attribute("gf",gf)
311
- # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
312
- stack.push node
313
-
314
- return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
315
- else
316
- raise "Error: cannot analyse sentence at pos #{pos}: #{sentence[pos..-1]}. Complete sentence: \n#{sentence}"
317
- end
318
- end
319
-
320
-
321
-
322
-
323
- ###
324
- # Berkeley delivers node labels as "phrase type"-"grammatical function"
325
- # but the GF may not be present.
326
-
327
- def split_cat(cat)
328
-
329
- cat =~ /^([^-]*)(-([^-]*))?$/
330
- unless $1
331
- raise "Error: could not identify category in #{cat}"
332
- end
333
-
334
- proper_cat = $1
335
-
336
- if $3
337
- gf = $3
338
- else
339
- gf = ""
340
- end
341
-
342
- return [proper_cat,gf]
343
-
344
- end
345
- end
346
-
347
-
348
-
349
- ################################################
350
- # Interpreter class
351
- class BerkeleyInterpreter < Tiger
352
- BerkeleyInterpreter.announce_me()
353
-
354
- ###
355
- # names of the systems interpreted by this class:
356
- # returns a hash service(string) -> system name (string),
357
- # e.g.
358
- # { "parser" => "collins", "lemmatizer" => "treetagger" }
359
- def BerkeleyInterpreter.systems()
360
- return {
361
- "parser" => "berkeley"
362
- }
363
- end
364
-
365
- ###
366
- # names of additional systems that may be interpreted by this class
367
- # returns a hash service(string) -> system name(string)
368
- # same as names()
369
- def BerkeleyInterpreter.optional_systems()
370
- return {
371
- "lemmatizer" => "treetagger"
372
- }
373
- end
374
-
375
- end
@@ -1,694 +0,0 @@
1
- # class ConfigData:
2
- #
3
- # reads config data file,
4
- # matches it against feature declarations given in its new() method,
5
- # offers access methods for different kinds of features
6
- #
7
- # In the config file, all feature specifications have the form
8
- #
9
- # feature_name = feature_value
10
- #
11
- # where feature_name is a string without spaces. feature_value
12
- # may include spaces, depending on the feature type (see below).
13
- #
14
- # To include a comment in a config file, start the comment line with
15
- # '#'.
16
- #
17
- # Features are typed. The following types are supported:
18
- #
19
- # - normal types:
20
- # "bool", "float", "integer", "string"
21
- # For the get() function with which features in the ConfigData object
22
- # are accessed, the values are transformed from the strings in the
23
- # config file to the appropriate class: Boolean, Float, Integer, String
24
- #
25
- # - other types:
26
- # pattern: This is a feature that may include variables in
27
- # <> brackets. When this feature is accesssed,
28
- # values for these variables are given, i.e. this
29
- # pattern has to be instantiated.
30
- # For example, given a feature
31
- #
32
- # fileformat = features.<type>.train
33
- #
34
- # and method call
35
- # instantiate("fileformat", "type" => "path")
36
- #
37
- # what is returned is a string "features.path.train"
38
- #
39
- # Variables used in a pattern have to be declared to
40
- # the new() method.
41
- #
42
- # list: This is the only feature type where more than one
43
- # feature specification with the same feature_name is allowed.
44
- # The right-hand sides of a list feature are stored in an array.
45
- #
46
- # Given a 'list' feature 'bla', if the config file contains
47
- #
48
- # bla = blupp 1 2
49
- # bla = la di da
50
- #
51
- # the list feature 'bla' is represented as follows:
52
- # @features['bla'] = [['blupp', 1,2], ['la', 'di', 'da']]
53
- #
54
- # For comfortable access to a list feature, arbitrary
55
- # access functions for list features can be defined.
56
- #
57
- #
58
-
59
- require 'frprep/ruby_class_extensions'
60
-
61
-
62
- #####################################################
63
- ####################################################
64
- # ConfigData is the main class in this package.
65
- # It manages config files.
66
- #
67
- # To use it, inherit from it and just make a new new() method
68
- # that only takes as input the name of the config file
69
- # and that declares all the feature types and variable names
70
- # needed for the given application.
71
-
72
- class ConfigData
73
-
74
- ###########
75
- # new()
76
- #
77
- # reads the config file
78
- #
79
- # Input parameters: the name of the config file, a hash declaring all
80
- # features by mapping feature names to their types,
81
- # and an array of all variables that may occur in pattern type features
82
- #
83
- def initialize(filename, # string: name of config file
84
- feature_types, # hash: feature_name => feature_type
85
- variables) # array of strings: list of variables used in pattern features
86
-
87
- @test_print = false
88
- @variables = variables
89
- @original_filename = filename
90
-
91
- ##
92
- # open config file
93
- begin
94
- file = File.new(filename)
95
- rescue
96
- $stderr.puts "Error: I could not open the experiment file " + filename
97
- exit 1
98
- end
99
-
100
- # feature_types: hash: feature_name => feature_type
101
- # features: hash: feature_name => value
102
- @feature_types = feature_types
103
- @features = Hash.new
104
-
105
- # @list_feature_access: hash feature_name => Proc
106
- # access method for list features
107
- @list_feature_access = Hash.new
108
-
109
- # pre-initialize list features to an empty array
110
- @feature_types.each_pair { |feature_name, feature_type|
111
- if feature_type == "list"
112
- @features[feature_name] = Array.new
113
- end
114
- }
115
-
116
- ##
117
- # examine the config file contents
118
-
119
- while (line = file.gets())
120
- line = line.chomp().strip()
121
- if line =~ /^#/ # comment
122
- next
123
- end
124
-
125
- if line.empty? # nothing to be seen here
126
- next
127
- end
128
-
129
- feature_name, rhs = extract_def(line)
130
- set_entry(feature_name, rhs)
131
- end
132
- end
133
-
134
- #####
135
- # set_entry
136
- #
137
- # set an entry in the experiment file, either an existing or a new one
138
- # but it must conform to the feature types declared in the new() method
139
- def set_entry(feature_name, rhs)
140
-
141
- unless @feature_types[feature_name]
142
- $stderr.puts "Error in experiment file:"
143
- $stderr.puts "Unknown parameter #{feature_name} in #{@original_filename}."
144
- $stderr.puts "Expected features for this type of experiment file:"
145
- $stderr.puts @feature_types.keys().join(", ")
146
- exit 1
147
- end
148
-
149
- case @feature_types[feature_name]
150
- when "pattern"
151
- # file format specification
152
-
153
- @features[feature_name] = ConfigFormatElement.new(rhs, @variables)
154
-
155
- when "list"
156
-
157
- # rhs is a string of space-separated words
158
- # the first of them is the key, the rest is the value, to be
159
- # stored as an array of words
160
-
161
- # split rhs into words
162
- if rhs.empty?
163
- $stderr.puts "WARNING: I got an empty value for list feature #{feature_name}."
164
- $stderr.puts "I'll ignore it."
165
- else
166
- unless @features[feature_name].include? rhs.split()
167
- @features[feature_name] << rhs.split()
168
- end
169
- end
170
-
171
- when "bool"
172
- # boolean value
173
- unless ["true", "false"].include? rhs
174
- $stderr.puts "Error in experiment file:"
175
- $stderr.puts "Value for #{feature_name} must be either 'true' or 'false'."
176
- $stderr.puts "I got: "+ rhs.to_s
177
- exit 1
178
- end
179
- @features[feature_name] = (rhs == "true")
180
-
181
- when "float"
182
- # float value
183
- @features[feature_name] = rhs.to_f
184
-
185
- when "integer"
186
- # integer value
187
- @features[feature_name] = rhs.to_i
188
-
189
- when "string"
190
- # string value
191
- @features[feature_name] = rhs
192
-
193
- else
194
- raise "Unknown feature type for feature #{feature_name}: #{@feature_types[feature_name]}"
195
- end
196
- end
197
-
198
- ####
199
- # remove list entry in this config data structure:
200
- # the lhs argument is the list feature name
201
- # the rhs argument can be a string or a regexp.
202
- # - string: each entry exactly matching the string is removed
203
- # - regexp: each entry matching the regexp is removed
204
- def unset_list_entry(lhs, #string: feature name
205
- rhs) # string/regexp: righthand side
206
- unless @feature_types[lhs] == "list"
207
- $stderr.puts "Error in experiment file: "
208
- $stderr.puts "Feature #{lhs} unknown or not of type list."
209
- exit 1
210
- end
211
-
212
- case rhs.class.to_s
213
- when "String"
214
- rhs_match = Regexp.new("^" + Regexp.escape(rhs) + "$")
215
- when "Regexp"
216
- rhs_match = rhs
217
- else
218
- raise "Shouldn't be here: " + rhs.class.to_s
219
- end
220
-
221
- to_delete = @features[lhs].select { |entry| entry.join(" ") =~ rhs_match }
222
- to_delete.each { |entry| @features[lhs].delete(entry) }
223
- end
224
-
225
-
226
- #####
227
- # adjoin
228
- #
229
- # adds the information from a second ConfigData object
230
- # to this one.
231
- # Disjointness of feature names is assumed.
232
- def adjoin(config_obj) # ConfigData object
233
-
234
- ##
235
- # sanity checks:
236
- # the other object must be a ConfigData object
237
- unless config_obj.kind_of? ConfigData
238
- raise "I can only adjoin another ConfigData object"
239
- end
240
-
241
- # if feature name sets are not disjoint,
242
- # ignore the feature names that I already have
243
- other_features, other_feature_types, other_list_feature_access = config_obj.get_contents()
244
- unless (@feature_types.keys & other_feature_types.keys).empty?
245
- other_features = other_features.clone()
246
- other_feature_types = other_feature_types.clone()
247
- other_list_feature_access = other_list_feature_access.clone()
248
-
249
- (@feature_types.keys() & other_feature_types.keys()).each { |overlap_feature|
250
- other_features.delete(overlap_feature)
251
- other_feature_types.delete(overlap_feature)
252
- other_list_feature_access.delete(overlap_feature)
253
- }
254
- end
255
-
256
- # now adjoin the contents of the other config objects to mine
257
- @features.update(other_features)
258
- @feature_types.update(other_feature_types)
259
- @list_feature_access.update(other_list_feature_access)
260
- end
261
-
262
- #####
263
- # get()
264
- #
265
- # returns the value of a given feature
266
- # raises an error if no feature of this name
267
- # has been declared to the new() method
268
- #
269
- # returns: a feature value. the type of the return value
270
- # depends on the type of the feature.
271
- # returns nil if the feature has not been set in the config file.
272
- def get(name) # string: name of the feature to access
273
- if @feature_types[name].nil?
274
- raise "Unknown feature " + name
275
- end
276
-
277
- # may return nil if something has not been set
278
- return @features[name]
279
- end
280
-
281
- ####
282
- # get_type
283
- #
284
- # returns the type of a given feature,
285
- # or nil if it is undefined
286
- def get_type(feature_name)
287
- return @feature_types[feature_name]
288
- end
289
-
290
- #####
291
- # is_defined
292
- #
293
- # returns: true if a feature by this name has been set in the config file,
294
- # false else
295
- def is_defined(feature) # string: name of the feature
296
- if @features[feature]
297
- return true
298
- else
299
- return false
300
- end
301
- end
302
-
303
- #####
304
- # instantiate
305
- #
306
- # given a pattern type feature, and a hash
307
- # mapping all variables occurring in the pattern to
308
- # values, instantiate the pattern
309
- #
310
- # returns: string, the pattern with all variables
311
- # instantiated with their values
312
- def instantiate(key, # string: feature name
313
- var_hash={}) # hash: variable name(string) => value(string)
314
-
315
- unless @feature_types[key] == "pattern"
316
- raise "Nothing known about pattern " + key
317
- end
318
- unless @features[key]
319
- raise "Please define pattern in configuration file: " + key
320
- end
321
-
322
- # piece together the file name
323
- # expand in case it is a filename/directory
324
- return @features[key].instantiate(var_hash)
325
- end
326
-
327
- #####
328
- # get_filename:
329
- #
330
- # synonym for instantiate()
331
- def get_filename(key, var_hash={})
332
- return instantiate(key, var_hash)
333
- end
334
-
335
- #####
336
- # set_test_print
337
- #
338
- # set test output to on (true) or off (false)
339
- def set_test_print(tf) # boolean
340
- unless [true, false].include? tf
341
- raise "Shouldn't be here"
342
- end
343
- @test_print = tf
344
- end
345
-
346
-
347
- #####
348
- # get_all_filenames
349
- #
350
- # given a directory, a pattern type feature,
351
- # and a hash mapping some of the pattern's variables
352
- # to values, return all filenames in the given directory
353
- # that match the partially instantiated pattern
354
- #
355
- # returns: an array of pairs [filename(string), matches(hash)]
356
- # where the matches hash maps all variables of the pattern to
357
- # their values as instantiated in the given filename
358
- # The filename doesn't include the directory.
359
- def get_all_filenames(dir, #string: directory name
360
- key, # string: name of pattern type feature
361
- var_hash={}) # hash: variable name(string) => value(string)
362
-
363
- unless @feature_types[key] == "pattern"
364
- raise "Nothing known about file format " + key
365
- end
366
-
367
- # array of pairs [filename(string), matches(hash)]
368
- filenames = Array.new
369
-
370
- # iterate through all files of this directory
371
- Dir.foreach(dir) { |filename|
372
- # does the filename match the pattern of the feature "key"?
373
- if (matches = @features[key].match(filename, var_hash))
374
- # do the variable values for this filename conform
375
- # to the variable values given in var_hash?
376
- if @test_print
377
- $stderr.puts "got " + filename
378
- end
379
- if var_hash.keys.select { |var|
380
- matches[var] != var_hash[var]
381
- }.empty?
382
- filenames << [filename, matches]
383
- else
384
- # mismatch for given variables
385
- if @test_print
386
- var_hash.keys.each { |var|
387
- if matches[var] != var_hash[var]
388
- $stderr.puts "Mismatch for " + var + ": " +
389
- matches[var].to_s + " vs. " + var_hash[var]
390
- end
391
- }
392
- end
393
- end
394
- end
395
- }
396
-
397
- return filenames
398
- end
399
-
400
- #####
401
- # set list feature access:
402
- #
403
- # for a given list type feature, set a method that should
404
- # be used for accessing the feature.
405
- #
406
- # method signature: first parameter is an array of tuples of strings.
407
- # for each experiment file entry
408
- # feature = rhs
409
- # there will be a tuple rhs.split() in the list.
410
- #
411
- # The other parameters are not checked by ConfigData, there
412
- # may be arbitrarily many
413
- def set_list_feature_access(feature_name, # string: name of the feature
414
- proc) # proc: access method for list feature
415
- unless @feature_types[feature_name] == 'list'
416
- raise "Cannot set list feature access to non-list feature #{feature_name}"
417
- end
418
-
419
- @list_feature_access[feature_name] = proc
420
- end
421
-
422
- #####
423
- # get_lf
424
- #
425
- # access a list type feature for which an access function
426
- # has been set using set_list_feature_access
427
- #
428
- # returns: whatever the access function returns
429
- def get_lf(feature_name, # string: name of list feature
430
- *parameters) # parameters for access function, collapsed into an array here
431
-
432
- unless @list_feature_access[feature_name]
433
- raise "I have no list feature access method for #{feature_name}."
434
- end
435
-
436
- # call access function, re-exploding the collapsed parameters and
437
- # adding the list of values for the list feature as first parameter
438
- return @list_feature_access[feature_name].call(@features[feature_name], *parameters)
439
- end
440
-
441
-
442
- protected
443
-
444
- #####
445
- # extract_def
446
- #
447
- # given a line of the config file,
448
- # it is assumed that it has the structure
449
- # [white space] string [white space] = [white space] stuff
450
- # 'stuff' may include further white space, 'string' may not.
451
- #
452
- # returns: a pair of strings, the left-hand side and the right-hand side
453
- # of the =, minus the [white space] in the places shown above
454
-
455
- def extract_def(line) # string: line from config file
456
- unless line =~ /^\s*(\w+)\s*=\s*([^\s].*)$/
457
- $stderr.puts "Error in experiment file: "
458
- $stderr.puts "I couldn't analyze the following line: "
459
- $stderr.puts line
460
- exit 1
461
- end
462
- return [$1, $2]
463
- end
464
-
465
- ####
466
- # access to the object variables
467
- def get_contents()
468
- return [@features, @feature_types, @list_feature_access]
469
- end
470
-
471
- end
472
-
473
-
474
- ##############################
475
- # ConfigFormatelement is an auxiliary class
476
- # of ConfigData.
477
- # It keeps track of feature patterns with variables in them
478
- # that can be instantiated.
479
-
480
- class ConfigFormatElement
481
-
482
- # new()
483
- #
484
- # given a pattern and a list of variable names,
485
- # analyze the pattern and remember the variable names
486
- #
487
- def initialize(string, # string: feature name, may include names of variables.
488
- # they are included in <>
489
- variables) # list of variable names that can occur
490
-
491
- @variables = variables
492
-
493
- # pattern: this is what the 'string' is split into,
494
- # an array of elements that are either fixed parts or variables.
495
- # fixed part: pair [item:string, "string"]
496
- # variable: pair [variable_name:string, "variable"]
497
- @pattern = Array.new
498
- state = "out"
499
- item = ""
500
-
501
- # analyze string,
502
- # split into variables and fixed parts
503
- string.split(//).each { |char|
504
-
505
- case state
506
- when "in"
507
- case char
508
- when "<"
509
- raise "Duplicate < in " + string
510
- when ">"
511
- unless @variables.include? item
512
- raise "Unknown variable " + item
513
- end
514
- @pattern << [item, "variable"]
515
- item = ""
516
- state = "out"
517
- else
518
- item << char
519
- state = "in"
520
- end
521
-
522
- when "out"
523
- case char
524
- when "<"
525
- unless item.empty?
526
- @pattern << [item, "string"]
527
- item = ""
528
- end
529
- state = "in"
530
- when ">"
531
- raise "Unexpected > in " + string
532
- else
533
- item << char
534
- state = "out"
535
- end
536
-
537
- else
538
- raise "Shouldn't be here"
539
- end
540
- }
541
-
542
- # read through the whole of "string"
543
- # end state has to be "out"
544
- unless state == "out"
545
- raise "Unclosed < in " + string
546
- end
547
-
548
- # last bit still to be recorded?
549
- unless item.empty?
550
- @pattern << [item, "string"]
551
- end
552
-
553
- # make regexp for matching this pattern
554
- @regexp = make_regexp(@pattern)
555
- end
556
-
557
- # instantiate: given pairs of variable names and variable values,
558
- # instantiate @pattern to a string in which var names are replaced
559
- # by their values
560
- #
561
- # returns: string
562
- def instantiate(var_hash) # hash variable name(string) => variable value(string)
563
-
564
- # instantiate the pattern
565
- return @pattern.map { |item, string_or_var|
566
-
567
- case string_or_var
568
- when "string"
569
- item
570
-
571
- when "variable"
572
-
573
- if var_hash[item].nil?
574
- raise "Missing variable instantiation: " + item
575
- end
576
- var_hash[item]
577
-
578
- else
579
- raise "Shouldn't be here"
580
- end
581
- }.join
582
- end
583
-
584
- # match()
585
- #
586
- # given a string, try to match it against the @pattern
587
- # while setting the variables given in 'fillers' to
588
- # the values given in that hash.
589
- #
590
- # returns: if the string matches, a hash variable name => value
591
- # that includes the fillers given as a parameter as well as
592
- # values for all other variables mentioned in @pattern,
593
- # or false if no match.
594
- def match(string, # a string
595
- fillers = nil) # hash variable name(string) => value(string)
596
-
597
- # have we been given partial info about variables?
598
- if fillers
599
- match = make_regexp(@pattern, fillers).match(string)
600
- # $stderr.print "matching " + make_regexp(@pattern, fillers).source +
601
- # " against " + string + " "
602
- # if match.nil?
603
- # $stderr.puts "no"
604
- # else
605
- # $stderr.puts "yes"
606
- # end
607
- else
608
- match = @regexp.match(string)
609
- end
610
-
611
- if match.nil?
612
- # no match via the regular expression
613
- return false
614
- end
615
-
616
- # regular expression matched.
617
- # construct return value in hash
618
- # retv: variable name(string) => value(string)
619
- retv = Hash.new()
620
- if fillers
621
- # include given fillers in retv hash
622
- fillers.each_pair { |name, val| retv[name] = val }
623
- end
624
-
625
- # now put values for other variables in @pattern into retv
626
- index = 1
627
- @pattern.to_a.select { |item, string_or_var|
628
- string_or_var == "variable"
629
- }.select { |item, string_or_var|
630
- fillers.nil? or
631
- fillers[item].nil?
632
- }.each { |item, string_or_var|
633
- # for all items on the pattern list
634
- # that are variables and
635
- # haven't been filled by the "fillers" list already:
636
- # fill from matches
637
-
638
- if match[index].nil?
639
- raise "Match, but not enough matched elements? Strange."
640
- end
641
-
642
- if retv[item].nil?
643
- retv[item] = match[index]
644
- else
645
- unless retv[item] == match[index]
646
- return false
647
- end
648
- end
649
-
650
- index += 1
651
- }
652
-
653
- return retv
654
- end
655
-
656
- # used_variables
657
- #
658
- # returns: an array of variable names used in @pattern
659
- def used_variables()
660
- return @pattern.select { |item, string_or_var|
661
- string_or_var == "variable"
662
- }.map { |item, string_or_var| item}
663
- end
664
-
665
- ####################
666
- private
667
-
668
- # make_regexp:
669
- # make regular expression from a pattern
670
- # together with some variable fillers
671
- #
672
- # returns: Regexp object
673
- def make_regexp(pattern, # array of pairs [string, "string"] or [string, "variable"]
674
- fillers = nil) # hash variable name(string) => value(string)
675
- return (Regexp.new "^" +
676
- pattern.map { |item, string_or_var|
677
- case string_or_var
678
- when "variable"
679
- if fillers and
680
- fillers[item]
681
- Regexp.escape(fillers[item])
682
- else
683
- "(.+)"
684
- end
685
- when "string"
686
- Regexp.escape(item)
687
- else
688
- raise "Shouldn't be here"
689
- end
690
- }.join + "$")
691
- end
692
-
693
- end
694
-