shalmaneser 1.2.0.rc3 → 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -7
  3. data/bin/fred +2 -4
  4. data/doc/exp_files.md +6 -5
  5. data/lib/common/{ConfigData.rb → config_data.rb} +46 -270
  6. data/lib/common/config_format_element.rb +220 -0
  7. data/lib/common/prep_config_data.rb +62 -0
  8. data/lib/common/{frprep_helper.rb → prep_helper.rb} +0 -0
  9. data/lib/{common/DBInterface.rb → db/db_interface.rb} +2 -2
  10. data/lib/{rosy/DBMySQL.rb → db/db_mysql.rb} +1 -2
  11. data/lib/{rosy/DBSQLite.rb → db/db_sqlite.rb} +1 -1
  12. data/lib/{rosy/DBTable.rb → db/db_table.rb} +1 -1
  13. data/lib/{rosy/DBWrapper.rb → db/db_wrapper.rb} +0 -0
  14. data/lib/{common/SQLQuery.rb → db/sql_query.rb} +0 -0
  15. data/lib/fred/FredBOWContext.rb +8 -6
  16. data/lib/fred/FredDetermineTargets.rb +1 -1
  17. data/lib/fred/FredEval.rb +1 -1
  18. data/lib/fred/FredFeaturize.rb +22 -16
  19. data/lib/fred/FredTest.rb +0 -1
  20. data/lib/fred/fred.rb +2 -0
  21. data/lib/fred/{FredConfigData.rb → fred_config_data.rb} +70 -67
  22. data/lib/fred/opt_parser.rb +1 -1
  23. data/lib/frprep/frprep.rb +1 -1
  24. data/lib/frprep/interfaces/berkeley_interface.rb +7 -9
  25. data/lib/frprep/opt_parser.rb +1 -1
  26. data/lib/rosy/ExternalConfigData.rb +1 -1
  27. data/lib/rosy/RosyEval.rb +1 -1
  28. data/lib/rosy/RosyFeaturize.rb +21 -20
  29. data/lib/rosy/RosyInspect.rb +1 -1
  30. data/lib/rosy/RosyPruning.rb +1 -1
  31. data/lib/rosy/RosyServices.rb +1 -1
  32. data/lib/rosy/RosySplit.rb +1 -1
  33. data/lib/rosy/RosyTest.rb +23 -20
  34. data/lib/rosy/RosyTrain.rb +15 -13
  35. data/lib/rosy/RosyTrainingTestTable.rb +2 -1
  36. data/lib/rosy/View.rb +1 -1
  37. data/lib/rosy/opt_parser.rb +1 -1
  38. data/lib/rosy/rosy.rb +1 -1
  39. data/lib/rosy/rosy_config_data.rb +121 -0
  40. data/lib/shalmaneser/opt_parser.rb +32 -2
  41. data/lib/shalmaneser/version.rb +1 -1
  42. metadata +23 -114
  43. checksums.yaml.gz.sig +0 -0
  44. data.tar.gz.sig +0 -0
  45. data/lib/common/FrPrepConfigData.rb +0 -66
  46. data/lib/rosy/RosyConfigData.rb +0 -115
  47. metadata.gz.sig +0 -0
@@ -0,0 +1,220 @@
1
+
2
+ ##############################
3
+ # ConfigFormatelement is an auxiliary class
4
+ # of ConfigData.
5
+ # It keeps track of feature patterns with variables in them
6
+ # that can be instantiated.
7
+ # @author Andrei Beliankou
8
+ #
9
+ class ConfigFormatElement
10
+
11
+ # given a pattern and a list of variable names,
12
+ # analyze the pattern and remember the variable names
13
+ #
14
+ def initialize(string, # string: feature name, may include names of variables.
15
+ # they are included in <>
16
+ variables) # list of variable names that can occur
17
+
18
+ @variables = variables
19
+
20
+ # pattern: this is what the 'string' is split into,
21
+ # an array of elements that are either fixed parts or variables.
22
+ # fixed part: pair [item:string, "string"]
23
+ # variable: pair [variable_name:string, "variable"]
24
+ @pattern = Array.new
25
+ state = "out"
26
+ item = ""
27
+
28
+ # analyze string,
29
+ # split into variables and fixed parts
30
+ string.split(//).each { |char|
31
+
32
+ case state
33
+ when "in"
34
+ case char
35
+ when "<"
36
+ raise "Duplicate < in " + string
37
+ when ">"
38
+ unless @variables.include? item
39
+ raise "Unknown variable " + item
40
+ end
41
+ @pattern << [item, "variable"]
42
+ item = ""
43
+ state = "out"
44
+ else
45
+ item << char
46
+ state = "in"
47
+ end
48
+
49
+ when "out"
50
+ case char
51
+ when "<"
52
+ unless item.empty?
53
+ @pattern << [item, "string"]
54
+ item = ""
55
+ end
56
+ state = "in"
57
+ when ">"
58
+ raise "Unexpected > in " + string
59
+ else
60
+ item << char
61
+ state = "out"
62
+ end
63
+
64
+ else
65
+ raise "Shouldn't be here"
66
+ end
67
+ }
68
+
69
+ # read through the whole of "string"
70
+ # end state has to be "out"
71
+ unless state == "out"
72
+ raise "Unclosed < in " + string
73
+ end
74
+
75
+ # last bit still to be recorded?
76
+ unless item.empty?
77
+ @pattern << [item, "string"]
78
+ end
79
+
80
+ # make regexp for matching this pattern
81
+ @regexp = make_regexp(@pattern)
82
+ end
83
+
84
+ # instantiate: given pairs of variable names and variable values,
85
+ # instantiate @pattern to a string in which var names are replaced
86
+ # by their values
87
+ #
88
+ # returns: string
89
+ def instantiate(var_hash) # hash variable name(string) => variable value(string)
90
+
91
+ # instantiate the pattern
92
+ return @pattern.map { |item, string_or_var|
93
+
94
+ case string_or_var
95
+ when "string"
96
+ item
97
+
98
+ when "variable"
99
+
100
+ if var_hash[item].nil?
101
+ raise "Missing variable instantiation: " + item
102
+ end
103
+ var_hash[item]
104
+
105
+ else
106
+ raise "Shouldn't be here"
107
+ end
108
+ }.join
109
+ end
110
+
111
+ # match()
112
+ #
113
+ # given a string, try to match it against the @pattern
114
+ # while setting the variables given in 'fillers' to
115
+ # the values given in that hash.
116
+ #
117
+ # returns: if the string matches, a hash variable name => value
118
+ # that includes the fillers given as a parameter as well as
119
+ # values for all other variables mentioned in @pattern,
120
+ # or false if no match.
121
+ def match(string, # a string
122
+ fillers = nil) # hash variable name(string) => value(string)
123
+
124
+ # have we been given partial info about variables?
125
+ if fillers
126
+ match = make_regexp(@pattern, fillers).match(string)
127
+ # $stderr.print "matching " + make_regexp(@pattern, fillers).source +
128
+ # " against " + string + " "
129
+ # if match.nil?
130
+ # $stderr.puts "no"
131
+ # else
132
+ # $stderr.puts "yes"
133
+ # end
134
+ else
135
+ match = @regexp.match(string)
136
+ end
137
+
138
+ if match.nil?
139
+ # no match via the regular expression
140
+ return false
141
+ end
142
+
143
+ # regular expression matched.
144
+ # construct return value in hash
145
+ # retv: variable name(string) => value(string)
146
+ retv = Hash.new()
147
+ if fillers
148
+ # include given fillers in retv hash
149
+ fillers.each_pair { |name, val| retv[name] = val }
150
+ end
151
+
152
+ # now put values for other variables in @pattern into retv
153
+ index = 1
154
+ @pattern.to_a.select { |item, string_or_var|
155
+ string_or_var == "variable"
156
+ }.select { |item, string_or_var|
157
+ fillers.nil? or
158
+ fillers[item].nil?
159
+ }.each { |item, string_or_var|
160
+ # for all items on the pattern list
161
+ # that are variables and
162
+ # haven't been filled by the "fillers" list already:
163
+ # fill from matches
164
+
165
+ if match[index].nil?
166
+ raise "Match, but not enough matched elements? Strange."
167
+ end
168
+
169
+ if retv[item].nil?
170
+ retv[item] = match[index]
171
+ else
172
+ unless retv[item] == match[index]
173
+ return false
174
+ end
175
+ end
176
+
177
+ index += 1
178
+ }
179
+
180
+ return retv
181
+ end
182
+
183
+ # used_variables
184
+ #
185
+ # returns: an array of variable names used in @pattern
186
+ def used_variables()
187
+ return @pattern.select { |item, string_or_var|
188
+ string_or_var == "variable"
189
+ }.map { |item, string_or_var| item}
190
+ end
191
+
192
+ ####################
193
+ private
194
+
195
+ # make_regexp:
196
+ # make regular expression from a pattern
197
+ # together with some variable fillers
198
+ #
199
+ # returns: Regexp object
200
+ def make_regexp(pattern, # array of pairs [string, "string"] or [string, "variable"]
201
+ fillers = nil) # hash variable name(string) => value(string)
202
+ return (Regexp.new "^" +
203
+ pattern.map { |item, string_or_var|
204
+ case string_or_var
205
+ when "variable"
206
+ if fillers and
207
+ fillers[item]
208
+ Regexp.escape(fillers[item])
209
+ else
210
+ "(.+)"
211
+ end
212
+ when "string"
213
+ Regexp.escape(item)
214
+ else
215
+ raise "Shouldn't be here"
216
+ end
217
+ }.join + "$")
218
+ end
219
+
220
+ end
@@ -0,0 +1,62 @@
1
+ # FPrepConfigData
2
+ # Katrin Erk July 05
3
+ #
4
+ # Preprocessing for Fred and Rosy:
5
+ # access to a configuration and experiment description file
6
+
7
+ require "common/config_data"
8
+
9
+ ##############################
10
+ # Class FrPrepConfigData
11
+ #
12
+ # inherits from ConfigData,
13
+ # sets variable names appropriate to preprocessing task
14
+
15
+ class FrPrepConfigData < ConfigData
16
+
17
+ CONFIG_DEFS = {"prep_experiment_ID" => "string", # experiment identifier
18
+ "frprep_directory" => "string", # dir for frprep internal data
19
+ # information about the dataset
20
+ "language" => "string", # en, de
21
+ "origin"=> "string", # FrameNet, Salsa, or nothing
22
+ "format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
23
+ "encoding" => "string", # utf8, iso, hex, or nothing
24
+
25
+ # directories
26
+ "directory_input" => "string", # dir with input data
27
+ "directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
28
+ "directory_parserout" => "string", # dir with parser output for the parser named below
29
+
30
+ # syntactic processing
31
+ "pos_tagger" => "string", # name of POS tagger
32
+ "lemmatizer" => "string", # name of lemmatizer
33
+ "parser" => "string", # name of parser
34
+ "pos_tagger_path" => "string", # path to POS tagger
35
+ "lemmatizer_path" => "string", # path to lemmatizer
36
+ "parser_path" => "string", # path to parser
37
+ "parser_max_sent_num" => "integer", # max number of sentences per parser input file
38
+ "parser_max_sent_len" => "integer", # max sentence length the parser handles
39
+
40
+ "do_parse" => "bool", # use parser?
41
+ "do_lemmatize" => "bool",# use lemmatizer?
42
+ "do_postag" => "bool", # use POS tagger?
43
+
44
+ # output format: if tabformat_output == true,
45
+ # output in Tab format rather than Salsa/Tiger XML
46
+ # (this will not work if do_parse == true)
47
+ "tabformat_output" => "bool",
48
+
49
+ # syntactic repairs, dependent on existing semantic role annotation
50
+ "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
51
+ "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
52
+ }
53
+
54
+ def initialize(filename)
55
+ # @param filename [String] path to a config file
56
+ # @param CONFIG_DEFS [Hash] a list of configuration definitions
57
+ super(filename, CONFIG_DEFS, [])
58
+ end
59
+ end
60
+
61
+
62
+
File without changes
@@ -20,7 +20,7 @@ def get_db_interface(exp, # experiment file object with 'dbtype' entry
20
20
  case exp.get("dbtype")
21
21
  when "mysql"
22
22
  begin
23
- require 'rosy/DBMySQL'
23
+ require 'db/db_mysql'
24
24
  rescue
25
25
  $stderr.puts "Error loading DB interface."
26
26
  $stderr.puts "Make sure you have the Ruby MySQL package installed."
@@ -30,7 +30,7 @@ def get_db_interface(exp, # experiment file object with 'dbtype' entry
30
30
 
31
31
  when "sqlite"
32
32
  begin
33
- require 'rosy/DBSQLite'
33
+ require 'db/db_sqlite'
34
34
  rescue
35
35
  $stderr.puts "Error loading DB interface."
36
36
  $stderr.puts "Make sure you have the Ruby SQLite package installed."
@@ -6,8 +6,7 @@
6
6
 
7
7
  require 'mysql'
8
8
 
9
-
10
- require 'rosy/DBWrapper'
9
+ require 'db/db_wrapper'
11
10
 
12
11
  #################
13
12
  class DBMySQLResult < DBResult
@@ -6,7 +6,7 @@
6
6
  require 'sqlite3'
7
7
  require 'tempfile'
8
8
 
9
- require "DBWrapper"
9
+ require 'db/db_wrapper'
10
10
 
11
11
  #################
12
12
  class DBSQLiteResult < DBResult
@@ -6,7 +6,7 @@
6
6
  # Just creating a table, changing the table, and accessing it.
7
7
  #
8
8
 
9
- require "common/SQLQuery"
9
+ require 'db/sql_query'
10
10
  require "common/RosyConventions"
11
11
 
12
12
  class DBTable
File without changes
File without changes
@@ -6,14 +6,15 @@ require "common/SynInterfaces"
6
6
  require "common/TabFormat"
7
7
  require "common/SalsaTigerRegXML"
8
8
  require "common/SalsaTigerXMLHelper"
9
+ require "common/RosyConventions"
9
10
 
10
11
  require 'fred/md5'
11
- require "fred/FredConfigData"
12
+ require "fred/fred_config_data"
12
13
  require "fred/FredConventions"
13
14
  require "fred/FredDetermineTargets"
14
- require "common/DBInterface"
15
- require "common/RosyConventions"
16
- require "common/SQLQuery"
15
+
16
+ require 'db/db_interface'
17
+ require 'db/sql_query'
17
18
 
18
19
  ########################################
19
20
  # Context Provider classes:
@@ -394,6 +395,7 @@ class NoncontiguousContextProvider < AbstractContextProvider
394
395
  # yielding contexts.
395
396
  def each_window(dir) # string: directory containing Salsa/Tiger XML data
396
397
 
398
+ # @todo AB: Move this chunk to OptionParser.
397
399
  # sanity check: do we know where the larger corpus is?
398
400
  unless @exp.get("larger_corpus_dir")
399
401
  $stderr.puts "Error: 'noncontiguous_input' has been set in the experiment file"
@@ -436,7 +438,7 @@ class NoncontiguousContextProvider < AbstractContextProvider
436
438
  # We will need an FrPrep instance and an options object.
437
439
  base_dir_path = File.expand_path(File.dirname(__FILE__) + '/../..')
438
440
 
439
- # Remove this
441
+ # @todo AB: Remove this
440
442
  FileUtils.cp(tf_exp_frprep.path, '/tmp/frprep.exp')
441
443
  # after debugging
442
444
 
@@ -479,7 +481,7 @@ class NoncontiguousContextProvider < AbstractContextProvider
479
481
  # remove temporary data
480
482
  temptable_obj.drop_temp_table()
481
483
 
482
- # AB: TODO Rewrite this passage using pure Ruby.
484
+ # @todo AB: TODO Rewrite this passage using pure Ruby.
483
485
  %x{rm -rf #{frprep_in}}
484
486
  %x{rm -rf #{frprep_out}}
485
487
  %x{rm -rf #{frprep_dir}}
@@ -1,6 +1,6 @@
1
1
  require "fred/FileZipped"
2
2
 
3
- require "fred/FredConfigData"
3
+ require "fred/fred_config_data"
4
4
  require "common/SynInterfaces"
5
5
  require "fred/FredConventions"
6
6
 
data/lib/fred/FredEval.rb CHANGED
@@ -18,7 +18,7 @@ require "common/Eval"
18
18
  require "common/ruby_class_extensions"
19
19
 
20
20
  # Fred packages
21
- require "fred/FredConfigData"
21
+ require "fred/fred_config_data"
22
22
  require "fred/FredConventions"
23
23
  require "fred/FredFeatures"
24
24
  require "fred/FredDetermineTargets"
@@ -29,10 +29,9 @@ require "common/RegXML"
29
29
  require "common/SalsaTigerRegXML"
30
30
  require "common/SalsaTigerXMLHelper"
31
31
 
32
- require "fred/FredConfigData"
32
+ require "fred/fred_config_data"
33
33
  require "fred/FredConventions"
34
- require "common/FrPrepConfigData"
35
- require "common/frprep_helper"
34
+ require "common/prep_helper"
36
35
  require "common/SynInterfaces"
37
36
 
38
37
  require "fred/FredBOWContext"
@@ -169,18 +168,24 @@ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
169
168
  # prepare experiment file: add preprocessing experiment file data
170
169
  @exp = exp_obj
171
170
 
172
- preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
173
- if not(preproc_expname)
174
- $stderr.puts "Please set the name of the preprocessing exp. file name"
175
- $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
176
- exit 1
177
- elsif not(File.readable?(preproc_expname))
178
- $stderr.puts "Error in the experiment file:"
179
- $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
180
- exit 1
181
- end
182
- preproc_exp = FrPrepConfigData.new(preproc_expname)
183
- @exp.adjoin(preproc_exp)
171
+ # @note AB: The following is desabled because we don't want to use
172
+ # the dependence on {PrepConfigData}. We duplicate options:
173
+ # <do_postag>, <pos_tagger>, <do_lemmatize>, <lemmatizer>,
174
+ # <do_parse>, <parser>, <directory_preprocessed>
175
+ # in the experiment file of Fred.
176
+ #
177
+ # preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
178
+ # if not(preproc_expname)
179
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
180
+ # $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
181
+ # exit 1
182
+ # elsif not(File.readable?(preproc_expname))
183
+ # $stderr.puts "Error in the experiment file:"
184
+ # $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
185
+ # exit 1
186
+ # end
187
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
188
+ # @exp.adjoin(preproc_exp)
184
189
 
185
190
  # get the right syntactic interface
186
191
  SynInterfaces.check_interfaces_abort_if_missing(@exp)
@@ -190,7 +195,6 @@ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
190
195
  grf_obj = GrammaticalFunctionAccess.new(@interpreter_class)
191
196
  super(grf_obj)
192
197
 
193
-
194
198
  # announce the task
195
199
  $stderr.puts "---------"
196
200
  $stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
@@ -267,6 +271,8 @@ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
267
271
  context_obj = SingleSentContextProvider.new(max_context_size, @exp,
268
272
  @interpreter_class, target_obj,
269
273
  @dataset)
274
+ # @todo AB: Put it to the OptionParser, two option are not
275
+ # compatible, don't do the check here!
270
276
  if @exp.get("noncontiguous_input")
271
277
  $stderr.puts "Warning: 'single_sent_context' has been set in the experiment file."
272
278
  $stderr.puts "So I'm ignoring the 'noncontiguous_input = true' setting."