shalmaneser 1.2.0.rc3 → 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -7
  3. data/bin/fred +2 -4
  4. data/doc/exp_files.md +6 -5
  5. data/lib/common/{ConfigData.rb → config_data.rb} +46 -270
  6. data/lib/common/config_format_element.rb +220 -0
  7. data/lib/common/prep_config_data.rb +62 -0
  8. data/lib/common/{frprep_helper.rb → prep_helper.rb} +0 -0
  9. data/lib/{common/DBInterface.rb → db/db_interface.rb} +2 -2
  10. data/lib/{rosy/DBMySQL.rb → db/db_mysql.rb} +1 -2
  11. data/lib/{rosy/DBSQLite.rb → db/db_sqlite.rb} +1 -1
  12. data/lib/{rosy/DBTable.rb → db/db_table.rb} +1 -1
  13. data/lib/{rosy/DBWrapper.rb → db/db_wrapper.rb} +0 -0
  14. data/lib/{common/SQLQuery.rb → db/sql_query.rb} +0 -0
  15. data/lib/fred/FredBOWContext.rb +8 -6
  16. data/lib/fred/FredDetermineTargets.rb +1 -1
  17. data/lib/fred/FredEval.rb +1 -1
  18. data/lib/fred/FredFeaturize.rb +22 -16
  19. data/lib/fred/FredTest.rb +0 -1
  20. data/lib/fred/fred.rb +2 -0
  21. data/lib/fred/{FredConfigData.rb → fred_config_data.rb} +70 -67
  22. data/lib/fred/opt_parser.rb +1 -1
  23. data/lib/frprep/frprep.rb +1 -1
  24. data/lib/frprep/interfaces/berkeley_interface.rb +7 -9
  25. data/lib/frprep/opt_parser.rb +1 -1
  26. data/lib/rosy/ExternalConfigData.rb +1 -1
  27. data/lib/rosy/RosyEval.rb +1 -1
  28. data/lib/rosy/RosyFeaturize.rb +21 -20
  29. data/lib/rosy/RosyInspect.rb +1 -1
  30. data/lib/rosy/RosyPruning.rb +1 -1
  31. data/lib/rosy/RosyServices.rb +1 -1
  32. data/lib/rosy/RosySplit.rb +1 -1
  33. data/lib/rosy/RosyTest.rb +23 -20
  34. data/lib/rosy/RosyTrain.rb +15 -13
  35. data/lib/rosy/RosyTrainingTestTable.rb +2 -1
  36. data/lib/rosy/View.rb +1 -1
  37. data/lib/rosy/opt_parser.rb +1 -1
  38. data/lib/rosy/rosy.rb +1 -1
  39. data/lib/rosy/rosy_config_data.rb +121 -0
  40. data/lib/shalmaneser/opt_parser.rb +32 -2
  41. data/lib/shalmaneser/version.rb +1 -1
  42. metadata +23 -114
  43. checksums.yaml.gz.sig +0 -0
  44. data.tar.gz.sig +0 -0
  45. data/lib/common/FrPrepConfigData.rb +0 -66
  46. data/lib/rosy/RosyConfigData.rb +0 -115
  47. metadata.gz.sig +0 -0
data/lib/fred/FredTest.rb CHANGED
@@ -16,7 +16,6 @@ require "common/SalsaTigerRegXML"
16
16
  require "common/ruby_class_extensions"
17
17
 
18
18
  # Shalmaneser packages
19
- require "common/FrPrepConfigData"
20
19
  require "common/ML"
21
20
  require "fred/Baseline"
22
21
  require "fred/FredConventions"
data/lib/fred/fred.rb CHANGED
@@ -34,6 +34,8 @@ module Fred
34
34
  task_obj = FredEval.new(@exp, @opts)
35
35
  else
36
36
  raise "Shouldn't be here"
37
+ # @todo AB: this <else> condition should be unpossible
38
+ # do in OptionParser
37
39
  end
38
40
 
39
41
  task_obj.compute
@@ -4,7 +4,7 @@
4
4
  # Frame disambiguation system:
5
5
  # access to a configuration and experiment description file
6
6
 
7
- require "common/ConfigData"
7
+ require "common/config_data"
8
8
 
9
9
  ##############################
10
10
  # Class FredConfigData
@@ -13,71 +13,73 @@ require "common/ConfigData"
13
13
  # sets variable names appropriate to WSD task
14
14
 
15
15
  class FredConfigData < ConfigData
16
- def initialize(filename)
17
-
18
- # initialize config data object
19
- super(filename, # config file
20
- {
21
- "experiment_ID" => "string", # experiment ID
22
- "enduser_mode" => "bool", # work in enduser mode? (disallowing many things)
23
-
24
- "preproc_descr_file_train" => "string", # path to preprocessing files
25
- "preproc_descr_file_test" => "string",
26
- "directory_output" => "string", # path to Salsa/Tiger XML output directory
27
-
28
- "verbose" => "bool" , # print diagnostic messages?
29
- "apply_to_all_known_targets" => "bool", # apply to all known targets rather than the ones with a frame?
30
-
31
- "fred_directory" => "string",# directory for internal info
32
- "classifier_dir" => "string", # write classifiers here
33
-
34
- "classifier" => "list", # classifiers
35
-
36
- "dbtype" => "string", # "mysql" or "sqlite"
37
-
38
- "host" => "string", # DB access: sqlite only
39
- "user" => "string",
40
- "passwd" => "string",
41
- "dbname" => "string",
16
+ CONFIG_DEFS = {
17
+ "experiment_ID" => "string", # experiment ID
18
+ "enduser_mode" => "bool", # work in enduser mode? (disallowing many things)
19
+
20
+ "preproc_descr_file_train" => "string", # path to preprocessing files
21
+ "preproc_descr_file_test" => "string",
22
+ "directory_output" => "string", # path to Salsa/Tiger XML output directory
23
+
24
+ "verbose" => "bool" , # print diagnostic messages?
25
+ "apply_to_all_known_targets" => "bool", # apply to all known targets rather than the ones with a frame?
26
+
27
+ "fred_directory" => "string",# directory for internal info
28
+ "classifier_dir" => "string", # write classifiers here
29
+
30
+ "classifier" => "list", # classifiers
31
+
32
+ "dbtype" => "string", # "mysql" or "sqlite"
33
+
34
+ "host" => "string", # DB access: sqlite only
35
+ "user" => "string",
36
+ "passwd" => "string",
37
+ "dbname" => "string",
38
+
39
+ # featurization info
40
+ "feature" => "list", # which features to use for the classifier?
41
+ "binary_classifiers" => "bool",# make binary rather than n-ary clasifiers?
42
+ "negsense" => "string", # binary classifier: negative sense is..?
43
+ "numerical_features" => "string", # do what with numerical features?
44
+
45
+ # what to do with items that have multiple senses?
46
+ # 'binarize': binary classifiers, and consider positive
47
+ # if the sense is among the gold senses
48
+ # 'join' : make one joint sense
49
+ # 'repeat' : make multiple occurrences of the item, one sense per occ
50
+ # 'keep' : keep as separate labels
51
+ #
52
+ # multilabel: consider as assigned all labels
53
+ # above a certain confidence threshold?
54
+ "handle_multilabel" => "string",
55
+ "assignment_confidence_threshold" => "float",
56
+
57
+ # single-sentence context?
58
+ "single_sent_context" => "bool",
59
+
60
+ # noncontiguous input? then we need access to a larger corpus
61
+ "noncontiguous_input" => "bool",
62
+ "larger_corpus_dir" => "string",
63
+ "larger_corpus_format" => "string",
64
+ "larger_corpus_encoding" => "string",
65
+ # Imported from PrepConfigData
66
+ 'do_postag' => 'bool',
67
+ 'do_lemmatize' => 'bool',
68
+ 'do_parse' => 'bool',
69
+ 'pos_tagger' => 'string',
70
+ 'lemmatizer' => 'string',
71
+ 'parser' => 'string',
72
+ 'directory_preprocessed' => 'string',
73
+ 'language' => 'string'
74
+ }
42
75
 
43
- # featurization info
44
- "feature" => "list", # which features to use for the classifier?
45
- "binary_classifiers" => "bool",# make binary rather than n-ary clasifiers?
46
- "negsense" => "string", # binary classifier: negative sense is..?
47
- "numerical_features" => "string", # do what with numerical features?
48
-
49
- # what to do with items that have multiple senses?
50
- # 'binarize': binary classifiers, and consider positive
51
- # if the sense is among the gold senses
52
- # 'join' : make one joint sense
53
- # 'repeat' : make multiple occurrences of the item, one sense per occ
54
- # 'keep' : keep as separate labels
55
- #
56
- # multilabel: consider as assigned all labels
57
- # above a certain confidence threshold?
58
- "handle_multilabel" => "string",
59
- "assignment_confidence_threshold" => "float",
60
-
61
- # single-sentence context?
62
- "single_sent_context" => "bool",
76
+ def initialize(filename)
63
77
 
64
- # noncontiguous input? then we need access to a larger corpus
65
- "noncontiguous_input" => "bool",
66
- "larger_corpus_dir" => "string",
67
- "larger_corpus_format" => "string",
68
- "larger_corpus_encoding" => "string"
69
- },
70
- [ # variables
71
- "train",
72
- "exp_ID"
73
- ]
74
- )
78
+ super(filename, CONFIG_DEFS, ["train", "exp_ID"])
75
79
 
76
80
  # set access functions for list features
77
- set_list_feature_access("classifier",
78
- method("access_classifier"))
79
- set_list_feature_access("feature",
80
- method("access_feature"))
81
+ set_list_feature_access("classifier", method("access_classifier"))
82
+ set_list_feature_access("feature", method("access_feature"))
81
83
  end
82
84
 
83
85
  ###
@@ -165,14 +167,15 @@ class FredConfigData < ConfigData
165
167
  #
166
168
  # returns: a list of pairs [feature_name(string), options(array:string)]
167
169
  # of defined features
168
- def access_classifier(val_list) # array:array:string: list of tuples defined in config file
169
- # for feature 'feature'
170
+ # @param val_list [Array] array:array:string: list of tuples defined
171
+ # in config file for feature 'feature'
172
+ def access_classifier(val_list)
170
173
  if val_list.nil?
171
- return []
174
+ []
172
175
  else
173
- return val_list.map { |cl_descr_tuple|
176
+ val_list.map do |cl_descr_tuple|
174
177
  [cl_descr_tuple.first, cl_descr_tuple[1..-1]]
175
- }
178
+ end
176
179
  end
177
180
  end
178
181
 
@@ -4,7 +4,7 @@
4
4
 
5
5
  #require 'optparse' # for reimplementation
6
6
  require 'getoptlong'
7
- require "fred/FredConfigData"
7
+ require "fred/fred_config_data"
8
8
 
9
9
  module Fred
10
10
 
data/lib/frprep/frprep.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'frprep/do_parses'
2
- require 'common/frprep_helper'
2
+ require 'common/prep_helper'
3
3
  require 'common/FixSynSemMapping'
4
4
  # For FN input.
5
5
  require 'frprep/FNCorpusXML'
@@ -63,12 +63,9 @@ class BerkeleyInterface < SynInterfaceSTXML
63
63
 
64
64
  parser = ENV['SHALM_BERKELEY_BIN'] || 'berkeleyParser.jar'
65
65
  grammar = ENV['SHALM_BERKELEY_MODEL'] || 'grammar.gr'
66
+ options = ENV['SHALM_BERKELEY_OPTIONS']
66
67
 
67
- #berkeley_prog = "java -Xmx2000m -jar #{@program_path}berkeleyParser.jar -gr #{@program_path}ger_sm5.gr"
68
-
69
- #berkeley_prog = "java -d64 -Xmx10000m -jar #{@program_path}berkeley-parser.jar -gr #{@program_path}gerNegra.01.utf8 "
70
-
71
- berkeley_prog = "java -jar #{@program_path}#{parser} -gr #{@program_path}#{grammar}"
68
+ berkeley_prog = "java -jar #{@program_path}#{parser} #{options} -gr #{@program_path}#{grammar}"
72
69
 
73
70
  Dir[in_dir + "*" + @insuffix].each do |inputfilename|
74
71
 
@@ -139,10 +136,10 @@ class BerkeleyInterface < SynInterfaceSTXML
139
136
  # - a parse beginning with <( (>, <( (TOP>, <( (VROOT> etc.
140
137
  # TOP - Negra Grammars
141
138
  # VROOT - Tiger Grammars
142
- # PSEUDE - Original BP Grammars
139
+ # PSEUDO - Original BP Grammars
143
140
  # ROOT - some english grammars
144
141
  # empty identifiers for older Tiger grammars
145
- if line.nil? or line=~/^\( *\((PSEUDO|TOP|ROOT|VROOT)? / or line=~/^\(\(\)/
142
+ if line.nil? or line=~/^(\( *)?\((PSEUDO|TOP|ROOT|VROOT)? / or line=~/^\(\(\)/
146
143
  break
147
144
  end
148
145
  sentid +=1
@@ -157,8 +154,9 @@ class BerkeleyInterface < SynInterfaceSTXML
157
154
  # Insert a top node <VROOT> if missing.
158
155
  # Some grammars trained on older Tiger Versions
159
156
  # expose this problem.
160
- line.sub!(/^(\(\s+\(\s+)/, '\1VROOT')
161
-
157
+ #STDERR.puts "@@@1 <#{line}>"
158
+ line.sub!(/^(\(\s+\()(\s+)/, '\1VROOT\2')
159
+ #STDERR.puts "@@@2 <#{line}>"
162
160
  # berkeley parser output: remove brackets /(.*)/
163
161
  # Remove leading and trailing top level brackets.
164
162
  line.sub!(/^\( */, '')
@@ -3,7 +3,7 @@
3
3
  # AB, 2010-11-25
4
4
 
5
5
  require 'optparse'
6
- require 'common/FrPrepConfigData'
6
+ require 'common/prep_config_data'
7
7
  require 'common/SynInterfaces'
8
8
  module FrPrep
9
9
 
@@ -5,7 +5,7 @@
5
5
  # for Fred and Rosy:
6
6
  # access to configuration and experiment description file
7
7
 
8
- require 'common/ConfigData'
8
+ require 'common/config_data'
9
9
 
10
10
  ##############################
11
11
  # Class ExternalConfigData
data/lib/rosy/RosyEval.rb CHANGED
@@ -19,7 +19,7 @@ require "rosy/RosyTask"
19
19
  require "rosy/RosyPruning"
20
20
 
21
21
  # Frprep packages
22
- require "common/FrPrepConfigData"
22
+ require "common/prep_config_data"
23
23
 
24
24
  #######################################################################
25
25
  # This class is a subclass of the general evaluation class
@@ -9,13 +9,13 @@ require "common/SynInterfaces"
9
9
  require "common/ruby_class_extensions"
10
10
 
11
11
  # Frprep packages
12
- require "common/FrPrepConfigData"
12
+ #require "common/prep_config_data"
13
13
 
14
14
  # Rosy packages
15
15
  require "rosy/FailedParses"
16
16
  require "rosy/FeatureInfo"
17
17
  require "rosy/InputData"
18
- require "rosy/RosyConfigData"
18
+ require "rosy/rosy_config_data"
19
19
  require "common/RosyConventions"
20
20
  require "rosy/RosySplit"
21
21
  require "rosy/RosyTask"
@@ -81,24 +81,25 @@ class RosyFeaturize < RosyTask
81
81
 
82
82
  ##
83
83
  # add preprocessing information to the experiment file object
84
- if @dataset
85
- preproc_parameter = "preproc_descr_file_" + @dataset
86
- else
87
- # split data
88
- preproc_parameter = "preproc_descr_file_train"
89
- end
90
- preproc_expname = @exp.get(preproc_parameter)
91
- if not(preproc_expname)
92
- $stderr.puts "Please set the name of the preprocessing exp. file name"
93
- $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
94
- exit 1
95
- elsif not(File.readable?(preproc_expname))
96
- $stderr.puts "Error in the experiment file:"
97
- $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
98
- exit 1
99
- end
100
- preproc_exp = FrPrepConfigData.new(preproc_expname)
101
- @exp.adjoin(preproc_exp)
84
+ # @note AB: Commented out due to separation of PrepConfigData.
85
+ # if @dataset
86
+ # preproc_parameter = "preproc_descr_file_" + @dataset
87
+ # else
88
+ # # split data
89
+ # preproc_parameter = "preproc_descr_file_train"
90
+ # end
91
+ # preproc_expname = @exp.get(preproc_parameter)
92
+ # if not(preproc_expname)
93
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
94
+ # $stderr.puts "in the experiment file, parameter #{preproc_parameter}"
95
+ # exit 1
96
+ # elsif not(File.readable?(preproc_expname))
97
+ # $stderr.puts "Error in the experiment file:"
98
+ # $stderr.puts "Parameter #{preproc_parameter} has to be a readable file."
99
+ # exit 1
100
+ # end
101
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
102
+ # @exp.adjoin(preproc_exp)
102
103
 
103
104
  ###
104
105
  # find appropriate class for interpreting syntactic structures
@@ -12,7 +12,7 @@ require "rosy/RosyTrainingTestTable"
12
12
  require "rosy/View"
13
13
 
14
14
  # Frprep packages
15
- require "common/FrPrepConfigData"
15
+ require "common/prep_config_data"
16
16
 
17
17
  class RosyInspect < RosyTask
18
18
 
@@ -12,7 +12,7 @@ require "common/ruby_class_extensions"
12
12
 
13
13
  require "rosy/RosyFeatureExtractors"
14
14
  require "common/RosyConventions"
15
- require "rosy/RosyConfigData"
15
+ require "rosy/rosy_config_data"
16
16
  require "rosy/RosyIterator"
17
17
 
18
18
  ###
@@ -16,7 +16,7 @@ require "rosy/RosyTrainingTestTable"
16
16
  require "rosy/View"
17
17
 
18
18
  # Frprep packages
19
- require "common/FrPrepConfigData"
19
+ require "common/prep_config_data"
20
20
 
21
21
  ###################################################
22
22
  class RosyServices < RosyTask
@@ -16,7 +16,7 @@
16
16
  require "common/ruby_class_extensions"
17
17
 
18
18
  # Frprep packages
19
- require "common/FrPrepConfigData"
19
+ require "common/prep_config_data"
20
20
 
21
21
  # Rosy packages
22
22
  require "rosy/FailedParses"
data/lib/rosy/RosyTest.rb CHANGED
@@ -24,7 +24,7 @@ require "rosy/RosyTrainingTestTable"
24
24
  require "rosy/View"
25
25
 
26
26
  # Frprep packages
27
- require "common/FrPrepConfigData" # AB: what the fuck???
27
+ #require "common/prep_config_data" # AB: what the fuck???
28
28
 
29
29
  ##########################################################################
30
30
  # classifier combination class
@@ -156,25 +156,28 @@ class RosyTest < RosyTask
156
156
 
157
157
  ##
158
158
  # add preprocessing information to the experiment file object
159
- if @splitID
160
- # use split data
161
- preproc_param = "preproc_descr_file_train"
162
- else
163
- # use test data
164
- preproc_param = "preproc_descr_file_test"
165
- end
166
- preproc_expname = @exp.get(preproc_param)
167
- if not(preproc_expname)
168
- $stderr.puts "Please set the name of the preprocessing exp. file name"
169
- $stderr.puts "in the experiment file, parameter #{preproc_param}."
170
- exit 1
171
- elsif not(File.readable?(preproc_expname))
172
- $stderr.puts "Error in the experiment file:"
173
- $stderr.puts "Parameter #{preproc_param} has to be a readable file."
174
- exit 1
175
- end
176
- preproc_exp = FrPrepConfigData.new(preproc_expname)
177
- @exp.adjoin(preproc_exp)
159
+ # @note AB: Commented out due to separation of PrepConfigData:
160
+ # information for SynInteraces required.
161
+ # if @splitID
162
+ # # use split data
163
+ # preproc_param = "preproc_descr_file_train"
164
+ # else
165
+ # # use test data
166
+ # preproc_param = "preproc_descr_file_test"
167
+ # end
168
+
169
+ # preproc_expname = @exp.get(preproc_param)
170
+ # if not(preproc_expname)
171
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
172
+ # $stderr.puts "in the experiment file, parameter #{preproc_param}."
173
+ # exit 1
174
+ # elsif not(File.readable?(preproc_expname))
175
+ # $stderr.puts "Error in the experiment file:"
176
+ # $stderr.puts "Parameter #{preproc_param} has to be a readable file."
177
+ # exit 1
178
+ # end
179
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
180
+ # @exp.adjoin(preproc_exp)
178
181
 
179
182
  # announce the task
180
183
  $stderr.puts "---------"
@@ -18,7 +18,7 @@ require "rosy/RosyPruning"
18
18
  require "common/ML"
19
19
 
20
20
  # Frprep packages
21
- require "common/FrPrepConfigData"
21
+ #require "common/prep_config_data"
22
22
 
23
23
  class RosyTrain < RosyTask
24
24
 
@@ -68,18 +68,20 @@ class RosyTrain < RosyTask
68
68
 
69
69
  ##
70
70
  # add preprocessing information to the experiment file object
71
- preproc_expname = @exp.get("preproc_descr_file_train")
72
- if not(preproc_expname)
73
- $stderr.puts "Please set the name of the preprocessing exp. file name"
74
- $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
75
- exit 1
76
- elsif not(File.readable?(preproc_expname))
77
- $stderr.puts "Error in the experiment file:"
78
- $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
79
- exit 1
80
- end
81
- preproc_exp = FrPrepConfigData.new(preproc_expname)
82
- @exp.adjoin(preproc_exp)
71
+ # @note AB: Commented out due to separation of PrepConfigData.
72
+ # No information seems to be required.
73
+ # preproc_expname = @exp.get("preproc_descr_file_train")
74
+ # if not(preproc_expname)
75
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
76
+ # $stderr.puts "in the experiment file, parameter preproc_descr_file_train."
77
+ # exit 1
78
+ # elsif not(File.readable?(preproc_expname))
79
+ # $stderr.puts "Error in the experiment file:"
80
+ # $stderr.puts "Parameter preproc_descr_file_train has to be a readable file."
81
+ # exit 1
82
+ # end
83
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
84
+ # @exp.adjoin(preproc_exp)
83
85
 
84
86
 
85
87
  # get_lf returns: array of pairs [classifier_name, options[array]]