shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/rosy +14 -7
  4. data/lib/rosy/FailedParses.rb +22 -20
  5. data/lib/rosy/FeatureInfo.rb +35 -31
  6. data/lib/rosy/GfInduce.rb +132 -130
  7. data/lib/rosy/GfInduceFeature.rb +86 -68
  8. data/lib/rosy/InputData.rb +59 -55
  9. data/lib/rosy/RosyConfusability.rb +47 -40
  10. data/lib/rosy/RosyEval.rb +55 -55
  11. data/lib/rosy/RosyFeatureExtractors.rb +295 -290
  12. data/lib/rosy/RosyFeaturize.rb +54 -67
  13. data/lib/rosy/RosyInspect.rb +52 -50
  14. data/lib/rosy/RosyIterator.rb +73 -67
  15. data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
  16. data/lib/rosy/RosyPruning.rb +39 -31
  17. data/lib/rosy/RosyServices.rb +116 -115
  18. data/lib/rosy/RosySplit.rb +55 -53
  19. data/lib/rosy/RosyTask.rb +7 -3
  20. data/lib/rosy/RosyTest.rb +174 -191
  21. data/lib/rosy/RosyTrain.rb +46 -50
  22. data/lib/rosy/RosyTrainingTestTable.rb +101 -99
  23. data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
  24. data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
  25. data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
  26. data/lib/rosy/external_feature_extractor.rb +35 -0
  27. data/lib/rosy/opt_parser.rb +231 -201
  28. data/lib/rosy/rosy.rb +63 -64
  29. data/lib/rosy/rosy_conventions.rb +66 -0
  30. data/lib/rosy/rosy_error.rb +15 -0
  31. data/lib/rosy/var_var_restriction.rb +16 -0
  32. data/lib/shalmaneser/rosy.rb +1 -0
  33. metadata +26 -19
  34. data/lib/rosy/ExternalConfigData.rb +0 -58
  35. data/lib/rosy/View.rb +0 -418
  36. data/lib/rosy/rosy_config_data.rb +0 -121
  37. data/test/frprep/test_opt_parser.rb +0 -94
  38. data/test/functional/functional_test_helper.rb +0 -58
  39. data/test/functional/test_fred.rb +0 -47
  40. data/test/functional/test_frprep.rb +0 -99
  41. data/test/functional/test_rosy.rb +0 -40
@@ -5,61 +5,60 @@
5
5
  # feature for Rosy
6
6
 
7
7
  require "rosy/GfInduce"
8
- require "rosy/AbstractFeatureAndExternal"
9
- require "common/ruby_class_extensions"
10
-
11
- ###
12
- # make filename for GfInduce picle file
13
- def filename_gfmap(exp, # ExternalConfigData object
14
- interpreter) # SynInterpreter class
15
-
16
- # output dir as given in my experiment file
17
- # If there is an experiment ID, make subdirectory
18
- # named after the experiment ID and place the data there.
19
- output_dir = File.new_dir(exp.get("directory"))
20
- if exp.get("experiment_id")
21
- output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
22
- end
23
-
24
- # output file name:
25
- # Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
26
- return output_dir +
27
- "Gfmap." +
28
- interpreter.systems().to_a.map { |service, system_name|
29
- service.to_s+ "=" + system_name.to_s
30
- }.sort.join(".") + "." +
31
- interpreter.optional_systems().to_a.map { |service, system_name|
32
- "OPT" + service.to_s + "=" + system_name.to_s
33
- }.sort.join(".") + ".pkl"
34
- end
8
+ require 'rosy/external_feature_extractor'
9
+ require 'monkey_patching/file'
35
10
 
11
+ module Shalmaneser
12
+ module Rosy
36
13
  ################################
37
14
  # base class for all following feature extractors
38
15
  class GfInduceFeatureExtractor < ExternalFeatureExtractor
39
- GfInduceFeatureExtractor.announce_me()
16
+ GfInduceFeatureExtractor.announce_me
40
17
 
41
18
  @@okay = true # external experiment file present?
42
19
  @@gf_obj = nil # GfInduce object
43
20
  @@node_to_gf = nil # Hash: SynNodes of a sentence -> Gf label
44
21
 
45
- def GfInduceFeatureExtractor.designator()
46
- return "gf_fn"
22
+ ###
23
+ # Initialize: read GFInduce pickle
24
+ def initialize(exp, # experiment file object
25
+ interpreter_class) # SynInterpreter class
26
+
27
+ super(exp, interpreter_class)
28
+
29
+ if @exp_external
30
+ pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
31
+ @@gf_obj = GfInduce.from_file(pickle_filename)
32
+ @@okay = true
33
+
34
+ else
35
+ # signal that you cannot compute anything
36
+ @@okay = false
37
+ end
38
+ end
39
+
40
+ def self.designator
41
+ "gf_fn"
47
42
  end
48
- def GfInduceFeatureExtractor.feature_names()
49
- return ["gf_fn"]
43
+
44
+ def self.feature_names
45
+ ["gf_fn"]
50
46
  end
51
- def GfInduceFeatureExtractor.sql_type()
52
- return "VARCHAR(25)"
47
+
48
+ def self.sql_type
49
+ "VARCHAR(25)"
53
50
  end
54
- def GfInduceFeatureExtractor.feature_type()
55
- return "syn"
51
+
52
+ def self.feature_type
53
+ "syn"
56
54
  end
57
- def GfInduceFeatureExtractor.phase()
58
- return "phase 1"
55
+
56
+ def self.phase
57
+ "phase 1"
59
58
  end
60
59
 
61
60
  ###
62
- # set sentence, set node, set other settings:
61
+ # set sentence, set node, set other settings:
63
62
  # this is done prior to
64
63
  # feature computation using compute_feature()
65
64
  # such that computations that stay the same for
@@ -68,9 +67,9 @@ class GfInduceFeatureExtractor < ExternalFeatureExtractor
68
67
  # This is just relevant for Phase 1
69
68
  #
70
69
  # returns: false/nil if there was a problem
71
- def GfInduceFeatureExtractor.set_sentence(sent, # SalsaTigerSentence object
72
- frame) # FrameNode object
73
-
70
+ # @param sent [SalsaTigerSentence]
71
+ # @param frame [FrameNode]
72
+ def self.set_sentence(sent, frame)
74
73
  super(sent, frame)
75
74
 
76
75
  if @@okay
@@ -78,7 +77,7 @@ class GfInduceFeatureExtractor < ExternalFeatureExtractor
78
77
 
79
78
  # let the GF object compute all subcat frames
80
79
  # for the target of this frame
81
- subcatframes_of_current_target = @@gf_obj.apply(frame.target.children())
80
+ subcatframes_of_current_target = @@gf_obj.apply(frame.target.children)
82
81
 
83
82
  # keep the most frequent one of the
84
83
  # subcat frames returned by the GF object:
@@ -86,7 +85,7 @@ class GfInduceFeatureExtractor < ExternalFeatureExtractor
86
85
  # no subcat frames returned
87
86
  subcatframe = []
88
87
  else
89
- # we have at least one subcat frame:
88
+ # we have at least one subcat frame:
90
89
  # keep the most frequent one of them
91
90
  #
92
91
  # Also, subcatframes_of_current_target
@@ -98,9 +97,9 @@ class GfInduceFeatureExtractor < ExternalFeatureExtractor
98
97
  b.last <=> a.last
99
98
  }.first[1]
100
99
  end
101
-
100
+
102
101
  # change into a mapping node(SynNode) -> GF(string)
103
- @@node_to_gf = Hash.new
102
+ @@node_to_gf = {}
104
103
  subcatframe.each { |gf, prep, fe, synnodes|
105
104
  synnodes.each { |node|
106
105
  @@node_to_gf[node] = "#{gf} #{prep}"
@@ -109,25 +108,6 @@ class GfInduceFeatureExtractor < ExternalFeatureExtractor
109
108
  end
110
109
  end
111
110
 
112
-
113
- ###
114
- # Initialize: read GFInduce pickle
115
- def initialize(exp, # experiment file object
116
- interpreter_class) # SynInterpreter class
117
-
118
- super(exp, interpreter_class)
119
-
120
- if @exp_external
121
- pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
122
- @@gf_obj = GfInduce.from_file(pickle_filename)
123
- @@okay = true
124
-
125
- else
126
- # signal that you cannot compute anything
127
- @@okay = false
128
- end
129
- end
130
-
131
111
  ###
132
112
  # compute: compute features
133
113
  #
@@ -135,14 +115,52 @@ class GfInduceFeatureExtractor < ExternalFeatureExtractor
135
115
  # length of feature_names()
136
116
  #
137
117
  # here: array of length one, content either a string or nil
138
- def compute_features()
118
+ def compute_features
139
119
  # current node: @@node
140
120
  # check whether the current node has been assigned a slot
141
121
  # in the subcat frame
142
122
  if @@okay
143
- return [ @@node_to_gf[@@node] ]
123
+ return [@@node_to_gf[@@node]]
144
124
  else
145
- return [ nil ]
125
+ return [nil]
146
126
  end
147
127
  end
128
+
129
+ private
130
+
131
+ ###
132
+ # make filename for GfInduce picle file
133
+ # @param exp [ExternalConfigData]
134
+ # @param interpreter [SynInterpreter]
135
+ def filename_gfmap(exp, interpreter)
136
+ # output dir as given in my experiment file
137
+ # If there is an experiment ID, make subdirectory
138
+ # named after the experiment ID and place the data there.
139
+ output_dir = File.new_dir(exp.get("directory"))
140
+
141
+ if exp.get("experiment_id")
142
+ output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
143
+ end
144
+
145
+ # output file name:
146
+ # Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
147
+ output_dir = output_dir + 'Gfmap.' + interpreter.systems.to_a
148
+
149
+ output_dir = output_dir.map do |service, system_name|
150
+ "#{service}=#{system_name}"
151
+ end
152
+
153
+ output_dir = output_dir.sort.join('.') + '.' +
154
+ interpreter.optional_systems.to_a
155
+
156
+ output_dir = output_dir.map do |service, system_name|
157
+ "OPT#{service}=#{system_name}"
158
+ end
159
+
160
+ output_dir = output_dir.sort.join('.') + '.pkl'
161
+
162
+ output_dir
163
+ end
164
+ end
165
+ end
148
166
  end
@@ -7,37 +7,41 @@
7
7
  # featurization
8
8
 
9
9
  # Salsa packages
10
- require "common/Parser"
11
- require "common/SalsaTigerRegXML"
12
- require "common/ruby_class_extensions"
10
+ require 'salsa_tiger_xml/file_parts_parser'
11
+ # require "SalsaTigerRegXML"
12
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
13
+ require "ruby_class_extensions"
13
14
 
14
15
  # Fred/Rosy packages
15
16
  require "rosy/FailedParses"
16
- require "common/RosyConventions"
17
+ require 'rosy/rosy_conventions'
17
18
  require "rosy/RosyFeatureExtractors"
18
19
  require "rosy/RosyPhase2FeatureExtractors"
19
20
  require "rosy/RosyPruning"
20
21
  require "rosy/GfInduceFeature"
21
- require "common/FixSynSemMapping"
22
+ require 'frappe/fix_syn_sem_mapping'
22
23
 
24
+ module Shalmaneser
25
+ module Rosy
23
26
  class InputData
24
27
 
25
28
  ###
26
29
  def initialize(exp_object, # RosyConfigData object
27
30
  dataset, # train/test
28
- feature_info_object, # FeatureInfo object
31
+ feature_info_object, # FeatureInfo object
29
32
  interpreter_class, # SynInterpreter class
30
- input_dir) # Directory with input files
33
+ input_dir) # Directory with input files
31
34
 
32
35
  @exp = exp_object
33
36
  @dataset = dataset
34
37
  @interpreter_class = interpreter_class
38
+ raise 'BumBamBim!!!' if @interpreter_class.nil?
35
39
  @input_dir = input_dir
36
40
  # store information about failed parses here
37
- @failed_parses = FailedParses.new()
41
+ @failed_parses = FailedParses.new
38
42
 
39
43
  # feature_extractors_phase1: array of AbstractFeatureExtractor objects
40
- @extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1",
44
+ @extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1",
41
45
  @interpreter_class)
42
46
 
43
47
  # global settings
@@ -47,18 +51,18 @@ class InputData
47
51
 
48
52
  # # nothing to set here for now, so deactivated
49
53
  # @extractors_p1_other.each { |extractor_obj|
50
- # unless extractor_obj.class.set()
54
+ # unless extractor_obj.class.set
51
55
  # raise "Some grave problem during feature extractor initialization"
52
56
  # end
53
57
  # }
54
58
 
55
59
 
56
60
  # feature_extractors_phase2: array of AbstractFeatureExtractor objects
57
- extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2",
61
+ extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2",
58
62
  @interpreter_class)
59
63
  @feature_extractors_phase2 = extractors_p2_rosy + extractors_p2_other
60
64
  end
61
-
65
+
62
66
  ###
63
67
  # each_instance_phase1()
64
68
  #
@@ -68,14 +72,14 @@ class InputData
68
72
  # and yields one feature vector per instance
69
73
  #
70
74
  # yields: pairs [feature_name(string), feature_value(object)]
71
-
72
- def each_instance_phase1()
75
+
76
+ def each_instance_phase1
73
77
  Dir[@input_dir+"*.xml"]. each {|parsefilename|
74
78
 
75
- xmlFile = FilePartsParser.new(parsefilename)
79
+ xmlFile = STXML::FilePartsParser.new(parsefilename)
76
80
  $stderr.puts "Processing #{parsefilename}"
77
81
  xmlFile.scan_s {|sent_string|
78
- sent = SalsaTigerSentence.new(sent_string)
82
+ sent = STXML::SalsaTigerSentence.new(sent_string)
79
83
 
80
84
  # preprocessing: possibly change the SalsaTigerSentence object
81
85
  # before featurization
@@ -105,9 +109,9 @@ class InputData
105
109
  if skip_frame
106
110
  next
107
111
  end
108
-
112
+
109
113
  sent.each_syn_node { |syn_node|
110
-
114
+
111
115
  # Tell feature extractors about the current node:
112
116
  # first Rosy feature extractors, then the others
113
117
  # if there is a problem, skip this node
@@ -126,34 +130,34 @@ class InputData
126
130
  end
127
131
 
128
132
  # features: array of pairs: [feature_name(string), feature_value(object)]
129
- features = Array.new
133
+ features = []
130
134
  (@extractors_p1_rosy + @extractors_p1_other).each { |extractor|
131
135
  # compute features
132
- feature_names = extractor.class.feature_names()
136
+ feature_names = extractor.class.feature_names
133
137
  feature_index = 0
134
-
138
+
135
139
  # append new features to features array
136
- features.concat extractor.compute_features().map { |feature_value|
140
+ features.concat extractor.compute_features.map { |feature_value|
137
141
  feature_name = feature_names[feature_index]
138
142
  feature_index += 1
139
-
143
+
140
144
  # sanity check: feature value longer than the allotted space in the DB?
141
145
  check_feature_length(feature_name, feature_value, extractor)
142
146
 
143
- [feature_name, nonnil_feature(feature_value, extractor.class.sql_type()) ]
147
+ [feature_name, nonnil_feature(feature_value, extractor.class.sql_type) ]
144
148
  }
145
- }
149
+ }
146
150
  yield features
147
151
  } # each syn node
148
152
  } # each frame
149
153
  } # each sentence
150
154
  }
151
155
  end
152
-
156
+
153
157
  ###
154
158
  # each_phase2_column
155
159
  #
156
- # This method implements the application of the
160
+ # This method implements the application of the
157
161
  # phase 2 extractors to data.
158
162
  #
159
163
  # Given a database view (of either training or test data),
@@ -169,10 +173,10 @@ class InputData
169
173
  feature_columns = extractor.compute_features_on_view(view)
170
174
  # interleave with feature values and yield
171
175
  feature_index = 0
172
- feature_names = extractor.class.feature_names()
176
+ feature_names = extractor.class.feature_names
173
177
  feature_columns.each { |feature_values|
174
178
  yield [
175
- feature_names[feature_index],
179
+ feature_names[feature_index],
176
180
  feature_values.map { |feature_val| nonnil_feature(feature_val, extractor.class.sql_type) }
177
181
  ]
178
182
  feature_index += 1
@@ -184,22 +188,18 @@ class InputData
184
188
  # get_failed_parses
185
189
  #
186
190
  # returns the FailedParses object in which the info about failed parses has been stored
187
- def get_failed_parses()
188
- return @failed_parses
191
+ def get_failed_parses
192
+ @failed_parses
189
193
  end
190
194
 
191
- #################################
192
195
  private
193
196
 
194
-
195
197
  ###
196
- def nonnil_feature(feature_value,
197
- sql_type)
198
-
198
+ def nonnil_feature(feature_value, sql_type)
199
199
  # feature value nil? then change to noval
200
- if feature_value.nil? and sql_type =~ /CHAR/
200
+ if feature_value.nil? && sql_type =~ /CHAR/
201
201
  return @exp.get("noval")
202
- elsif feature_value.class.to_s == "String" and feature_value.empty?
202
+ elsif feature_value.is_a?(String) && feature_value.empty?
203
203
  return @exp.get("noval")
204
204
  elsif feature_value.nil?
205
205
  return 0
@@ -209,12 +209,14 @@ class InputData
209
209
  end
210
210
 
211
211
  ###
212
- # preprocess: possibly change the given SalsaTigerSentence
212
+ # preprocess: possibly change the given SalsaTigerSentence
213
213
  # to enable better learning
214
214
  def preprocess(sent) # SalsaTigerSentence object
215
215
 
216
-
217
- if @dataset == "train" and
216
+ # @todo AB: [2015-12-16 Wed 15:39]
217
+ # Don't think it should be done by Rosy, do it only in Frappe.
218
+ # This module will be moved to Frappe's lib.
219
+ if @dataset == "train" and
218
220
  (@exp.get("fe_syn_repair") or @exp.get("fe_rel_repair"))
219
221
  FixSynSemMapping.fixit(sent, @exp, @interpreter_class)
220
222
  end
@@ -226,8 +228,8 @@ class InputData
226
228
  frame) # FrameNode
227
229
 
228
230
  # target POS
229
- if frame.target()
230
- main_target = @interpreter_class.main_node_of_expr(frame.target.children(), "no_mwe")
231
+ if frame.target
232
+ main_target = @interpreter_class.main_node_of_expr(frame.target.children, "no_mwe")
231
233
  else
232
234
  main_target = nil
233
235
  end
@@ -236,8 +238,8 @@ class InputData
236
238
  else
237
239
  target_pos = nil
238
240
  end
239
- if frame.target()
240
- target_str = frame.target().yield_nodes_ordered().map { |t_node|
241
+ if frame.target
242
+ target_str = frame.target.yield_nodes_ordered.map { |t_node|
241
243
  if t_node.is_syntactic?
242
244
  @interpreter_class.lemma_backoff(t_node)
243
245
  else
@@ -248,9 +250,9 @@ class InputData
248
250
  else
249
251
  target_str = ""
250
252
  end
251
-
252
- @failed_parses.register(construct_instance_id(sent.id(), frame.id()),
253
- frame.name(),
253
+
254
+ @failed_parses.register(Rosy::construct_instance_id(sent.id, frame.id),
255
+ frame.name,
254
256
  target_str,
255
257
  target_pos,
256
258
  frame.children.map { |fe| fe.name })
@@ -263,28 +265,28 @@ class InputData
263
265
  feature_value, # object
264
266
  extractor_obj) # AbstractFeatureExtractor object
265
267
 
266
- if extractor_obj.class.sql_type() =~ /(\d+)/
268
+ if extractor_obj.class.sql_type =~ /(\d+)/
267
269
  # sql type contains some statement about the length.
268
270
  # just crudely compare to feature length
269
271
  length = $1.to_i
270
272
  if feature_value.class == String and
271
- feature_value.length() > length
273
+ feature_value.length > length
272
274
 
273
275
  if feature_name == "sentid"
274
- print length;
276
+ print length;
275
277
  print feature_value;
276
- print feature_value.length();
277
- # if the sentence (instance) ID is too long, we cannot go on.
278
+ print feature_value.length;
279
+ # if the sentence (instance) ID is too long, we cannot go on.
278
280
  $stderr.puts "Error: Instance ID is longer than its DB column."
279
281
  $stderr.puts "Please increase the DB column size in {Tiger,Collins}FeatureExtractors.rb"
280
282
  raise "SQL entry length surpassed"
281
283
 
282
284
  elsif @exp.get("verbose")
283
- # KE Feb 07: don't print warning,
285
+ # KE Feb 07: don't print warning,
284
286
  # this is just too frequent
285
287
  # for other features, we just issue a warning, and only if we are verbose
286
288
 
287
- # $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length()}): #{feature_value}"
289
+ # $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length}): #{feature_value}"
288
290
  end # feature name check
289
291
  end # length surpassed
290
292
  end # length found in sql type
@@ -292,3 +294,5 @@ class InputData
292
294
  end
293
295
 
294
296
  end
297
+ end
298
+ end