shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
@@ -0,0 +1,190 @@
1
+ # FredParameters
2
+ # Katrin Erk, April 05
3
+ #
4
+ # Frame disambiguation system:
5
+ # test different values for system parameters,
6
+ # construct text and graphical output
7
+
8
+ # Salsa packages
9
+ require 'fred/plot_and_r_eval'
10
+ require 'fred/FredConventions' # !
11
+ require 'fred/fred_split'
12
+ require 'fred/fred_train'
13
+ require 'fred/fred_test'
14
+ require 'fred/fred_eval'
15
+ require 'fred/toggle_var'
16
+ require 'fred/slide_var'
17
+
18
+ require 'logging'
19
+
20
+ module Shalmaneser
21
+ module Fred
22
+ ##########################################
23
+ # main class of this package:
24
+ # try out different values for system parameters,
25
+ # and record the result.
26
+ #
27
+ # One value can be a slide variable, taking on several numerical values.
28
+ # 0 or more values can be toggle variables, taking on the values true and false.
29
+ # @todo AB: Reintroduce this task!!!
30
+ class FredParameters
31
+ #####
32
+ # @param [FredConfigData] exp
33
+ # @param [Hash] options hash: runtime option name (string) => value(string)
34
+ def initialize(exp, options)
35
+ @exp = exp
36
+
37
+ # evaluate runtime options:
38
+ # record the slide variable (if any) plus all toggle variables
39
+ @slide = SlideVar.new("", @exp)
40
+ @toggle = []
41
+ @outfile_prefix = "fred_parameters"
42
+
43
+ options.each_pair do |opt, arg|
44
+ case opt
45
+ when "--slide"
46
+ @slide = SlideVar.new(arg, @exp)
47
+
48
+ when "--toggle"
49
+ arg.split(":").each { |toggle_var|
50
+ @toggle << ToggleVar.new(toggle_var, @exp)
51
+ }
52
+
53
+ when "--output_to"
54
+ @outfile_prefix = arg
55
+
56
+ else
57
+ # case of unknown arguments has been dealt with by fred.rb
58
+ end
59
+ end
60
+ end
61
+
62
+ ####
63
+ def compute
64
+ LOGGER.info "Fred parameter exploration, experiment #{@exp.get("experiment_ID")}"
65
+ ##
66
+ # make a split of the training data
67
+ begin
68
+ feature_dir = ::Shalmaneser::Fred.fred_dirname(@exp, "train", "features")
69
+ rescue
70
+ $stderr.puts "To experiment with system parameters, please first featurize training data."
71
+ exit 1
72
+ end
73
+ # make new split ID from system time, and make a split with 80% training, 20% test data
74
+ splitID = Time.new.to_f.to_s
75
+ task_obj = FredSplit.new(@exp,
76
+ { "--logID" => splitID,
77
+ "--trainpercent" => "80",
78
+ },
79
+ true # ignore unambiguous
80
+ )
81
+ task_obj.compute
82
+
83
+ ##
84
+ # start recording results:
85
+
86
+ # text output file
87
+ begin
88
+ textout_file = File.new(@outfile_prefix + ".txt", "w")
89
+ rescue
90
+ raise "Could not write to output file #{@outfile_prefix}.txt"
91
+ end
92
+
93
+ # values_to_score: hash toggle_values_descr(string) =>
94
+ # hash slide_value(float) => score(float)
95
+ values_to_score = {}
96
+
97
+ # max_score: float, describing maximum score achieved
98
+ # max_setting: string, describing values for maximum score
99
+ max_score = 0.0
100
+ max_setting = ""
101
+
102
+ ##
103
+ # for each value of the toggle variables
104
+ 0.upto(2**@toggle.length - 1) { |binary|
105
+
106
+ textout_line = ""
107
+
108
+ # re-set toggle values according to 'binary':
109
+ @toggle.each_index { |i|
110
+ # if the i-th bit is set in binary, set this
111
+ # boolean to true, else set it to false
112
+ if (binary & (2**i)) > 0
113
+ textout_line << @toggle[i].set_value_to(true, @exp) + " "
114
+ else
115
+ textout_line << @toggle[i].set_value_to(false, @exp) + " "
116
+ end
117
+ }
118
+
119
+ values_to_score[textout_line] = {}
120
+
121
+
122
+ ##
123
+ # for each value of the slide variable
124
+ @slide.each_slide_value(@exp) { |slide_value, slide_value_description|
125
+
126
+ ##
127
+ # progress bar
128
+ $stderr.puts "Parameter exploration: #{textout_line} #{slide_value_description}"
129
+
130
+ ##
131
+ # @exp has been modified to fit the current values of the
132
+ # slide and toggle variables.
133
+ # Now train, test, evaluate on the split we have constructed
134
+ task_obj = FredTrain.new(@exp, { "--logID" => splitID})
135
+ task_obj.compute
136
+ task_obj = FredTest.new(@exp,
137
+ { "--logID" => splitID,
138
+ "--nooutput"=> true
139
+ })
140
+ task_obj.compute
141
+ task_obj = FredEval.new(@exp, {"--logID" => splitID})
142
+ task_obj.compute(false) # don't print evaluation results to file
143
+
144
+ ##
145
+ # read off F-score, record result
146
+ score = task_obj.f
147
+
148
+ textout_file.puts textout_line + slide_value_description + " : " + score.to_s
149
+ textout_file.flush
150
+ values_to_score[textout_line][slide_value] = score
151
+
152
+ if score > max_score
153
+ max_score = score
154
+ max_setting = textout_line + slide_value_description + " : " + score.to_s
155
+ end
156
+ }
157
+ }
158
+
159
+ ##
160
+ # remove split
161
+ FredSplit.remove_split(@exp, splitID)
162
+
163
+ ##
164
+ # plot outcome, report overall maximum
165
+
166
+ unless @slide.empty?
167
+ # gnuplot output only if some slide variable has been used
168
+ title = "Exploring #{@slide.var_name}, " + @toggle.map { |toggle_obj| toggle_obj.var_name }.join(", ")
169
+ PlotAndREval.gnuplot_direct(values_to_score,
170
+ title,
171
+ @slide.var_name,
172
+ "F-score",
173
+ @outfile_prefix + ".ps")
174
+ end
175
+
176
+ $stderr.puts "Parameter exploration finished."
177
+ $stderr.puts "Text output was written to #{@outfile_prefix}.txt"
178
+ unless @slide.empty?
179
+ $stderr.puts "Gnuplot output was written to #{@outfile_prefix}.ps"
180
+ end
181
+
182
+ unless max_setting.empty?
183
+ $stderr.puts "-----------------------"
184
+ $stderr.puts "Maximum score:"
185
+ $stderr.puts max_setting
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,86 @@
1
+ # FredSplit
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system:
5
+ # make random split of the training data
6
+ #
7
+ # The split is computed on the basis of the Fred format
8
+ # feature data.
9
+ # The split is recorded in a separate split directory
10
+ # with a very simple system:
11
+ # - one file per feature file, same filename
12
+ # - one line per instance line in feature file
13
+ # - entry in that line is either 'train' or 'test'
14
+
15
+ # Fred packages
16
+ require 'fred/fred_split_pkg'
17
+ require 'logging'
18
+
19
+ module Shalmaneser
20
+ module Fred
21
+ class FredSplit
22
+ # @param [FredConfigData] exp
23
+ # @param [String] split_id
24
+ def self.remove_split(exp, split_id)
25
+ FredSplitPkg.remove_split(exp, split_id)
26
+ end
27
+
28
+ ###
29
+ # new
30
+ #
31
+ # evaluate runtime options and announce the task
32
+ def initialize(exp_obj, # FredConfigData object
33
+ options, # hash: runtime option name (string) => value(string)
34
+ ignore_unambiguous = false)
35
+
36
+ @exp = exp_obj
37
+ @ignore_unambiguous = ignore_unambiguous
38
+
39
+ # evaluate runtime options
40
+ @split_id = nil
41
+ @trainpercent = 0.9
42
+
43
+ options.each_pair do |opt, arg|
44
+ case opt
45
+ when "--logID"
46
+ @split_id = arg
47
+
48
+ # @ todo AB: Should be prepared in the ConfigData/OptParser.
49
+ when "--trainpercent"
50
+ @trainpercent = arg.to_f / 100.0
51
+ end
52
+ end
53
+
54
+ # sanity check: need a log ID
55
+ # @todo AB: Move it to OptParser
56
+ if @split_id.nil?
57
+ raise "I need a log ID, parameter --logID"
58
+ end
59
+
60
+ # @todo AB: Move it to OptParser
61
+ if @trainpercent <= 0.0 or @trainpercent >= 1.0
62
+ raise "Training percentage needs to be between 1 and 99. I got "+
63
+ (@trainpercent * 100.0).to_i.to_s
64
+ end
65
+
66
+ ##
67
+ # make a splitting object
68
+ @split_obj = FredSplitPkg.new(@exp)
69
+ end
70
+
71
+ ###
72
+ # compute
73
+ #
74
+ # do the splitting
75
+ def compute
76
+ # announce the task
77
+ LOGGER.info "Fred experiment #{@exp.get("experiment_ID")}: "\
78
+ "Making split, using #{(@trainpercent * 100.0).to_i}% as training data."
79
+
80
+ FredSplitPkg.remove_split(@exp, @split_id)
81
+ @split_obj.make_new_split(@split_id, @trainpercent,
82
+ @ignore_unambiguous)
83
+ end
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,189 @@
1
+ require "tempfile"
2
+ require 'fileutils'
3
+ require 'fred/targets'
4
+ require 'fred/FredConventions' # !
5
+ require 'fred/fred_error'
6
+
7
+ require 'logging'
8
+
9
+ module Shalmaneser
10
+ module Fred
11
+ # splitting package for WSD:
12
+ # compute a split for feature files (one item a line, CSV),
13
+ # and apply pre-computed split
14
+ # to produce new feature files accordingly
15
+ class FredSplitPkg
16
+ ###
17
+ # remove an old split
18
+ # @param [FredConfigData] exp object
19
+ # @param [String] split_id
20
+ def self.remove_split(exp, split_id)
21
+ begin
22
+ # split_dir = FredSplitPkg.split_dir(exp, split_id, "new")
23
+ split_dir = ::Shalmaneser::Fred.fred_dirname(exp, 'split', split_id, 'new')
24
+ rescue
25
+ # no split to be removed
26
+ return
27
+ end
28
+
29
+ FileUtils.rm_rf(split_dir)
30
+ end
31
+
32
+ def initialize(exp)
33
+ @exp = exp
34
+ end
35
+
36
+ # make a new split
37
+ def make_new_split(split_id, # string: ID
38
+ trainpercent, # float: percentage training data
39
+ ignore_unambiguous = false)
40
+
41
+ # where to store the split?
42
+ split_dir = split_dir(@exp, split_id, "new")
43
+
44
+ lemmas_and_senses = Targets.new(@exp, nil, "r")
45
+ unless lemmas_and_senses.targets_okay
46
+ # error during initialization
47
+ raise FredError, "FredSplitPkg: Error: Could not read list of known targets, bailing out."
48
+ end
49
+
50
+ # Iterate through lemmas,
51
+ # split training feature files.
52
+ #
53
+ # Do the split only once per lemma,
54
+ # even if we have sense-specific feature files
55
+ feature_dir = ::Shalmaneser::Fred.fred_dirname(@exp, "train", "features")
56
+
57
+ lemmas_and_senses.get_lemmas.each { |lemma|
58
+ # construct split file
59
+ splitfilename = split_dir + fred_split_filename(lemma)
60
+ begin
61
+ splitfile = File.new(splitfilename, "w")
62
+ rescue
63
+ raise "Error: Couldn't write to file " + splitfilename
64
+ end
65
+
66
+ # find lemma-specific feature file
67
+
68
+ filename = feature_dir + ::Shalmaneser::Fred.fred_feature_filename(lemma)
69
+
70
+ unless File.exist?(filename)
71
+ # try lemma+sense-specific feature file
72
+ file_pattern = ::Shalmaneser::Fred.fred_feature_filename(lemma, "*", true)
73
+ filename = Dir[feature_dir + file_pattern].first
74
+
75
+ unless filename
76
+ # no lemma+sense-specific feature file
77
+ LOGGER.warn "Warning: split: no feature file found for #{lemma}, skipping."
78
+ splitfile.close
79
+ next
80
+ end
81
+ end
82
+
83
+ # open feature file for reading
84
+ begin
85
+ file = File.new(filename)
86
+ rescue
87
+ raise "Couldn't read feature file " + filename
88
+ end
89
+
90
+ if ignore_unambiguous and
91
+ lemmas_and_senses.get_senses(lemma).length < 2
92
+ # unambiguous: ignore
93
+
94
+ while file.gets
95
+ splitfile.puts "ignore"
96
+ end
97
+
98
+ else
99
+ # read from feature file, classify at random
100
+ # as train or test,
101
+ # write result to splitfile
102
+
103
+ while file.gets
104
+ if rand < trainpercent
105
+ splitfile.puts "train"
106
+ else
107
+ splitfile.puts "test"
108
+ end
109
+ end
110
+ end
111
+
112
+ splitfile.close
113
+ }
114
+ end
115
+
116
+ # change feature files according to
117
+ # pre-computed split
118
+ #
119
+ #
120
+ # returns: tempfile containing featurized items,
121
+ # according to split,
122
+ # or nil if the split file wouldn't contain any data
123
+ def apply_split(filename, # feature file
124
+ lemma, # string: lemma that filename is about
125
+ dataset, # string: train, test
126
+ split_id) # string: split ID
127
+
128
+ split_filename = split_dir(@exp, split_id) + fred_split_filename(lemma)
129
+
130
+ # read feature file and split file at the same time
131
+ # write to tempfile.
132
+ f_feat = File.new(filename)
133
+ f_split = File.new(split_filename)
134
+ f_out = Tempfile.new("fred_split")
135
+
136
+ num_yes = 0
137
+
138
+ f_feat.each do |line|
139
+ begin
140
+ split_part = f_split.readline.chomp
141
+ rescue
142
+ $stderr.puts "FredSplit error: split file too short."
143
+ $stderr.puts "skipping rest of featurization data."
144
+ $stderr.puts "Split file: #{split_filename}"
145
+ $stderr.puts "Feature file: #{filename}"
146
+ # @todo AB: FIXME
147
+ raise "HIER"
148
+
149
+ f_out.close
150
+ if num_yes > 0
151
+ return f_out
152
+ else
153
+ return nil
154
+ end
155
+ end
156
+
157
+ if split_part == dataset
158
+ # write training data, and this item is in the training
159
+ # part of the split,
160
+ # or write test data, and item is in test part
161
+ f_out.puts line
162
+ num_yes += 1
163
+ end
164
+ end
165
+
166
+ f_out.close
167
+ f_feat.close
168
+ f_split.close
169
+
170
+ if num_yes > 0
171
+ return f_out
172
+ else
173
+ return nil
174
+ end
175
+
176
+ end
177
+
178
+ private
179
+
180
+ def fred_split_filename(lemma)
181
+ "fred.split.#{lemma}"
182
+ end
183
+
184
+ def split_dir(exp, split_id, mode = "existing")
185
+ ::Shalmaneser::Fred.fred_dirname(exp, "split", split_id, mode)
186
+ end
187
+ end
188
+ end
189
+ end