shalmaneser-fred 1.2.0.rc4 → 1.2.rc5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/fred +8 -3
- data/lib/fred/FredConventions.rb +190 -189
- data/lib/fred/abstract_context_provider.rb +246 -0
- data/lib/fred/abstract_fred_feature_access.rb +43 -0
- data/lib/fred/answer_key_access.rb +130 -0
- data/lib/fred/aux_keep_writers.rb +94 -0
- data/lib/fred/baseline.rb +153 -0
- data/lib/fred/context_provider.rb +55 -0
- data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
- data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
- data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
- data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
- data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
- data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
- data/lib/fred/feature_extractors.rb +5 -0
- data/lib/fred/file_zipped.rb +43 -0
- data/lib/fred/find_all_targets.rb +94 -0
- data/lib/fred/find_targets_from_frames.rb +92 -0
- data/lib/fred/fred.rb +43 -40
- data/lib/fred/fred_error.rb +15 -0
- data/lib/fred/fred_eval.rb +311 -0
- data/lib/fred/fred_feature_access.rb +420 -0
- data/lib/fred/fred_feature_info.rb +56 -0
- data/lib/fred/fred_featurize.rb +525 -0
- data/lib/fred/fred_parameters.rb +190 -0
- data/lib/fred/fred_split.rb +86 -0
- data/lib/fred/fred_split_pkg.rb +189 -0
- data/lib/fred/fred_test.rb +571 -0
- data/lib/fred/fred_train.rb +125 -0
- data/lib/fred/grammatical_function_access.rb +63 -0
- data/lib/fred/md5.rb +6 -0
- data/lib/fred/meta_feature_access.rb +185 -0
- data/lib/fred/non_contiguous_context_provider.rb +532 -0
- data/lib/fred/opt_parser.rb +182 -161
- data/lib/fred/plot_and_r_eval.rb +486 -0
- data/lib/fred/single_sent_context_provider.rb +76 -0
- data/lib/fred/slide_var.rb +148 -0
- data/lib/fred/targets.rb +136 -0
- data/lib/fred/toggle_var.rb +61 -0
- data/lib/fred/word_lemma_pos_ne.rb +51 -0
- data/lib/fred/write_features_binary.rb +95 -0
- data/lib/fred/write_features_nary.rb +51 -0
- data/lib/fred/write_features_nary_or_binary.rb +51 -0
- data/lib/shalmaneser/fred.rb +1 -0
- metadata +57 -30
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred_config_data.rb +0 -185
- data/test/frprep/test_opt_parser.rb +0 -94
- data/test/functional/functional_test_helper.rb +0 -58
- data/test/functional/test_fred.rb +0 -47
- data/test/functional/test_frprep.rb +0 -99
- data/test/functional/test_rosy.rb +0 -40
@@ -0,0 +1,190 @@
|
|
1
|
+
# FredParameters
|
2
|
+
# Katrin Erk, April 05
|
3
|
+
#
|
4
|
+
# Frame disambiguation system:
|
5
|
+
# test different values for system parameters,
|
6
|
+
# construct text and graphical output
|
7
|
+
|
8
|
+
# Salsa packages
|
9
|
+
require 'fred/plot_and_r_eval'
|
10
|
+
require 'fred/FredConventions' # !
|
11
|
+
require 'fred/fred_split'
|
12
|
+
require 'fred/fred_train'
|
13
|
+
require 'fred/fred_test'
|
14
|
+
require 'fred/fred_eval'
|
15
|
+
require 'fred/toggle_var'
|
16
|
+
require 'fred/slide_var'
|
17
|
+
|
18
|
+
require 'logging'
|
19
|
+
|
20
|
+
module Shalmaneser
|
21
|
+
module Fred
|
22
|
+
##########################################
|
23
|
+
# main class of this package:
|
24
|
+
# try out different values for system parameters,
|
25
|
+
# and record the result.
|
26
|
+
#
|
27
|
+
# One value can be a slide variable, taking on several numerical values.
|
28
|
+
# 0 or more values can be toggle variables, taking on the values true and false.
|
29
|
+
# @todo AB: Reintroduce this task!!!
|
30
|
+
class FredParameters
|
31
|
+
#####
|
32
|
+
# @param [FredConfigData] exp
|
33
|
+
# @param [Hash] options hash: runtime option name (string) => value(string)
|
34
|
+
def initialize(exp, options)
|
35
|
+
@exp = exp
|
36
|
+
|
37
|
+
# evaluate runtime options:
|
38
|
+
# record the slide variable (if any) plus all toggle variables
|
39
|
+
@slide = SlideVar.new("", @exp)
|
40
|
+
@toggle = []
|
41
|
+
@outfile_prefix = "fred_parameters"
|
42
|
+
|
43
|
+
options.each_pair do |opt, arg|
|
44
|
+
case opt
|
45
|
+
when "--slide"
|
46
|
+
@slide = SlideVar.new(arg, @exp)
|
47
|
+
|
48
|
+
when "--toggle"
|
49
|
+
arg.split(":").each { |toggle_var|
|
50
|
+
@toggle << ToggleVar.new(toggle_var, @exp)
|
51
|
+
}
|
52
|
+
|
53
|
+
when "--output_to"
|
54
|
+
@outfile_prefix = arg
|
55
|
+
|
56
|
+
else
|
57
|
+
# case of unknown arguments has been dealt with by fred.rb
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
####
|
63
|
+
def compute
|
64
|
+
LOGGER.info "Fred parameter exploration, experiment #{@exp.get("experiment_ID")}"
|
65
|
+
##
|
66
|
+
# make a split of the training data
|
67
|
+
begin
|
68
|
+
feature_dir = ::Shalmaneser::Fred.fred_dirname(@exp, "train", "features")
|
69
|
+
rescue
|
70
|
+
$stderr.puts "To experiment with system parameters, please first featurize training data."
|
71
|
+
exit 1
|
72
|
+
end
|
73
|
+
# make new split ID from system time, and make a split with 80% training, 20% test data
|
74
|
+
splitID = Time.new.to_f.to_s
|
75
|
+
task_obj = FredSplit.new(@exp,
|
76
|
+
{ "--logID" => splitID,
|
77
|
+
"--trainpercent" => "80",
|
78
|
+
},
|
79
|
+
true # ignore unambiguous
|
80
|
+
)
|
81
|
+
task_obj.compute
|
82
|
+
|
83
|
+
##
|
84
|
+
# start recording results:
|
85
|
+
|
86
|
+
# text output file
|
87
|
+
begin
|
88
|
+
textout_file = File.new(@outfile_prefix + ".txt", "w")
|
89
|
+
rescue
|
90
|
+
raise "Could not write to output file #{@outfile_prefix}.txt"
|
91
|
+
end
|
92
|
+
|
93
|
+
# values_to_score: hash toggle_values_descr(string) =>
|
94
|
+
# hash slide_value(float) => score(float)
|
95
|
+
values_to_score = {}
|
96
|
+
|
97
|
+
# max_score: float, describing maximum score achieved
|
98
|
+
# max_setting: string, describing values for maximum score
|
99
|
+
max_score = 0.0
|
100
|
+
max_setting = ""
|
101
|
+
|
102
|
+
##
|
103
|
+
# for each value of the toggle variables
|
104
|
+
0.upto(2**@toggle.length - 1) { |binary|
|
105
|
+
|
106
|
+
textout_line = ""
|
107
|
+
|
108
|
+
# re-set toggle values according to 'binary':
|
109
|
+
@toggle.each_index { |i|
|
110
|
+
# if the i-th bit is set in binary, set this
|
111
|
+
# boolean to true, else set it to false
|
112
|
+
if (binary & (2**i)) > 0
|
113
|
+
textout_line << @toggle[i].set_value_to(true, @exp) + " "
|
114
|
+
else
|
115
|
+
textout_line << @toggle[i].set_value_to(false, @exp) + " "
|
116
|
+
end
|
117
|
+
}
|
118
|
+
|
119
|
+
values_to_score[textout_line] = {}
|
120
|
+
|
121
|
+
|
122
|
+
##
|
123
|
+
# for each value of the slide variable
|
124
|
+
@slide.each_slide_value(@exp) { |slide_value, slide_value_description|
|
125
|
+
|
126
|
+
##
|
127
|
+
# progress bar
|
128
|
+
$stderr.puts "Parameter exploration: #{textout_line} #{slide_value_description}"
|
129
|
+
|
130
|
+
##
|
131
|
+
# @exp has been modified to fit the current values of the
|
132
|
+
# slide and toggle variables.
|
133
|
+
# Now train, test, evaluate on the split we have constructed
|
134
|
+
task_obj = FredTrain.new(@exp, { "--logID" => splitID})
|
135
|
+
task_obj.compute
|
136
|
+
task_obj = FredTest.new(@exp,
|
137
|
+
{ "--logID" => splitID,
|
138
|
+
"--nooutput"=> true
|
139
|
+
})
|
140
|
+
task_obj.compute
|
141
|
+
task_obj = FredEval.new(@exp, {"--logID" => splitID})
|
142
|
+
task_obj.compute(false) # don't print evaluation results to file
|
143
|
+
|
144
|
+
##
|
145
|
+
# read off F-score, record result
|
146
|
+
score = task_obj.f
|
147
|
+
|
148
|
+
textout_file.puts textout_line + slide_value_description + " : " + score.to_s
|
149
|
+
textout_file.flush
|
150
|
+
values_to_score[textout_line][slide_value] = score
|
151
|
+
|
152
|
+
if score > max_score
|
153
|
+
max_score = score
|
154
|
+
max_setting = textout_line + slide_value_description + " : " + score.to_s
|
155
|
+
end
|
156
|
+
}
|
157
|
+
}
|
158
|
+
|
159
|
+
##
|
160
|
+
# remove split
|
161
|
+
FredSplit.remove_split(@exp, splitID)
|
162
|
+
|
163
|
+
##
|
164
|
+
# plot outcome, report overall maximum
|
165
|
+
|
166
|
+
unless @slide.empty?
|
167
|
+
# gnuplot output only if some slide variable has been used
|
168
|
+
title = "Exploring #{@slide.var_name}, " + @toggle.map { |toggle_obj| toggle_obj.var_name }.join(", ")
|
169
|
+
PlotAndREval.gnuplot_direct(values_to_score,
|
170
|
+
title,
|
171
|
+
@slide.var_name,
|
172
|
+
"F-score",
|
173
|
+
@outfile_prefix + ".ps")
|
174
|
+
end
|
175
|
+
|
176
|
+
$stderr.puts "Parameter exploration finished."
|
177
|
+
$stderr.puts "Text output was written to #{@outfile_prefix}.txt"
|
178
|
+
unless @slide.empty?
|
179
|
+
$stderr.puts "Gnuplot output was written to #{@outfile_prefix}.ps"
|
180
|
+
end
|
181
|
+
|
182
|
+
unless max_setting.empty?
|
183
|
+
$stderr.puts "-----------------------"
|
184
|
+
$stderr.puts "Maximum score:"
|
185
|
+
$stderr.puts max_setting
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# FredSplit
|
2
|
+
# Katrin Erk April 05
|
3
|
+
#
|
4
|
+
# Frame disambiguation system:
|
5
|
+
# make random split of the training data
|
6
|
+
#
|
7
|
+
# The split is computed on the basis of the Fred format
|
8
|
+
# feature data.
|
9
|
+
# The split is recorded in a separate split directory
|
10
|
+
# with a very simple system:
|
11
|
+
# - one file per feature file, same filename
|
12
|
+
# - one line per instance line in feature file
|
13
|
+
# - entry in that line is either 'train' or 'test'
|
14
|
+
|
15
|
+
# Fred packages
|
16
|
+
require 'fred/fred_split_pkg'
|
17
|
+
require 'logging'
|
18
|
+
|
19
|
+
module Shalmaneser
|
20
|
+
module Fred
|
21
|
+
class FredSplit
|
22
|
+
# @param [FredConfigData] exp
|
23
|
+
# @param [String] split_id
|
24
|
+
def self.remove_split(exp, split_id)
|
25
|
+
FredSplitPkg.remove_split(exp, split_id)
|
26
|
+
end
|
27
|
+
|
28
|
+
###
|
29
|
+
# new
|
30
|
+
#
|
31
|
+
# evaluate runtime options and announce the task
|
32
|
+
def initialize(exp_obj, # FredConfigData object
|
33
|
+
options, # hash: runtime option name (string) => value(string)
|
34
|
+
ignore_unambiguous = false)
|
35
|
+
|
36
|
+
@exp = exp_obj
|
37
|
+
@ignore_unambiguous = ignore_unambiguous
|
38
|
+
|
39
|
+
# evaluate runtime options
|
40
|
+
@split_id = nil
|
41
|
+
@trainpercent = 0.9
|
42
|
+
|
43
|
+
options.each_pair do |opt, arg|
|
44
|
+
case opt
|
45
|
+
when "--logID"
|
46
|
+
@split_id = arg
|
47
|
+
|
48
|
+
# @ todo AB: Should be prepared in the ConfigData/OptParser.
|
49
|
+
when "--trainpercent"
|
50
|
+
@trainpercent = arg.to_f / 100.0
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# sanity check: need a log ID
|
55
|
+
# @todo AB: Move it to OptParser
|
56
|
+
if @split_id.nil?
|
57
|
+
raise "I need a log ID, parameter --logID"
|
58
|
+
end
|
59
|
+
|
60
|
+
# @todo AB: Move it to OptParser
|
61
|
+
if @trainpercent <= 0.0 or @trainpercent >= 1.0
|
62
|
+
raise "Training percentage needs to be between 1 and 99. I got "+
|
63
|
+
(@trainpercent * 100.0).to_i.to_s
|
64
|
+
end
|
65
|
+
|
66
|
+
##
|
67
|
+
# make a splitting object
|
68
|
+
@split_obj = FredSplitPkg.new(@exp)
|
69
|
+
end
|
70
|
+
|
71
|
+
###
|
72
|
+
# compute
|
73
|
+
#
|
74
|
+
# do the splitting
|
75
|
+
def compute
|
76
|
+
# announce the task
|
77
|
+
LOGGER.info "Fred experiment #{@exp.get("experiment_ID")}: "\
|
78
|
+
"Making split, using #{(@trainpercent * 100.0).to_i}% as training data."
|
79
|
+
|
80
|
+
FredSplitPkg.remove_split(@exp, @split_id)
|
81
|
+
@split_obj.make_new_split(@split_id, @trainpercent,
|
82
|
+
@ignore_unambiguous)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
require 'fileutils'
|
3
|
+
require 'fred/targets'
|
4
|
+
require 'fred/FredConventions' # !
|
5
|
+
require 'fred/fred_error'
|
6
|
+
|
7
|
+
require 'logging'
|
8
|
+
|
9
|
+
module Shalmaneser
|
10
|
+
module Fred
|
11
|
+
# splitting package for WSD:
|
12
|
+
# compute a split for feature files (one item a line, CSV),
|
13
|
+
# and apply pre-computed split
|
14
|
+
# to produce new feature files accordingly
|
15
|
+
class FredSplitPkg
|
16
|
+
###
|
17
|
+
# remove an old split
|
18
|
+
# @param [FredConfigData] exp object
|
19
|
+
# @param [String] split_id
|
20
|
+
def self.remove_split(exp, split_id)
|
21
|
+
begin
|
22
|
+
# split_dir = FredSplitPkg.split_dir(exp, split_id, "new")
|
23
|
+
split_dir = ::Shalmaneser::Fred.fred_dirname(exp, 'split', split_id, 'new')
|
24
|
+
rescue
|
25
|
+
# no split to be removed
|
26
|
+
return
|
27
|
+
end
|
28
|
+
|
29
|
+
FileUtils.rm_rf(split_dir)
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize(exp)
|
33
|
+
@exp = exp
|
34
|
+
end
|
35
|
+
|
36
|
+
# make a new split
|
37
|
+
def make_new_split(split_id, # string: ID
|
38
|
+
trainpercent, # float: percentage training data
|
39
|
+
ignore_unambiguous = false)
|
40
|
+
|
41
|
+
# where to store the split?
|
42
|
+
split_dir = split_dir(@exp, split_id, "new")
|
43
|
+
|
44
|
+
lemmas_and_senses = Targets.new(@exp, nil, "r")
|
45
|
+
unless lemmas_and_senses.targets_okay
|
46
|
+
# error during initialization
|
47
|
+
raise FredError, "FredSplitPkg: Error: Could not read list of known targets, bailing out."
|
48
|
+
end
|
49
|
+
|
50
|
+
# Iterate through lemmas,
|
51
|
+
# split training feature files.
|
52
|
+
#
|
53
|
+
# Do the split only once per lemma,
|
54
|
+
# even if we have sense-specific feature files
|
55
|
+
feature_dir = ::Shalmaneser::Fred.fred_dirname(@exp, "train", "features")
|
56
|
+
|
57
|
+
lemmas_and_senses.get_lemmas.each { |lemma|
|
58
|
+
# construct split file
|
59
|
+
splitfilename = split_dir + fred_split_filename(lemma)
|
60
|
+
begin
|
61
|
+
splitfile = File.new(splitfilename, "w")
|
62
|
+
rescue
|
63
|
+
raise "Error: Couldn't write to file " + splitfilename
|
64
|
+
end
|
65
|
+
|
66
|
+
# find lemma-specific feature file
|
67
|
+
|
68
|
+
filename = feature_dir + ::Shalmaneser::Fred.fred_feature_filename(lemma)
|
69
|
+
|
70
|
+
unless File.exist?(filename)
|
71
|
+
# try lemma+sense-specific feature file
|
72
|
+
file_pattern = ::Shalmaneser::Fred.fred_feature_filename(lemma, "*", true)
|
73
|
+
filename = Dir[feature_dir + file_pattern].first
|
74
|
+
|
75
|
+
unless filename
|
76
|
+
# no lemma+sense-specific feature file
|
77
|
+
LOGGER.warn "Warning: split: no feature file found for #{lemma}, skipping."
|
78
|
+
splitfile.close
|
79
|
+
next
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# open feature file for reading
|
84
|
+
begin
|
85
|
+
file = File.new(filename)
|
86
|
+
rescue
|
87
|
+
raise "Couldn't read feature file " + filename
|
88
|
+
end
|
89
|
+
|
90
|
+
if ignore_unambiguous and
|
91
|
+
lemmas_and_senses.get_senses(lemma).length < 2
|
92
|
+
# unambiguous: ignore
|
93
|
+
|
94
|
+
while file.gets
|
95
|
+
splitfile.puts "ignore"
|
96
|
+
end
|
97
|
+
|
98
|
+
else
|
99
|
+
# read from feature file, classify at random
|
100
|
+
# as train or test,
|
101
|
+
# write result to splitfile
|
102
|
+
|
103
|
+
while file.gets
|
104
|
+
if rand < trainpercent
|
105
|
+
splitfile.puts "train"
|
106
|
+
else
|
107
|
+
splitfile.puts "test"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
splitfile.close
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
116
|
+
# change feature files according to
|
117
|
+
# pre-computed split
|
118
|
+
#
|
119
|
+
#
|
120
|
+
# returns: tempfile containing featurized items,
|
121
|
+
# according to split,
|
122
|
+
# or nil if the split file wouldn't contain any data
|
123
|
+
def apply_split(filename, # feature file
|
124
|
+
lemma, # string: lemma that filename is about
|
125
|
+
dataset, # string: train, test
|
126
|
+
split_id) # string: split ID
|
127
|
+
|
128
|
+
split_filename = split_dir(@exp, split_id) + fred_split_filename(lemma)
|
129
|
+
|
130
|
+
# read feature file and split file at the same time
|
131
|
+
# write to tempfile.
|
132
|
+
f_feat = File.new(filename)
|
133
|
+
f_split = File.new(split_filename)
|
134
|
+
f_out = Tempfile.new("fred_split")
|
135
|
+
|
136
|
+
num_yes = 0
|
137
|
+
|
138
|
+
f_feat.each do |line|
|
139
|
+
begin
|
140
|
+
split_part = f_split.readline.chomp
|
141
|
+
rescue
|
142
|
+
$stderr.puts "FredSplit error: split file too short."
|
143
|
+
$stderr.puts "skipping rest of featurization data."
|
144
|
+
$stderr.puts "Split file: #{split_filename}"
|
145
|
+
$stderr.puts "Feature file: #{filename}"
|
146
|
+
# @todo AB: FIXME
|
147
|
+
raise "HIER"
|
148
|
+
|
149
|
+
f_out.close
|
150
|
+
if num_yes > 0
|
151
|
+
return f_out
|
152
|
+
else
|
153
|
+
return nil
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
if split_part == dataset
|
158
|
+
# write training data, and this item is in the training
|
159
|
+
# part of the split,
|
160
|
+
# or write test data, and item is in test part
|
161
|
+
f_out.puts line
|
162
|
+
num_yes += 1
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
f_out.close
|
167
|
+
f_feat.close
|
168
|
+
f_split.close
|
169
|
+
|
170
|
+
if num_yes > 0
|
171
|
+
return f_out
|
172
|
+
else
|
173
|
+
return nil
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
178
|
+
private
|
179
|
+
|
180
|
+
def fred_split_filename(lemma)
|
181
|
+
"fred.split.#{lemma}"
|
182
|
+
end
|
183
|
+
|
184
|
+
def split_dir(exp, split_id, mode = "existing")
|
185
|
+
::Shalmaneser::Fred.fred_dirname(exp, "split", split_id, mode)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|