shalmaneser 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/shalmaneser +8 -2
- data/doc/index.md +1 -0
- data/lib/shalmaneser/opt_parser.rb +68 -67
- metadata +49 -119
- data/bin/fred +0 -16
- data/bin/frprep +0 -34
- data/bin/rosy +0 -17
- data/lib/common/AbstractSynInterface.rb +0 -1229
- data/lib/common/Counter.rb +0 -18
- data/lib/common/EnduserMode.rb +0 -27
- data/lib/common/Eval.rb +0 -480
- data/lib/common/FixSynSemMapping.rb +0 -196
- data/lib/common/Graph.rb +0 -345
- data/lib/common/ISO-8859-1.rb +0 -24
- data/lib/common/ML.rb +0 -186
- data/lib/common/Mallet.rb +0 -236
- data/lib/common/Maxent.rb +0 -229
- data/lib/common/Optimise.rb +0 -195
- data/lib/common/Parser.rb +0 -213
- data/lib/common/RegXML.rb +0 -269
- data/lib/common/RosyConventions.rb +0 -171
- data/lib/common/STXmlTerminalOrder.rb +0 -194
- data/lib/common/SalsaTigerRegXML.rb +0 -2347
- data/lib/common/SalsaTigerXMLHelper.rb +0 -99
- data/lib/common/SynInterfaces.rb +0 -282
- data/lib/common/TabFormat.rb +0 -721
- data/lib/common/Tiger.rb +0 -1448
- data/lib/common/Timbl.rb +0 -144
- data/lib/common/Tree.rb +0 -61
- data/lib/common/config_data.rb +0 -470
- data/lib/common/config_format_element.rb +0 -220
- data/lib/common/headz.rb +0 -338
- data/lib/common/option_parser.rb +0 -13
- data/lib/common/prep_config_data.rb +0 -62
- data/lib/common/prep_helper.rb +0 -1330
- data/lib/common/ruby_class_extensions.rb +0 -310
- data/lib/db/db_interface.rb +0 -48
- data/lib/db/db_mysql.rb +0 -145
- data/lib/db/db_sqlite.rb +0 -280
- data/lib/db/db_table.rb +0 -239
- data/lib/db/db_wrapper.rb +0 -176
- data/lib/db/sql_query.rb +0 -243
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredConventions.rb +0 -232
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred.rb +0 -47
- data/lib/fred/fred_config_data.rb +0 -185
- data/lib/fred/md5.rb +0 -23
- data/lib/fred/opt_parser.rb +0 -250
- data/lib/frprep/Ampersand.rb +0 -39
- data/lib/frprep/CollinsInterface.rb +0 -1165
- data/lib/frprep/Counter.rb +0 -18
- data/lib/frprep/FNCorpusXML.rb +0 -643
- data/lib/frprep/FNDatabase.rb +0 -144
- data/lib/frprep/FrameXML.rb +0 -513
- data/lib/frprep/Graph.rb +0 -345
- data/lib/frprep/MiniparInterface.rb +0 -1388
- data/lib/frprep/RegXML.rb +0 -269
- data/lib/frprep/STXmlTerminalOrder.rb +0 -194
- data/lib/frprep/SleepyInterface.rb +0 -384
- data/lib/frprep/TntInterface.rb +0 -44
- data/lib/frprep/TreetaggerInterface.rb +0 -327
- data/lib/frprep/do_parses.rb +0 -143
- data/lib/frprep/frprep.rb +0 -693
- data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
- data/lib/frprep/interfaces/stanford_interface.rb +0 -353
- data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
- data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
- data/lib/frprep/one_parsed_file.rb +0 -28
- data/lib/frprep/opt_parser.rb +0 -94
- data/lib/frprep/ruby_class_extensions.rb +0 -310
- data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/FailedParses.rb +0 -130
- data/lib/rosy/FeatureInfo.rb +0 -242
- data/lib/rosy/GfInduce.rb +0 -1115
- data/lib/rosy/GfInduceFeature.rb +0 -148
- data/lib/rosy/InputData.rb +0 -294
- data/lib/rosy/RosyConfusability.rb +0 -338
- data/lib/rosy/RosyEval.rb +0 -465
- data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
- data/lib/rosy/RosyFeaturize.rb +0 -281
- data/lib/rosy/RosyInspect.rb +0 -336
- data/lib/rosy/RosyIterator.rb +0 -478
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
- data/lib/rosy/RosyPruning.rb +0 -165
- data/lib/rosy/RosyServices.rb +0 -744
- data/lib/rosy/RosySplit.rb +0 -232
- data/lib/rosy/RosyTask.rb +0 -19
- data/lib/rosy/RosyTest.rb +0 -829
- data/lib/rosy/RosyTrain.rb +0 -234
- data/lib/rosy/RosyTrainingTestTable.rb +0 -787
- data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/opt_parser.rb +0 -379
- data/lib/rosy/rosy.rb +0 -78
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/lib/shalmaneser/version.rb +0 -3
data/lib/rosy/GfInduceFeature.rb
DELETED
@@ -1,148 +0,0 @@
|
|
1
|
-
# GfInduceFeature
|
2
|
-
# Katrin Erk Jan 06
|
3
|
-
#
|
4
|
-
# use result of GfInduce.rb as
|
5
|
-
# feature for Rosy
|
6
|
-
|
7
|
-
require "rosy/GfInduce"
|
8
|
-
require "rosy/AbstractFeatureAndExternal"
|
9
|
-
require "common/ruby_class_extensions"
|
10
|
-
|
11
|
-
###
|
12
|
-
# make filename for GfInduce picle file
|
13
|
-
def filename_gfmap(exp, # ExternalConfigData object
|
14
|
-
interpreter) # SynInterpreter class
|
15
|
-
|
16
|
-
# output dir as given in my experiment file
|
17
|
-
# If there is an experiment ID, make subdirectory
|
18
|
-
# named after the experiment ID and place the data there.
|
19
|
-
output_dir = File.new_dir(exp.get("directory"))
|
20
|
-
if exp.get("experiment_id")
|
21
|
-
output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
|
22
|
-
end
|
23
|
-
|
24
|
-
# output file name:
|
25
|
-
# Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
|
26
|
-
return output_dir +
|
27
|
-
"Gfmap." +
|
28
|
-
interpreter.systems().to_a.map { |service, system_name|
|
29
|
-
service.to_s+ "=" + system_name.to_s
|
30
|
-
}.sort.join(".") + "." +
|
31
|
-
interpreter.optional_systems().to_a.map { |service, system_name|
|
32
|
-
"OPT" + service.to_s + "=" + system_name.to_s
|
33
|
-
}.sort.join(".") + ".pkl"
|
34
|
-
end
|
35
|
-
|
36
|
-
################################
|
37
|
-
# base class for all following feature extractors
|
38
|
-
class GfInduceFeatureExtractor < ExternalFeatureExtractor
|
39
|
-
GfInduceFeatureExtractor.announce_me()
|
40
|
-
|
41
|
-
@@okay = true # external experiment file present?
|
42
|
-
@@gf_obj = nil # GfInduce object
|
43
|
-
@@node_to_gf = nil # Hash: SynNodes of a sentence -> Gf label
|
44
|
-
|
45
|
-
def GfInduceFeatureExtractor.designator()
|
46
|
-
return "gf_fn"
|
47
|
-
end
|
48
|
-
def GfInduceFeatureExtractor.feature_names()
|
49
|
-
return ["gf_fn"]
|
50
|
-
end
|
51
|
-
def GfInduceFeatureExtractor.sql_type()
|
52
|
-
return "VARCHAR(25)"
|
53
|
-
end
|
54
|
-
def GfInduceFeatureExtractor.feature_type()
|
55
|
-
return "syn"
|
56
|
-
end
|
57
|
-
def GfInduceFeatureExtractor.phase()
|
58
|
-
return "phase 1"
|
59
|
-
end
|
60
|
-
|
61
|
-
###
|
62
|
-
# set sentence, set node, set other settings:
|
63
|
-
# this is done prior to
|
64
|
-
# feature computation using compute_feature()
|
65
|
-
# such that computations that stay the same for
|
66
|
-
# several features can be done in advance
|
67
|
-
#
|
68
|
-
# This is just relevant for Phase 1
|
69
|
-
#
|
70
|
-
# returns: false/nil if there was a problem
|
71
|
-
def GfInduceFeatureExtractor.set_sentence(sent, # SalsaTigerSentence object
|
72
|
-
frame) # FrameNode object
|
73
|
-
|
74
|
-
super(sent, frame)
|
75
|
-
|
76
|
-
if @@okay
|
77
|
-
# we can actually compute something
|
78
|
-
|
79
|
-
# let the GF object compute all subcat frames
|
80
|
-
# for the target of this frame
|
81
|
-
subcatframes_of_current_target = @@gf_obj.apply(frame.target.children())
|
82
|
-
|
83
|
-
# keep the most frequent one of the
|
84
|
-
# subcat frames returned by the GF object:
|
85
|
-
if subcatframes_of_current_target.empty?
|
86
|
-
# no subcat frames returned
|
87
|
-
subcatframe = []
|
88
|
-
else
|
89
|
-
# we have at least one subcat frame:
|
90
|
-
# keep the most frequent one of them
|
91
|
-
#
|
92
|
-
# Also, subcatframes_of_current_target
|
93
|
-
# contains triples [frame, actual_subcatframe, frequency]
|
94
|
-
# Of these, keep just the actual_subcatframe
|
95
|
-
|
96
|
-
subcatframe = subcatframes_of_current_target.sort { |a, b|
|
97
|
-
# sort by frequency
|
98
|
-
b.last <=> a.last
|
99
|
-
}.first[1]
|
100
|
-
end
|
101
|
-
|
102
|
-
# change into a mapping node(SynNode) -> GF(string)
|
103
|
-
@@node_to_gf = Hash.new
|
104
|
-
subcatframe.each { |gf, prep, fe, synnodes|
|
105
|
-
synnodes.each { |node|
|
106
|
-
@@node_to_gf[node] = "#{gf} #{prep}"
|
107
|
-
}
|
108
|
-
}
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
|
113
|
-
###
|
114
|
-
# Initialize: read GFInduce pickle
|
115
|
-
def initialize(exp, # experiment file object
|
116
|
-
interpreter_class) # SynInterpreter class
|
117
|
-
|
118
|
-
super(exp, interpreter_class)
|
119
|
-
|
120
|
-
if @exp_external
|
121
|
-
pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
|
122
|
-
@@gf_obj = GfInduce.from_file(pickle_filename)
|
123
|
-
@@okay = true
|
124
|
-
|
125
|
-
else
|
126
|
-
# signal that you cannot compute anything
|
127
|
-
@@okay = false
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
###
|
132
|
-
# compute: compute features
|
133
|
-
#
|
134
|
-
# returns an array of features (strings), length the same as the
|
135
|
-
# length of feature_names()
|
136
|
-
#
|
137
|
-
# here: array of length one, content either a string or nil
|
138
|
-
def compute_features()
|
139
|
-
# current node: @@node
|
140
|
-
# check whether the current node has been assigned a slot
|
141
|
-
# in the subcat frame
|
142
|
-
if @@okay
|
143
|
-
return [ @@node_to_gf[@@node] ]
|
144
|
-
else
|
145
|
-
return [ nil ]
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
data/lib/rosy/InputData.rb
DELETED
@@ -1,294 +0,0 @@
|
|
1
|
-
###########
|
2
|
-
#
|
3
|
-
# ke / sp 12 04 05
|
4
|
-
#
|
5
|
-
# class for input data object
|
6
|
-
# offers methods for preprocessing and
|
7
|
-
# featurization
|
8
|
-
|
9
|
-
# Salsa packages
|
10
|
-
require "common/Parser"
|
11
|
-
require "common/SalsaTigerRegXML"
|
12
|
-
require "common/ruby_class_extensions"
|
13
|
-
|
14
|
-
# Fred/Rosy packages
|
15
|
-
require "rosy/FailedParses"
|
16
|
-
require "common/RosyConventions"
|
17
|
-
require "rosy/RosyFeatureExtractors"
|
18
|
-
require "rosy/RosyPhase2FeatureExtractors"
|
19
|
-
require "rosy/RosyPruning"
|
20
|
-
require "rosy/GfInduceFeature"
|
21
|
-
require "common/FixSynSemMapping"
|
22
|
-
|
23
|
-
class InputData
|
24
|
-
|
25
|
-
###
|
26
|
-
def initialize(exp_object, # RosyConfigData object
|
27
|
-
dataset, # train/test
|
28
|
-
feature_info_object, # FeatureInfo object
|
29
|
-
interpreter_class, # SynInterpreter class
|
30
|
-
input_dir) # Directory with input files
|
31
|
-
|
32
|
-
@exp = exp_object
|
33
|
-
@dataset = dataset
|
34
|
-
@interpreter_class = interpreter_class
|
35
|
-
@input_dir = input_dir
|
36
|
-
# store information about failed parses here
|
37
|
-
@failed_parses = FailedParses.new()
|
38
|
-
|
39
|
-
# feature_extractors_phase1: array of AbstractFeatureExtractor objects
|
40
|
-
@extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1",
|
41
|
-
@interpreter_class)
|
42
|
-
|
43
|
-
# global settings
|
44
|
-
unless RosyFeatureExtractor.set("split_nones" => @exp.get("split_nones"))
|
45
|
-
raise "Some grave problem during feature extractor initialization"
|
46
|
-
end
|
47
|
-
|
48
|
-
# # nothing to set here for now, so deactivated
|
49
|
-
# @extractors_p1_other.each { |extractor_obj|
|
50
|
-
# unless extractor_obj.class.set()
|
51
|
-
# raise "Some grave problem during feature extractor initialization"
|
52
|
-
# end
|
53
|
-
# }
|
54
|
-
|
55
|
-
|
56
|
-
# feature_extractors_phase2: array of AbstractFeatureExtractor objects
|
57
|
-
extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2",
|
58
|
-
@interpreter_class)
|
59
|
-
@feature_extractors_phase2 = extractors_p2_rosy + extractors_p2_other
|
60
|
-
end
|
61
|
-
|
62
|
-
###
|
63
|
-
# each_instance_phase1()
|
64
|
-
#
|
65
|
-
# reads the input data from file(s), in the specific input format,
|
66
|
-
# separates it into instances,
|
67
|
-
# threads it through all phase 1 feature extractors
|
68
|
-
# and yields one feature vector per instance
|
69
|
-
#
|
70
|
-
# yields: pairs [feature_name(string), feature_value(object)]
|
71
|
-
|
72
|
-
def each_instance_phase1()
|
73
|
-
Dir[@input_dir+"*.xml"]. each {|parsefilename|
|
74
|
-
|
75
|
-
xmlFile = FilePartsParser.new(parsefilename)
|
76
|
-
$stderr.puts "Processing #{parsefilename}"
|
77
|
-
xmlFile.scan_s {|sent_string|
|
78
|
-
sent = SalsaTigerSentence.new(sent_string)
|
79
|
-
|
80
|
-
# preprocessing: possibly change the SalsaTigerSentence object
|
81
|
-
# before featurization
|
82
|
-
preprocess(sent)
|
83
|
-
|
84
|
-
sent.each_frame{ |frame|
|
85
|
-
|
86
|
-
# skip failed parses
|
87
|
-
if sent.get_attribute("failed")
|
88
|
-
handle_failed_parse(sent, frame)
|
89
|
-
next
|
90
|
-
end
|
91
|
-
|
92
|
-
# Tell feature extractors about the sentence and frame:
|
93
|
-
# first Rosy feature extractors, then the others
|
94
|
-
# if there is a problem, skip this frame
|
95
|
-
unless RosyFeatureExtractor.set_sentence(sent, frame)
|
96
|
-
next
|
97
|
-
end
|
98
|
-
skip_frame = false
|
99
|
-
@extractors_p1_other.each { |extractor_obj|
|
100
|
-
unless extractor_obj.class.set_sentence(sent, frame)
|
101
|
-
skip_frame = true
|
102
|
-
break
|
103
|
-
end
|
104
|
-
}
|
105
|
-
if skip_frame
|
106
|
-
next
|
107
|
-
end
|
108
|
-
|
109
|
-
sent.each_syn_node { |syn_node|
|
110
|
-
|
111
|
-
# Tell feature extractors about the current node:
|
112
|
-
# first Rosy feature extractors, then the others
|
113
|
-
# if there is a problem, skip this node
|
114
|
-
unless RosyFeatureExtractor.set_node(syn_node)
|
115
|
-
next
|
116
|
-
end
|
117
|
-
skip_node = false
|
118
|
-
@extractors_p1_other.each { |extractor_obj|
|
119
|
-
unless extractor_obj.class.set_node(syn_node)
|
120
|
-
skip_node = true
|
121
|
-
break
|
122
|
-
end
|
123
|
-
}
|
124
|
-
if skip_node
|
125
|
-
next
|
126
|
-
end
|
127
|
-
|
128
|
-
# features: array of pairs: [feature_name(string), feature_value(object)]
|
129
|
-
features = Array.new
|
130
|
-
(@extractors_p1_rosy + @extractors_p1_other).each { |extractor|
|
131
|
-
# compute features
|
132
|
-
feature_names = extractor.class.feature_names()
|
133
|
-
feature_index = 0
|
134
|
-
|
135
|
-
# append new features to features array
|
136
|
-
features.concat extractor.compute_features().map { |feature_value|
|
137
|
-
feature_name = feature_names[feature_index]
|
138
|
-
feature_index += 1
|
139
|
-
|
140
|
-
# sanity check: feature value longer than the allotted space in the DB?
|
141
|
-
check_feature_length(feature_name, feature_value, extractor)
|
142
|
-
|
143
|
-
[feature_name, nonnil_feature(feature_value, extractor.class.sql_type()) ]
|
144
|
-
}
|
145
|
-
}
|
146
|
-
yield features
|
147
|
-
} # each syn node
|
148
|
-
} # each frame
|
149
|
-
} # each sentence
|
150
|
-
}
|
151
|
-
end
|
152
|
-
|
153
|
-
###
|
154
|
-
# each_phase2_column
|
155
|
-
#
|
156
|
-
# This method implements the application of the
|
157
|
-
# phase 2 extractors to data.
|
158
|
-
#
|
159
|
-
# Given a database view (of either training or test data),
|
160
|
-
# assign a new feature value to each instance
|
161
|
-
#
|
162
|
-
# yields pairs [feature_name(string), feature_values(array)]
|
163
|
-
# The feature_values array has as many lines as the view has instances
|
164
|
-
# so the yield of this method can be fed directly into view.update_column()
|
165
|
-
def each_phase2_column(view) # View object: training or test data
|
166
|
-
|
167
|
-
@feature_extractors_phase2.each { |extractor|
|
168
|
-
# apply the extractor
|
169
|
-
feature_columns = extractor.compute_features_on_view(view)
|
170
|
-
# interleave with feature values and yield
|
171
|
-
feature_index = 0
|
172
|
-
feature_names = extractor.class.feature_names()
|
173
|
-
feature_columns.each { |feature_values|
|
174
|
-
yield [
|
175
|
-
feature_names[feature_index],
|
176
|
-
feature_values.map { |feature_val| nonnil_feature(feature_val, extractor.class.sql_type) }
|
177
|
-
]
|
178
|
-
feature_index += 1
|
179
|
-
}
|
180
|
-
}
|
181
|
-
end
|
182
|
-
|
183
|
-
###
|
184
|
-
# get_failed_parses
|
185
|
-
#
|
186
|
-
# returns the FailedParses object in which the info about failed parses has been stored
|
187
|
-
def get_failed_parses()
|
188
|
-
return @failed_parses
|
189
|
-
end
|
190
|
-
|
191
|
-
#################################
|
192
|
-
private
|
193
|
-
|
194
|
-
|
195
|
-
###
|
196
|
-
def nonnil_feature(feature_value,
|
197
|
-
sql_type)
|
198
|
-
|
199
|
-
# feature value nil? then change to noval
|
200
|
-
if feature_value.nil? and sql_type =~ /CHAR/
|
201
|
-
return @exp.get("noval")
|
202
|
-
elsif feature_value.class.to_s == "String" and feature_value.empty?
|
203
|
-
return @exp.get("noval")
|
204
|
-
elsif feature_value.nil?
|
205
|
-
return 0
|
206
|
-
else
|
207
|
-
return feature_value
|
208
|
-
end
|
209
|
-
end
|
210
|
-
|
211
|
-
###
|
212
|
-
# preprocess: possibly change the given SalsaTigerSentence
|
213
|
-
# to enable better learning
|
214
|
-
def preprocess(sent) # SalsaTigerSentence object
|
215
|
-
|
216
|
-
|
217
|
-
if @dataset == "train" and
|
218
|
-
(@exp.get("fe_syn_repair") or @exp.get("fe_rel_repair"))
|
219
|
-
FixSynSemMapping.fixit(sent, @exp, @interpreter_class)
|
220
|
-
end
|
221
|
-
end
|
222
|
-
|
223
|
-
###
|
224
|
-
# register failed parses
|
225
|
-
def handle_failed_parse(sent, # SalsaTigerSentence object
|
226
|
-
frame) # FrameNode
|
227
|
-
|
228
|
-
# target POS
|
229
|
-
if frame.target()
|
230
|
-
main_target = @interpreter_class.main_node_of_expr(frame.target.children(), "no_mwe")
|
231
|
-
else
|
232
|
-
main_target = nil
|
233
|
-
end
|
234
|
-
if main_target
|
235
|
-
target_pos = @interpreter_class.category(main_target)
|
236
|
-
else
|
237
|
-
target_pos = nil
|
238
|
-
end
|
239
|
-
if frame.target()
|
240
|
-
target_str = frame.target().yield_nodes_ordered().map { |t_node|
|
241
|
-
if t_node.is_syntactic?
|
242
|
-
@interpreter_class.lemma_backoff(t_node)
|
243
|
-
else
|
244
|
-
# not a syntactic node: maybe an unassigned target?
|
245
|
-
""
|
246
|
-
end
|
247
|
-
}.join(" ")
|
248
|
-
else
|
249
|
-
target_str = ""
|
250
|
-
end
|
251
|
-
|
252
|
-
@failed_parses.register(construct_instance_id(sent.id(), frame.id()),
|
253
|
-
frame.name(),
|
254
|
-
target_str,
|
255
|
-
target_pos,
|
256
|
-
frame.children.map { |fe| fe.name })
|
257
|
-
|
258
|
-
end
|
259
|
-
|
260
|
-
###
|
261
|
-
# sanity check: feature value longer than the allotted space in the DB?
|
262
|
-
def check_feature_length(feature_name, # string
|
263
|
-
feature_value, # object
|
264
|
-
extractor_obj) # AbstractFeatureExtractor object
|
265
|
-
|
266
|
-
if extractor_obj.class.sql_type() =~ /(\d+)/
|
267
|
-
# sql type contains some statement about the length.
|
268
|
-
# just crudely compare to feature length
|
269
|
-
length = $1.to_i
|
270
|
-
if feature_value.class == String and
|
271
|
-
feature_value.length() > length
|
272
|
-
|
273
|
-
if feature_name == "sentid"
|
274
|
-
print length;
|
275
|
-
print feature_value;
|
276
|
-
print feature_value.length();
|
277
|
-
# if the sentence (instance) ID is too long, we cannot go on.
|
278
|
-
$stderr.puts "Error: Instance ID is longer than its DB column."
|
279
|
-
$stderr.puts "Please increase the DB column size in {Tiger,Collins}FeatureExtractors.rb"
|
280
|
-
raise "SQL entry length surpassed"
|
281
|
-
|
282
|
-
elsif @exp.get("verbose")
|
283
|
-
# KE Feb 07: don't print warning,
|
284
|
-
# this is just too frequent
|
285
|
-
# for other features, we just issue a warning, and only if we are verbose
|
286
|
-
|
287
|
-
# $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length()}): #{feature_value}"
|
288
|
-
end # feature name check
|
289
|
-
end # length surpassed
|
290
|
-
end # length found in sql type
|
291
|
-
|
292
|
-
end
|
293
|
-
|
294
|
-
end
|
@@ -1,338 +0,0 @@
|
|
1
|
-
# RosyConfusability
|
2
|
-
# KE May 05
|
3
|
-
#
|
4
|
-
# Access instance database created by the Rosy role assignment system
|
5
|
-
# and compute the confusability of target categories
|
6
|
-
# for the data in the (training) database there.
|
7
|
-
#
|
8
|
-
# We define confusability as follows:
|
9
|
-
# Given a frame fr, let
|
10
|
-
# - fes(fr) the FEs of fr (a set)
|
11
|
-
# - gfs(fe) the grammatical functions realizing the FE fe in the data
|
12
|
-
# - gfs(fr) = U_{fe \in fes(fr)} gfs(fe) the grammatical functions realizing roles of fr
|
13
|
-
#
|
14
|
-
# Then the entropy of a grammatical function gf within fr is
|
15
|
-
#
|
16
|
-
# gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log p(fe|gf)
|
17
|
-
#
|
18
|
-
# where p(fe|gf) = f(gf, fe) / f(gf)
|
19
|
-
#
|
20
|
-
# And the confusability of a frame element fe of fr is
|
21
|
-
#
|
22
|
-
# c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
|
23
|
-
#
|
24
|
-
# where p(gf|fe) = f(gf, fe) / f(fe)
|
25
|
-
|
26
|
-
require "RosyConfigData"
|
27
|
-
require "RosyIterator"
|
28
|
-
require "RosyConventions"
|
29
|
-
require "TargetsMostFrequentFrame"
|
30
|
-
|
31
|
-
require "mysql"
|
32
|
-
|
33
|
-
class RosyConfusability
|
34
|
-
include TargetsMostFrequentSc
|
35
|
-
|
36
|
-
attr_reader :confusability, :counts_fe_glob, :frame_confusability, :overall_confusability
|
37
|
-
|
38
|
-
def initialize(exp) # RosyConfigData object
|
39
|
-
@exp = exp
|
40
|
-
|
41
|
-
@confusability = Hash.new(0.0)
|
42
|
-
@counts_fe_glob = Hash.new(0)
|
43
|
-
@counts_gffe_glob = Hash.new(0)
|
44
|
-
@frame_confusability = Hash.new(0.0)
|
45
|
-
@overall_confusability = 0.0
|
46
|
-
|
47
|
-
@frequent_gframes = [
|
48
|
-
# NO DUPLICATES
|
49
|
-
"Ext_Comp", "Mod", "Comp", "Gen",
|
50
|
-
"Ext_Obj", "Ext", "Ext_Obj_Comp", "Head",
|
51
|
-
"Ext_Mod", "Gen_Mod", "Mod_Comp", "Comp_Ext",
|
52
|
-
"Gen_Comp", "Ext_Gen", "Ext_Mod_Comp", "Head_Comp",
|
53
|
-
"Obj_Comp", "Obj", "Mod_Head", "Ext_Comp_Obj",
|
54
|
-
"Gen_Head", "Ext_Gen_Mod"
|
55
|
-
# with duplicates
|
56
|
-
# "Ext_Comp", "Mod", "Comp", "Gen",
|
57
|
-
# "Ext_Obj", "Ext", "", "Ext_Obj_Comp",
|
58
|
-
# "Ext_Comp_Comp", "Head", "Mod_Mod", "Gen_Mod",
|
59
|
-
# "Ext_Mod", "Comp_Comp", "Mod_Comp", "Ext_Gen",
|
60
|
-
# "Gen_Comp", "Head_Head", "Ext_Comp_Comp_Comp", "Head_Comp",
|
61
|
-
# # "Ext_Ext_Comp",
|
62
|
-
# # "Ext_Obj_Comp_Comp", "Obj_Comp",
|
63
|
-
# # "Ext_Mod_Mod", "Comp_Comp_Comp",
|
64
|
-
# # "Ext_Ext_Obj", "Ext_Mod_Comp", "Comp_Ext", "Obj",
|
65
|
-
# # "Ext_Ext", "Ext_Obj_Obj", "Mod_Mod_Mod", "Gen_Mod_Mod",
|
66
|
-
# # "Ext_Comp_Comp_Comp_Comp", "Gen_Head", "Mod_Head",
|
67
|
-
# # "Ext_Ext_Ext_Comp"
|
68
|
-
].map { |string|
|
69
|
-
string.split("_")
|
70
|
-
}
|
71
|
-
end
|
72
|
-
|
73
|
-
def compute(splitID, # string: split ID, may be nil
|
74
|
-
additionals) # array:string: "target", "target_pos", "gframe", "fgframe"
|
75
|
-
###
|
76
|
-
# open and initialize stuff:
|
77
|
-
|
78
|
-
# open database
|
79
|
-
database = Mysql.real_connect(@exp.get('host'), @exp.get('user'),
|
80
|
-
@exp.get('passwd'), @exp.get('dbname'))
|
81
|
-
# make an object that creates views.
|
82
|
-
# read one frame at a time.
|
83
|
-
iterator = RosyIterator.new(database, @exp, "train",
|
84
|
-
"splitID" => splitID,
|
85
|
-
"xwise" => "frame")
|
86
|
-
# get value for "no val"
|
87
|
-
noval = @exp.get("noval")
|
88
|
-
|
89
|
-
counts_frame = Hash.new(0)
|
90
|
-
|
91
|
-
# iterate through all frames and compute confusability of each FE
|
92
|
-
iterator.each_group { |group_descr_hash, frame|
|
93
|
-
|
94
|
-
$stderr.puts "Computing confusability for #{frame}"
|
95
|
-
|
96
|
-
# read all instances of the frame, columns: FE and GF
|
97
|
-
view = iterator.get_a_view_for_current_group(["sentid","gold", "fn_gf",
|
98
|
-
"target","target_pos", "frame"])
|
99
|
-
|
100
|
-
if additionals.include? "tmfframe"
|
101
|
-
# find most frequent gframe for each target
|
102
|
-
tmfframe = determine_target_most_frequent_sc(view, noval)
|
103
|
-
end
|
104
|
-
|
105
|
-
# count occurrences
|
106
|
-
counts_gf = Hash.new(0)
|
107
|
-
counts_fe = Hash.new(0)
|
108
|
-
counts_gffe = Hash.new(0)
|
109
|
-
|
110
|
-
view.each_sentence { |sentence|
|
111
|
-
|
112
|
-
# make string consisting of all FN GFs of this sentence
|
113
|
-
allgfs = Array.new()
|
114
|
-
sentence.each { |inst|
|
115
|
-
if inst["fn_gf"] != noval
|
116
|
-
allgfs << inst["fn_gf"]
|
117
|
-
end
|
118
|
-
}
|
119
|
-
|
120
|
-
# assume uniqueness of GFs
|
121
|
-
# design decision, could also be done differently.
|
122
|
-
# rationale: if a GF occurs more than once,
|
123
|
-
# it's probable that this is because we get more than
|
124
|
-
# one constituent for this GF, not because
|
125
|
-
# it actually occurred more than once in the
|
126
|
-
# original FrameNet annotation.
|
127
|
-
allgfs.uniq!
|
128
|
-
|
129
|
-
# now count each instance
|
130
|
-
sentence.each { |row|
|
131
|
-
if row["gold"] == "target"
|
132
|
-
# don't count target among the FEs
|
133
|
-
next
|
134
|
-
end
|
135
|
-
|
136
|
-
if row["gold"] != noval
|
137
|
-
counts_fe[row["gold"]] += 1
|
138
|
-
end
|
139
|
-
if row["fn_gf"] != noval and row["fn_gf"] != "target"
|
140
|
-
gf = row["fn_gf"]
|
141
|
-
|
142
|
-
additionals.each { |additional|
|
143
|
-
case additional
|
144
|
-
when "target"
|
145
|
-
gf << "_" + row["target"]
|
146
|
-
when "target_pos"
|
147
|
-
gf << "_" + row["target_pos"]
|
148
|
-
when "gframe"
|
149
|
-
gf << "_" + allgfs.join("_")
|
150
|
-
|
151
|
-
when "fgframe"
|
152
|
-
# find the maximal frequent frame subsuming allgfs
|
153
|
-
maxfgf = nil
|
154
|
-
@frequent_gframes.each { |fgframe|
|
155
|
-
if fgframe.subsumed_by?(allgfs)
|
156
|
-
# fgframe is a subset of allgfs
|
157
|
-
if maxfgf.nil? or fgframe.length() > maxfgf.length()
|
158
|
-
maxfgf = fgframe
|
159
|
-
end
|
160
|
-
end
|
161
|
-
}
|
162
|
-
if maxfgf.nil?
|
163
|
-
# nothing there that fits
|
164
|
-
# leave GF as is
|
165
|
-
else
|
166
|
-
gf << "_" + maxfgf.join("_")
|
167
|
-
end
|
168
|
-
|
169
|
-
when "tmfframe"
|
170
|
-
gf << "_" + tmfframe[tmf_target_key(row)]
|
171
|
-
|
172
|
-
else
|
173
|
-
raise "Don't know how to compute #{additional}"
|
174
|
-
end
|
175
|
-
}
|
176
|
-
|
177
|
-
counts_gf[gf] += 1
|
178
|
-
end
|
179
|
-
|
180
|
-
if row["gold"] != noval and gf
|
181
|
-
counts_gffe[gf + " " + row["gold"]] += 1
|
182
|
-
end
|
183
|
-
} # each row of sentence
|
184
|
-
} # each sentence of view
|
185
|
-
|
186
|
-
# compute gf entropy
|
187
|
-
# gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log_2 p(fe|gf)
|
188
|
-
#
|
189
|
-
# where p(fe|gf) = f(gf, fe) / f(gf)
|
190
|
-
gf_entropy = Hash.new
|
191
|
-
|
192
|
-
counts_gf.keys.each { |gf|
|
193
|
-
gf_entropy[gf] = 0.0
|
194
|
-
|
195
|
-
counts_fe.keys.each { |fe|
|
196
|
-
if counts_gf[gf] > 0
|
197
|
-
p_gf_fe = counts_gffe[gf + " " + fe].to_f / counts_gf[gf].to_f
|
198
|
-
|
199
|
-
# get log_2 via log_10
|
200
|
-
if p_gf_fe > 0.0
|
201
|
-
gf_entropy[gf] -= p_gf_fe * Math.log10(p_gf_fe) * 3.32193
|
202
|
-
end
|
203
|
-
end
|
204
|
-
} # each FE for this GF
|
205
|
-
} # each GF (gf entropy)
|
206
|
-
|
207
|
-
# compute FE confusability
|
208
|
-
# c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
|
209
|
-
#
|
210
|
-
# where p(gf|fe) = f(gf, fe) / f(fe)
|
211
|
-
counts_fe.keys.each { |fe|
|
212
|
-
@confusability[frame + " " + fe] = 0.0
|
213
|
-
|
214
|
-
counts_gf.keys.each { |gf|
|
215
|
-
if counts_fe[fe] > 0
|
216
|
-
p_fe_gf = counts_gffe[gf + " " + fe].to_f / counts_fe[fe].to_f
|
217
|
-
|
218
|
-
@confusability[frame + " " + fe] += p_fe_gf * gf_entropy[gf]
|
219
|
-
end
|
220
|
-
} # each GF for this FE
|
221
|
-
} # each FE (fe confusability)
|
222
|
-
|
223
|
-
|
224
|
-
# remember counts for FEs and GF/FE pairs
|
225
|
-
counts_fe.keys.each { |fe|
|
226
|
-
@counts_fe_glob[frame + " " + fe] = counts_fe[fe]
|
227
|
-
}
|
228
|
-
counts_gffe.each_pair {|event,freq|
|
229
|
-
@counts_gffe_glob[frame+" " +event] = freq
|
230
|
-
}
|
231
|
-
|
232
|
-
# omit rare FEs:
|
233
|
-
# anything below 5 occurrences
|
234
|
-
counts_fe.each_key { |fe|
|
235
|
-
if counts_fe[fe] < 5
|
236
|
-
@confusability.delete(frame + " " + fe)
|
237
|
-
end
|
238
|
-
}
|
239
|
-
|
240
|
-
# compute overall frame confusability
|
241
|
-
# omitting rare FEs with below 5 occurrences:
|
242
|
-
#
|
243
|
-
# c(fr) = sum_{fe \in fes(fr)} f(fe)/f(fr) * c_{fr}(fe)
|
244
|
-
# = \sum_{gf \in gfs(fr)} p(gf|fr) gfe_{fr}(gf)
|
245
|
-
#
|
246
|
-
# where p(gf|fr) = (sum_{fe\in fes(fr)} f(gf, fe)) / f(fr)
|
247
|
-
counts_frame[frame] = 0
|
248
|
-
counts_fe.each_value { |count|
|
249
|
-
if count >= 5
|
250
|
-
counts_frame[frame] += count
|
251
|
-
end
|
252
|
-
}
|
253
|
-
@frame_confusability[frame] = 0.0
|
254
|
-
counts_fe.each_pair { |fe, count|
|
255
|
-
if count >= 5
|
256
|
-
@frame_confusability[frame] += (count.to_f / counts_frame[frame].to_f) * @confusability[frame + " " + fe]
|
257
|
-
end
|
258
|
-
}
|
259
|
-
} # each frame
|
260
|
-
|
261
|
-
# compute overall confusability
|
262
|
-
# c = \sum{fr \in frames} f(fr)/N * c(fr)
|
263
|
-
#
|
264
|
-
# where N is the number of FE occurrences overall
|
265
|
-
counts_overall = 0
|
266
|
-
counts_frame.each_value { |count|
|
267
|
-
counts_overall += count
|
268
|
-
}
|
269
|
-
@overall_confusability = 0.0
|
270
|
-
counts_frame.each_pair { |frame, count|
|
271
|
-
@overall_confusability += (count.to_f / counts_overall.to_f) * @frame_confusability[frame]
|
272
|
-
}
|
273
|
-
end
|
274
|
-
|
275
|
-
|
276
|
-
# return a copy of @counts_fe_glob, from which all fes with less than 5 occurrences are deleted
|
277
|
-
def get_global_counts
|
278
|
-
global_counts = @counts_fe_glob.clone
|
279
|
-
global_counts.delete_if {|key, value| value < 5}
|
280
|
-
return global_counts
|
281
|
-
end
|
282
|
-
|
283
|
-
###
|
284
|
-
#
|
285
|
-
# compute sparseness statistics over the set of
|
286
|
-
# base events used for computing the confusability
|
287
|
-
# returns an array of length 4:
|
288
|
-
# - number of events with freq 1
|
289
|
-
# - number of events with freq 2
|
290
|
-
# - number of events with freq 3-5
|
291
|
-
# - number of events with freq > 5
|
292
|
-
|
293
|
-
def counts()
|
294
|
-
counts = [0,0,0,0]
|
295
|
-
@counts_gffe_glob.each_value {|freq|
|
296
|
-
case freq
|
297
|
-
when 1
|
298
|
-
counts[0] += 1
|
299
|
-
when 2
|
300
|
-
counts[1] += 1
|
301
|
-
when 3..5
|
302
|
-
counts[2] += 1
|
303
|
-
else
|
304
|
-
counts[3] += 1
|
305
|
-
end
|
306
|
-
}
|
307
|
-
return counts
|
308
|
-
end
|
309
|
-
|
310
|
-
def to_file(filename)
|
311
|
-
begin
|
312
|
-
file = File.new(filename,"w")
|
313
|
-
rescue
|
314
|
-
raise "Couldn't open file #{filename} for writing."
|
315
|
-
end
|
316
|
-
Marshal.dump({"confusability" => @confusability,
|
317
|
-
"counts_fe_glob" => @counts_fe_glob,
|
318
|
-
"counts_gffe_glob" => @counts_gffe_glob,
|
319
|
-
"frame_confusability" => @frame_confusability,
|
320
|
-
"overall_confusability" => @overall_confusability
|
321
|
-
},
|
322
|
-
file)
|
323
|
-
end
|
324
|
-
|
325
|
-
def from_file(filename)
|
326
|
-
begin
|
327
|
-
file = File.new(filename)
|
328
|
-
rescue
|
329
|
-
raise "Couldn't open file #{filename} for reading."
|
330
|
-
end
|
331
|
-
hash = Marshal.load(file)
|
332
|
-
@confusability = hash["confusability"]
|
333
|
-
@counts_fe_glob = hash["counts_fe_glob"]
|
334
|
-
@counts_gffe_glob = hash["counts_gffe_glob"]
|
335
|
-
@frame_confusability = hash["frame_confusability"]
|
336
|
-
@overall_confusability = hash["overall_confusability"]
|
337
|
-
end
|
338
|
-
end
|