frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,148 @@
|
|
1
|
+
# GfInduceFeature
|
2
|
+
# Katrin Erk Jan 06
|
3
|
+
#
|
4
|
+
# use result of GfInduce.rb as
|
5
|
+
# feature for Rosy
|
6
|
+
|
7
|
+
require "rosy/GfInduce"
|
8
|
+
require "rosy/AbstractFeatureAndExternal"
|
9
|
+
require "common/ruby_class_extensions"
|
10
|
+
|
11
|
+
###
|
12
|
+
# make filename for GfInduce picle file
|
13
|
+
def filename_gfmap(exp, # ExternalConfigData object
|
14
|
+
interpreter) # SynInterpreter class
|
15
|
+
|
16
|
+
# output dir as given in my experiment file
|
17
|
+
# If there is an experiment ID, make subdirectory
|
18
|
+
# named after the experiment ID and place the data there.
|
19
|
+
output_dir = File.new_dir(exp.get("directory"))
|
20
|
+
if exp.get("experiment_id")
|
21
|
+
output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
|
22
|
+
end
|
23
|
+
|
24
|
+
# output file name:
|
25
|
+
# Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
|
26
|
+
return output_dir +
|
27
|
+
"Gfmap." +
|
28
|
+
interpreter.systems().to_a.map { |service, system_name|
|
29
|
+
service.to_s+ "=" + system_name.to_s
|
30
|
+
}.sort.join(".") + "." +
|
31
|
+
interpreter.optional_systems().to_a.map { |service, system_name|
|
32
|
+
"OPT" + service.to_s + "=" + system_name.to_s
|
33
|
+
}.sort.join(".") + ".pkl"
|
34
|
+
end
|
35
|
+
|
36
|
+
################################
|
37
|
+
# base class for all following feature extractors
|
38
|
+
class GfInduceFeatureExtractor < ExternalFeatureExtractor
|
39
|
+
GfInduceFeatureExtractor.announce_me()
|
40
|
+
|
41
|
+
@@okay = true # external experiment file present?
|
42
|
+
@@gf_obj = nil # GfInduce object
|
43
|
+
@@node_to_gf = nil # Hash: SynNodes of a sentence -> Gf label
|
44
|
+
|
45
|
+
def GfInduceFeatureExtractor.designator()
|
46
|
+
return "gf_fn"
|
47
|
+
end
|
48
|
+
def GfInduceFeatureExtractor.feature_names()
|
49
|
+
return ["gf_fn"]
|
50
|
+
end
|
51
|
+
def GfInduceFeatureExtractor.sql_type()
|
52
|
+
return "VARCHAR(25)"
|
53
|
+
end
|
54
|
+
def GfInduceFeatureExtractor.feature_type()
|
55
|
+
return "syn"
|
56
|
+
end
|
57
|
+
def GfInduceFeatureExtractor.phase()
|
58
|
+
return "phase 1"
|
59
|
+
end
|
60
|
+
|
61
|
+
###
|
62
|
+
# set sentence, set node, set other settings:
|
63
|
+
# this is done prior to
|
64
|
+
# feature computation using compute_feature()
|
65
|
+
# such that computations that stay the same for
|
66
|
+
# several features can be done in advance
|
67
|
+
#
|
68
|
+
# This is just relevant for Phase 1
|
69
|
+
#
|
70
|
+
# returns: false/nil if there was a problem
|
71
|
+
def GfInduceFeatureExtractor.set_sentence(sent, # SalsaTigerSentence object
|
72
|
+
frame) # FrameNode object
|
73
|
+
|
74
|
+
super(sent, frame)
|
75
|
+
|
76
|
+
if @@okay
|
77
|
+
# we can actually compute something
|
78
|
+
|
79
|
+
# let the GF object compute all subcat frames
|
80
|
+
# for the target of this frame
|
81
|
+
subcatframes_of_current_target = @@gf_obj.apply(frame.target.children())
|
82
|
+
|
83
|
+
# keep the most frequent one of the
|
84
|
+
# subcat frames returned by the GF object:
|
85
|
+
if subcatframes_of_current_target.empty?
|
86
|
+
# no subcat frames returned
|
87
|
+
subcatframe = []
|
88
|
+
else
|
89
|
+
# we have at least one subcat frame:
|
90
|
+
# keep the most frequent one of them
|
91
|
+
#
|
92
|
+
# Also, subcatframes_of_current_target
|
93
|
+
# contains triples [frame, actual_subcatframe, frequency]
|
94
|
+
# Of these, keep just the actual_subcatframe
|
95
|
+
|
96
|
+
subcatframe = subcatframes_of_current_target.sort { |a, b|
|
97
|
+
# sort by frequency
|
98
|
+
b.last <=> a.last
|
99
|
+
}.first[1]
|
100
|
+
end
|
101
|
+
|
102
|
+
# change into a mapping node(SynNode) -> GF(string)
|
103
|
+
@@node_to_gf = Hash.new
|
104
|
+
subcatframe.each { |gf, prep, fe, synnodes|
|
105
|
+
synnodes.each { |node|
|
106
|
+
@@node_to_gf[node] = "#{gf} #{prep}"
|
107
|
+
}
|
108
|
+
}
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
###
|
114
|
+
# Initialize: read GFInduce pickle
|
115
|
+
def initialize(exp, # experiment file object
|
116
|
+
interpreter_class) # SynInterpreter class
|
117
|
+
|
118
|
+
super(exp, interpreter_class)
|
119
|
+
|
120
|
+
if @exp_external
|
121
|
+
pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
|
122
|
+
@@gf_obj = GfInduce.from_file(pickle_filename)
|
123
|
+
@@okay = true
|
124
|
+
|
125
|
+
else
|
126
|
+
# signal that you cannot compute anything
|
127
|
+
@@okay = false
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
###
|
132
|
+
# compute: compute features
|
133
|
+
#
|
134
|
+
# returns an array of features (strings), length the same as the
|
135
|
+
# length of feature_names()
|
136
|
+
#
|
137
|
+
# here: array of length one, content either a string or nil
|
138
|
+
def compute_features()
|
139
|
+
# current node: @@node
|
140
|
+
# check whether the current node has been assigned a slot
|
141
|
+
# in the subcat frame
|
142
|
+
if @@okay
|
143
|
+
return [ @@node_to_gf[@@node] ]
|
144
|
+
else
|
145
|
+
return [ nil ]
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,294 @@
|
|
1
|
+
###########
|
2
|
+
#
|
3
|
+
# ke / sp 12 04 05
|
4
|
+
#
|
5
|
+
# class for input data object
|
6
|
+
# offers methods for preprocessing and
|
7
|
+
# featurization
|
8
|
+
|
9
|
+
# Salsa packages
|
10
|
+
require "common/Parser"
|
11
|
+
require "common/SalsaTigerRegXML"
|
12
|
+
require "common/ruby_class_extensions"
|
13
|
+
|
14
|
+
# Fred/Rosy packages
|
15
|
+
require "rosy/FailedParses"
|
16
|
+
require "common/RosyConventions"
|
17
|
+
require "rosy/RosyFeatureExtractors"
|
18
|
+
require "rosy/RosyPhase2FeatureExtractors"
|
19
|
+
require "rosy/RosyPruning"
|
20
|
+
require "rosy/GfInduceFeature"
|
21
|
+
require "common/FixSynSemMapping"
|
22
|
+
|
23
|
+
class InputData
|
24
|
+
|
25
|
+
###
|
26
|
+
def initialize(exp_object, # RosyConfigData object
|
27
|
+
dataset, # train/test
|
28
|
+
feature_info_object, # FeatureInfo object
|
29
|
+
interpreter_class, # SynInterpreter class
|
30
|
+
input_dir) # Directory with input files
|
31
|
+
|
32
|
+
@exp = exp_object
|
33
|
+
@dataset = dataset
|
34
|
+
@interpreter_class = interpreter_class
|
35
|
+
@input_dir = input_dir
|
36
|
+
# store information about failed parses here
|
37
|
+
@failed_parses = FailedParses.new()
|
38
|
+
|
39
|
+
# feature_extractors_phase1: array of AbstractFeatureExtractor objects
|
40
|
+
@extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1",
|
41
|
+
@interpreter_class)
|
42
|
+
|
43
|
+
# global settings
|
44
|
+
unless RosyFeatureExtractor.set("split_nones" => @exp.get("split_nones"))
|
45
|
+
raise "Some grave problem during feature extractor initialization"
|
46
|
+
end
|
47
|
+
|
48
|
+
# # nothing to set here for now, so deactivated
|
49
|
+
# @extractors_p1_other.each { |extractor_obj|
|
50
|
+
# unless extractor_obj.class.set()
|
51
|
+
# raise "Some grave problem during feature extractor initialization"
|
52
|
+
# end
|
53
|
+
# }
|
54
|
+
|
55
|
+
|
56
|
+
# feature_extractors_phase2: array of AbstractFeatureExtractor objects
|
57
|
+
extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2",
|
58
|
+
@interpreter_class)
|
59
|
+
@feature_extractors_phase2 = extractors_p2_rosy + extractors_p2_other
|
60
|
+
end
|
61
|
+
|
62
|
+
###
|
63
|
+
# each_instance_phase1()
|
64
|
+
#
|
65
|
+
# reads the input data from file(s), in the specific input format,
|
66
|
+
# separates it into instances,
|
67
|
+
# threads it through all phase 1 feature extractors
|
68
|
+
# and yields one feature vector per instance
|
69
|
+
#
|
70
|
+
# yields: pairs [feature_name(string), feature_value(object)]
|
71
|
+
|
72
|
+
def each_instance_phase1()
|
73
|
+
Dir[@input_dir+"*.xml"]. each {|parsefilename|
|
74
|
+
|
75
|
+
xmlFile = FilePartsParser.new(parsefilename)
|
76
|
+
$stderr.puts "Processing #{parsefilename}"
|
77
|
+
xmlFile.scan_s {|sent_string|
|
78
|
+
sent = SalsaTigerSentence.new(sent_string)
|
79
|
+
|
80
|
+
# preprocessing: possibly change the SalsaTigerSentence object
|
81
|
+
# before featurization
|
82
|
+
preprocess(sent)
|
83
|
+
|
84
|
+
sent.each_frame{ |frame|
|
85
|
+
|
86
|
+
# skip failed parses
|
87
|
+
if sent.get_attribute("failed")
|
88
|
+
handle_failed_parse(sent, frame)
|
89
|
+
next
|
90
|
+
end
|
91
|
+
|
92
|
+
# Tell feature extractors about the sentence and frame:
|
93
|
+
# first Rosy feature extractors, then the others
|
94
|
+
# if there is a problem, skip this frame
|
95
|
+
unless RosyFeatureExtractor.set_sentence(sent, frame)
|
96
|
+
next
|
97
|
+
end
|
98
|
+
skip_frame = false
|
99
|
+
@extractors_p1_other.each { |extractor_obj|
|
100
|
+
unless extractor_obj.class.set_sentence(sent, frame)
|
101
|
+
skip_frame = true
|
102
|
+
break
|
103
|
+
end
|
104
|
+
}
|
105
|
+
if skip_frame
|
106
|
+
next
|
107
|
+
end
|
108
|
+
|
109
|
+
sent.each_syn_node { |syn_node|
|
110
|
+
|
111
|
+
# Tell feature extractors about the current node:
|
112
|
+
# first Rosy feature extractors, then the others
|
113
|
+
# if there is a problem, skip this node
|
114
|
+
unless RosyFeatureExtractor.set_node(syn_node)
|
115
|
+
next
|
116
|
+
end
|
117
|
+
skip_node = false
|
118
|
+
@extractors_p1_other.each { |extractor_obj|
|
119
|
+
unless extractor_obj.class.set_node(syn_node)
|
120
|
+
skip_node = true
|
121
|
+
break
|
122
|
+
end
|
123
|
+
}
|
124
|
+
if skip_node
|
125
|
+
next
|
126
|
+
end
|
127
|
+
|
128
|
+
# features: array of pairs: [feature_name(string), feature_value(object)]
|
129
|
+
features = Array.new
|
130
|
+
(@extractors_p1_rosy + @extractors_p1_other).each { |extractor|
|
131
|
+
# compute features
|
132
|
+
feature_names = extractor.class.feature_names()
|
133
|
+
feature_index = 0
|
134
|
+
|
135
|
+
# append new features to features array
|
136
|
+
features.concat extractor.compute_features().map { |feature_value|
|
137
|
+
feature_name = feature_names[feature_index]
|
138
|
+
feature_index += 1
|
139
|
+
|
140
|
+
# sanity check: feature value longer than the allotted space in the DB?
|
141
|
+
check_feature_length(feature_name, feature_value, extractor)
|
142
|
+
|
143
|
+
[feature_name, nonnil_feature(feature_value, extractor.class.sql_type()) ]
|
144
|
+
}
|
145
|
+
}
|
146
|
+
yield features
|
147
|
+
} # each syn node
|
148
|
+
} # each frame
|
149
|
+
} # each sentence
|
150
|
+
}
|
151
|
+
end
|
152
|
+
|
153
|
+
###
|
154
|
+
# each_phase2_column
|
155
|
+
#
|
156
|
+
# This method implements the application of the
|
157
|
+
# phase 2 extractors to data.
|
158
|
+
#
|
159
|
+
# Given a database view (of either training or test data),
|
160
|
+
# assign a new feature value to each instance
|
161
|
+
#
|
162
|
+
# yields pairs [feature_name(string), feature_values(array)]
|
163
|
+
# The feature_values array has as many lines as the view has instances
|
164
|
+
# so the yield of this method can be fed directly into view.update_column()
|
165
|
+
def each_phase2_column(view) # View object: training or test data
|
166
|
+
|
167
|
+
@feature_extractors_phase2.each { |extractor|
|
168
|
+
# apply the extractor
|
169
|
+
feature_columns = extractor.compute_features_on_view(view)
|
170
|
+
# interleave with feature values and yield
|
171
|
+
feature_index = 0
|
172
|
+
feature_names = extractor.class.feature_names()
|
173
|
+
feature_columns.each { |feature_values|
|
174
|
+
yield [
|
175
|
+
feature_names[feature_index],
|
176
|
+
feature_values.map { |feature_val| nonnil_feature(feature_val, extractor.class.sql_type) }
|
177
|
+
]
|
178
|
+
feature_index += 1
|
179
|
+
}
|
180
|
+
}
|
181
|
+
end
|
182
|
+
|
183
|
+
###
|
184
|
+
# get_failed_parses
|
185
|
+
#
|
186
|
+
# returns the FailedParses object in which the info about failed parses has been stored
|
187
|
+
def get_failed_parses()
|
188
|
+
return @failed_parses
|
189
|
+
end
|
190
|
+
|
191
|
+
#################################
|
192
|
+
private
|
193
|
+
|
194
|
+
|
195
|
+
###
|
196
|
+
def nonnil_feature(feature_value,
|
197
|
+
sql_type)
|
198
|
+
|
199
|
+
# feature value nil? then change to noval
|
200
|
+
if feature_value.nil? and sql_type =~ /CHAR/
|
201
|
+
return @exp.get("noval")
|
202
|
+
elsif feature_value.class.to_s == "String" and feature_value.empty?
|
203
|
+
return @exp.get("noval")
|
204
|
+
elsif feature_value.nil?
|
205
|
+
return 0
|
206
|
+
else
|
207
|
+
return feature_value
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
###
|
212
|
+
# preprocess: possibly change the given SalsaTigerSentence
|
213
|
+
# to enable better learning
|
214
|
+
def preprocess(sent) # SalsaTigerSentence object
|
215
|
+
|
216
|
+
|
217
|
+
if @dataset == "train" and
|
218
|
+
(@exp.get("fe_syn_repair") or @exp.get("fe_rel_repair"))
|
219
|
+
FixSynSemMapping.fixit(sent, @exp, @interpreter_class)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
###
|
224
|
+
# register failed parses
|
225
|
+
def handle_failed_parse(sent, # SalsaTigerSentence object
|
226
|
+
frame) # FrameNode
|
227
|
+
|
228
|
+
# target POS
|
229
|
+
if frame.target()
|
230
|
+
main_target = @interpreter_class.main_node_of_expr(frame.target.children(), "no_mwe")
|
231
|
+
else
|
232
|
+
main_target = nil
|
233
|
+
end
|
234
|
+
if main_target
|
235
|
+
target_pos = @interpreter_class.category(main_target)
|
236
|
+
else
|
237
|
+
target_pos = nil
|
238
|
+
end
|
239
|
+
if frame.target()
|
240
|
+
target_str = frame.target().yield_nodes_ordered().map { |t_node|
|
241
|
+
if t_node.is_syntactic?
|
242
|
+
@interpreter_class.lemma_backoff(t_node)
|
243
|
+
else
|
244
|
+
# not a syntactic node: maybe an unassigned target?
|
245
|
+
""
|
246
|
+
end
|
247
|
+
}.join(" ")
|
248
|
+
else
|
249
|
+
target_str = ""
|
250
|
+
end
|
251
|
+
|
252
|
+
@failed_parses.register(construct_instance_id(sent.id(), frame.id()),
|
253
|
+
frame.name(),
|
254
|
+
target_str,
|
255
|
+
target_pos,
|
256
|
+
frame.children.map { |fe| fe.name })
|
257
|
+
|
258
|
+
end
|
259
|
+
|
260
|
+
###
|
261
|
+
# sanity check: feature value longer than the allotted space in the DB?
|
262
|
+
def check_feature_length(feature_name, # string
|
263
|
+
feature_value, # object
|
264
|
+
extractor_obj) # AbstractFeatureExtractor object
|
265
|
+
|
266
|
+
if extractor_obj.class.sql_type() =~ /(\d+)/
|
267
|
+
# sql type contains some statement about the length.
|
268
|
+
# just crudely compare to feature length
|
269
|
+
length = $1.to_i
|
270
|
+
if feature_value.class == String and
|
271
|
+
feature_value.length() > length
|
272
|
+
|
273
|
+
if feature_name == "sentid"
|
274
|
+
print length;
|
275
|
+
print feature_value;
|
276
|
+
print feature_value.length();
|
277
|
+
# if the sentence (instance) ID is too long, we cannot go on.
|
278
|
+
$stderr.puts "Error: Instance ID is longer than its DB column."
|
279
|
+
$stderr.puts "Please increase the DB column size in {Tiger,Collins}FeatureExtractors.rb"
|
280
|
+
raise "SQL entry length surpassed"
|
281
|
+
|
282
|
+
elsif @exp.get("verbose")
|
283
|
+
# KE Feb 07: don't print warning,
|
284
|
+
# this is just too frequent
|
285
|
+
# for other features, we just issue a warning, and only if we are verbose
|
286
|
+
|
287
|
+
# $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length()}): #{feature_value}"
|
288
|
+
end # feature name check
|
289
|
+
end # length surpassed
|
290
|
+
end # length found in sql type
|
291
|
+
|
292
|
+
end
|
293
|
+
|
294
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'common/ConfigData'
|
2
|
+
|
3
|
+
##############################
|
4
|
+
# Class RosyConfigData
|
5
|
+
#
|
6
|
+
# inherits from ConfigData,
|
7
|
+
# sets features for ROSY
|
8
|
+
|
9
|
+
class RosyConfigData < ConfigData
|
10
|
+
def initialize(filename)
|
11
|
+
super(filename, # config file
|
12
|
+
{ # features
|
13
|
+
"feature" => "list",
|
14
|
+
"classifier" => "list",
|
15
|
+
|
16
|
+
"verbose" => "bool" ,
|
17
|
+
"enduser_mode" => "bool",
|
18
|
+
|
19
|
+
"experiment_ID" => "string",
|
20
|
+
|
21
|
+
"directory_input_train" => "string",
|
22
|
+
"directory_input_test" => "string",
|
23
|
+
"directory_output" => "string",
|
24
|
+
|
25
|
+
"preproc_descr_file_train" => "string",
|
26
|
+
"preproc_descr_file_test" => "string",
|
27
|
+
"external_descr_file" => "string",
|
28
|
+
|
29
|
+
"dbtype" => "string", # "mysql" or "sqlite"
|
30
|
+
|
31
|
+
"host" => "string", # DB access: sqlite only
|
32
|
+
"user" => "string",
|
33
|
+
"passwd" => "string",
|
34
|
+
"dbname" => "string",
|
35
|
+
|
36
|
+
"data_dir" => "string", # for external use
|
37
|
+
"rosy_dir" => "pattern", # for internal use only, set by rosy.rb
|
38
|
+
|
39
|
+
"classifier_dir" => "string", # if present, special directory for classifiers
|
40
|
+
|
41
|
+
"classif_column_name" => "string",
|
42
|
+
"main_table_name" => "pattern",
|
43
|
+
"test_table_name" => "pattern",
|
44
|
+
|
45
|
+
"eval_file" => "pattern",
|
46
|
+
"log_file" => "pattern",
|
47
|
+
"failed_file" => "pattern",
|
48
|
+
"classifier_file" => "pattern",
|
49
|
+
"classifier_output_file" => "pattern",
|
50
|
+
"noval" => "string",
|
51
|
+
|
52
|
+
|
53
|
+
"split_nones" => "bool",
|
54
|
+
"print_eval_log" => "bool",
|
55
|
+
"assume_argrec_perfect" => "bool",
|
56
|
+
"xwise_argrec" => "string",
|
57
|
+
"xwise_arglab" => "string",
|
58
|
+
"xwise_onestep" => "string",
|
59
|
+
|
60
|
+
"fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
|
61
|
+
"fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
|
62
|
+
|
63
|
+
"prune" => "string", # pruning prior to argrec?
|
64
|
+
|
65
|
+
},
|
66
|
+
["exp_ID", "test_ID", "split_ID", "feature_name", "classif", "step",
|
67
|
+
"group", "dataset","mode"] # variables
|
68
|
+
)
|
69
|
+
|
70
|
+
# set access functions for list features
|
71
|
+
set_list_feature_access("feature",
|
72
|
+
method("access_feature"))
|
73
|
+
|
74
|
+
# set access functions for list features
|
75
|
+
set_list_feature_access("classifier",
|
76
|
+
method("access_feature"))
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
###
|
81
|
+
# protected
|
82
|
+
|
83
|
+
#####
|
84
|
+
# access_feature
|
85
|
+
#
|
86
|
+
# access function for feature 'feature'
|
87
|
+
#
|
88
|
+
# assumed format in the config file:
|
89
|
+
#
|
90
|
+
# feature = path [option]*
|
91
|
+
#
|
92
|
+
# i.e. first the name of the feature type to use, then
|
93
|
+
# optionally options associated with that feature,
|
94
|
+
# e.g. 'argrec': use that feature only when computing argrec
|
95
|
+
#
|
96
|
+
# the access function is called with parameter val_list, an array of
|
97
|
+
# string tuples, one string tuple for each feature defined.
|
98
|
+
# the first string in the tuple is the feature name, the rest are the options
|
99
|
+
#
|
100
|
+
# returns: a list of pairs [feature_name(string), options(array:string)]
|
101
|
+
# of defined features
|
102
|
+
def access_feature(val_list) # array:array:string: list of tuples defined in config file
|
103
|
+
# for feature 'feature'
|
104
|
+
if val_list.nil?
|
105
|
+
return []
|
106
|
+
else
|
107
|
+
return val_list.map { |feature_descr_tuple|
|
108
|
+
[feature_descr_tuple.first, feature_descr_tuple[1..-1]]
|
109
|
+
}
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
|