shalmaneser-rosy 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/rosy +14 -7
- data/lib/rosy/FailedParses.rb +22 -20
- data/lib/rosy/FeatureInfo.rb +35 -31
- data/lib/rosy/GfInduce.rb +132 -130
- data/lib/rosy/GfInduceFeature.rb +86 -68
- data/lib/rosy/InputData.rb +59 -55
- data/lib/rosy/RosyConfusability.rb +47 -40
- data/lib/rosy/RosyEval.rb +55 -55
- data/lib/rosy/RosyFeatureExtractors.rb +295 -290
- data/lib/rosy/RosyFeaturize.rb +54 -67
- data/lib/rosy/RosyInspect.rb +52 -50
- data/lib/rosy/RosyIterator.rb +73 -67
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +48 -48
- data/lib/rosy/RosyPruning.rb +39 -31
- data/lib/rosy/RosyServices.rb +116 -115
- data/lib/rosy/RosySplit.rb +55 -53
- data/lib/rosy/RosyTask.rb +7 -3
- data/lib/rosy/RosyTest.rb +174 -191
- data/lib/rosy/RosyTrain.rb +46 -50
- data/lib/rosy/RosyTrainingTestTable.rb +101 -99
- data/lib/rosy/TargetsMostFrequentFrame.rb +13 -9
- data/lib/rosy/{AbstractFeatureAndExternal.rb → abstract_feature_extractor.rb} +22 -97
- data/lib/rosy/abstract_single_feature_extractor.rb +52 -0
- data/lib/rosy/external_feature_extractor.rb +35 -0
- data/lib/rosy/opt_parser.rb +231 -201
- data/lib/rosy/rosy.rb +63 -64
- data/lib/rosy/rosy_conventions.rb +66 -0
- data/lib/rosy/rosy_error.rb +15 -0
- data/lib/rosy/var_var_restriction.rb +16 -0
- data/lib/shalmaneser/rosy.rb +1 -0
- metadata +26 -19
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/test/frprep/test_opt_parser.rb +0 -94
- data/test/functional/functional_test_helper.rb +0 -58
- data/test/functional/test_fred.rb +0 -47
- data/test/functional/test_frprep.rb +0 -99
- data/test/functional/test_rosy.rb +0 -40
data/lib/rosy/RosyIterator.rb
CHANGED
@@ -1,50 +1,54 @@
|
|
1
1
|
# RosyIterator
|
2
2
|
# KE May 2005
|
3
3
|
#
|
4
|
-
# RosyIterator is a class that
|
5
|
-
# * reads the "xwise" parameters in the experiment file to
|
4
|
+
# RosyIterator is a class that
|
5
|
+
# * reads the "xwise" parameters in the experiment file to
|
6
6
|
# determine the portions in which data is to be fed to classifiers,
|
7
|
-
# and offers an iterator that iterates through every group to
|
7
|
+
# and offers an iterator that iterates through every group to
|
8
8
|
# be trained/tested on
|
9
9
|
# * constructs views matching the given "xwise" group.
|
10
|
-
#
|
10
|
+
#
|
11
11
|
# RosyIterator incorporates the following services:
|
12
|
-
# - choosing the right DB table, depending on
|
12
|
+
# - choosing the right DB table, depending on
|
13
13
|
# whether training/test data is being accessed,
|
14
14
|
# and with or without a splitlog
|
15
15
|
# - making and adding all currently available Dynamic Gold objects
|
16
|
-
# (i.e. objects that are capable of mapping the gold column to
|
16
|
+
# (i.e. objects that are capable of mapping the gold column to
|
17
17
|
# something else)
|
18
18
|
# - initializing a view, potentially modified depending on the assignment step:
|
19
19
|
# argrec -> use dynamic gold, mapping gold labels to "FE" or "NONE"
|
20
20
|
# arglab -> use only those rows that have "FE" assigned from the argrec step
|
21
21
|
#
|
22
22
|
# Setting "xwise": An "xwise" entry in the hash passed on to RosyIterator.new()
|
23
|
-
# overrides all other settings. If that isn't given, the "xwise_" + step
|
23
|
+
# overrides all other settings. If that isn't given, the "xwise_" + step
|
24
24
|
# (xwise_argrec, xwise_arglab, xwise_onestep) from the experiment file is read.
|
25
25
|
# If that hasn't been set either, the default is frame-wise.
|
26
26
|
|
27
|
-
require '
|
27
|
+
require 'ruby_class_extensions'
|
28
28
|
|
29
|
-
require 'rosy/View'
|
30
|
-
require "
|
31
|
-
require
|
29
|
+
# require 'rosy/View'
|
30
|
+
# require "RosyConventions"
|
31
|
+
require 'value_restriction'
|
32
|
+
require 'db/select_table_and_columns'
|
33
|
+
require 'db/db_view'
|
32
34
|
require "rosy/RosySplit"
|
33
35
|
require "rosy/RosyTrainingTestTable"
|
34
36
|
|
37
|
+
module Shalmaneser
|
38
|
+
module Rosy
|
35
39
|
class RosyIterator
|
36
40
|
|
37
41
|
###
|
38
42
|
# new
|
39
43
|
#
|
40
|
-
# open the correct database table,
|
44
|
+
# open the correct database table,
|
41
45
|
# initialize Dynamic Gold objects
|
42
46
|
|
43
47
|
|
44
|
-
def initialize(ttt_obj, # RosyTrainingTestTable object
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
+
def initialize(ttt_obj, # RosyTrainingTestTable object
|
49
|
+
exp, # RosyConfigData object: experiment file
|
50
|
+
dataset, # string: train/test
|
51
|
+
var_hash = {}) # further arguments:
|
48
52
|
# step: string: argrec/arglab/onestep, or nil (= no manipulation of the view)
|
49
53
|
# testID: string: ID of test set, or nil
|
50
54
|
# splitID string: splitlog ID, or nil if no split is to be used
|
@@ -59,7 +63,7 @@ class RosyIterator
|
|
59
63
|
@splitID = var_hash["splitID"]
|
60
64
|
@step = var_hash["step"]
|
61
65
|
@testID = var_hash["testID"]
|
62
|
-
|
66
|
+
|
63
67
|
# object variables we are going to use below
|
64
68
|
@db_table = nil # DB table we are working on
|
65
69
|
@allcolnames = nil # names of all columns of first and potentially second table
|
@@ -80,19 +84,19 @@ class RosyIterator
|
|
80
84
|
##
|
81
85
|
# open the right database table
|
82
86
|
if @dataset == "train" or @splitID
|
83
|
-
@db_table = @ttt_obj.existing_train_table
|
87
|
+
@db_table = @ttt_obj.existing_train_table
|
84
88
|
|
85
89
|
else
|
86
90
|
unless @testID
|
87
|
-
|
91
|
+
raise "cannot open the test table without test ID"
|
88
92
|
end
|
89
93
|
@db_table = @ttt_obj.existing_test_table(@testID)
|
90
94
|
end
|
91
|
-
@allcolnames = @db_table.list_column_names
|
95
|
+
@allcolnames = @db_table.list_column_names
|
92
96
|
|
93
97
|
##
|
94
98
|
# make dynamic gold objects
|
95
|
-
@dyn_gold_objects =
|
99
|
+
@dyn_gold_objects = []
|
96
100
|
@dyn_gold_objects << DynGoldBinary.new(@exp.get("noval"))
|
97
101
|
|
98
102
|
###
|
@@ -101,38 +105,38 @@ class RosyIterator
|
|
101
105
|
# argument recognition: distinguish just "FE", "NONE" as gold
|
102
106
|
@standard_dyngold_id = "binary_gold"
|
103
107
|
end
|
104
|
-
|
108
|
+
|
105
109
|
##
|
106
|
-
# if splitID has been set,
|
110
|
+
# if splitID has been set,
|
107
111
|
# make additional restrictions on the column values
|
108
112
|
if @splitID
|
109
113
|
# get split table name
|
110
|
-
@second_table = @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname
|
114
|
+
@second_table = @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname)
|
111
115
|
|
112
116
|
# additional value restriction:
|
113
117
|
# only use rows whose sentence ID also appears in the split table
|
114
118
|
# (i.e. rows included in the split)
|
115
|
-
@standard_value_restrictions << RosySplit.make_join_restriction(@splitID,
|
119
|
+
@standard_value_restrictions << RosySplit.make_join_restriction(@splitID,
|
116
120
|
@db_table,
|
117
121
|
@dataset,
|
118
122
|
@ttt_obj)
|
119
123
|
|
120
124
|
# additional column names:
|
121
125
|
# those of the second table (but remove duplicates)
|
122
|
-
@allcolnames.concat @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname
|
126
|
+
@allcolnames.concat @ttt_obj.existing_split_table(@splitID, @dataset, RosySplit.split_index_colname).list_column_names
|
123
127
|
@allcolnames.uniq!
|
124
128
|
|
125
129
|
|
126
130
|
# if we're using a split, read the phase 2 features and the classification results
|
127
131
|
# from the split table rather than from the main table:
|
128
|
-
# @use_cols_from_second_table is a list of column names (strings)
|
132
|
+
# @use_cols_from_second_table is a list of column names (strings)
|
129
133
|
# to take from the 2nd table
|
130
134
|
# @second_table_colprefix is a string: all columns starting with this prefix
|
131
135
|
# are taken from the 2nd table
|
132
|
-
@use_cols_from_second_table = [ RosySplit.split_index_colname
|
136
|
+
@use_cols_from_second_table = [ RosySplit.split_index_colname ]
|
133
137
|
@second_table_colprefix = @exp.get("classif_column_name")
|
134
138
|
end
|
135
|
-
|
139
|
+
|
136
140
|
###
|
137
141
|
# Any (row) value restrictions to be imposed
|
138
142
|
# on all views we generate?
|
@@ -141,14 +145,14 @@ class RosyIterator
|
|
141
145
|
# for which argrec-label is "FE"
|
142
146
|
|
143
147
|
if @exp.get("assume_argrec_perfect")
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
@exp.get("noval"),
|
148
|
+
# assume perfect argrec step:
|
149
|
+
# take all rows where gold is not "noval"
|
150
|
+
@standard_value_restrictions << ValueRestriction.new(@db_table.table_name + ".gold",
|
151
|
+
@exp.get("noval"),
|
148
152
|
"posneg" => "!=")
|
149
153
|
else
|
150
|
-
|
151
|
-
|
154
|
+
# use argrec step as is:
|
155
|
+
# take all rows where the argrec result is "FE"
|
152
156
|
|
153
157
|
case @dataset
|
154
158
|
when "train"
|
@@ -159,10 +163,10 @@ class RosyIterator
|
|
159
163
|
raise "Shouldn't be here"
|
160
164
|
end
|
161
165
|
|
162
|
-
if run_column_name.nil?
|
166
|
+
if run_column_name.nil?
|
163
167
|
$stderr.puts "Missing: argrec classification results on #{@dataset} data."
|
164
168
|
$stderr.puts "I have logs of the following runs: "
|
165
|
-
$stderr.puts @ttt_obj.runlog_to_s
|
169
|
+
$stderr.puts @ttt_obj.runlog_to_s
|
166
170
|
raise "Problem"
|
167
171
|
end
|
168
172
|
|
@@ -173,7 +177,7 @@ class RosyIterator
|
|
173
177
|
run_column_name = @db_table.table_name + "." + run_column_name
|
174
178
|
end
|
175
179
|
|
176
|
-
|
180
|
+
@standard_value_restrictions << ValueRestriction.new(run_column_name, "FE")
|
177
181
|
end
|
178
182
|
end
|
179
183
|
|
@@ -192,9 +196,9 @@ class RosyIterator
|
|
192
196
|
@xwise = var_hash["xwise"]
|
193
197
|
unless @xwise
|
194
198
|
if @step
|
195
|
-
|
196
|
-
|
197
|
-
|
199
|
+
# read xwise from experiment file,
|
200
|
+
# if we know what training/test step we're in
|
201
|
+
@xwise = @exp.get("xwise_" + @step)
|
198
202
|
end
|
199
203
|
end
|
200
204
|
if @xwise.nil?
|
@@ -202,10 +206,10 @@ class RosyIterator
|
|
202
206
|
@xwise = "frame"
|
203
207
|
end
|
204
208
|
|
205
|
-
# xwise is a string consisting of any subset of
|
209
|
+
# xwise is a string consisting of any subset of
|
206
210
|
# "frame", "target_pos", "target" joined by spaces.
|
207
211
|
# transform to an array by splitting at spaces
|
208
|
-
@xwise = @xwise.split
|
212
|
+
@xwise = @xwise.split
|
209
213
|
@xwise.each { |xwise_entry|
|
210
214
|
unless @ttt_obj.feature_names.include? xwise_entry
|
211
215
|
# sanity check: valid xwise value?
|
@@ -226,17 +230,17 @@ class RosyIterator
|
|
226
230
|
#
|
227
231
|
# get the column names used for determining the groups
|
228
232
|
#
|
229
|
-
# returns: an array of strings, ["frame"] or ["frame", "target"],
|
233
|
+
# returns: an array of strings, ["frame"] or ["frame", "target"],
|
230
234
|
# or ["target_pos"]
|
231
|
-
def get_xwise_column_names
|
235
|
+
def get_xwise_column_names
|
232
236
|
return @xwise
|
233
237
|
end
|
234
238
|
|
235
239
|
####
|
236
240
|
# num_groups
|
237
241
|
# returns: integer
|
238
|
-
def num_groups
|
239
|
-
return @groups.length
|
242
|
+
def num_groups
|
243
|
+
return @groups.length
|
240
244
|
end
|
241
245
|
|
242
246
|
####
|
@@ -250,7 +254,7 @@ class RosyIterator
|
|
250
254
|
# - the hash describing the group, as returned by unique_values_of_column
|
251
255
|
# - plus an ID for the group, made up of its hash values concatenated into a string
|
252
256
|
# (values are connected by spaces)
|
253
|
-
def each_group
|
257
|
+
def each_group
|
254
258
|
@groups.each { |hash|
|
255
259
|
# hash is a hash column_name(string)-> value(object)
|
256
260
|
# this is the unique description of the current group
|
@@ -269,12 +273,12 @@ class RosyIterator
|
|
269
273
|
# (or "*" for all columns) and a list of value restrictions
|
270
274
|
# on the rows (ValueRestriction objects, equalities or inequalities
|
271
275
|
# column_name = value, columnb_name != value), which may be omitted
|
272
|
-
#
|
276
|
+
#
|
273
277
|
# returns: DBView object
|
274
278
|
# @param columns [Array] array:string, column names to include
|
275
279
|
# or string: "*" for all columns
|
276
280
|
# @param value_restrictions [Array] array:ValueRestriction objects
|
277
|
-
def get_a_view_for_current_group(columns, value_restrictions = [])
|
281
|
+
def get_a_view_for_current_group(columns, value_restrictions = [])
|
278
282
|
get_a_view_for_group(@current_group, columns, value_restrictions)
|
279
283
|
end
|
280
284
|
|
@@ -290,7 +294,7 @@ class RosyIterator
|
|
290
294
|
# (or "*" for all columns) and a list of value restrictions
|
291
295
|
# on the rows (ValueRestriction objects, equalities or inequalities
|
292
296
|
# column_name = value, columnb_name != value), which may be omitted
|
293
|
-
#
|
297
|
+
#
|
294
298
|
# returns: DBView object
|
295
299
|
# @param group [Hash] column(string)->value(object)
|
296
300
|
# describing the group
|
@@ -311,7 +315,7 @@ class RosyIterator
|
|
311
315
|
# the second table
|
312
316
|
|
313
317
|
# separate group column names into two groups
|
314
|
-
first_columns, second_columns =
|
318
|
+
first_columns, second_columns =
|
315
319
|
separate_into_1st_and_2nd_table_cols(group.keys)
|
316
320
|
|
317
321
|
# make separate value restrictions for the two groups
|
@@ -323,12 +327,12 @@ class RosyIterator
|
|
323
327
|
raise "Cannot use second table columns without second table"
|
324
328
|
end
|
325
329
|
value_restrictions.concat second_columns.map { |column_name|
|
326
|
-
ValueRestriction.new(@second_table.table_name + "." + column_name,
|
330
|
+
ValueRestriction.new(@second_table.table_name + "." + column_name,
|
327
331
|
group[column_name],
|
328
332
|
"table_name_included" => true)
|
329
333
|
}
|
330
334
|
end
|
331
|
-
|
335
|
+
|
332
336
|
# get a view with the given columns, given value restrictions
|
333
337
|
# plus add more value restrictions: must be the current group
|
334
338
|
return get_a_view(columns,value_restrictions)
|
@@ -345,18 +349,18 @@ class RosyIterator
|
|
345
349
|
# (or "*" for all columns) and a list of value restrictions
|
346
350
|
# on the rows (ValueRestriction objects, equalities or inequalities
|
347
351
|
# column_name = value, columnb_name != value), which may be omitted
|
348
|
-
#
|
352
|
+
#
|
349
353
|
# returns: DBView object
|
350
354
|
def get_a_view(columns, # array:strings, list of column names
|
351
|
-
|
352
|
-
|
355
|
+
# or string "*" (all columns)
|
356
|
+
value_restrictions = []) # array: ValueRestriction objects
|
353
357
|
# or [], nil for no restrictions
|
354
358
|
|
355
359
|
if value_restrictions.nil?
|
356
360
|
value_restrictions = []
|
357
361
|
end
|
358
|
-
return get_a_view_aux(columns, value_restrictions,
|
359
|
-
"gold" => "gold",
|
362
|
+
return get_a_view_aux(columns, value_restrictions,
|
363
|
+
"gold" => "gold",
|
360
364
|
"dynamic_feature_list" => @dyn_gold_objects,
|
361
365
|
"standard_dyngold_id" => @standard_dyngold_id,
|
362
366
|
"sentence_id_feature" => "sentid")
|
@@ -371,15 +375,15 @@ class RosyIterator
|
|
371
375
|
#
|
372
376
|
# returns: a list of hashes, one for each unique set of values
|
373
377
|
def unique_values_of_columns(columns) # array:string, several column names
|
374
|
-
retv =
|
378
|
+
retv = []
|
375
379
|
|
376
380
|
view = get_a_view_aux(columns, [],
|
377
381
|
"distinct" => true)
|
378
382
|
|
379
|
-
view.each_hash
|
383
|
+
view.each_hash { |row|
|
380
384
|
retv << row
|
381
385
|
}
|
382
|
-
view.close
|
386
|
+
view.close
|
383
387
|
return retv
|
384
388
|
end
|
385
389
|
|
@@ -387,7 +391,7 @@ class RosyIterator
|
|
387
391
|
private
|
388
392
|
|
389
393
|
###
|
390
|
-
# given a list of column names,
|
394
|
+
# given a list of column names,
|
391
395
|
# separate them into first table and second table columns
|
392
396
|
#
|
393
397
|
# columns may be either an array of string (column names)
|
@@ -440,10 +444,10 @@ class RosyIterator
|
|
440
444
|
|
441
445
|
|
442
446
|
# and get a view
|
443
|
-
return DBView.new(tables_and_cols,
|
444
|
-
|
447
|
+
return DBView.new(tables_and_cols,
|
448
|
+
value_restrictions + @standard_value_restrictions,
|
445
449
|
@ttt_obj.database,
|
446
|
-
|
450
|
+
var_hash)
|
447
451
|
end
|
448
452
|
|
449
453
|
end
|
@@ -472,7 +476,9 @@ class DynGoldBinary
|
|
472
476
|
end
|
473
477
|
end
|
474
478
|
|
475
|
-
def id
|
479
|
+
def id
|
476
480
|
return "binary_gold"
|
477
481
|
end
|
478
482
|
end
|
483
|
+
end
|
484
|
+
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
####
|
2
|
-
# ke & sp
|
2
|
+
# ke & sp
|
3
3
|
# adapted to new feature extractor class,
|
4
4
|
# Collins and Tiger features combined:
|
5
5
|
# SP November 2005
|
@@ -9,7 +9,7 @@
|
|
9
9
|
# These are features that are computed on the basis of the Phase 1 feature set
|
10
10
|
#
|
11
11
|
# This consists of all features which have to know feature values for other nodes
|
12
|
-
# (e.g. am I the nearest node to the target?) or similar.
|
12
|
+
# (e.g. am I the nearest node to the target?) or similar.
|
13
13
|
#
|
14
14
|
# Contract: each feature extractor inherits from the RosyPhase2FeatureExtractor class
|
15
15
|
#
|
@@ -17,16 +17,17 @@
|
|
17
17
|
|
18
18
|
|
19
19
|
# Salsa packages
|
20
|
-
require 'rosy/
|
21
|
-
require '
|
20
|
+
require 'rosy/abstract_feature_extractor'
|
21
|
+
# require 'SalsaTigerRegXML'
|
22
22
|
|
23
23
|
# Fred and Rosy packages
|
24
|
-
require "
|
24
|
+
# require "RosyConventions"
|
25
25
|
|
26
26
|
|
27
27
|
################################
|
28
28
|
# base class for all following feature extractors
|
29
|
-
|
29
|
+
module Shalmaneser
|
30
|
+
module Rosy
|
30
31
|
class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
|
31
32
|
|
32
33
|
###
|
@@ -41,15 +42,15 @@ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
|
|
41
42
|
# computed for the training set
|
42
43
|
#
|
43
44
|
# Here: all features in this packages are phase 2
|
44
|
-
def
|
45
|
-
|
45
|
+
def self.phase
|
46
|
+
"phase 2"
|
46
47
|
end
|
47
48
|
|
48
49
|
###
|
49
50
|
# returns an array of strings, providing information about
|
50
51
|
# the feature extractor
|
51
|
-
def
|
52
|
-
|
52
|
+
def self.info
|
53
|
+
super().concat(["rosy"])
|
53
54
|
end
|
54
55
|
|
55
56
|
###
|
@@ -57,23 +58,23 @@ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
|
|
57
58
|
# feature computation using compute_feature_value()
|
58
59
|
# such that computations that stay the same for
|
59
60
|
# several features can be done in advance
|
60
|
-
def
|
61
|
+
def self.set(var_hash)
|
61
62
|
@@split_nones = var_hash["split_nones"]
|
62
63
|
return true
|
63
64
|
end
|
64
65
|
|
65
|
-
# check if the current feature is computable, i.e. if all the necessary
|
66
|
+
# check if the current feature is computable, i.e. if all the necessary
|
66
67
|
# Phase 1 features are in the present model..
|
67
68
|
def RosyPhase2FeatureExtractor.is_computable(given_extractor_list)
|
68
|
-
|
69
|
+
(extractor_list - given_extractor_list).empty?
|
69
70
|
end
|
70
|
-
|
71
|
+
|
71
72
|
# this probably has to be done for each feature:
|
72
|
-
# identify sentences and the target, and recombine into a large array
|
73
|
+
# identify sentences and the target, and recombine into a large array
|
73
74
|
def compute_features_on_view(view)
|
74
|
-
result = Array.new(
|
75
|
+
result = Array.new(self.class.feature_names.length)
|
75
76
|
result.each_index {|i|
|
76
|
-
result[i] =
|
77
|
+
result[i] = []
|
77
78
|
}
|
78
79
|
view.each_sentence {|instance_features|
|
79
80
|
sentence_result = compute_features_for_sentence(instance_features)
|
@@ -94,7 +95,7 @@ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
|
|
94
95
|
private
|
95
96
|
|
96
97
|
# list of all the Phase 1 extractors that a particular feature extractor presupposes
|
97
|
-
def RosyPhase2FeatureExtractor.extractor_list
|
98
|
+
def RosyPhase2FeatureExtractor.extractor_list
|
98
99
|
return []
|
99
100
|
end
|
100
101
|
|
@@ -105,8 +106,6 @@ class RosyPhase2FeatureExtractor < AbstractFeatureExtractor
|
|
105
106
|
def compute_features_for_sentence(instance_features) # array of hashes features -> values
|
106
107
|
raise "Overwrite me"
|
107
108
|
end
|
108
|
-
|
109
|
-
|
110
109
|
end
|
111
110
|
|
112
111
|
|
@@ -117,65 +116,65 @@ end
|
|
117
116
|
####################
|
118
117
|
# nearestNode
|
119
118
|
#
|
120
|
-
# compute whether if my head word is the nearest word to the target,
|
119
|
+
# compute whether if my head word is the nearest word to the target,
|
121
120
|
# according to some criterion
|
122
121
|
|
123
122
|
class NearestNodeFeature < RosyPhase2FeatureExtractor
|
124
|
-
NearestNodeFeature.announce_me
|
125
|
-
|
126
|
-
def NearestNodeFeature.designator
|
123
|
+
NearestNodeFeature.announce_me
|
124
|
+
|
125
|
+
def NearestNodeFeature.designator
|
127
126
|
return "nearest_node"
|
128
127
|
end
|
129
|
-
def NearestNodeFeature.feature_names
|
130
|
-
return ["nearest_pt_path", # the nearest node with a specific pt_path
|
131
|
-
"neareststring_pt",# the nearest pt (string distance)
|
128
|
+
def NearestNodeFeature.feature_names
|
129
|
+
return ["nearest_pt_path", # the nearest node with a specific pt_path
|
130
|
+
"neareststring_pt",# the nearest pt (string distance)
|
132
131
|
"nearestpath_pt"] # the nearest pt (path length) ]
|
133
132
|
end
|
134
|
-
def NearestNodeFeature.sql_type
|
133
|
+
def NearestNodeFeature.sql_type
|
135
134
|
return "TINYINT"
|
136
135
|
end
|
137
|
-
def NearestNodeFeature.feature_type
|
136
|
+
def NearestNodeFeature.feature_type
|
138
137
|
return "syn"
|
139
138
|
end
|
140
139
|
|
141
140
|
#####
|
142
141
|
private
|
143
142
|
|
144
|
-
def NearestNodeFeature.extractor_list
|
143
|
+
def NearestNodeFeature.extractor_list
|
145
144
|
return ["worddistance","pt_path","pt","path_length"]
|
146
145
|
end
|
147
|
-
|
146
|
+
|
148
147
|
def compute_features_for_sentence(instance_features)
|
149
|
-
|
148
|
+
|
150
149
|
# for each "interesting" feature, compute a hash map value -> index
|
151
150
|
# also compute a hashmap index -> distance
|
152
|
-
# so we efficiently compute, for each feature value, the index with min distance
|
153
|
-
|
154
|
-
dist_hash =
|
155
|
-
pl_hash =
|
156
|
-
path_hash =
|
157
|
-
pt_hash =
|
158
|
-
|
151
|
+
# so we efficiently compute, for each feature value, the index with min distance
|
152
|
+
|
153
|
+
dist_hash = {} # node id -> word distance
|
154
|
+
pl_hash = {} # node id -> path length
|
155
|
+
path_hash = {} # path -> node id array
|
156
|
+
pt_hash = {} # pt -> node id array
|
157
|
+
|
159
158
|
result = [Array.new(instance_features.length),
|
160
159
|
Array.new(instance_features.length),
|
161
160
|
Array.new(instance_features.length)]
|
162
|
-
|
161
|
+
|
163
162
|
instance_features.each_index {|inst_id|
|
164
163
|
instance_hash = instance_features[inst_id]
|
165
164
|
dist_hash[inst_id] = instance_hash["worddistance"]
|
166
165
|
pl_hash[inst_id] = instance_hash["path_length"]
|
167
|
-
|
166
|
+
|
168
167
|
# record paths
|
169
168
|
pt_path = instance_hash["pt_path"]
|
170
169
|
unless path_hash.key? pt_path
|
171
|
-
path_hash[pt_path] =
|
170
|
+
path_hash[pt_path] = []
|
172
171
|
end
|
173
172
|
path_hash[pt_path] << inst_id
|
174
173
|
|
175
174
|
# record pts
|
176
175
|
pt = instance_hash["pt"]
|
177
176
|
unless pt_hash.key? pt
|
178
|
-
pt_hash[pt] =
|
177
|
+
pt_hash[pt] = []
|
179
178
|
end
|
180
179
|
pt_hash[pt] << inst_id
|
181
180
|
|
@@ -208,8 +207,8 @@ class NearestNodeFeature < RosyPhase2FeatureExtractor
|
|
208
207
|
result[1][inst_id] = 0
|
209
208
|
end
|
210
209
|
}
|
211
|
-
}
|
212
|
-
|
210
|
+
}
|
211
|
+
|
213
212
|
# nearest-pt (path length) feature is feature 2 of the extractor
|
214
213
|
pt_hash.each{|pt,inst_ids|
|
215
214
|
path_lengths = inst_ids.map {|inst_id| pl_hash[inst_id]}
|
@@ -222,9 +221,10 @@ class NearestNodeFeature < RosyPhase2FeatureExtractor
|
|
222
221
|
result[2][inst_id] = 0
|
223
222
|
end
|
224
223
|
}
|
225
|
-
}
|
224
|
+
}
|
226
225
|
|
227
226
|
return result
|
228
|
-
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
229
230
|
end
|
230
|
-
|