shalmaneser 1.2.0.rc3 → 1.2.0.rc4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +26 -7
- data/bin/fred +2 -4
- data/doc/exp_files.md +6 -5
- data/lib/common/{ConfigData.rb → config_data.rb} +46 -270
- data/lib/common/config_format_element.rb +220 -0
- data/lib/common/prep_config_data.rb +62 -0
- data/lib/common/{frprep_helper.rb → prep_helper.rb} +0 -0
- data/lib/{common/DBInterface.rb → db/db_interface.rb} +2 -2
- data/lib/{rosy/DBMySQL.rb → db/db_mysql.rb} +1 -2
- data/lib/{rosy/DBSQLite.rb → db/db_sqlite.rb} +1 -1
- data/lib/{rosy/DBTable.rb → db/db_table.rb} +1 -1
- data/lib/{rosy/DBWrapper.rb → db/db_wrapper.rb} +0 -0
- data/lib/{common/SQLQuery.rb → db/sql_query.rb} +0 -0
- data/lib/fred/FredBOWContext.rb +8 -6
- data/lib/fred/FredDetermineTargets.rb +1 -1
- data/lib/fred/FredEval.rb +1 -1
- data/lib/fred/FredFeaturize.rb +22 -16
- data/lib/fred/FredTest.rb +0 -1
- data/lib/fred/fred.rb +2 -0
- data/lib/fred/{FredConfigData.rb → fred_config_data.rb} +70 -67
- data/lib/fred/opt_parser.rb +1 -1
- data/lib/frprep/frprep.rb +1 -1
- data/lib/frprep/interfaces/berkeley_interface.rb +7 -9
- data/lib/frprep/opt_parser.rb +1 -1
- data/lib/rosy/ExternalConfigData.rb +1 -1
- data/lib/rosy/RosyEval.rb +1 -1
- data/lib/rosy/RosyFeaturize.rb +21 -20
- data/lib/rosy/RosyInspect.rb +1 -1
- data/lib/rosy/RosyPruning.rb +1 -1
- data/lib/rosy/RosyServices.rb +1 -1
- data/lib/rosy/RosySplit.rb +1 -1
- data/lib/rosy/RosyTest.rb +23 -20
- data/lib/rosy/RosyTrain.rb +15 -13
- data/lib/rosy/RosyTrainingTestTable.rb +2 -1
- data/lib/rosy/View.rb +1 -1
- data/lib/rosy/opt_parser.rb +1 -1
- data/lib/rosy/rosy.rb +1 -1
- data/lib/rosy/rosy_config_data.rb +121 -0
- data/lib/shalmaneser/opt_parser.rb +32 -2
- data/lib/shalmaneser/version.rb +1 -1
- metadata +23 -114
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/common/FrPrepConfigData.rb +0 -66
- data/lib/rosy/RosyConfigData.rb +0 -115
- metadata.gz.sig +0 -0
@@ -0,0 +1,220 @@
|
|
1
|
+
|
2
|
+
##############################
|
3
|
+
# ConfigFormatelement is an auxiliary class
|
4
|
+
# of ConfigData.
|
5
|
+
# It keeps track of feature patterns with variables in them
|
6
|
+
# that can be instantiated.
|
7
|
+
# @author Andrei Beliankou
|
8
|
+
#
|
9
|
+
class ConfigFormatElement
|
10
|
+
|
11
|
+
# given a pattern and a list of variable names,
|
12
|
+
# analyze the pattern and remember the variable names
|
13
|
+
#
|
14
|
+
def initialize(string, # string: feature name, may include names of variables.
|
15
|
+
# they are included in <>
|
16
|
+
variables) # list of variable names that can occur
|
17
|
+
|
18
|
+
@variables = variables
|
19
|
+
|
20
|
+
# pattern: this is what the 'string' is split into,
|
21
|
+
# an array of elements that are either fixed parts or variables.
|
22
|
+
# fixed part: pair [item:string, "string"]
|
23
|
+
# variable: pair [variable_name:string, "variable"]
|
24
|
+
@pattern = Array.new
|
25
|
+
state = "out"
|
26
|
+
item = ""
|
27
|
+
|
28
|
+
# analyze string,
|
29
|
+
# split into variables and fixed parts
|
30
|
+
string.split(//).each { |char|
|
31
|
+
|
32
|
+
case state
|
33
|
+
when "in"
|
34
|
+
case char
|
35
|
+
when "<"
|
36
|
+
raise "Duplicate < in " + string
|
37
|
+
when ">"
|
38
|
+
unless @variables.include? item
|
39
|
+
raise "Unknown variable " + item
|
40
|
+
end
|
41
|
+
@pattern << [item, "variable"]
|
42
|
+
item = ""
|
43
|
+
state = "out"
|
44
|
+
else
|
45
|
+
item << char
|
46
|
+
state = "in"
|
47
|
+
end
|
48
|
+
|
49
|
+
when "out"
|
50
|
+
case char
|
51
|
+
when "<"
|
52
|
+
unless item.empty?
|
53
|
+
@pattern << [item, "string"]
|
54
|
+
item = ""
|
55
|
+
end
|
56
|
+
state = "in"
|
57
|
+
when ">"
|
58
|
+
raise "Unexpected > in " + string
|
59
|
+
else
|
60
|
+
item << char
|
61
|
+
state = "out"
|
62
|
+
end
|
63
|
+
|
64
|
+
else
|
65
|
+
raise "Shouldn't be here"
|
66
|
+
end
|
67
|
+
}
|
68
|
+
|
69
|
+
# read through the whole of "string"
|
70
|
+
# end state has to be "out"
|
71
|
+
unless state == "out"
|
72
|
+
raise "Unclosed < in " + string
|
73
|
+
end
|
74
|
+
|
75
|
+
# last bit still to be recorded?
|
76
|
+
unless item.empty?
|
77
|
+
@pattern << [item, "string"]
|
78
|
+
end
|
79
|
+
|
80
|
+
# make regexp for matching this pattern
|
81
|
+
@regexp = make_regexp(@pattern)
|
82
|
+
end
|
83
|
+
|
84
|
+
# instantiate: given pairs of variable names and variable values,
|
85
|
+
# instantiate @pattern to a string in which var names are replaced
|
86
|
+
# by their values
|
87
|
+
#
|
88
|
+
# returns: string
|
89
|
+
def instantiate(var_hash) # hash variable name(string) => variable value(string)
|
90
|
+
|
91
|
+
# instantiate the pattern
|
92
|
+
return @pattern.map { |item, string_or_var|
|
93
|
+
|
94
|
+
case string_or_var
|
95
|
+
when "string"
|
96
|
+
item
|
97
|
+
|
98
|
+
when "variable"
|
99
|
+
|
100
|
+
if var_hash[item].nil?
|
101
|
+
raise "Missing variable instantiation: " + item
|
102
|
+
end
|
103
|
+
var_hash[item]
|
104
|
+
|
105
|
+
else
|
106
|
+
raise "Shouldn't be here"
|
107
|
+
end
|
108
|
+
}.join
|
109
|
+
end
|
110
|
+
|
111
|
+
# match()
|
112
|
+
#
|
113
|
+
# given a string, try to match it against the @pattern
|
114
|
+
# while setting the variables given in 'fillers' to
|
115
|
+
# the values given in that hash.
|
116
|
+
#
|
117
|
+
# returns: if the string matches, a hash variable name => value
|
118
|
+
# that includes the fillers given as a parameter as well as
|
119
|
+
# values for all other variables mentioned in @pattern,
|
120
|
+
# or false if no match.
|
121
|
+
def match(string, # a string
|
122
|
+
fillers = nil) # hash variable name(string) => value(string)
|
123
|
+
|
124
|
+
# have we been given partial info about variables?
|
125
|
+
if fillers
|
126
|
+
match = make_regexp(@pattern, fillers).match(string)
|
127
|
+
# $stderr.print "matching " + make_regexp(@pattern, fillers).source +
|
128
|
+
# " against " + string + " "
|
129
|
+
# if match.nil?
|
130
|
+
# $stderr.puts "no"
|
131
|
+
# else
|
132
|
+
# $stderr.puts "yes"
|
133
|
+
# end
|
134
|
+
else
|
135
|
+
match = @regexp.match(string)
|
136
|
+
end
|
137
|
+
|
138
|
+
if match.nil?
|
139
|
+
# no match via the regular expression
|
140
|
+
return false
|
141
|
+
end
|
142
|
+
|
143
|
+
# regular expression matched.
|
144
|
+
# construct return value in hash
|
145
|
+
# retv: variable name(string) => value(string)
|
146
|
+
retv = Hash.new()
|
147
|
+
if fillers
|
148
|
+
# include given fillers in retv hash
|
149
|
+
fillers.each_pair { |name, val| retv[name] = val }
|
150
|
+
end
|
151
|
+
|
152
|
+
# now put values for other variables in @pattern into retv
|
153
|
+
index = 1
|
154
|
+
@pattern.to_a.select { |item, string_or_var|
|
155
|
+
string_or_var == "variable"
|
156
|
+
}.select { |item, string_or_var|
|
157
|
+
fillers.nil? or
|
158
|
+
fillers[item].nil?
|
159
|
+
}.each { |item, string_or_var|
|
160
|
+
# for all items on the pattern list
|
161
|
+
# that are variables and
|
162
|
+
# haven't been filled by the "fillers" list already:
|
163
|
+
# fill from matches
|
164
|
+
|
165
|
+
if match[index].nil?
|
166
|
+
raise "Match, but not enough matched elements? Strange."
|
167
|
+
end
|
168
|
+
|
169
|
+
if retv[item].nil?
|
170
|
+
retv[item] = match[index]
|
171
|
+
else
|
172
|
+
unless retv[item] == match[index]
|
173
|
+
return false
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
index += 1
|
178
|
+
}
|
179
|
+
|
180
|
+
return retv
|
181
|
+
end
|
182
|
+
|
183
|
+
# used_variables
|
184
|
+
#
|
185
|
+
# returns: an array of variable names used in @pattern
|
186
|
+
def used_variables()
|
187
|
+
return @pattern.select { |item, string_or_var|
|
188
|
+
string_or_var == "variable"
|
189
|
+
}.map { |item, string_or_var| item}
|
190
|
+
end
|
191
|
+
|
192
|
+
####################
|
193
|
+
private
|
194
|
+
|
195
|
+
# make_regexp:
|
196
|
+
# make regular expression from a pattern
|
197
|
+
# together with some variable fillers
|
198
|
+
#
|
199
|
+
# returns: Regexp object
|
200
|
+
def make_regexp(pattern, # array of pairs [string, "string"] or [string, "variable"]
|
201
|
+
fillers = nil) # hash variable name(string) => value(string)
|
202
|
+
return (Regexp.new "^" +
|
203
|
+
pattern.map { |item, string_or_var|
|
204
|
+
case string_or_var
|
205
|
+
when "variable"
|
206
|
+
if fillers and
|
207
|
+
fillers[item]
|
208
|
+
Regexp.escape(fillers[item])
|
209
|
+
else
|
210
|
+
"(.+)"
|
211
|
+
end
|
212
|
+
when "string"
|
213
|
+
Regexp.escape(item)
|
214
|
+
else
|
215
|
+
raise "Shouldn't be here"
|
216
|
+
end
|
217
|
+
}.join + "$")
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# FPrepConfigData
|
2
|
+
# Katrin Erk July 05
|
3
|
+
#
|
4
|
+
# Preprocessing for Fred and Rosy:
|
5
|
+
# access to a configuration and experiment description file
|
6
|
+
|
7
|
+
require "common/config_data"
|
8
|
+
|
9
|
+
##############################
|
10
|
+
# Class FrPrepConfigData
|
11
|
+
#
|
12
|
+
# inherits from ConfigData,
|
13
|
+
# sets variable names appropriate to preprocessing task
|
14
|
+
|
15
|
+
class FrPrepConfigData < ConfigData
|
16
|
+
|
17
|
+
CONFIG_DEFS = {"prep_experiment_ID" => "string", # experiment identifier
|
18
|
+
"frprep_directory" => "string", # dir for frprep internal data
|
19
|
+
# information about the dataset
|
20
|
+
"language" => "string", # en, de
|
21
|
+
"origin"=> "string", # FrameNet, Salsa, or nothing
|
22
|
+
"format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
|
23
|
+
"encoding" => "string", # utf8, iso, hex, or nothing
|
24
|
+
|
25
|
+
# directories
|
26
|
+
"directory_input" => "string", # dir with input data
|
27
|
+
"directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
|
28
|
+
"directory_parserout" => "string", # dir with parser output for the parser named below
|
29
|
+
|
30
|
+
# syntactic processing
|
31
|
+
"pos_tagger" => "string", # name of POS tagger
|
32
|
+
"lemmatizer" => "string", # name of lemmatizer
|
33
|
+
"parser" => "string", # name of parser
|
34
|
+
"pos_tagger_path" => "string", # path to POS tagger
|
35
|
+
"lemmatizer_path" => "string", # path to lemmatizer
|
36
|
+
"parser_path" => "string", # path to parser
|
37
|
+
"parser_max_sent_num" => "integer", # max number of sentences per parser input file
|
38
|
+
"parser_max_sent_len" => "integer", # max sentence length the parser handles
|
39
|
+
|
40
|
+
"do_parse" => "bool", # use parser?
|
41
|
+
"do_lemmatize" => "bool",# use lemmatizer?
|
42
|
+
"do_postag" => "bool", # use POS tagger?
|
43
|
+
|
44
|
+
# output format: if tabformat_output == true,
|
45
|
+
# output in Tab format rather than Salsa/Tiger XML
|
46
|
+
# (this will not work if do_parse == true)
|
47
|
+
"tabformat_output" => "bool",
|
48
|
+
|
49
|
+
# syntactic repairs, dependent on existing semantic role annotation
|
50
|
+
"fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
|
51
|
+
"fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
|
52
|
+
}
|
53
|
+
|
54
|
+
def initialize(filename)
|
55
|
+
# @param filename [String] path to a config file
|
56
|
+
# @param CONFIG_DEFS [Hash] a list of configuration definitions
|
57
|
+
super(filename, CONFIG_DEFS, [])
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
|
File without changes
|
@@ -20,7 +20,7 @@ def get_db_interface(exp, # experiment file object with 'dbtype' entry
|
|
20
20
|
case exp.get("dbtype")
|
21
21
|
when "mysql"
|
22
22
|
begin
|
23
|
-
require '
|
23
|
+
require 'db/db_mysql'
|
24
24
|
rescue
|
25
25
|
$stderr.puts "Error loading DB interface."
|
26
26
|
$stderr.puts "Make sure you have the Ruby MySQL package installed."
|
@@ -30,7 +30,7 @@ def get_db_interface(exp, # experiment file object with 'dbtype' entry
|
|
30
30
|
|
31
31
|
when "sqlite"
|
32
32
|
begin
|
33
|
-
require '
|
33
|
+
require 'db/db_sqlite'
|
34
34
|
rescue
|
35
35
|
$stderr.puts "Error loading DB interface."
|
36
36
|
$stderr.puts "Make sure you have the Ruby SQLite package installed."
|
File without changes
|
File without changes
|
data/lib/fred/FredBOWContext.rb
CHANGED
@@ -6,14 +6,15 @@ require "common/SynInterfaces"
|
|
6
6
|
require "common/TabFormat"
|
7
7
|
require "common/SalsaTigerRegXML"
|
8
8
|
require "common/SalsaTigerXMLHelper"
|
9
|
+
require "common/RosyConventions"
|
9
10
|
|
10
11
|
require 'fred/md5'
|
11
|
-
require "fred/
|
12
|
+
require "fred/fred_config_data"
|
12
13
|
require "fred/FredConventions"
|
13
14
|
require "fred/FredDetermineTargets"
|
14
|
-
|
15
|
-
require
|
16
|
-
require
|
15
|
+
|
16
|
+
require 'db/db_interface'
|
17
|
+
require 'db/sql_query'
|
17
18
|
|
18
19
|
########################################
|
19
20
|
# Context Provider classes:
|
@@ -394,6 +395,7 @@ class NoncontiguousContextProvider < AbstractContextProvider
|
|
394
395
|
# yielding contexts.
|
395
396
|
def each_window(dir) # string: directory containing Salsa/Tiger XML data
|
396
397
|
|
398
|
+
# @todo AB: Move this chunk to OptionParser.
|
397
399
|
# sanity check: do we know where the larger corpus is?
|
398
400
|
unless @exp.get("larger_corpus_dir")
|
399
401
|
$stderr.puts "Error: 'noncontiguous_input' has been set in the experiment file"
|
@@ -436,7 +438,7 @@ class NoncontiguousContextProvider < AbstractContextProvider
|
|
436
438
|
# We will need an FrPrep instance and an options object.
|
437
439
|
base_dir_path = File.expand_path(File.dirname(__FILE__) + '/../..')
|
438
440
|
|
439
|
-
# Remove this
|
441
|
+
# @todo AB: Remove this
|
440
442
|
FileUtils.cp(tf_exp_frprep.path, '/tmp/frprep.exp')
|
441
443
|
# after debugging
|
442
444
|
|
@@ -479,7 +481,7 @@ class NoncontiguousContextProvider < AbstractContextProvider
|
|
479
481
|
# remove temporary data
|
480
482
|
temptable_obj.drop_temp_table()
|
481
483
|
|
482
|
-
# AB: TODO Rewrite this passage using pure Ruby.
|
484
|
+
# @todo AB: TODO Rewrite this passage using pure Ruby.
|
483
485
|
%x{rm -rf #{frprep_in}}
|
484
486
|
%x{rm -rf #{frprep_out}}
|
485
487
|
%x{rm -rf #{frprep_dir}}
|
data/lib/fred/FredEval.rb
CHANGED
data/lib/fred/FredFeaturize.rb
CHANGED
@@ -29,10 +29,9 @@ require "common/RegXML"
|
|
29
29
|
require "common/SalsaTigerRegXML"
|
30
30
|
require "common/SalsaTigerXMLHelper"
|
31
31
|
|
32
|
-
require "fred/
|
32
|
+
require "fred/fred_config_data"
|
33
33
|
require "fred/FredConventions"
|
34
|
-
require "common/
|
35
|
-
require "common/frprep_helper"
|
34
|
+
require "common/prep_helper"
|
36
35
|
require "common/SynInterfaces"
|
37
36
|
|
38
37
|
require "fred/FredBOWContext"
|
@@ -169,18 +168,24 @@ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
|
|
169
168
|
# prepare experiment file: add preprocessing experiment file data
|
170
169
|
@exp = exp_obj
|
171
170
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
171
|
+
# @note AB: The following is desabled because we don't want to use
|
172
|
+
# the dependence on {PrepConfigData}. We duplicate options:
|
173
|
+
# <do_postag>, <pos_tagger>, <do_lemmatize>, <lemmatizer>,
|
174
|
+
# <do_parse>, <parser>, <directory_preprocessed>
|
175
|
+
# in the experiment file of Fred.
|
176
|
+
#
|
177
|
+
# preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
|
178
|
+
# if not(preproc_expname)
|
179
|
+
# $stderr.puts "Please set the name of the preprocessing exp. file name"
|
180
|
+
# $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
|
181
|
+
# exit 1
|
182
|
+
# elsif not(File.readable?(preproc_expname))
|
183
|
+
# $stderr.puts "Error in the experiment file:"
|
184
|
+
# $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
|
185
|
+
# exit 1
|
186
|
+
# end
|
187
|
+
# preproc_exp = FrPrepConfigData.new(preproc_expname)
|
188
|
+
# @exp.adjoin(preproc_exp)
|
184
189
|
|
185
190
|
# get the right syntactic interface
|
186
191
|
SynInterfaces.check_interfaces_abort_if_missing(@exp)
|
@@ -190,7 +195,6 @@ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
|
|
190
195
|
grf_obj = GrammaticalFunctionAccess.new(@interpreter_class)
|
191
196
|
super(grf_obj)
|
192
197
|
|
193
|
-
|
194
198
|
# announce the task
|
195
199
|
$stderr.puts "---------"
|
196
200
|
$stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
|
@@ -267,6 +271,8 @@ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
|
|
267
271
|
context_obj = SingleSentContextProvider.new(max_context_size, @exp,
|
268
272
|
@interpreter_class, target_obj,
|
269
273
|
@dataset)
|
274
|
+
# @todo AB: Put it to the OptionParser, two option are not
|
275
|
+
# compatible, don't do the check here!
|
270
276
|
if @exp.get("noncontiguous_input")
|
271
277
|
$stderr.puts "Warning: 'single_sent_context' has been set in the experiment file."
|
272
278
|
$stderr.puts "So I'm ignoring the 'noncontiguous_input = true' setting."
|