shalmaneser-lib 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/configuration/config_data.rb +457 -0
- data/lib/configuration/config_format_element.rb +210 -0
- data/lib/configuration/configuration_error.rb +15 -0
- data/lib/configuration/external_config_data.rb +56 -0
- data/lib/configuration/frappe_config_data.rb +134 -0
- data/lib/configuration/fred_config_data.rb +199 -0
- data/lib/configuration/rosy_config_data.rb +126 -0
- data/lib/db/db_interface.rb +50 -0
- data/lib/db/db_mysql.rb +141 -0
- data/lib/db/db_sqlite.rb +280 -0
- data/lib/db/db_table.rb +237 -0
- data/lib/db/db_view.rb +416 -0
- data/lib/db/db_wrapper.rb +175 -0
- data/lib/db/select_table_and_columns.rb +10 -0
- data/lib/db/sql_query.rb +243 -0
- data/lib/definitions.rb +19 -0
- data/lib/eval.rb +482 -0
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/external_systems.rb +251 -0
- data/lib/framenet_format/fn_corpus_aset.rb +209 -0
- data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
- data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
- data/lib/framenet_format/fn_database.rb +143 -0
- data/lib/framenet_format/frame_xml_file.rb +104 -0
- data/lib/framenet_format/frame_xml_sentence.rb +411 -0
- data/lib/logging.rb +25 -0
- data/lib/ml/classifier.rb +189 -0
- data/lib/ml/mallet.rb +236 -0
- data/lib/ml/maxent.rb +229 -0
- data/lib/ml/optimize.rb +195 -0
- data/lib/ml/timbl.rb +140 -0
- data/lib/monkey_patching/array.rb +82 -0
- data/lib/monkey_patching/enumerable_bool.rb +24 -0
- data/lib/monkey_patching/enumerable_distribute.rb +18 -0
- data/lib/monkey_patching/file.rb +131 -0
- data/lib/monkey_patching/subsumed.rb +24 -0
- data/lib/ruby_class_extensions.rb +4 -0
- data/lib/salsa_tiger_xml/corpus.rb +24 -0
- data/lib/salsa_tiger_xml/fe_node.rb +98 -0
- data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
- data/lib/salsa_tiger_xml/frame_node.rb +145 -0
- data/lib/salsa_tiger_xml/graph_node.rb +347 -0
- data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
- data/lib/salsa_tiger_xml/sem_node.rb +58 -0
- data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
- data/lib/salsa_tiger_xml/syn_node.rb +169 -0
- data/lib/salsa_tiger_xml/tree_node.rb +59 -0
- data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
- data/lib/salsa_tiger_xml/usp_node.rb +72 -0
- data/lib/salsa_tiger_xml/xml_node.rb +163 -0
- data/lib/shalmaneser/lib.rb +1 -0
- data/lib/tabular_format/fn_tab_format_file.rb +38 -0
- data/lib/tabular_format/fn_tab_frame.rb +67 -0
- data/lib/tabular_format/fn_tab_sentence.rb +169 -0
- data/lib/tabular_format/tab_format_file.rb +91 -0
- data/lib/tabular_format/tab_format_named_args.rb +184 -0
- data/lib/tabular_format/tab_format_sentence.rb +119 -0
- data/lib/value_restriction.rb +49 -0
- metadata +131 -0
data/lib/logging.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'pastel'
|
3
|
+
|
4
|
+
# A general logger for all instances.
|
5
|
+
module Shalmaneser
|
6
|
+
# @todo AB: [2015-12-31 Thu 13:15]
|
7
|
+
# Check if the constant is defined.
|
8
|
+
# Do not rely only on the require order.
|
9
|
+
LOGGER = Logger.new($stderr)
|
10
|
+
|
11
|
+
LOGGER.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'INFO'))
|
12
|
+
pastel = Pastel.new
|
13
|
+
colors = {
|
14
|
+
'FATAL' => pastel.red.bold.detach,
|
15
|
+
'ERROR' => pastel.red.detach,
|
16
|
+
'WARN' => pastel.yellow.detach,
|
17
|
+
'INFO' => pastel.green.detach,
|
18
|
+
'DEBUG' => pastel.white.detach
|
19
|
+
}
|
20
|
+
|
21
|
+
LOGGER.formatter = lambda do |severity, datetime, progname, message|
|
22
|
+
colorizer = $stderr.tty? ? colors[severity] : ->(s) { s }
|
23
|
+
"#{colorizer.call(severity)}: #{message}\n"
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
# sp 24 08 04
|
2
|
+
|
3
|
+
# this file provides a very simple wrapper for using different ML systems
|
4
|
+
# all you need to do is to write the appropriate learner class
|
5
|
+
# and insert them in the initialize routine here in ML()
|
6
|
+
#
|
7
|
+
# available at the moment:
|
8
|
+
# * timbl (memory-based learner)
|
9
|
+
# * mallet-maxent (another maxent system)
|
10
|
+
# * maxent (the OpenNLP maxent system)
|
11
|
+
|
12
|
+
# part of contract: learner is not initialised unless it is either trained or read
|
13
|
+
|
14
|
+
# @note AB: This is only a remark about dynamic requirement below.
|
15
|
+
# require_relative 'timbl'
|
16
|
+
# require_relative 'mallet'
|
17
|
+
# require_relative 'maxent'
|
18
|
+
|
19
|
+
require_relative 'optimize'
|
20
|
+
|
21
|
+
class Classifier
|
22
|
+
|
23
|
+
@@learners = [
|
24
|
+
["timbl", "timbl", "Timbl"],
|
25
|
+
["mallet", "mallet", "Mallet"],
|
26
|
+
["maxent", "maxent", "Maxent"]
|
27
|
+
]
|
28
|
+
|
29
|
+
def initialize(learner, params)
|
30
|
+
|
31
|
+
@ready = false
|
32
|
+
|
33
|
+
if params[0] == "optimise"
|
34
|
+
params.shift
|
35
|
+
@optimise = true
|
36
|
+
else
|
37
|
+
@optimise = false
|
38
|
+
end
|
39
|
+
|
40
|
+
program_path = ""
|
41
|
+
begin
|
42
|
+
program_path = params.shift.chomp
|
43
|
+
unless FileTest.exist? program_path
|
44
|
+
$stderr.puts "Error: Could not find classifier system at " + program_path
|
45
|
+
$stderr.puts "Perhaps an erroneous entry in your experiment file?"
|
46
|
+
exit 1
|
47
|
+
end
|
48
|
+
rescue NoMethodError
|
49
|
+
$stderr.puts "Error: No program path provided for classifier system."
|
50
|
+
end
|
51
|
+
|
52
|
+
# try to find our learner in the pre-set list of learners
|
53
|
+
learner_tuple = @@learners.assoc(learner)
|
54
|
+
unless learner_tuple
|
55
|
+
$stderr.puts "Error: I don't know the learner " + learner.to_s
|
56
|
+
$stderr.puts "Perhaps an erroneous entry in your experiment file?"
|
57
|
+
exit 1
|
58
|
+
end
|
59
|
+
|
60
|
+
# @todo AB: Investigate, why this dynamic require is necessary.
|
61
|
+
learner_name, learner_filename, learner_classname = learner_tuple
|
62
|
+
require_relative "#{learner_filename}"
|
63
|
+
@learner = eval(learner_classname).new(program_path,params)
|
64
|
+
end
|
65
|
+
|
66
|
+
# a classifier can (and has to be) either trained or read
|
67
|
+
def train(trainfile, classifier_file=nil)
|
68
|
+
# train on the training data in trainfile
|
69
|
+
# make sure we produce a valid file name
|
70
|
+
|
71
|
+
# it is possible to directly specify a filename for storing the classifier
|
72
|
+
|
73
|
+
trainfile.gsub!(/[<>]/,"")
|
74
|
+
trainfile.gsub!(/ /,"_")
|
75
|
+
if @optimise
|
76
|
+
STDERR.puts "[ML] using feature optimisation"
|
77
|
+
@optimiser = Optimise.new
|
78
|
+
@optimiser.init_from_data(trainfile)
|
79
|
+
optimisedfile = trainfile+".opted"
|
80
|
+
@optimiser.apply(trainfile,optimisedfile)
|
81
|
+
@learner.train(optimisedfile,classifier_file)
|
82
|
+
File.delete(optimisedfile)
|
83
|
+
else
|
84
|
+
STDERR.puts "[ML] no feature optimisation"
|
85
|
+
@learner.train(trainfile,classifier_file)
|
86
|
+
end
|
87
|
+
@ready = true
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
# returns true iff reading the classifier from the file has had success
|
92
|
+
|
93
|
+
def read(classifier_file)
|
94
|
+
# make sure we produce a valid file name
|
95
|
+
classifier_file.gsub!(/[<>]/,"")
|
96
|
+
classifier_file.gsub!(/ /,"_")
|
97
|
+
|
98
|
+
# read file, if present
|
99
|
+
|
100
|
+
status = @learner.read(classifier_file)
|
101
|
+
|
102
|
+
# if reading has failed, return "false"
|
103
|
+
unless status
|
104
|
+
STDERR.puts "reading from #{classifier_file} did not succeed"
|
105
|
+
return status
|
106
|
+
end
|
107
|
+
|
108
|
+
# read optimisation, if desired
|
109
|
+
if @optimise
|
110
|
+
optimisations_filename = Optimise.recommended_filename(classifier_file)
|
111
|
+
unless FileTest.exists? optimisations_filename
|
112
|
+
STDERR.puts "[ML] Error: attempted to read stored optimisation, but file does not exist"
|
113
|
+
return false
|
114
|
+
else
|
115
|
+
@optimiser = Optimise.new
|
116
|
+
@optimiser.init_from_file(optimisations_filename)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
@ready = true
|
121
|
+
return true
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
# a classifier can be stored somewhere. This can be more than one file (classifier-specific),
|
126
|
+
# but all files start with "classifier_file"
|
127
|
+
|
128
|
+
def write(classifier_file)
|
129
|
+
# make sure we produce a valid file name
|
130
|
+
classifier_file.gsub!(/[<>]/,"")
|
131
|
+
classifier_file.gsub!(/ /,"_")
|
132
|
+
@learner.write(classifier_file)
|
133
|
+
if @optimise
|
134
|
+
@optimiser.store(Optimise.recommended_filename(classifier_file))
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
###
|
139
|
+
# exists?
|
140
|
+
# check if a classifier is living at some particular path
|
141
|
+
|
142
|
+
def exists?(classifier_file)
|
143
|
+
classifier_file.gsub!(/[<>]/,"")
|
144
|
+
classifier_file.gsub!(/ /,"_")
|
145
|
+
return @learner.exists?(classifier_file)
|
146
|
+
end
|
147
|
+
|
148
|
+
# a classifier can be applied
|
149
|
+
|
150
|
+
# returns true iff application has had success
|
151
|
+
|
152
|
+
def apply(testfile,outfile) # test either on the training or the test data in the specified dir
|
153
|
+
# make sure we produce a valid file name
|
154
|
+
testfile.gsub!(/[<>]/,"")
|
155
|
+
testfile.gsub!(/ /,"_")
|
156
|
+
# make sure we produce a valid file name
|
157
|
+
outfile.gsub!(/[<>]/,"")
|
158
|
+
outfile.gsub!(/ /,"_")
|
159
|
+
|
160
|
+
unless @ready
|
161
|
+
STDERR.puts "[ML] Warning: learner not ready for testing! Must be trained or read."
|
162
|
+
return false
|
163
|
+
end
|
164
|
+
|
165
|
+
# do we have a testfile?
|
166
|
+
unless FileTest.exists?(testfile)
|
167
|
+
STDERR.puts "[ML] Warning: could not find testfile (maybe empty test set?)."
|
168
|
+
return false
|
169
|
+
end
|
170
|
+
|
171
|
+
if @optimise
|
172
|
+
optimisedfile = testfile+".opted"
|
173
|
+
@optimiser.apply(testfile,optimisedfile)
|
174
|
+
return @learner.apply(optimisedfile,outfile)
|
175
|
+
File.delete(optimisedfile)
|
176
|
+
else
|
177
|
+
return @learner.apply(testfile,outfile)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
###
|
182
|
+
# read classifier result file,
|
183
|
+
# returns a list of instance_results
|
184
|
+
# where an instance_result is a list of pairs [label, confidence]
|
185
|
+
# where the pairs are sorted by confidence
|
186
|
+
def read_resultfile(file)
|
187
|
+
return @learner.read_resultfile(file)
|
188
|
+
end
|
189
|
+
end
|
data/lib/ml/mallet.rb
ADDED
@@ -0,0 +1,236 @@
|
|
1
|
+
# wrapper script for the Mallet toolkit Maxent classifier
|
2
|
+
|
3
|
+
# Problem with Winnow: cannot be serialised (written to file). Support dropped.
|
4
|
+
|
5
|
+
# sp 27 10 04
|
6
|
+
|
7
|
+
|
8
|
+
require "tempfile"
|
9
|
+
require "ftools"
|
10
|
+
|
11
|
+
class Mallet
|
12
|
+
|
13
|
+
###
|
14
|
+
def initialize(program_path,parameters)
|
15
|
+
|
16
|
+
if parameters.empty?
|
17
|
+
puts "Error: Mallet needs two paths (first the location of mallet itself and then the location of the interface, usually program/tools/mallet)."
|
18
|
+
puts "I got only the program path."
|
19
|
+
Kernel.exit
|
20
|
+
end
|
21
|
+
|
22
|
+
@malletpath = program_path
|
23
|
+
@interface_path = parameters.first
|
24
|
+
unless @malletpath =~ /\/$/
|
25
|
+
@malletpath = @malletpath + "/"
|
26
|
+
end
|
27
|
+
|
28
|
+
@learner = "MaxEnt,gaussianPriorVariance=1.0"
|
29
|
+
|
30
|
+
# classpath for mallet
|
31
|
+
|
32
|
+
@cp = "#{ENV["CLASSPATH"]}:#{@malletpath}class:#{@malletpath}lib/bsh.jar"
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
###
|
37
|
+
def train(infilename,classifier_location)
|
38
|
+
csvfile = Tempfile.new(File.basename(infilename)+".csvtrain")
|
39
|
+
infile = File.new(infilename)
|
40
|
+
c45_to_csv(infile,csvfile) # training data in csv format
|
41
|
+
infile.close
|
42
|
+
csvfile.close
|
43
|
+
@mallet_train_vectors = infilename+".trainvectors" # training data in mallet format
|
44
|
+
if classifier_location
|
45
|
+
@classifier_mallet_path = classifier_location
|
46
|
+
else
|
47
|
+
@classifier_mallet_path = infilename+".classifier"
|
48
|
+
end
|
49
|
+
|
50
|
+
command1 = [@malletpath+"bin/csv2vectors ",
|
51
|
+
" --input ",csvfile.path,
|
52
|
+
" --output ",@mallet_train_vectors].join("")
|
53
|
+
|
54
|
+
command2 = ["cd #{@interface_path}; ",
|
55
|
+
"java -cp #{@cp} -Xmx1000m Train ",
|
56
|
+
" --train ",@mallet_train_vectors,
|
57
|
+
" --out ",@classifier_mallet_path,
|
58
|
+
" --trainer ",@learner].join("")
|
59
|
+
# STDERR.puts "[train 1] "+command1
|
60
|
+
successfully_run(command1) # encode
|
61
|
+
# STDERR.puts "[train 2] "+command2
|
62
|
+
successfully_run(command2) # train
|
63
|
+
csvfile.close(true)
|
64
|
+
end
|
65
|
+
|
66
|
+
def write(classifier_file)
|
67
|
+
if @classifier_mallet_path
|
68
|
+
%x{cp #{@classifier_mallet_path} #{classifier_file}.classifier} # store classifier
|
69
|
+
# File.chmod(0664,classifier_file+".classifier")
|
70
|
+
end
|
71
|
+
if @mallet_train_vectors
|
72
|
+
%x{cp #{@mallet_train_vectors} #{classifier_file}.trainvectors} # store train vectors to recreate pipe for testing data
|
73
|
+
# File.chmod(0664,classifier_file+".trainvectors")
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
###
|
78
|
+
def exists?(classifier_file)
|
79
|
+
return (FileTest.exists?(classifier_file+".trainvectors") and
|
80
|
+
FileTest.exists?(classifier_file+".classifier"))
|
81
|
+
end
|
82
|
+
|
83
|
+
###
|
84
|
+
# return true iff reading the classifier has had success
|
85
|
+
def read(classifier_file)
|
86
|
+
@mallet_train_vectors = classifier_file+".trainvectors" # training data in mallet format
|
87
|
+
@classifier_mallet_path = classifier_file+".classifier"
|
88
|
+
unless FileTest.exists?(@mallet_train_vectors)
|
89
|
+
$stderr.puts "No classifier file "+@mallet_train_vectors
|
90
|
+
return false
|
91
|
+
end
|
92
|
+
unless FileTest.exists?(@classifier_mallet_path)
|
93
|
+
$stderr.puts "No classifier file "+@classifier_mallet_path
|
94
|
+
return false
|
95
|
+
end
|
96
|
+
return true
|
97
|
+
end
|
98
|
+
|
99
|
+
###
|
100
|
+
def apply(infilename,outfilename)
|
101
|
+
unless @classifier_mallet_path and @mallet_train_vectors
|
102
|
+
return false
|
103
|
+
end
|
104
|
+
|
105
|
+
# STDERR.puts "Testing on "+infilename
|
106
|
+
csvfile = Tempfile.new(File.basename(infilename)+".csvtest")
|
107
|
+
|
108
|
+
infile = File.new(infilename)
|
109
|
+
c45_to_csv(infile,csvfile) # training data in csv format
|
110
|
+
infile.close
|
111
|
+
csvfile.close
|
112
|
+
|
113
|
+
test_mallet_path = infilename+".test.vectors" # training data in mallet format
|
114
|
+
|
115
|
+
# $stderr.puts "test file in " + infilename
|
116
|
+
# $stderr.puts "using training vectors from " + @mallet_train_vectors
|
117
|
+
|
118
|
+
# copy train vectors to temp file.
|
119
|
+
# reason: mallet in std edition reads _and writes_ this file
|
120
|
+
# if rosy is interrupted, corrupted (ie incomplete) train vector files
|
121
|
+
# result
|
122
|
+
|
123
|
+
tempfile = Tempfile.new("mallet")
|
124
|
+
tempfilename = tempfile.path
|
125
|
+
unless File.copy(@mallet_train_vectors,tempfilename)
|
126
|
+
return false
|
127
|
+
end
|
128
|
+
|
129
|
+
command1 = [@malletpath+"bin/csv2vectors", # encode testing data
|
130
|
+
" --input ",csvfile.path,
|
131
|
+
" --output ",test_mallet_path,
|
132
|
+
" --use-pipe-from ",tempfilename].join("")
|
133
|
+
|
134
|
+
# $stderr.puts "Mallet encode: " + command1
|
135
|
+
unless successfully_run(command1) # encode
|
136
|
+
return false
|
137
|
+
end
|
138
|
+
|
139
|
+
File.safe_unlink(tempfilename)
|
140
|
+
|
141
|
+
# some error in encoding?
|
142
|
+
unless FileTest.exists?(test_mallet_path)
|
143
|
+
return false
|
144
|
+
end
|
145
|
+
|
146
|
+
command2 = ["cd #{@interface_path}; ",
|
147
|
+
"java -cp #{@cp} -Xmx1000m Classify ",
|
148
|
+
@classifier_mallet_path," ",
|
149
|
+
test_mallet_path," ",
|
150
|
+
"> ",outfilename].join("")
|
151
|
+
|
152
|
+
# classify
|
153
|
+
# $stderr.puts "Mallet classify: " + command2
|
154
|
+
unless successfully_run(command2)
|
155
|
+
return false
|
156
|
+
end
|
157
|
+
|
158
|
+
# some error in classification
|
159
|
+
unless FileTest.exists?(outfilename)
|
160
|
+
return false
|
161
|
+
end
|
162
|
+
|
163
|
+
# no errors = success
|
164
|
+
csvfile.close(true)
|
165
|
+
return true
|
166
|
+
end
|
167
|
+
|
168
|
+
#####
|
169
|
+
# format of Mallet result file:
|
170
|
+
# <best label> <confidence> \t <secondbest_label> <confidence>....
|
171
|
+
def read_resultfile(filename)
|
172
|
+
begin
|
173
|
+
f = File.new(filename)
|
174
|
+
rescue
|
175
|
+
$stderr.puts "Mallet error: cannot read Mallet result file #{filemame}."
|
176
|
+
return nil
|
177
|
+
end
|
178
|
+
|
179
|
+
retv = []
|
180
|
+
|
181
|
+
f.each { |line|
|
182
|
+
line_results = []
|
183
|
+
pieces = line.split
|
184
|
+
|
185
|
+
while not(pieces.empty?)
|
186
|
+
label = pieces.shift
|
187
|
+
|
188
|
+
begin
|
189
|
+
confidence = pieces.shift.to_f
|
190
|
+
rescue
|
191
|
+
$stderr.puts "Error reading mallet output: invalid line: #{line}"
|
192
|
+
confidence = 0
|
193
|
+
end
|
194
|
+
|
195
|
+
line_results << [label, confidence]
|
196
|
+
end
|
197
|
+
retv << line_results
|
198
|
+
}
|
199
|
+
|
200
|
+
return retv
|
201
|
+
end
|
202
|
+
|
203
|
+
|
204
|
+
###################################
|
205
|
+
private
|
206
|
+
|
207
|
+
###
|
208
|
+
# mallet needs "comma separated values"-file
|
209
|
+
# input: features separated by comma
|
210
|
+
# output:
|
211
|
+
# line_number classlabel features_joined_by_spaces
|
212
|
+
def c45_to_csv(inpipe,outpipe)
|
213
|
+
idx = 0
|
214
|
+
while (line = inpipe.gets)
|
215
|
+
line.chomp!
|
216
|
+
idx += 1
|
217
|
+
la = line.split(",")
|
218
|
+
label = la.pop
|
219
|
+
if label[-1,1] == "."
|
220
|
+
label.chop!
|
221
|
+
end
|
222
|
+
outpipe.puts [idx,label].join(" ")+" "+la.join(" ")
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
###
|
227
|
+
def successfully_run(command)
|
228
|
+
retv = Kernel.system(command)
|
229
|
+
unless retv
|
230
|
+
$stderr.puts "Error running classifier. Continuing."
|
231
|
+
$stderr.puts "Offending command: "+command
|
232
|
+
# exit 1
|
233
|
+
end
|
234
|
+
return retv
|
235
|
+
end
|
236
|
+
end
|