shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,25 @@
1
+ require 'logger'
2
+ require 'pastel'
3
+
4
+ # A general logger for all instances.
5
+ module Shalmaneser
6
+ # @todo AB: [2015-12-31 Thu 13:15]
7
+ # Check if the constant is defined.
8
+ # Do not rely only on the require order.
9
+ LOGGER = Logger.new($stderr)
10
+
11
+ LOGGER.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'INFO'))
12
+ pastel = Pastel.new
13
+ colors = {
14
+ 'FATAL' => pastel.red.bold.detach,
15
+ 'ERROR' => pastel.red.detach,
16
+ 'WARN' => pastel.yellow.detach,
17
+ 'INFO' => pastel.green.detach,
18
+ 'DEBUG' => pastel.white.detach
19
+ }
20
+
21
+ LOGGER.formatter = lambda do |severity, datetime, progname, message|
22
+ colorizer = $stderr.tty? ? colors[severity] : ->(s) { s }
23
+ "#{colorizer.call(severity)}: #{message}\n"
24
+ end
25
+ end
@@ -0,0 +1,189 @@
1
+ # sp 24 08 04
2
+
3
+ # this file provides a very simple wrapper for using different ML systems
4
+ # all you need to do is to write the appropriate learner class
5
+ # and insert them in the initialize routine here in ML()
6
+ #
7
+ # available at the moment:
8
+ # * timbl (memory-based learner)
9
+ # * mallet-maxent (another maxent system)
10
+ # * maxent (the OpenNLP maxent system)
11
+
12
+ # part of contract: learner is not initialised unless it is either trained or read
13
+
14
+ # @note AB: This is only a remark about dynamic requirement below.
15
+ # require_relative 'timbl'
16
+ # require_relative 'mallet'
17
+ # require_relative 'maxent'
18
+
19
+ require_relative 'optimize'
20
+
21
+ class Classifier
22
+
23
+ @@learners = [
24
+ ["timbl", "timbl", "Timbl"],
25
+ ["mallet", "mallet", "Mallet"],
26
+ ["maxent", "maxent", "Maxent"]
27
+ ]
28
+
29
+ def initialize(learner, params)
30
+
31
+ @ready = false
32
+
33
+ if params[0] == "optimise"
34
+ params.shift
35
+ @optimise = true
36
+ else
37
+ @optimise = false
38
+ end
39
+
40
+ program_path = ""
41
+ begin
42
+ program_path = params.shift.chomp
43
+ unless FileTest.exist? program_path
44
+ $stderr.puts "Error: Could not find classifier system at " + program_path
45
+ $stderr.puts "Perhaps an erroneous entry in your experiment file?"
46
+ exit 1
47
+ end
48
+ rescue NoMethodError
49
+ $stderr.puts "Error: No program path provided for classifier system."
50
+ end
51
+
52
+ # try to find our learner in the pre-set list of learners
53
+ learner_tuple = @@learners.assoc(learner)
54
+ unless learner_tuple
55
+ $stderr.puts "Error: I don't know the learner " + learner.to_s
56
+ $stderr.puts "Perhaps an erroneous entry in your experiment file?"
57
+ exit 1
58
+ end
59
+
60
+ # @todo AB: Investigate, why this dynamic require is necessary.
61
+ learner_name, learner_filename, learner_classname = learner_tuple
62
+ require_relative "#{learner_filename}"
63
+ @learner = eval(learner_classname).new(program_path,params)
64
+ end
65
+
66
+ # a classifier can (and has to be) either trained or read
67
+ def train(trainfile, classifier_file=nil)
68
+ # train on the training data in trainfile
69
+ # make sure we produce a valid file name
70
+
71
+ # it is possible to directly specify a filename for storing the classifier
72
+
73
+ trainfile.gsub!(/[<>]/,"")
74
+ trainfile.gsub!(/ /,"_")
75
+ if @optimise
76
+ STDERR.puts "[ML] using feature optimisation"
77
+ @optimiser = Optimise.new
78
+ @optimiser.init_from_data(trainfile)
79
+ optimisedfile = trainfile+".opted"
80
+ @optimiser.apply(trainfile,optimisedfile)
81
+ @learner.train(optimisedfile,classifier_file)
82
+ File.delete(optimisedfile)
83
+ else
84
+ STDERR.puts "[ML] no feature optimisation"
85
+ @learner.train(trainfile,classifier_file)
86
+ end
87
+ @ready = true
88
+ end
89
+
90
+
91
+ # returns true iff reading the classifier from the file has had success
92
+
93
+ def read(classifier_file)
94
+ # make sure we produce a valid file name
95
+ classifier_file.gsub!(/[<>]/,"")
96
+ classifier_file.gsub!(/ /,"_")
97
+
98
+ # read file, if present
99
+
100
+ status = @learner.read(classifier_file)
101
+
102
+ # if reading has failed, return "false"
103
+ unless status
104
+ STDERR.puts "reading from #{classifier_file} did not succeed"
105
+ return status
106
+ end
107
+
108
+ # read optimisation, if desired
109
+ if @optimise
110
+ optimisations_filename = Optimise.recommended_filename(classifier_file)
111
+ unless FileTest.exists? optimisations_filename
112
+ STDERR.puts "[ML] Error: attempted to read stored optimisation, but file does not exist"
113
+ return false
114
+ else
115
+ @optimiser = Optimise.new
116
+ @optimiser.init_from_file(optimisations_filename)
117
+ end
118
+ end
119
+
120
+ @ready = true
121
+ return true
122
+
123
+ end
124
+
125
+ # a classifier can be stored somewhere. This can be more than one file (classifier-specific),
126
+ # but all files start with "classifier_file"
127
+
128
+ def write(classifier_file)
129
+ # make sure we produce a valid file name
130
+ classifier_file.gsub!(/[<>]/,"")
131
+ classifier_file.gsub!(/ /,"_")
132
+ @learner.write(classifier_file)
133
+ if @optimise
134
+ @optimiser.store(Optimise.recommended_filename(classifier_file))
135
+ end
136
+ end
137
+
138
+ ###
139
+ # exists?
140
+ # check if a classifier is living at some particular path
141
+
142
+ def exists?(classifier_file)
143
+ classifier_file.gsub!(/[<>]/,"")
144
+ classifier_file.gsub!(/ /,"_")
145
+ return @learner.exists?(classifier_file)
146
+ end
147
+
148
+ # a classifier can be applied
149
+
150
+ # returns true iff application has had success
151
+
152
+ def apply(testfile,outfile) # test either on the training or the test data in the specified dir
153
+ # make sure we produce a valid file name
154
+ testfile.gsub!(/[<>]/,"")
155
+ testfile.gsub!(/ /,"_")
156
+ # make sure we produce a valid file name
157
+ outfile.gsub!(/[<>]/,"")
158
+ outfile.gsub!(/ /,"_")
159
+
160
+ unless @ready
161
+ STDERR.puts "[ML] Warning: learner not ready for testing! Must be trained or read."
162
+ return false
163
+ end
164
+
165
+ # do we have a testfile?
166
+ unless FileTest.exists?(testfile)
167
+ STDERR.puts "[ML] Warning: could not find testfile (maybe empty test set?)."
168
+ return false
169
+ end
170
+
171
+ if @optimise
172
+ optimisedfile = testfile+".opted"
173
+ @optimiser.apply(testfile,optimisedfile)
174
+ return @learner.apply(optimisedfile,outfile)
175
+ File.delete(optimisedfile)
176
+ else
177
+ return @learner.apply(testfile,outfile)
178
+ end
179
+ end
180
+
181
+ ###
182
+ # read classifier result file,
183
+ # returns a list of instance_results
184
+ # where an instance_result is a list of pairs [label, confidence]
185
+ # where the pairs are sorted by confidence
186
+ def read_resultfile(file)
187
+ return @learner.read_resultfile(file)
188
+ end
189
+ end
@@ -0,0 +1,236 @@
1
+ # wrapper script for the Mallet toolkit Maxent classifier
2
+
3
+ # Problem with Winnow: cannot be serialised (written to file). Support dropped.
4
+
5
+ # sp 27 10 04
6
+
7
+
8
+ require "tempfile"
9
+ require "ftools"
10
+
11
+ class Mallet
12
+
13
+ ###
14
+ def initialize(program_path,parameters)
15
+
16
+ if parameters.empty?
17
+ puts "Error: Mallet needs two paths (first the location of mallet itself and then the location of the interface, usually program/tools/mallet)."
18
+ puts "I got only the program path."
19
+ Kernel.exit
20
+ end
21
+
22
+ @malletpath = program_path
23
+ @interface_path = parameters.first
24
+ unless @malletpath =~ /\/$/
25
+ @malletpath = @malletpath + "/"
26
+ end
27
+
28
+ @learner = "MaxEnt,gaussianPriorVariance=1.0"
29
+
30
+ # classpath for mallet
31
+
32
+ @cp = "#{ENV["CLASSPATH"]}:#{@malletpath}class:#{@malletpath}lib/bsh.jar"
33
+
34
+ end
35
+
36
+ ###
37
+ def train(infilename,classifier_location)
38
+ csvfile = Tempfile.new(File.basename(infilename)+".csvtrain")
39
+ infile = File.new(infilename)
40
+ c45_to_csv(infile,csvfile) # training data in csv format
41
+ infile.close
42
+ csvfile.close
43
+ @mallet_train_vectors = infilename+".trainvectors" # training data in mallet format
44
+ if classifier_location
45
+ @classifier_mallet_path = classifier_location
46
+ else
47
+ @classifier_mallet_path = infilename+".classifier"
48
+ end
49
+
50
+ command1 = [@malletpath+"bin/csv2vectors ",
51
+ " --input ",csvfile.path,
52
+ " --output ",@mallet_train_vectors].join("")
53
+
54
+ command2 = ["cd #{@interface_path}; ",
55
+ "java -cp #{@cp} -Xmx1000m Train ",
56
+ " --train ",@mallet_train_vectors,
57
+ " --out ",@classifier_mallet_path,
58
+ " --trainer ",@learner].join("")
59
+ # STDERR.puts "[train 1] "+command1
60
+ successfully_run(command1) # encode
61
+ # STDERR.puts "[train 2] "+command2
62
+ successfully_run(command2) # train
63
+ csvfile.close(true)
64
+ end
65
+
66
+ def write(classifier_file)
67
+ if @classifier_mallet_path
68
+ %x{cp #{@classifier_mallet_path} #{classifier_file}.classifier} # store classifier
69
+ # File.chmod(0664,classifier_file+".classifier")
70
+ end
71
+ if @mallet_train_vectors
72
+ %x{cp #{@mallet_train_vectors} #{classifier_file}.trainvectors} # store train vectors to recreate pipe for testing data
73
+ # File.chmod(0664,classifier_file+".trainvectors")
74
+ end
75
+ end
76
+
77
+ ###
78
+ def exists?(classifier_file)
79
+ return (FileTest.exists?(classifier_file+".trainvectors") and
80
+ FileTest.exists?(classifier_file+".classifier"))
81
+ end
82
+
83
+ ###
84
+ # return true iff reading the classifier has had success
85
+ def read(classifier_file)
86
+ @mallet_train_vectors = classifier_file+".trainvectors" # training data in mallet format
87
+ @classifier_mallet_path = classifier_file+".classifier"
88
+ unless FileTest.exists?(@mallet_train_vectors)
89
+ $stderr.puts "No classifier file "+@mallet_train_vectors
90
+ return false
91
+ end
92
+ unless FileTest.exists?(@classifier_mallet_path)
93
+ $stderr.puts "No classifier file "+@classifier_mallet_path
94
+ return false
95
+ end
96
+ return true
97
+ end
98
+
99
+ ###
100
+ def apply(infilename,outfilename)
101
+ unless @classifier_mallet_path and @mallet_train_vectors
102
+ return false
103
+ end
104
+
105
+ # STDERR.puts "Testing on "+infilename
106
+ csvfile = Tempfile.new(File.basename(infilename)+".csvtest")
107
+
108
+ infile = File.new(infilename)
109
+ c45_to_csv(infile,csvfile) # training data in csv format
110
+ infile.close
111
+ csvfile.close
112
+
113
+ test_mallet_path = infilename+".test.vectors" # training data in mallet format
114
+
115
+ # $stderr.puts "test file in " + infilename
116
+ # $stderr.puts "using training vectors from " + @mallet_train_vectors
117
+
118
+ # copy train vectors to temp file.
119
+ # reason: mallet in std edition reads _and writes_ this file
120
+ # if rosy is interrupted, corrupted (ie incomplete) train vector files
121
+ # result
122
+
123
+ tempfile = Tempfile.new("mallet")
124
+ tempfilename = tempfile.path
125
+ unless File.copy(@mallet_train_vectors,tempfilename)
126
+ return false
127
+ end
128
+
129
+ command1 = [@malletpath+"bin/csv2vectors", # encode testing data
130
+ " --input ",csvfile.path,
131
+ " --output ",test_mallet_path,
132
+ " --use-pipe-from ",tempfilename].join("")
133
+
134
+ # $stderr.puts "Mallet encode: " + command1
135
+ unless successfully_run(command1) # encode
136
+ return false
137
+ end
138
+
139
+ File.safe_unlink(tempfilename)
140
+
141
+ # some error in encoding?
142
+ unless FileTest.exists?(test_mallet_path)
143
+ return false
144
+ end
145
+
146
+ command2 = ["cd #{@interface_path}; ",
147
+ "java -cp #{@cp} -Xmx1000m Classify ",
148
+ @classifier_mallet_path," ",
149
+ test_mallet_path," ",
150
+ "> ",outfilename].join("")
151
+
152
+ # classify
153
+ # $stderr.puts "Mallet classify: " + command2
154
+ unless successfully_run(command2)
155
+ return false
156
+ end
157
+
158
+ # some error in classification
159
+ unless FileTest.exists?(outfilename)
160
+ return false
161
+ end
162
+
163
+ # no errors = success
164
+ csvfile.close(true)
165
+ return true
166
+ end
167
+
168
+ #####
169
+ # format of Mallet result file:
170
+ # <best label> <confidence> \t <secondbest_label> <confidence>....
171
+ def read_resultfile(filename)
172
+ begin
173
+ f = File.new(filename)
174
+ rescue
175
+ $stderr.puts "Mallet error: cannot read Mallet result file #{filemame}."
176
+ return nil
177
+ end
178
+
179
+ retv = []
180
+
181
+ f.each { |line|
182
+ line_results = []
183
+ pieces = line.split
184
+
185
+ while not(pieces.empty?)
186
+ label = pieces.shift
187
+
188
+ begin
189
+ confidence = pieces.shift.to_f
190
+ rescue
191
+ $stderr.puts "Error reading mallet output: invalid line: #{line}"
192
+ confidence = 0
193
+ end
194
+
195
+ line_results << [label, confidence]
196
+ end
197
+ retv << line_results
198
+ }
199
+
200
+ return retv
201
+ end
202
+
203
+
204
+ ###################################
205
+ private
206
+
207
+ ###
208
+ # mallet needs "comma separated values"-file
209
+ # input: features separated by comma
210
+ # output:
211
+ # line_number classlabel features_joined_by_spaces
212
+ def c45_to_csv(inpipe,outpipe)
213
+ idx = 0
214
+ while (line = inpipe.gets)
215
+ line.chomp!
216
+ idx += 1
217
+ la = line.split(",")
218
+ label = la.pop
219
+ if label[-1,1] == "."
220
+ label.chop!
221
+ end
222
+ outpipe.puts [idx,label].join(" ")+" "+la.join(" ")
223
+ end
224
+ end
225
+
226
+ ###
227
+ def successfully_run(command)
228
+ retv = Kernel.system(command)
229
+ unless retv
230
+ $stderr.puts "Error running classifier. Continuing."
231
+ $stderr.puts "Offending command: "+command
232
+ # exit 1
233
+ end
234
+ return retv
235
+ end
236
+ end