shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,25 @@
1
+ require 'logger'
2
+ require 'pastel'
3
+
4
+ # A general logger for all instances.
5
+ module Shalmaneser
6
+ # @todo AB: [2015-12-31 Thu 13:15]
7
+ # Check if the constant is defined.
8
+ # Do not rely only on the require order.
9
+ LOGGER = Logger.new($stderr)
10
+
11
+ LOGGER.level = Logger.const_get(ENV.fetch('LOG_LEVEL', 'INFO'))
12
+ pastel = Pastel.new
13
+ colors = {
14
+ 'FATAL' => pastel.red.bold.detach,
15
+ 'ERROR' => pastel.red.detach,
16
+ 'WARN' => pastel.yellow.detach,
17
+ 'INFO' => pastel.green.detach,
18
+ 'DEBUG' => pastel.white.detach
19
+ }
20
+
21
+ LOGGER.formatter = lambda do |severity, datetime, progname, message|
22
+ colorizer = $stderr.tty? ? colors[severity] : ->(s) { s }
23
+ "#{colorizer.call(severity)}: #{message}\n"
24
+ end
25
+ end
@@ -0,0 +1,189 @@
1
+ # sp 24 08 04
2
+
3
+ # this file provides a very simple wrapper for using different ML systems
4
+ # all you need to do is to write the appropriate learner class
5
+ # and insert them in the initialize routine here in ML()
6
+ #
7
+ # available at the moment:
8
+ # * timbl (memory-based learner)
9
+ # * mallet-maxent (another maxent system)
10
+ # * maxent (the OpenNLP maxent system)
11
+
12
+ # part of contract: learner is not initialised unless it is either trained or read
13
+
14
+ # @note AB: This is only a remark about dynamic requirement below.
15
+ # require_relative 'timbl'
16
+ # require_relative 'mallet'
17
+ # require_relative 'maxent'
18
+
19
+ require_relative 'optimize'
20
+
21
+ class Classifier
22
+
23
+ @@learners = [
24
+ ["timbl", "timbl", "Timbl"],
25
+ ["mallet", "mallet", "Mallet"],
26
+ ["maxent", "maxent", "Maxent"]
27
+ ]
28
+
29
+ def initialize(learner, params)
30
+
31
+ @ready = false
32
+
33
+ if params[0] == "optimise"
34
+ params.shift
35
+ @optimise = true
36
+ else
37
+ @optimise = false
38
+ end
39
+
40
+ program_path = ""
41
+ begin
42
+ program_path = params.shift.chomp
43
+ unless FileTest.exist? program_path
44
+ $stderr.puts "Error: Could not find classifier system at " + program_path
45
+ $stderr.puts "Perhaps an erroneous entry in your experiment file?"
46
+ exit 1
47
+ end
48
+ rescue NoMethodError
49
+ $stderr.puts "Error: No program path provided for classifier system."
50
+ end
51
+
52
+ # try to find our learner in the pre-set list of learners
53
+ learner_tuple = @@learners.assoc(learner)
54
+ unless learner_tuple
55
+ $stderr.puts "Error: I don't know the learner " + learner.to_s
56
+ $stderr.puts "Perhaps an erroneous entry in your experiment file?"
57
+ exit 1
58
+ end
59
+
60
+ # @todo AB: Investigate, why this dynamic require is necessary.
61
+ learner_name, learner_filename, learner_classname = learner_tuple
62
+ require_relative "#{learner_filename}"
63
+ @learner = eval(learner_classname).new(program_path,params)
64
+ end
65
+
66
+ # a classifier can (and has to be) either trained or read
67
+ def train(trainfile, classifier_file=nil)
68
+ # train on the training data in trainfile
69
+ # make sure we produce a valid file name
70
+
71
+ # it is possible to directly specify a filename for storing the classifier
72
+
73
+ trainfile.gsub!(/[<>]/,"")
74
+ trainfile.gsub!(/ /,"_")
75
+ if @optimise
76
+ STDERR.puts "[ML] using feature optimisation"
77
+ @optimiser = Optimise.new
78
+ @optimiser.init_from_data(trainfile)
79
+ optimisedfile = trainfile+".opted"
80
+ @optimiser.apply(trainfile,optimisedfile)
81
+ @learner.train(optimisedfile,classifier_file)
82
+ File.delete(optimisedfile)
83
+ else
84
+ STDERR.puts "[ML] no feature optimisation"
85
+ @learner.train(trainfile,classifier_file)
86
+ end
87
+ @ready = true
88
+ end
89
+
90
+
91
+ # returns true iff reading the classifier from the file has had success
92
+
93
+ def read(classifier_file)
94
+ # make sure we produce a valid file name
95
+ classifier_file.gsub!(/[<>]/,"")
96
+ classifier_file.gsub!(/ /,"_")
97
+
98
+ # read file, if present
99
+
100
+ status = @learner.read(classifier_file)
101
+
102
+ # if reading has failed, return "false"
103
+ unless status
104
+ STDERR.puts "reading from #{classifier_file} did not succeed"
105
+ return status
106
+ end
107
+
108
+ # read optimisation, if desired
109
+ if @optimise
110
+ optimisations_filename = Optimise.recommended_filename(classifier_file)
111
+ unless FileTest.exists? optimisations_filename
112
+ STDERR.puts "[ML] Error: attempted to read stored optimisation, but file does not exist"
113
+ return false
114
+ else
115
+ @optimiser = Optimise.new
116
+ @optimiser.init_from_file(optimisations_filename)
117
+ end
118
+ end
119
+
120
+ @ready = true
121
+ return true
122
+
123
+ end
124
+
125
+ # a classifier can be stored somewhere. This can be more than one file (classifier-specific),
126
+ # but all files start with "classifier_file"
127
+
128
+ def write(classifier_file)
129
+ # make sure we produce a valid file name
130
+ classifier_file.gsub!(/[<>]/,"")
131
+ classifier_file.gsub!(/ /,"_")
132
+ @learner.write(classifier_file)
133
+ if @optimise
134
+ @optimiser.store(Optimise.recommended_filename(classifier_file))
135
+ end
136
+ end
137
+
138
+ ###
139
+ # exists?
140
+ # check if a classifier is living at some particular path
141
+
142
+ def exists?(classifier_file)
143
+ classifier_file.gsub!(/[<>]/,"")
144
+ classifier_file.gsub!(/ /,"_")
145
+ return @learner.exists?(classifier_file)
146
+ end
147
+
148
+ # a classifier can be applied
149
+
150
+ # returns true iff application has had success
151
+
152
+ def apply(testfile,outfile) # test either on the training or the test data in the specified dir
153
+ # make sure we produce a valid file name
154
+ testfile.gsub!(/[<>]/,"")
155
+ testfile.gsub!(/ /,"_")
156
+ # make sure we produce a valid file name
157
+ outfile.gsub!(/[<>]/,"")
158
+ outfile.gsub!(/ /,"_")
159
+
160
+ unless @ready
161
+ STDERR.puts "[ML] Warning: learner not ready for testing! Must be trained or read."
162
+ return false
163
+ end
164
+
165
+ # do we have a testfile?
166
+ unless FileTest.exists?(testfile)
167
+ STDERR.puts "[ML] Warning: could not find testfile (maybe empty test set?)."
168
+ return false
169
+ end
170
+
171
+ if @optimise
172
+ optimisedfile = testfile+".opted"
173
+ @optimiser.apply(testfile,optimisedfile)
174
+ return @learner.apply(optimisedfile,outfile)
175
+ File.delete(optimisedfile)
176
+ else
177
+ return @learner.apply(testfile,outfile)
178
+ end
179
+ end
180
+
181
+ ###
182
+ # read classifier result file,
183
+ # returns a list of instance_results
184
+ # where an instance_result is a list of pairs [label, confidence]
185
+ # where the pairs are sorted by confidence
186
+ def read_resultfile(file)
187
+ return @learner.read_resultfile(file)
188
+ end
189
+ end
@@ -0,0 +1,236 @@
1
+ # wrapper script for the Mallet toolkit Maxent classifier
2
+
3
+ # Problem with Winnow: cannot be serialised (written to file). Support dropped.
4
+
5
+ # sp 27 10 04
6
+
7
+
8
+ require "tempfile"
9
+ require "ftools"
10
+
11
+ class Mallet
12
+
13
+ ###
14
+ def initialize(program_path,parameters)
15
+
16
+ if parameters.empty?
17
+ puts "Error: Mallet needs two paths (first the location of mallet itself and then the location of the interface, usually program/tools/mallet)."
18
+ puts "I got only the program path."
19
+ Kernel.exit
20
+ end
21
+
22
+ @malletpath = program_path
23
+ @interface_path = parameters.first
24
+ unless @malletpath =~ /\/$/
25
+ @malletpath = @malletpath + "/"
26
+ end
27
+
28
+ @learner = "MaxEnt,gaussianPriorVariance=1.0"
29
+
30
+ # classpath for mallet
31
+
32
+ @cp = "#{ENV["CLASSPATH"]}:#{@malletpath}class:#{@malletpath}lib/bsh.jar"
33
+
34
+ end
35
+
36
+ ###
37
+ def train(infilename,classifier_location)
38
+ csvfile = Tempfile.new(File.basename(infilename)+".csvtrain")
39
+ infile = File.new(infilename)
40
+ c45_to_csv(infile,csvfile) # training data in csv format
41
+ infile.close
42
+ csvfile.close
43
+ @mallet_train_vectors = infilename+".trainvectors" # training data in mallet format
44
+ if classifier_location
45
+ @classifier_mallet_path = classifier_location
46
+ else
47
+ @classifier_mallet_path = infilename+".classifier"
48
+ end
49
+
50
+ command1 = [@malletpath+"bin/csv2vectors ",
51
+ " --input ",csvfile.path,
52
+ " --output ",@mallet_train_vectors].join("")
53
+
54
+ command2 = ["cd #{@interface_path}; ",
55
+ "java -cp #{@cp} -Xmx1000m Train ",
56
+ " --train ",@mallet_train_vectors,
57
+ " --out ",@classifier_mallet_path,
58
+ " --trainer ",@learner].join("")
59
+ # STDERR.puts "[train 1] "+command1
60
+ successfully_run(command1) # encode
61
+ # STDERR.puts "[train 2] "+command2
62
+ successfully_run(command2) # train
63
+ csvfile.close(true)
64
+ end
65
+
66
+ def write(classifier_file)
67
+ if @classifier_mallet_path
68
+ %x{cp #{@classifier_mallet_path} #{classifier_file}.classifier} # store classifier
69
+ # File.chmod(0664,classifier_file+".classifier")
70
+ end
71
+ if @mallet_train_vectors
72
+ %x{cp #{@mallet_train_vectors} #{classifier_file}.trainvectors} # store train vectors to recreate pipe for testing data
73
+ # File.chmod(0664,classifier_file+".trainvectors")
74
+ end
75
+ end
76
+
77
+ ###
78
+ def exists?(classifier_file)
79
+ return (FileTest.exists?(classifier_file+".trainvectors") and
80
+ FileTest.exists?(classifier_file+".classifier"))
81
+ end
82
+
83
+ ###
84
+ # return true iff reading the classifier has had success
85
+ def read(classifier_file)
86
+ @mallet_train_vectors = classifier_file+".trainvectors" # training data in mallet format
87
+ @classifier_mallet_path = classifier_file+".classifier"
88
+ unless FileTest.exists?(@mallet_train_vectors)
89
+ $stderr.puts "No classifier file "+@mallet_train_vectors
90
+ return false
91
+ end
92
+ unless FileTest.exists?(@classifier_mallet_path)
93
+ $stderr.puts "No classifier file "+@classifier_mallet_path
94
+ return false
95
+ end
96
+ return true
97
+ end
98
+
99
+ ###
100
+ def apply(infilename,outfilename)
101
+ unless @classifier_mallet_path and @mallet_train_vectors
102
+ return false
103
+ end
104
+
105
+ # STDERR.puts "Testing on "+infilename
106
+ csvfile = Tempfile.new(File.basename(infilename)+".csvtest")
107
+
108
+ infile = File.new(infilename)
109
+ c45_to_csv(infile,csvfile) # training data in csv format
110
+ infile.close
111
+ csvfile.close
112
+
113
+ test_mallet_path = infilename+".test.vectors" # training data in mallet format
114
+
115
+ # $stderr.puts "test file in " + infilename
116
+ # $stderr.puts "using training vectors from " + @mallet_train_vectors
117
+
118
+ # copy train vectors to temp file.
119
+ # reason: mallet in std edition reads _and writes_ this file
120
+ # if rosy is interrupted, corrupted (ie incomplete) train vector files
121
+ # result
122
+
123
+ tempfile = Tempfile.new("mallet")
124
+ tempfilename = tempfile.path
125
+ unless File.copy(@mallet_train_vectors,tempfilename)
126
+ return false
127
+ end
128
+
129
+ command1 = [@malletpath+"bin/csv2vectors", # encode testing data
130
+ " --input ",csvfile.path,
131
+ " --output ",test_mallet_path,
132
+ " --use-pipe-from ",tempfilename].join("")
133
+
134
+ # $stderr.puts "Mallet encode: " + command1
135
+ unless successfully_run(command1) # encode
136
+ return false
137
+ end
138
+
139
+ File.safe_unlink(tempfilename)
140
+
141
+ # some error in encoding?
142
+ unless FileTest.exists?(test_mallet_path)
143
+ return false
144
+ end
145
+
146
+ command2 = ["cd #{@interface_path}; ",
147
+ "java -cp #{@cp} -Xmx1000m Classify ",
148
+ @classifier_mallet_path," ",
149
+ test_mallet_path," ",
150
+ "> ",outfilename].join("")
151
+
152
+ # classify
153
+ # $stderr.puts "Mallet classify: " + command2
154
+ unless successfully_run(command2)
155
+ return false
156
+ end
157
+
158
+ # some error in classification
159
+ unless FileTest.exists?(outfilename)
160
+ return false
161
+ end
162
+
163
+ # no errors = success
164
+ csvfile.close(true)
165
+ return true
166
+ end
167
+
168
+ #####
169
+ # format of Mallet result file:
170
+ # <best label> <confidence> \t <secondbest_label> <confidence>....
171
+ def read_resultfile(filename)
172
+ begin
173
+ f = File.new(filename)
174
+ rescue
175
+ $stderr.puts "Mallet error: cannot read Mallet result file #{filemame}."
176
+ return nil
177
+ end
178
+
179
+ retv = []
180
+
181
+ f.each { |line|
182
+ line_results = []
183
+ pieces = line.split
184
+
185
+ while not(pieces.empty?)
186
+ label = pieces.shift
187
+
188
+ begin
189
+ confidence = pieces.shift.to_f
190
+ rescue
191
+ $stderr.puts "Error reading mallet output: invalid line: #{line}"
192
+ confidence = 0
193
+ end
194
+
195
+ line_results << [label, confidence]
196
+ end
197
+ retv << line_results
198
+ }
199
+
200
+ return retv
201
+ end
202
+
203
+
204
+ ###################################
205
+ private
206
+
207
+ ###
208
+ # mallet needs "comma separated values"-file
209
+ # input: features separated by comma
210
+ # output:
211
+ # line_number classlabel features_joined_by_spaces
212
+ def c45_to_csv(inpipe,outpipe)
213
+ idx = 0
214
+ while (line = inpipe.gets)
215
+ line.chomp!
216
+ idx += 1
217
+ la = line.split(",")
218
+ label = la.pop
219
+ if label[-1,1] == "."
220
+ label.chop!
221
+ end
222
+ outpipe.puts [idx,label].join(" ")+" "+la.join(" ")
223
+ end
224
+ end
225
+
226
+ ###
227
+ def successfully_run(command)
228
+ retv = Kernel.system(command)
229
+ unless retv
230
+ $stderr.puts "Error running classifier. Continuing."
231
+ $stderr.puts "Offending command: "+command
232
+ # exit 1
233
+ end
234
+ return retv
235
+ end
236
+ end