shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,229 @@
1
+ # wrapper script for the OpenNLP Maxent classifier
2
+
3
+ # sp July 2007
4
+
5
+
6
+ require "tempfile"
7
+ require 'fileutils'
8
+
9
+ class Maxent
10
+ ###
11
+ def initialize(program_path, parameters)
12
+
13
+ # @note AB: <parameters> is an Array with the last part of the
14
+ # line from the experiment file, it should contain the path to our
15
+ # java wrappers, but we don't want it.
16
+ # Since the presence of this part is checked only here we
17
+ # suppose it obsolete and set this path manually here.
18
+ # if parameters.empty?
19
+ # puts "Error: The OpenNLP maxent system needs two paths (first the location of maxent itself and then the location of the interface, usually program/tools/maxent)."
20
+ # puts "I got only the program path."
21
+ # Kernel.exit
22
+ # end
23
+ # @interface_path = parameters.first
24
+
25
+ # @note AB: Setting path manually.
26
+ # It assumes <Maxent.rb> ist in <lib/common> and
27
+ # <Classify.class> is in <lib/ext/maxent>.
28
+ # @todo AB: This assumption should be changed. ENV[]???
29
+ @interface_path = File.expand_path('../ext/maxent', File.dirname(__FILE__))
30
+
31
+ @maxentpath = program_path
32
+
33
+ unless @maxentpath =~ /\/$/
34
+ @maxentpath = @maxentpath + "/"
35
+ end
36
+
37
+ # classpath for maxent
38
+
39
+ @cp = "#{@maxentpath}:#{@maxentpath}lib:#{@maxentpath}lib/trove.jar:#{@maxentpath}output/maxent-2.4.0.jar:#{ENV["CLASSPATH"]}"
40
+
41
+ end
42
+
43
+ ###
44
+ #
45
+ # write classifier to training directory...
46
+ def train(infilename,classifier_file)
47
+ trainfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
48
+ infile = File.new(infilename)
49
+ c45_to_maxent(infile,trainfile) # training data in csv format
50
+ infile.close
51
+ trainfile.close
52
+
53
+ if classifier_file
54
+ @classifier_location = classifier_file
55
+ else
56
+ @classifier_location = trainfile.path+"Model.bin.gz"
57
+ end
58
+
59
+ @classifier_location = enforce_compact_storage(@classifier_location)
60
+
61
+ # store model in binary, gzipped form...
62
+ command = ["cd #{@interface_path}; ",
63
+ #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Train",
64
+ "java -cp #{@cp} -Xmx1000m Train",
65
+ trainfile.path,
66
+ @classifier_location].join(" ")
67
+ # remember location
68
+ unless successfully_run(command)
69
+ return false
70
+ end
71
+ trainfile.close(true)
72
+ end
73
+
74
+ def write(classifier_file)
75
+
76
+ classifier_file = enforce_compact_storage(classifier_file)
77
+
78
+ if @classifier_location
79
+ @classifier_location = enforce_compact_storage(@classifier_location)
80
+ %x{cp #{@classifier_location} #{classifier_file}} # store classifier
81
+ # File.chmod(0664,classifier_file+".classifier")
82
+ else
83
+ $stderr.puts "Maxent error: cannot read Maxent classifier file #{@classifier_file}."
84
+ return nil
85
+ end
86
+ end
87
+
88
+ ###
89
+ def exists?(classifier_file)
90
+ classifier_file = enforce_compact_storage(classifier_file)
91
+ return FileTest.exists?(classifier_file)
92
+ end
93
+
94
+ ###
95
+ # return true iff reading the classifier has had success
96
+ def read(classifier_file)
97
+
98
+ classifier_file = enforce_compact_storage(classifier_file)
99
+
100
+ if exists?(classifier_file)
101
+ @classifier_location = classifier_file
102
+ return true
103
+ else
104
+ $stderr.puts "No classifier file "+classifier_file
105
+ return false
106
+ end
107
+ end
108
+
109
+ ###
110
+ def apply(infilename,outfilename)
111
+
112
+ @classifier_location = enforce_compact_storage(@classifier_location)
113
+ unless @classifier_location
114
+ return false
115
+ end
116
+
117
+ testfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
118
+
119
+ infile = File.new(infilename)
120
+ c45_to_maxent(infile,testfile) # training data in csv format
121
+ infile.close
122
+ testfile.close
123
+
124
+ command = ["cd #{@interface_path}; ",
125
+ #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Classify ",
126
+ "java -cp #{@cp} -Xmx1000m Classify ",
127
+ testfile.path,
128
+ @classifier_location,
129
+ ">",
130
+ outfilename].join(" ")
131
+
132
+ # classify
133
+ unless successfully_run(command)
134
+ return false
135
+ end
136
+
137
+ # some error in classification
138
+ unless FileTest.exists?(outfilename)
139
+ return false
140
+ end
141
+
142
+ # no errors = success
143
+ testfile.close(true)
144
+ return true
145
+ end
146
+
147
+ #####
148
+ # format of Maxent result file:
149
+ # <best label>[<confidence>] <secondbest_label>[<confidence>] ....
150
+ #
151
+ # returns a list of instance_results
152
+ # where an instance_result is a list of pairs [label, confidence]
153
+ # where the pairs are sorted by confidence
154
+ def read_resultfile(filename)
155
+ begin
156
+ f = File.new(filename)
157
+ rescue
158
+ $stderr.puts "Maxent error: cannot read Maxent result file #{filemame}."
159
+ return nil
160
+ end
161
+
162
+ retv = []
163
+
164
+ f.each do |line|
165
+ line_results = []
166
+ pieces = line.split # split at whitespace
167
+
168
+ pieces.each {|piece|
169
+ piece =~ /(\S+)\[(.+)\]/
170
+ label = $1
171
+ confidence = $2.to_f
172
+
173
+ line_results << [label, confidence]
174
+ }
175
+
176
+ # sort: most confident label first
177
+ retv << line_results.sort {|a,b| b[1] <=> a[1]}
178
+ end
179
+
180
+ f.close
181
+
182
+ retv
183
+ end
184
+
185
+
186
+ ###################################
187
+ private
188
+
189
+ ###
190
+ # produce input file for maxent learner: make attribute-value pairs
191
+ # where attribute == featureX=
192
+ def c45_to_maxent(inpipe,outpipe)
193
+ while (line = inpipe.gets)
194
+ line.chomp!
195
+ la = line.split(",")
196
+ label = la.pop
197
+ if label[-1,1] == "."
198
+ label.chop!
199
+ end
200
+ la.each_index {|i|
201
+ la[i] = i.to_s + "=" + la[i]
202
+ }
203
+ la.push(label)
204
+ outpipe.puts la.join(" ")
205
+ end
206
+ end
207
+
208
+ # since the OpenNLP MaxEnt system determines storage based on filename,
209
+ # make sure that all models are stored internally as binary, gzipped files.
210
+
211
+ def enforce_compact_storage(filename)
212
+ if filename =~ /Model.bin.gz/
213
+ return filename
214
+ else
215
+ return filename+"Model.bin.gz"
216
+ end
217
+ end
218
+
219
+ ###
220
+ def successfully_run(command)
221
+ retv = Kernel.system(command)
222
+ unless retv
223
+ $stderr.puts "Error running classifier. Continuing."
224
+ $stderr.puts "Offending command: "+command
225
+ # exit 1
226
+ end
227
+ return retv
228
+ end
229
+ end
@@ -0,0 +1,195 @@
1
+ # sp 29 07 04
2
+ # "optimise" c4.5 files by replacing all feature values which only
3
+ # occur with one label by a new, common value.
4
+ #
5
+ # two modes of operation:
6
+ # optimise <file> -- optimise file and store optimisations in <file>.opts
7
+ # optimise <file> <file.opts> -- apply optimisation from file.opts to file
8
+
9
+ class Optimise
10
+
11
+ def initialize
12
+ @ready = false
13
+ end
14
+
15
+ def init_from_data(infile) # find new optimisation
16
+
17
+ STDERR.puts "[Optimise] computing new feature optimisation"
18
+
19
+ infile = File.new(infile)
20
+ labels = []
21
+ features = nil
22
+ @replacements = [] # for each feature, store the list of replacements
23
+
24
+ # read data from infile into hash and initialise replacements array
25
+ while (line = infile.gets)
26
+ f_l = line.chomp.split(",")
27
+
28
+ if features.nil? # first line: initialisation
29
+ features = [] # for each feature: array of feature values from file
30
+ f_l.each_index {|i|
31
+ features[i] = []
32
+ @replacements[i] = {}
33
+ }
34
+ end
35
+ labels << f_l.pop
36
+ f_l.each_index {|i|
37
+ features[i] << f_l[i]
38
+ }
39
+ end
40
+ infile.close
41
+
42
+ features.each_index {|findex| # traverse all features
43
+
44
+ # for each feature *value*, find all label indices
45
+
46
+ fvalues = features[findex]
47
+
48
+ fval_to_label = {} # record fval -> label mappings
49
+ # no label : nil
50
+ # one label: <label>
51
+ # two labels: false
52
+
53
+ fvalues.each_index {|inst_idx|
54
+ label = labels[inst_idx] # current label
55
+ fval = fvalues[inst_idx] # current feature value
56
+ seen_label = fval_to_label[fval] # previously seen label
57
+ if seen_label.nil?
58
+ fval_to_label[fval] = label
59
+ elsif seen_label and seen_label != label
60
+ fval_to_label[fval] = false
61
+ end
62
+ } # at the end, all fvals should be mapped to either <label> or false
63
+
64
+ # construct new feature value names
65
+
66
+ new_fvals = {}
67
+ labels.each {|label|
68
+ new_fvals[label] = "f"+findex.to_s+"_"+label.gsub(/\./,"")
69
+ }
70
+
71
+ # record all features values for which we have only seen one label in @replacements
72
+
73
+ fval_to_label.each_pair {|fval,label|
74
+ if fval == "[U]"
75
+ puts "[U]: "+label.to_s+" "+new_fvals[label]
76
+ end
77
+ if label
78
+ # STDERR.puts "replacement of "+fval+" by "+new_fvals[label]
79
+ @replacements[findex][fval] = new_fvals[label]
80
+ end
81
+ }
82
+
83
+ # fvalues = features[findex]
84
+
85
+ # l_to_v = {} # label -> array of feature values
86
+ # v_to_l = {} # feature value -> array of labels
87
+
88
+ # fvalues.each_index {|inst| # traverse all instances
89
+ # fval = fvalues[inst]
90
+ # label = labels[inst]
91
+
92
+
93
+ # unless v_to_l.key?(fval) # add entry to v_to_l
94
+ # v_to_l[fval] = []
95
+ # end
96
+ # v_to_l[fval] << label
97
+
98
+ # unless l_to_v.key?(label) # add entry to l_to_v
99
+ # l_to_v[label] = []
100
+ # end
101
+ # l_to_v[label] << fval
102
+ # }
103
+
104
+ # l_to_v.each_pair {|label,values|
105
+ # newvalue = "f"+findex.to_s+"_"+label.gsub(/\./,"")
106
+ # values.each {|value|
107
+ # if v_to_l[value].uniq.length == 1
108
+ # @replacements[findex][value] = newvalue
109
+ # end
110
+ # }
111
+ # }
112
+ }
113
+ @ready = true
114
+ end
115
+
116
+ def init_from_file(optsfile) # use old optimisation
117
+ optsinfile = File.new(optsfile)
118
+ @replacements = read(optsinfile)
119
+ optsinfile.close
120
+ @ready = true
121
+ end
122
+
123
+ def store(outfilename) # store data necessary to recreate optimisation
124
+ unless @ready
125
+ raise "[Optimise] Error: Cannot store un-initialised optimisation"
126
+ end
127
+ outfile = File.new(outfilename,"w")
128
+ @replacements.each_index {|i| # for each feature
129
+ reps = @replacements[i]
130
+ outfile.puts "<"+i.to_s+">"
131
+ reps.each_pair{|old,new|
132
+ outfile.puts [old,new].join("\t")
133
+ }
134
+ outfile.puts "</"+i.to_s+">"
135
+ }
136
+ outfile.close
137
+ end
138
+
139
+ def apply(infilename,outfilename)
140
+ unless @ready
141
+ raise "[Optimise] Error: Cannot apply un-initialised optimisation"
142
+ end
143
+
144
+ STDERR.puts "[Optimise] applying feature optimisation"
145
+
146
+ infile = File.new(infilename)
147
+ outfile = File.new(outfilename,"w")
148
+ features = []
149
+ labels = []
150
+
151
+
152
+ while (line = infile.gets)
153
+ tokens = line.chomp.split(",")
154
+
155
+ unless tokens.length == @replacements.length
156
+ raise "[Optimise] Error: trying to optimise incompatible feature file!\nFile has "+features.length.to_s+" features, and we know replacements for "+@replacements.length.to_s+" features."
157
+ end
158
+
159
+ label = tokens.pop
160
+ tokens.each_index {|f_idx|
161
+ fval = tokens[f_idx]
162
+ if @replacements[f_idx].key?(fval)
163
+ tokens[f_idx] = @replacements[f_idx][fval]
164
+ end
165
+ }
166
+ tokens.push label
167
+ outfile.puts tokens.join(",")
168
+ end
169
+ outfile.close
170
+ end
171
+
172
+ private
173
+
174
+ def read(infile)
175
+ @replacements = []
176
+ while line = infile.gets
177
+ line.chomp!
178
+ if line =~ /<(\d+)>/
179
+ reps = {}
180
+ elsif line =~ /<\/(\d+)>/
181
+ @replacements[$1.to_i] = reps
182
+ else
183
+ tokens = line.chomp.split("\t")
184
+ reps[tokens[0]] = tokens[1]
185
+ end
186
+ end
187
+ infile.close
188
+ end
189
+
190
+ # return recommended filename to store optimisation patterns for basefile
191
+ def Optimise.recommended_filename(basefile)
192
+ return basefile+".optimisations"
193
+ end
194
+
195
+ end
@@ -0,0 +1,140 @@
1
+ # wrapper script for timbl learner
2
+ # sp 24 08 04
3
+
4
+ # contract for Learner classes:
5
+
6
+ class Timbl
7
+ def initialize(program_path, parameters)
8
+
9
+ @timblpath = File.join(program_path, "Timbl")
10
+ unless @timblpath =~ /\s$/
11
+ # path must end in space so we can just attach parameters
12
+ @timblpath << " "
13
+ end
14
+
15
+ if parameters.empty?
16
+ # was: +vs
17
+ @params = "-mM -k5 +vs" # default parameters
18
+ else
19
+ @params = parameters.join(" ") + " "
20
+ end
21
+ end
22
+
23
+ def timbl_out_to_malouf_out(infilename,outfilename) # timbl: [all features], [gold standard label]
24
+ infile = File.new(infilename)
25
+ outfile = File.new(outfilename,"w")
26
+ while (line = infile.gets)
27
+ larray = line.chomp.split(",")
28
+ ml_label = larray.last
29
+ outfile.puts ml_label+"\t1"
30
+ end
31
+ infile.close
32
+ outfile.close
33
+ end
34
+
35
+ def train(infile,classifier_location) # lazy learning: for training, store the
36
+ # instancebase as a tree (TiMBL -I / -i option)
37
+ # figure out how many features we have
38
+ f = File.new(infile)
39
+ line = f.gets.chomp
40
+ num_features = line.split(",").length - 1
41
+
42
+ # and train
43
+ if classifier_location then
44
+ @instancebase = classifier_location
45
+ else
46
+ @instancebase = infile+".instancebase"
47
+ end
48
+ successfully_run(@timblpath+@params+" -N#{num_features} -f "+infile+" -I "+@instancebase)
49
+ end
50
+
51
+ # return true iff reading the classifier has had success
52
+ def read(classifierfile)
53
+ unless FileTest.exists?(classifierfile)
54
+ STDERR.puts "[Timbl] Cannot find instancebase at #{classifierfile}"
55
+ return false
56
+ end
57
+ @instancebase = classifierfile
58
+ return true
59
+ end
60
+
61
+ def exists?(classifierfile)
62
+ return FileTest.exists?(classifierfile)
63
+ end
64
+
65
+ def write(classifierfile)
66
+ %x{cp #{@instancebase} #{classifierfile}} # store training data as "modelfile"
67
+ File.chmod(0664,classifierfile)
68
+ end
69
+
70
+ def apply(infile,outfile)
71
+ temp_outfile = outfile+".temp"
72
+ successfully_run(@timblpath+@params+" -i "+@instancebase+" -t "+infile+" -o "+temp_outfile)
73
+
74
+ # if we have an empty input file, timbl will not produce an output file
75
+ unless FileTest.exists?(temp_outfile)
76
+ # STDERR.puts "[Timbl] Warning: Timbl failed to produce an outfile."
77
+ return false
78
+ end
79
+
80
+ # no error
81
+ timbl_out_to_malouf_out(temp_outfile,outfile)
82
+ File.unlink(temp_outfile)
83
+
84
+ # true iff outfile exists
85
+ if FileTest.exists?(outfile)
86
+ return true
87
+ else
88
+ # STDERR.puts "[Timbl] Warning: Final outfile could not be produced."
89
+ return false
90
+ end
91
+
92
+ end
93
+
94
+ #####
95
+ def read_resultfile(filename)
96
+ begin
97
+ f = File.new(filename)
98
+ rescue
99
+ $stderr.puts "TiMBL error: cannot read TiMBL result file #{filemame}."
100
+ return nil
101
+ end
102
+
103
+ retv = []
104
+
105
+ f.each { |line|
106
+ line_results = []
107
+ pieces = line.split
108
+
109
+ while not(pieces.empty?)
110
+ label = pieces.shift
111
+
112
+ begin
113
+ confidence = pieces.shift.to_f
114
+ rescue
115
+ $stderr.puts "Error reading mallet output: invalid line: #{line}"
116
+ confidence = 0
117
+ end
118
+
119
+ line_results << [label, confidence]
120
+ end
121
+ retv << line_results
122
+ }
123
+
124
+ return retv
125
+ end
126
+
127
+ #########################
128
+ private
129
+
130
+ ###
131
+ def successfully_run(command)
132
+ retv = Kernel.system(command)
133
+ unless retv
134
+ $stderr.puts "Error running classifier. Exiting."
135
+ $stderr.puts "Offending command: "+command
136
+ exit 1
137
+ end
138
+ end
139
+
140
+ end