shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,229 @@
1
+ # wrapper script for the OpenNLP Maxent classifier
2
+
3
+ # sp July 2007
4
+
5
+
6
+ require "tempfile"
7
+ require 'fileutils'
8
+
9
+ class Maxent
10
+ ###
11
+ def initialize(program_path, parameters)
12
+
13
+ # @note AB: <parameters> is an Array with the last part of the
14
+ # line from the experiment file, it should contain the path to our
15
+ # java wrappers, but we don't want it.
16
+ # Since the presence of this part is checked only here we
17
+ # suppose it obsolete and set this path manually here.
18
+ # if parameters.empty?
19
+ # puts "Error: The OpenNLP maxent system needs two paths (first the location of maxent itself and then the location of the interface, usually program/tools/maxent)."
20
+ # puts "I got only the program path."
21
+ # Kernel.exit
22
+ # end
23
+ # @interface_path = parameters.first
24
+
25
+ # @note AB: Setting path manually.
26
+ # It assumes <Maxent.rb> ist in <lib/common> and
27
+ # <Classify.class> is in <lib/ext/maxent>.
28
+ # @todo AB: This assumption should be changed. ENV[]???
29
+ @interface_path = File.expand_path('../ext/maxent', File.dirname(__FILE__))
30
+
31
+ @maxentpath = program_path
32
+
33
+ unless @maxentpath =~ /\/$/
34
+ @maxentpath = @maxentpath + "/"
35
+ end
36
+
37
+ # classpath for maxent
38
+
39
+ @cp = "#{@maxentpath}:#{@maxentpath}lib:#{@maxentpath}lib/trove.jar:#{@maxentpath}output/maxent-2.4.0.jar:#{ENV["CLASSPATH"]}"
40
+
41
+ end
42
+
43
+ ###
44
+ #
45
+ # write classifier to training directory...
46
+ def train(infilename,classifier_file)
47
+ trainfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
48
+ infile = File.new(infilename)
49
+ c45_to_maxent(infile,trainfile) # training data in csv format
50
+ infile.close
51
+ trainfile.close
52
+
53
+ if classifier_file
54
+ @classifier_location = classifier_file
55
+ else
56
+ @classifier_location = trainfile.path+"Model.bin.gz"
57
+ end
58
+
59
+ @classifier_location = enforce_compact_storage(@classifier_location)
60
+
61
+ # store model in binary, gzipped form...
62
+ command = ["cd #{@interface_path}; ",
63
+ #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Train",
64
+ "java -cp #{@cp} -Xmx1000m Train",
65
+ trainfile.path,
66
+ @classifier_location].join(" ")
67
+ # remember location
68
+ unless successfully_run(command)
69
+ return false
70
+ end
71
+ trainfile.close(true)
72
+ end
73
+
74
+ def write(classifier_file)
75
+
76
+ classifier_file = enforce_compact_storage(classifier_file)
77
+
78
+ if @classifier_location
79
+ @classifier_location = enforce_compact_storage(@classifier_location)
80
+ %x{cp #{@classifier_location} #{classifier_file}} # store classifier
81
+ # File.chmod(0664,classifier_file+".classifier")
82
+ else
83
+ $stderr.puts "Maxent error: cannot read Maxent classifier file #{@classifier_file}."
84
+ return nil
85
+ end
86
+ end
87
+
88
+ ###
89
+ def exists?(classifier_file)
90
+ classifier_file = enforce_compact_storage(classifier_file)
91
+ return FileTest.exists?(classifier_file)
92
+ end
93
+
94
+ ###
95
+ # return true iff reading the classifier has had success
96
+ def read(classifier_file)
97
+
98
+ classifier_file = enforce_compact_storage(classifier_file)
99
+
100
+ if exists?(classifier_file)
101
+ @classifier_location = classifier_file
102
+ return true
103
+ else
104
+ $stderr.puts "No classifier file "+classifier_file
105
+ return false
106
+ end
107
+ end
108
+
109
+ ###
110
+ def apply(infilename,outfilename)
111
+
112
+ @classifier_location = enforce_compact_storage(@classifier_location)
113
+ unless @classifier_location
114
+ return false
115
+ end
116
+
117
+ testfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
118
+
119
+ infile = File.new(infilename)
120
+ c45_to_maxent(infile,testfile) # training data in csv format
121
+ infile.close
122
+ testfile.close
123
+
124
+ command = ["cd #{@interface_path}; ",
125
+ #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Classify ",
126
+ "java -cp #{@cp} -Xmx1000m Classify ",
127
+ testfile.path,
128
+ @classifier_location,
129
+ ">",
130
+ outfilename].join(" ")
131
+
132
+ # classify
133
+ unless successfully_run(command)
134
+ return false
135
+ end
136
+
137
+ # some error in classification
138
+ unless FileTest.exists?(outfilename)
139
+ return false
140
+ end
141
+
142
+ # no errors = success
143
+ testfile.close(true)
144
+ return true
145
+ end
146
+
147
+ #####
148
+ # format of Maxent result file:
149
+ # <best label>[<confidence>] <secondbest_label>[<confidence>] ....
150
+ #
151
+ # returns a list of instance_results
152
+ # where an instance_result is a list of pairs [label, confidence]
153
+ # where the pairs are sorted by confidence
154
+ def read_resultfile(filename)
155
+ begin
156
+ f = File.new(filename)
157
+ rescue
158
+ $stderr.puts "Maxent error: cannot read Maxent result file #{filemame}."
159
+ return nil
160
+ end
161
+
162
+ retv = []
163
+
164
+ f.each do |line|
165
+ line_results = []
166
+ pieces = line.split # split at whitespace
167
+
168
+ pieces.each {|piece|
169
+ piece =~ /(\S+)\[(.+)\]/
170
+ label = $1
171
+ confidence = $2.to_f
172
+
173
+ line_results << [label, confidence]
174
+ }
175
+
176
+ # sort: most confident label first
177
+ retv << line_results.sort {|a,b| b[1] <=> a[1]}
178
+ end
179
+
180
+ f.close
181
+
182
+ retv
183
+ end
184
+
185
+
186
+ ###################################
187
+ private
188
+
189
+ ###
190
+ # produce input file for maxent learner: make attribute-value pairs
191
+ # where attribute == featureX=
192
+ def c45_to_maxent(inpipe,outpipe)
193
+ while (line = inpipe.gets)
194
+ line.chomp!
195
+ la = line.split(",")
196
+ label = la.pop
197
+ if label[-1,1] == "."
198
+ label.chop!
199
+ end
200
+ la.each_index {|i|
201
+ la[i] = i.to_s + "=" + la[i]
202
+ }
203
+ la.push(label)
204
+ outpipe.puts la.join(" ")
205
+ end
206
+ end
207
+
208
+ # since the OpenNLP MaxEnt system determines storage based on filename,
209
+ # make sure that all models are stored internally as binary, gzipped files.
210
+
211
+ def enforce_compact_storage(filename)
212
+ if filename =~ /Model.bin.gz/
213
+ return filename
214
+ else
215
+ return filename+"Model.bin.gz"
216
+ end
217
+ end
218
+
219
+ ###
220
+ def successfully_run(command)
221
+ retv = Kernel.system(command)
222
+ unless retv
223
+ $stderr.puts "Error running classifier. Continuing."
224
+ $stderr.puts "Offending command: "+command
225
+ # exit 1
226
+ end
227
+ return retv
228
+ end
229
+ end
@@ -0,0 +1,195 @@
1
+ # sp 29 07 04
2
+ # "optimise" c4.5 files by replacing all feature values which only
3
+ # occur with one label by a new, common value.
4
+ #
5
+ # two modes of operation:
6
+ # optimise <file> -- optimise file and store optimisations in <file>.opts
7
+ # optimise <file> <file.opts> -- apply optimisation from file.opts to file
8
+
9
+ class Optimise
10
+
11
+ def initialize
12
+ @ready = false
13
+ end
14
+
15
+ def init_from_data(infile) # find new optimisation
16
+
17
+ STDERR.puts "[Optimise] computing new feature optimisation"
18
+
19
+ infile = File.new(infile)
20
+ labels = []
21
+ features = nil
22
+ @replacements = [] # for each feature, store the list of replacements
23
+
24
+ # read data from infile into hash and initialise replacements array
25
+ while (line = infile.gets)
26
+ f_l = line.chomp.split(",")
27
+
28
+ if features.nil? # first line: initialisation
29
+ features = [] # for each feature: array of feature values from file
30
+ f_l.each_index {|i|
31
+ features[i] = []
32
+ @replacements[i] = {}
33
+ }
34
+ end
35
+ labels << f_l.pop
36
+ f_l.each_index {|i|
37
+ features[i] << f_l[i]
38
+ }
39
+ end
40
+ infile.close
41
+
42
+ features.each_index {|findex| # traverse all features
43
+
44
+ # for each feature *value*, find all label indices
45
+
46
+ fvalues = features[findex]
47
+
48
+ fval_to_label = {} # record fval -> label mappings
49
+ # no label : nil
50
+ # one label: <label>
51
+ # two labels: false
52
+
53
+ fvalues.each_index {|inst_idx|
54
+ label = labels[inst_idx] # current label
55
+ fval = fvalues[inst_idx] # current feature value
56
+ seen_label = fval_to_label[fval] # previously seen label
57
+ if seen_label.nil?
58
+ fval_to_label[fval] = label
59
+ elsif seen_label and seen_label != label
60
+ fval_to_label[fval] = false
61
+ end
62
+ } # at the end, all fvals should be mapped to either <label> or false
63
+
64
+ # construct new feature value names
65
+
66
+ new_fvals = {}
67
+ labels.each {|label|
68
+ new_fvals[label] = "f"+findex.to_s+"_"+label.gsub(/\./,"")
69
+ }
70
+
71
+ # record all features values for which we have only seen one label in @replacements
72
+
73
+ fval_to_label.each_pair {|fval,label|
74
+ if fval == "[U]"
75
+ puts "[U]: "+label.to_s+" "+new_fvals[label]
76
+ end
77
+ if label
78
+ # STDERR.puts "replacement of "+fval+" by "+new_fvals[label]
79
+ @replacements[findex][fval] = new_fvals[label]
80
+ end
81
+ }
82
+
83
+ # fvalues = features[findex]
84
+
85
+ # l_to_v = {} # label -> array of feature values
86
+ # v_to_l = {} # feature value -> array of labels
87
+
88
+ # fvalues.each_index {|inst| # traverse all instances
89
+ # fval = fvalues[inst]
90
+ # label = labels[inst]
91
+
92
+
93
+ # unless v_to_l.key?(fval) # add entry to v_to_l
94
+ # v_to_l[fval] = []
95
+ # end
96
+ # v_to_l[fval] << label
97
+
98
+ # unless l_to_v.key?(label) # add entry to l_to_v
99
+ # l_to_v[label] = []
100
+ # end
101
+ # l_to_v[label] << fval
102
+ # }
103
+
104
+ # l_to_v.each_pair {|label,values|
105
+ # newvalue = "f"+findex.to_s+"_"+label.gsub(/\./,"")
106
+ # values.each {|value|
107
+ # if v_to_l[value].uniq.length == 1
108
+ # @replacements[findex][value] = newvalue
109
+ # end
110
+ # }
111
+ # }
112
+ }
113
+ @ready = true
114
+ end
115
+
116
+ def init_from_file(optsfile) # use old optimisation
117
+ optsinfile = File.new(optsfile)
118
+ @replacements = read(optsinfile)
119
+ optsinfile.close
120
+ @ready = true
121
+ end
122
+
123
+ def store(outfilename) # store data necessary to recreate optimisation
124
+ unless @ready
125
+ raise "[Optimise] Error: Cannot store un-initialised optimisation"
126
+ end
127
+ outfile = File.new(outfilename,"w")
128
+ @replacements.each_index {|i| # for each feature
129
+ reps = @replacements[i]
130
+ outfile.puts "<"+i.to_s+">"
131
+ reps.each_pair{|old,new|
132
+ outfile.puts [old,new].join("\t")
133
+ }
134
+ outfile.puts "</"+i.to_s+">"
135
+ }
136
+ outfile.close
137
+ end
138
+
139
+ def apply(infilename,outfilename)
140
+ unless @ready
141
+ raise "[Optimise] Error: Cannot apply un-initialised optimisation"
142
+ end
143
+
144
+ STDERR.puts "[Optimise] applying feature optimisation"
145
+
146
+ infile = File.new(infilename)
147
+ outfile = File.new(outfilename,"w")
148
+ features = []
149
+ labels = []
150
+
151
+
152
+ while (line = infile.gets)
153
+ tokens = line.chomp.split(",")
154
+
155
+ unless tokens.length == @replacements.length
156
+ raise "[Optimise] Error: trying to optimise incompatible feature file!\nFile has "+features.length.to_s+" features, and we know replacements for "+@replacements.length.to_s+" features."
157
+ end
158
+
159
+ label = tokens.pop
160
+ tokens.each_index {|f_idx|
161
+ fval = tokens[f_idx]
162
+ if @replacements[f_idx].key?(fval)
163
+ tokens[f_idx] = @replacements[f_idx][fval]
164
+ end
165
+ }
166
+ tokens.push label
167
+ outfile.puts tokens.join(",")
168
+ end
169
+ outfile.close
170
+ end
171
+
172
+ private
173
+
174
+ def read(infile)
175
+ @replacements = []
176
+ while line = infile.gets
177
+ line.chomp!
178
+ if line =~ /<(\d+)>/
179
+ reps = {}
180
+ elsif line =~ /<\/(\d+)>/
181
+ @replacements[$1.to_i] = reps
182
+ else
183
+ tokens = line.chomp.split("\t")
184
+ reps[tokens[0]] = tokens[1]
185
+ end
186
+ end
187
+ infile.close
188
+ end
189
+
190
+ # return recommended filename to store optimisation patterns for basefile
191
+ def Optimise.recommended_filename(basefile)
192
+ return basefile+".optimisations"
193
+ end
194
+
195
+ end
@@ -0,0 +1,140 @@
1
+ # wrapper script for timbl learner
2
+ # sp 24 08 04
3
+
4
+ # contract for Learner classes:
5
+
6
+ class Timbl
7
+ def initialize(program_path, parameters)
8
+
9
+ @timblpath = File.join(program_path, "Timbl")
10
+ unless @timblpath =~ /\s$/
11
+ # path must end in space so we can just attach parameters
12
+ @timblpath << " "
13
+ end
14
+
15
+ if parameters.empty?
16
+ # was: +vs
17
+ @params = "-mM -k5 +vs" # default parameters
18
+ else
19
+ @params = parameters.join(" ") + " "
20
+ end
21
+ end
22
+
23
+ def timbl_out_to_malouf_out(infilename,outfilename) # timbl: [all features], [gold standard label]
24
+ infile = File.new(infilename)
25
+ outfile = File.new(outfilename,"w")
26
+ while (line = infile.gets)
27
+ larray = line.chomp.split(",")
28
+ ml_label = larray.last
29
+ outfile.puts ml_label+"\t1"
30
+ end
31
+ infile.close
32
+ outfile.close
33
+ end
34
+
35
+ def train(infile,classifier_location) # lazy learning: for training, store the
36
+ # instancebase as a tree (TiMBL -I / -i option)
37
+ # figure out how many features we have
38
+ f = File.new(infile)
39
+ line = f.gets.chomp
40
+ num_features = line.split(",").length - 1
41
+
42
+ # and train
43
+ if classifier_location then
44
+ @instancebase = classifier_location
45
+ else
46
+ @instancebase = infile+".instancebase"
47
+ end
48
+ successfully_run(@timblpath+@params+" -N#{num_features} -f "+infile+" -I "+@instancebase)
49
+ end
50
+
51
+ # return true iff reading the classifier has had success
52
+ def read(classifierfile)
53
+ unless FileTest.exists?(classifierfile)
54
+ STDERR.puts "[Timbl] Cannot find instancebase at #{classifierfile}"
55
+ return false
56
+ end
57
+ @instancebase = classifierfile
58
+ return true
59
+ end
60
+
61
+ def exists?(classifierfile)
62
+ return FileTest.exists?(classifierfile)
63
+ end
64
+
65
+ def write(classifierfile)
66
+ %x{cp #{@instancebase} #{classifierfile}} # store training data as "modelfile"
67
+ File.chmod(0664,classifierfile)
68
+ end
69
+
70
+ def apply(infile,outfile)
71
+ temp_outfile = outfile+".temp"
72
+ successfully_run(@timblpath+@params+" -i "+@instancebase+" -t "+infile+" -o "+temp_outfile)
73
+
74
+ # if we have an empty input file, timbl will not produce an output file
75
+ unless FileTest.exists?(temp_outfile)
76
+ # STDERR.puts "[Timbl] Warning: Timbl failed to produce an outfile."
77
+ return false
78
+ end
79
+
80
+ # no error
81
+ timbl_out_to_malouf_out(temp_outfile,outfile)
82
+ File.unlink(temp_outfile)
83
+
84
+ # true iff outfile exists
85
+ if FileTest.exists?(outfile)
86
+ return true
87
+ else
88
+ # STDERR.puts "[Timbl] Warning: Final outfile could not be produced."
89
+ return false
90
+ end
91
+
92
+ end
93
+
94
+ #####
95
+ def read_resultfile(filename)
96
+ begin
97
+ f = File.new(filename)
98
+ rescue
99
+ $stderr.puts "TiMBL error: cannot read TiMBL result file #{filemame}."
100
+ return nil
101
+ end
102
+
103
+ retv = []
104
+
105
+ f.each { |line|
106
+ line_results = []
107
+ pieces = line.split
108
+
109
+ while not(pieces.empty?)
110
+ label = pieces.shift
111
+
112
+ begin
113
+ confidence = pieces.shift.to_f
114
+ rescue
115
+ $stderr.puts "Error reading mallet output: invalid line: #{line}"
116
+ confidence = 0
117
+ end
118
+
119
+ line_results << [label, confidence]
120
+ end
121
+ retv << line_results
122
+ }
123
+
124
+ return retv
125
+ end
126
+
127
+ #########################
128
+ private
129
+
130
+ ###
131
+ def successfully_run(command)
132
+ retv = Kernel.system(command)
133
+ unless retv
134
+ $stderr.puts "Error running classifier. Exiting."
135
+ $stderr.puts "Offending command: "+command
136
+ exit 1
137
+ end
138
+ end
139
+
140
+ end