shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,24 +0,0 @@
1
- # KE changed July 05: now no inclusion of modules required,
2
- # and names changed from REXML.Encodign to UtfIso
3
-
4
- module UtfIso
5
- # Convert from UTF-8
6
- def UtfIso.to_iso_8859_1(content)
7
- array_utf8 = content.unpack('U*')
8
- array_enc = []
9
- array_utf8.each do |num|
10
- if num <= 0xFF
11
- array_enc << num
12
- else
13
- # Numeric entity (&#nnnn;); shard by Stefan Scholl
14
- # array_enc += to_iso_8859("&\##{num};").unpack('C*')
15
- end
16
- end
17
- array_enc.pack('C*')
18
- end
19
-
20
- # Convert to UTF-8
21
- def UtfIso.from_iso_8859_1(str)
22
- str.unpack('C*').pack('U*')
23
- end
24
- end
@@ -1,186 +0,0 @@
1
- # sp 24 08 04
2
-
3
- # this file provides a very simple wrapper for using different ML systems
4
- # all you need to do is to write the appropriate learner class
5
- # and insert them in the initialize routine here in ML()
6
- #
7
- # available at the moment:
8
- # * timbl (memory-based learner)
9
- # * mallet-maxent (another maxent system)
10
- # * maxent (the OpenNLP maxent system)
11
-
12
- # part of contract: learner is not initialised unless it is either trained or read
13
-
14
- require "common/Optimise"
15
-
16
- class Classifier
17
-
18
- @@learners = [
19
- ["timbl", "Timbl", "Timbl"],
20
- # ["mallet", "Mallet", "Mallet"],
21
- ["maxent", "Maxent", "Maxent"]
22
- ]
23
-
24
- def initialize(learner,params)
25
-
26
- @ready = false
27
-
28
- if params[0] == "optimise"
29
- params.shift
30
- @optimise = true
31
- else
32
- @optimise = false
33
- end
34
-
35
- program_path = ""
36
- begin
37
- program_path = params.shift.chomp
38
- unless FileTest.exist? program_path
39
- $stderr.puts "Error: Could not find classifier system at " + program_path
40
- $stderr.puts "Perhaps an erroneous entry in your experiment file?"
41
- exit 1
42
- end
43
- rescue NoMethodError
44
- $stderr.puts "Error: No program path provided for classifier system."
45
- end
46
-
47
- # try to find our learner in the pre-set list of learners
48
- learner_tuple = @@learners.assoc(learner)
49
- unless learner_tuple
50
- $stderr.puts "Error: I don't know the learner " + learner.to_s
51
- $stderr.puts "Perhaps an erroneous entry in your experiment file?"
52
- exit 1
53
- end
54
-
55
- learner_name, learner_filename, learner_classname = learner_tuple
56
- require "common/#{learner_filename}"
57
- @learner = eval(learner_classname).new(program_path,params)
58
- end
59
-
60
- # a classifier can (and has to be) either trained or read
61
- def train(trainfile, classifier_file=nil)
62
- # train on the training data in trainfile
63
- # make sure we produce a valid file name
64
-
65
- # it is possible to directly specify a filename for storing the classifier
66
-
67
- trainfile.gsub!(/[<>]/,"")
68
- trainfile.gsub!(/ /,"_")
69
- if @optimise
70
- STDERR.puts "[ML] using feature optimisation"
71
- @optimiser = Optimise.new
72
- @optimiser.init_from_data(trainfile)
73
- optimisedfile = trainfile+".opted"
74
- @optimiser.apply(trainfile,optimisedfile)
75
- @learner.train(optimisedfile,classifier_file)
76
- File.delete(optimisedfile)
77
- else
78
- STDERR.puts "[ML] no feature optimisation"
79
- @learner.train(trainfile,classifier_file)
80
- end
81
- @ready = true
82
- end
83
-
84
-
85
- # returns true iff reading the classifier from the file has had success
86
-
87
- def read(classifier_file)
88
- # make sure we produce a valid file name
89
- classifier_file.gsub!(/[<>]/,"")
90
- classifier_file.gsub!(/ /,"_")
91
-
92
- # read file, if present
93
-
94
- status = @learner.read(classifier_file)
95
-
96
- # if reading has failed, return "false"
97
- unless status
98
- STDERR.puts "reading from #{classifier_file} did not succeed"
99
- return status
100
- end
101
-
102
- # read optimisation, if desired
103
- if @optimise
104
- optimisations_filename = Optimise.recommended_filename(classifier_file)
105
- unless FileTest.exists? optimisations_filename
106
- STDERR.puts "[ML] Error: attempted to read stored optimisation, but file does not exist"
107
- return false
108
- else
109
- @optimiser = Optimise.new
110
- @optimiser.init_from_file(optimisations_filename)
111
- end
112
- end
113
-
114
- @ready = true
115
- return true
116
-
117
- end
118
-
119
- # a classifier can be stored somewhere. This can be more than one file (classifier-specific),
120
- # but all files start with "classifier_file"
121
-
122
- def write(classifier_file)
123
- # make sure we produce a valid file name
124
- classifier_file.gsub!(/[<>]/,"")
125
- classifier_file.gsub!(/ /,"_")
126
- @learner.write(classifier_file)
127
- if @optimise
128
- @optimiser.store(Optimise.recommended_filename(classifier_file))
129
- end
130
- end
131
-
132
- ###
133
- # exists?
134
- # check if a classifier is living at some particular path
135
-
136
- def exists?(classifier_file)
137
- classifier_file.gsub!(/[<>]/,"")
138
- classifier_file.gsub!(/ /,"_")
139
- return @learner.exists?(classifier_file)
140
- end
141
-
142
- # a classifier can be applied
143
-
144
- # returns true iff application has had success
145
-
146
- def apply(testfile,outfile) # test either on the training or the test data in the specified dir
147
- # make sure we produce a valid file name
148
- testfile.gsub!(/[<>]/,"")
149
- testfile.gsub!(/ /,"_")
150
- # make sure we produce a valid file name
151
- outfile.gsub!(/[<>]/,"")
152
- outfile.gsub!(/ /,"_")
153
-
154
- unless @ready
155
- STDERR.puts "[ML] Warning: learner not ready for testing! Must be trained or read."
156
- return false
157
- end
158
-
159
- # do we have a testfile?
160
-
161
- unless FileTest.exists?(testfile)
162
- STDERR.puts "[ML] Warning: could not find testfile (maybe empty test set?)."
163
- return false
164
- end
165
-
166
- if @optimise
167
- optimisedfile = testfile+".opted"
168
- @optimiser.apply(testfile,optimisedfile)
169
- return @learner.apply(optimisedfile,outfile)
170
- File.delete(optimisedfile)
171
- else
172
- return @learner.apply(testfile,outfile)
173
- end
174
-
175
- end
176
-
177
- ###
178
- # read classifier result file,
179
- # returns a list of instance_results
180
- # where an instance_result is a list of pairs [label, confidence]
181
- # where the pairs are sorted by confidence
182
- def read_resultfile(file)
183
- return @learner.read_resultfile(file)
184
- end
185
-
186
- end
@@ -1,236 +0,0 @@
1
- # wrapper script for the Mallet toolkit Maxent classifier
2
-
3
- # Problem with Winnow: cannot be serialised (written to file). Support dropped.
4
-
5
- # sp 27 10 04
6
-
7
-
8
- require "tempfile"
9
- require "ftools"
10
-
11
- class Mallet
12
-
13
- ###
14
- def initialize(program_path,parameters)
15
-
16
- if parameters.empty?
17
- puts "Error: Mallet needs two paths (first the location of mallet itself and then the location of the interface, usually program/tools/mallet)."
18
- puts "I got only the program path."
19
- Kernel.exit
20
- end
21
-
22
- @malletpath = program_path
23
- @interface_path = parameters.first
24
- unless @malletpath =~ /\/$/
25
- @malletpath = @malletpath + "/"
26
- end
27
-
28
- @learner = "MaxEnt,gaussianPriorVariance=1.0"
29
-
30
- # classpath for mallet
31
-
32
- @cp = "#{ENV["CLASSPATH"]}:#{@malletpath}class:#{@malletpath}lib/bsh.jar"
33
-
34
- end
35
-
36
- ###
37
- def train(infilename,classifier_location)
38
- csvfile = Tempfile.new(File.basename(infilename)+".csvtrain")
39
- infile = File.new(infilename)
40
- c45_to_csv(infile,csvfile) # training data in csv format
41
- infile.close
42
- csvfile.close
43
- @mallet_train_vectors = infilename+".trainvectors" # training data in mallet format
44
- if classifier_location
45
- @classifier_mallet_path = classifier_location
46
- else
47
- @classifier_mallet_path = infilename+".classifier"
48
- end
49
-
50
- command1 = [@malletpath+"bin/csv2vectors ",
51
- " --input ",csvfile.path,
52
- " --output ",@mallet_train_vectors].join("")
53
-
54
- command2 = ["cd #{@interface_path}; ",
55
- "java -cp #{@cp} -Xmx1000m Train ",
56
- " --train ",@mallet_train_vectors,
57
- " --out ",@classifier_mallet_path,
58
- " --trainer ",@learner].join("")
59
- # STDERR.puts "[train 1] "+command1
60
- successfully_run(command1) # encode
61
- # STDERR.puts "[train 2] "+command2
62
- successfully_run(command2) # train
63
- csvfile.close(true)
64
- end
65
-
66
- def write(classifier_file)
67
- if @classifier_mallet_path
68
- %x{cp #{@classifier_mallet_path} #{classifier_file}.classifier} # store classifier
69
- # File.chmod(0664,classifier_file+".classifier")
70
- end
71
- if @mallet_train_vectors
72
- %x{cp #{@mallet_train_vectors} #{classifier_file}.trainvectors} # store train vectors to recreate pipe for testing data
73
- # File.chmod(0664,classifier_file+".trainvectors")
74
- end
75
- end
76
-
77
- ###
78
- def exists?(classifier_file)
79
- return (FileTest.exists?(classifier_file+".trainvectors") and
80
- FileTest.exists?(classifier_file+".classifier"))
81
- end
82
-
83
- ###
84
- # return true iff reading the classifier has had success
85
- def read(classifier_file)
86
- @mallet_train_vectors = classifier_file+".trainvectors" # training data in mallet format
87
- @classifier_mallet_path = classifier_file+".classifier"
88
- unless FileTest.exists?(@mallet_train_vectors)
89
- $stderr.puts "No classifier file "+@mallet_train_vectors
90
- return false
91
- end
92
- unless FileTest.exists?(@classifier_mallet_path)
93
- $stderr.puts "No classifier file "+@classifier_mallet_path
94
- return false
95
- end
96
- return true
97
- end
98
-
99
- ###
100
- def apply(infilename,outfilename)
101
- unless @classifier_mallet_path and @mallet_train_vectors
102
- return false
103
- end
104
-
105
- # STDERR.puts "Testing on "+infilename
106
- csvfile = Tempfile.new(File.basename(infilename)+".csvtest")
107
-
108
- infile = File.new(infilename)
109
- c45_to_csv(infile,csvfile) # training data in csv format
110
- infile.close
111
- csvfile.close
112
-
113
- test_mallet_path = infilename+".test.vectors" # training data in mallet format
114
-
115
- # $stderr.puts "test file in " + infilename
116
- # $stderr.puts "using training vectors from " + @mallet_train_vectors
117
-
118
- # copy train vectors to temp file.
119
- # reason: mallet in std edition reads _and writes_ this file
120
- # if rosy is interrupted, corrupted (ie incomplete) train vector files
121
- # result
122
-
123
- tempfile = Tempfile.new("mallet")
124
- tempfilename = tempfile.path
125
- unless File.copy(@mallet_train_vectors,tempfilename)
126
- return false
127
- end
128
-
129
- command1 = [@malletpath+"bin/csv2vectors", # encode testing data
130
- " --input ",csvfile.path,
131
- " --output ",test_mallet_path,
132
- " --use-pipe-from ",tempfilename].join("")
133
-
134
- # $stderr.puts "Mallet encode: " + command1
135
- unless successfully_run(command1) # encode
136
- return false
137
- end
138
-
139
- File.safe_unlink(tempfilename)
140
-
141
- # some error in encoding?
142
- unless FileTest.exists?(test_mallet_path)
143
- return false
144
- end
145
-
146
- command2 = ["cd #{@interface_path}; ",
147
- "java -cp #{@cp} -Xmx1000m Classify ",
148
- @classifier_mallet_path," ",
149
- test_mallet_path," ",
150
- "> ",outfilename].join("")
151
-
152
- # classify
153
- # $stderr.puts "Mallet classify: " + command2
154
- unless successfully_run(command2)
155
- return false
156
- end
157
-
158
- # some error in classification
159
- unless FileTest.exists?(outfilename)
160
- return false
161
- end
162
-
163
- # no errors = success
164
- csvfile.close(true)
165
- return true
166
- end
167
-
168
- #####
169
- # format of Mallet result file:
170
- # <best label> <confidence> \t <secondbest_label> <confidence>....
171
- def read_resultfile(filename)
172
- begin
173
- f = File.new(filename)
174
- rescue
175
- $stderr.puts "Mallet error: cannot read Mallet result file #{filemame}."
176
- return nil
177
- end
178
-
179
- retv = Array.new()
180
-
181
- f.each { |line|
182
- line_results = Array.new()
183
- pieces = line.split()
184
-
185
- while not(pieces.empty?)
186
- label = pieces.shift()
187
-
188
- begin
189
- confidence = pieces.shift().to_f()
190
- rescue
191
- $stderr.puts "Error reading mallet output: invalid line: #{line}"
192
- confidence = 0
193
- end
194
-
195
- line_results << [label, confidence]
196
- end
197
- retv << line_results
198
- }
199
-
200
- return retv
201
- end
202
-
203
-
204
- ###################################
205
- private
206
-
207
- ###
208
- # mallet needs "comma separated values"-file
209
- # input: features separated by comma
210
- # output:
211
- # line_number classlabel features_joined_by_spaces
212
- def c45_to_csv(inpipe,outpipe)
213
- idx = 0
214
- while (line = inpipe.gets)
215
- line.chomp!
216
- idx += 1
217
- la = line.split(",")
218
- label = la.pop
219
- if label[-1,1] == "."
220
- label.chop!
221
- end
222
- outpipe.puts [idx,label].join(" ")+" "+la.join(" ")
223
- end
224
- end
225
-
226
- ###
227
- def successfully_run(command)
228
- retv = Kernel.system(command)
229
- unless retv
230
- $stderr.puts "Error running classifier. Continuing."
231
- $stderr.puts "Offending command: "+command
232
- # exit 1
233
- end
234
- return retv
235
- end
236
- end
@@ -1,229 +0,0 @@
1
- # wrapper script for the OpenNLP Maxent classifier
2
-
3
- # sp July 2007
4
-
5
-
6
- require "tempfile"
7
- require 'fileutils'
8
-
9
- class Maxent
10
-
11
- ###
12
- def initialize(program_path,parameters)
13
-
14
- # @note AB: <parameters> is an Array with the last part of the
15
- # line from the experiment file, it should contain the path to our
16
- # java wrappers, but we don't want it.
17
- # Since the presence of this part is checked only here we
18
- # suppose it obsolete and set this path manually here.
19
- # if parameters.empty?
20
- # puts "Error: The OpenNLP maxent system needs two paths (first the location of maxent itself and then the location of the interface, usually program/tools/maxent)."
21
- # puts "I got only the program path."
22
- # Kernel.exit
23
- # end
24
- # @interface_path = parameters.first
25
-
26
- # @note AB: Setting path manually.
27
- # It assumes <Maxent.rb> ist in <lib/common> and
28
- # <Classify.class> is in <lib/ext/maxent>.
29
- @interface_path = File.expand_path('../ext/maxent', File.dirname(__FILE__))
30
-
31
- @maxentpath = program_path
32
-
33
- unless @maxentpath =~ /\/$/
34
- @maxentpath = @maxentpath + "/"
35
- end
36
-
37
- # classpath for maxent
38
-
39
- @cp = "#{@maxentpath}:#{@maxentpath}lib:#{@maxentpath}lib/trove.jar:#{@maxentpath}output/maxent-2.4.0.jar:#{ENV["CLASSPATH"]}"
40
-
41
- end
42
-
43
- ###
44
- #
45
- # write classifier to training directory...
46
- def train(infilename,classifier_file)
47
- trainfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
48
- infile = File.new(infilename)
49
- c45_to_maxent(infile,trainfile) # training data in csv format
50
- infile.close
51
- trainfile.close
52
-
53
- if classifier_file
54
- @classifier_location = classifier_file
55
- else
56
- @classifier_location = trainfile.path+"Model.bin.gz"
57
- end
58
-
59
- @classifier_location = enforce_compact_storage(@classifier_location)
60
-
61
- # store model in binary, gzipped form...
62
- command = ["cd #{@interface_path}; ",
63
- #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Train",
64
- "java -cp #{@cp} -Xmx1000m Train",
65
- trainfile.path,
66
- @classifier_location].join(" ")
67
- # remember location
68
- unless successfully_run(command)
69
- return false
70
- end
71
- trainfile.close(true)
72
- end
73
-
74
- def write(classifier_file)
75
-
76
- classifier_file = enforce_compact_storage(classifier_file)
77
-
78
- if @classifier_location
79
- @classifier_location = enforce_compact_storage(@classifier_location)
80
- %x{cp #{@classifier_location} #{classifier_file}} # store classifier
81
- # File.chmod(0664,classifier_file+".classifier")
82
- else
83
- $stderr.puts "Maxent error: cannot read Maxent classifier file #{@classifier_file}."
84
- return nil
85
- end
86
- end
87
-
88
- ###
89
- def exists?(classifier_file)
90
- classifier_file = enforce_compact_storage(classifier_file)
91
- return FileTest.exists?(classifier_file)
92
- end
93
-
94
- ###
95
- # return true iff reading the classifier has had success
96
- def read(classifier_file)
97
-
98
- classifier_file = enforce_compact_storage(classifier_file)
99
-
100
- if exists?(classifier_file)
101
- @classifier_location = classifier_file
102
- return true
103
- else
104
- $stderr.puts "No classifier file "+classifier_file
105
- return false
106
- end
107
- end
108
-
109
- ###
110
- def apply(infilename,outfilename)
111
-
112
- @classifier_location = enforce_compact_storage(@classifier_location)
113
- unless @classifier_location
114
- return false
115
- end
116
-
117
- testfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
118
-
119
- infile = File.new(infilename)
120
- c45_to_maxent(infile,testfile) # training data in csv format
121
- infile.close
122
- testfile.close
123
-
124
- command = ["cd #{@interface_path}; ",
125
- #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Classify ",
126
- "java -cp #{@cp} -Xmx1000m Classify ",
127
- testfile.path,
128
- @classifier_location,
129
- ">",
130
- outfilename].join(" ")
131
-
132
- # classify
133
- unless successfully_run(command)
134
- return false
135
- end
136
-
137
- # some error in classification
138
- unless FileTest.exists?(outfilename)
139
- return false
140
- end
141
-
142
- # no errors = success
143
- testfile.close(true)
144
- return true
145
- end
146
-
147
- #####
148
- # format of Maxent result file:
149
- # <best label>[<confidence>] <secondbest_label>[<confidence>] ....
150
- #
151
- # returns a list of instance_results
152
- # where an instance_result is a list of pairs [label, confidence]
153
- # where the pairs are sorted by confidence
154
- def read_resultfile(filename)
155
- begin
156
- f = File.new(filename)
157
- rescue
158
- $stderr.puts "Maxent error: cannot read Maxent result file #{filemame}."
159
- return nil
160
- end
161
-
162
- retv = []
163
-
164
- f.each do |line|
165
- line_results = Array.new()
166
- pieces = line.split() # split at whitespace
167
-
168
- pieces.each {|piece|
169
- piece =~ /(\S+)\[(.+)\]/
170
- label = $1
171
- confidence = $2.to_f
172
-
173
- line_results << [label, confidence]
174
- }
175
-
176
- # sort: most confident label first
177
- retv << line_results.sort {|a,b| b[1] <=> a[1]}
178
- end
179
-
180
- f.close
181
-
182
- retv
183
- end
184
-
185
-
186
- ###################################
187
- private
188
-
189
- ###
190
- # produce input file for maxent learner: make attribute-value pairs
191
- # where attribute == featureX=
192
- def c45_to_maxent(inpipe,outpipe)
193
- while (line = inpipe.gets)
194
- line.chomp!
195
- la = line.split(",")
196
- label = la.pop
197
- if label[-1,1] == "."
198
- label.chop!
199
- end
200
- la.each_index {|i|
201
- la[i] = i.to_s() + "=" + la[i]
202
- }
203
- la.push(label)
204
- outpipe.puts la.join(" ")
205
- end
206
- end
207
-
208
- # since the OpenNLP MaxEnt system determines storage based on filename,
209
- # make sure that all models are stored internally as binary, gzipped files.
210
-
211
- def enforce_compact_storage(filename)
212
- if filename =~ /Model.bin.gz/
213
- return filename
214
- else
215
- return filename+"Model.bin.gz"
216
- end
217
- end
218
-
219
- ###
220
- def successfully_run(command)
221
- retv = Kernel.system(command)
222
- unless retv
223
- $stderr.puts "Error running classifier. Continuing."
224
- $stderr.puts "Offending command: "+command
225
- # exit 1
226
- end
227
- return retv
228
- end
229
- end