shalmaneser 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/shalmaneser +8 -2
- data/doc/index.md +1 -0
- data/lib/shalmaneser/opt_parser.rb +68 -67
- metadata +49 -119
- data/bin/fred +0 -16
- data/bin/frprep +0 -34
- data/bin/rosy +0 -17
- data/lib/common/AbstractSynInterface.rb +0 -1229
- data/lib/common/Counter.rb +0 -18
- data/lib/common/EnduserMode.rb +0 -27
- data/lib/common/Eval.rb +0 -480
- data/lib/common/FixSynSemMapping.rb +0 -196
- data/lib/common/Graph.rb +0 -345
- data/lib/common/ISO-8859-1.rb +0 -24
- data/lib/common/ML.rb +0 -186
- data/lib/common/Mallet.rb +0 -236
- data/lib/common/Maxent.rb +0 -229
- data/lib/common/Optimise.rb +0 -195
- data/lib/common/Parser.rb +0 -213
- data/lib/common/RegXML.rb +0 -269
- data/lib/common/RosyConventions.rb +0 -171
- data/lib/common/STXmlTerminalOrder.rb +0 -194
- data/lib/common/SalsaTigerRegXML.rb +0 -2347
- data/lib/common/SalsaTigerXMLHelper.rb +0 -99
- data/lib/common/SynInterfaces.rb +0 -282
- data/lib/common/TabFormat.rb +0 -721
- data/lib/common/Tiger.rb +0 -1448
- data/lib/common/Timbl.rb +0 -144
- data/lib/common/Tree.rb +0 -61
- data/lib/common/config_data.rb +0 -470
- data/lib/common/config_format_element.rb +0 -220
- data/lib/common/headz.rb +0 -338
- data/lib/common/option_parser.rb +0 -13
- data/lib/common/prep_config_data.rb +0 -62
- data/lib/common/prep_helper.rb +0 -1330
- data/lib/common/ruby_class_extensions.rb +0 -310
- data/lib/db/db_interface.rb +0 -48
- data/lib/db/db_mysql.rb +0 -145
- data/lib/db/db_sqlite.rb +0 -280
- data/lib/db/db_table.rb +0 -239
- data/lib/db/db_wrapper.rb +0 -176
- data/lib/db/sql_query.rb +0 -243
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredConventions.rb +0 -232
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred.rb +0 -47
- data/lib/fred/fred_config_data.rb +0 -185
- data/lib/fred/md5.rb +0 -23
- data/lib/fred/opt_parser.rb +0 -250
- data/lib/frprep/Ampersand.rb +0 -39
- data/lib/frprep/CollinsInterface.rb +0 -1165
- data/lib/frprep/Counter.rb +0 -18
- data/lib/frprep/FNCorpusXML.rb +0 -643
- data/lib/frprep/FNDatabase.rb +0 -144
- data/lib/frprep/FrameXML.rb +0 -513
- data/lib/frprep/Graph.rb +0 -345
- data/lib/frprep/MiniparInterface.rb +0 -1388
- data/lib/frprep/RegXML.rb +0 -269
- data/lib/frprep/STXmlTerminalOrder.rb +0 -194
- data/lib/frprep/SleepyInterface.rb +0 -384
- data/lib/frprep/TntInterface.rb +0 -44
- data/lib/frprep/TreetaggerInterface.rb +0 -327
- data/lib/frprep/do_parses.rb +0 -143
- data/lib/frprep/frprep.rb +0 -693
- data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
- data/lib/frprep/interfaces/stanford_interface.rb +0 -353
- data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
- data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
- data/lib/frprep/one_parsed_file.rb +0 -28
- data/lib/frprep/opt_parser.rb +0 -94
- data/lib/frprep/ruby_class_extensions.rb +0 -310
- data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/FailedParses.rb +0 -130
- data/lib/rosy/FeatureInfo.rb +0 -242
- data/lib/rosy/GfInduce.rb +0 -1115
- data/lib/rosy/GfInduceFeature.rb +0 -148
- data/lib/rosy/InputData.rb +0 -294
- data/lib/rosy/RosyConfusability.rb +0 -338
- data/lib/rosy/RosyEval.rb +0 -465
- data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
- data/lib/rosy/RosyFeaturize.rb +0 -281
- data/lib/rosy/RosyInspect.rb +0 -336
- data/lib/rosy/RosyIterator.rb +0 -478
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
- data/lib/rosy/RosyPruning.rb +0 -165
- data/lib/rosy/RosyServices.rb +0 -744
- data/lib/rosy/RosySplit.rb +0 -232
- data/lib/rosy/RosyTask.rb +0 -19
- data/lib/rosy/RosyTest.rb +0 -829
- data/lib/rosy/RosyTrain.rb +0 -234
- data/lib/rosy/RosyTrainingTestTable.rb +0 -787
- data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/opt_parser.rb +0 -379
- data/lib/rosy/rosy.rb +0 -78
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/lib/shalmaneser/version.rb +0 -3
data/lib/common/ISO-8859-1.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
# KE changed July 05: now no inclusion of modules required,
|
2
|
-
# and names changed from REXML.Encodign to UtfIso
|
3
|
-
|
4
|
-
module UtfIso
|
5
|
-
# Convert from UTF-8
|
6
|
-
def UtfIso.to_iso_8859_1(content)
|
7
|
-
array_utf8 = content.unpack('U*')
|
8
|
-
array_enc = []
|
9
|
-
array_utf8.each do |num|
|
10
|
-
if num <= 0xFF
|
11
|
-
array_enc << num
|
12
|
-
else
|
13
|
-
# Numeric entity (&#nnnn;); shard by Stefan Scholl
|
14
|
-
# array_enc += to_iso_8859("&\##{num};").unpack('C*')
|
15
|
-
end
|
16
|
-
end
|
17
|
-
array_enc.pack('C*')
|
18
|
-
end
|
19
|
-
|
20
|
-
# Convert to UTF-8
|
21
|
-
def UtfIso.from_iso_8859_1(str)
|
22
|
-
str.unpack('C*').pack('U*')
|
23
|
-
end
|
24
|
-
end
|
data/lib/common/ML.rb
DELETED
@@ -1,186 +0,0 @@
|
|
1
|
-
# sp 24 08 04
|
2
|
-
|
3
|
-
# this file provides a very simple wrapper for using different ML systems
|
4
|
-
# all you need to do is to write the appropriate learner class
|
5
|
-
# and insert them in the initialize routine here in ML()
|
6
|
-
#
|
7
|
-
# available at the moment:
|
8
|
-
# * timbl (memory-based learner)
|
9
|
-
# * mallet-maxent (another maxent system)
|
10
|
-
# * maxent (the OpenNLP maxent system)
|
11
|
-
|
12
|
-
# part of contract: learner is not initialised unless it is either trained or read
|
13
|
-
|
14
|
-
require "common/Optimise"
|
15
|
-
|
16
|
-
class Classifier
|
17
|
-
|
18
|
-
@@learners = [
|
19
|
-
["timbl", "Timbl", "Timbl"],
|
20
|
-
# ["mallet", "Mallet", "Mallet"],
|
21
|
-
["maxent", "Maxent", "Maxent"]
|
22
|
-
]
|
23
|
-
|
24
|
-
def initialize(learner,params)
|
25
|
-
|
26
|
-
@ready = false
|
27
|
-
|
28
|
-
if params[0] == "optimise"
|
29
|
-
params.shift
|
30
|
-
@optimise = true
|
31
|
-
else
|
32
|
-
@optimise = false
|
33
|
-
end
|
34
|
-
|
35
|
-
program_path = ""
|
36
|
-
begin
|
37
|
-
program_path = params.shift.chomp
|
38
|
-
unless FileTest.exist? program_path
|
39
|
-
$stderr.puts "Error: Could not find classifier system at " + program_path
|
40
|
-
$stderr.puts "Perhaps an erroneous entry in your experiment file?"
|
41
|
-
exit 1
|
42
|
-
end
|
43
|
-
rescue NoMethodError
|
44
|
-
$stderr.puts "Error: No program path provided for classifier system."
|
45
|
-
end
|
46
|
-
|
47
|
-
# try to find our learner in the pre-set list of learners
|
48
|
-
learner_tuple = @@learners.assoc(learner)
|
49
|
-
unless learner_tuple
|
50
|
-
$stderr.puts "Error: I don't know the learner " + learner.to_s
|
51
|
-
$stderr.puts "Perhaps an erroneous entry in your experiment file?"
|
52
|
-
exit 1
|
53
|
-
end
|
54
|
-
|
55
|
-
learner_name, learner_filename, learner_classname = learner_tuple
|
56
|
-
require "common/#{learner_filename}"
|
57
|
-
@learner = eval(learner_classname).new(program_path,params)
|
58
|
-
end
|
59
|
-
|
60
|
-
# a classifier can (and has to be) either trained or read
|
61
|
-
def train(trainfile, classifier_file=nil)
|
62
|
-
# train on the training data in trainfile
|
63
|
-
# make sure we produce a valid file name
|
64
|
-
|
65
|
-
# it is possible to directly specify a filename for storing the classifier
|
66
|
-
|
67
|
-
trainfile.gsub!(/[<>]/,"")
|
68
|
-
trainfile.gsub!(/ /,"_")
|
69
|
-
if @optimise
|
70
|
-
STDERR.puts "[ML] using feature optimisation"
|
71
|
-
@optimiser = Optimise.new
|
72
|
-
@optimiser.init_from_data(trainfile)
|
73
|
-
optimisedfile = trainfile+".opted"
|
74
|
-
@optimiser.apply(trainfile,optimisedfile)
|
75
|
-
@learner.train(optimisedfile,classifier_file)
|
76
|
-
File.delete(optimisedfile)
|
77
|
-
else
|
78
|
-
STDERR.puts "[ML] no feature optimisation"
|
79
|
-
@learner.train(trainfile,classifier_file)
|
80
|
-
end
|
81
|
-
@ready = true
|
82
|
-
end
|
83
|
-
|
84
|
-
|
85
|
-
# returns true iff reading the classifier from the file has had success
|
86
|
-
|
87
|
-
def read(classifier_file)
|
88
|
-
# make sure we produce a valid file name
|
89
|
-
classifier_file.gsub!(/[<>]/,"")
|
90
|
-
classifier_file.gsub!(/ /,"_")
|
91
|
-
|
92
|
-
# read file, if present
|
93
|
-
|
94
|
-
status = @learner.read(classifier_file)
|
95
|
-
|
96
|
-
# if reading has failed, return "false"
|
97
|
-
unless status
|
98
|
-
STDERR.puts "reading from #{classifier_file} did not succeed"
|
99
|
-
return status
|
100
|
-
end
|
101
|
-
|
102
|
-
# read optimisation, if desired
|
103
|
-
if @optimise
|
104
|
-
optimisations_filename = Optimise.recommended_filename(classifier_file)
|
105
|
-
unless FileTest.exists? optimisations_filename
|
106
|
-
STDERR.puts "[ML] Error: attempted to read stored optimisation, but file does not exist"
|
107
|
-
return false
|
108
|
-
else
|
109
|
-
@optimiser = Optimise.new
|
110
|
-
@optimiser.init_from_file(optimisations_filename)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
@ready = true
|
115
|
-
return true
|
116
|
-
|
117
|
-
end
|
118
|
-
|
119
|
-
# a classifier can be stored somewhere. This can be more than one file (classifier-specific),
|
120
|
-
# but all files start with "classifier_file"
|
121
|
-
|
122
|
-
def write(classifier_file)
|
123
|
-
# make sure we produce a valid file name
|
124
|
-
classifier_file.gsub!(/[<>]/,"")
|
125
|
-
classifier_file.gsub!(/ /,"_")
|
126
|
-
@learner.write(classifier_file)
|
127
|
-
if @optimise
|
128
|
-
@optimiser.store(Optimise.recommended_filename(classifier_file))
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
###
|
133
|
-
# exists?
|
134
|
-
# check if a classifier is living at some particular path
|
135
|
-
|
136
|
-
def exists?(classifier_file)
|
137
|
-
classifier_file.gsub!(/[<>]/,"")
|
138
|
-
classifier_file.gsub!(/ /,"_")
|
139
|
-
return @learner.exists?(classifier_file)
|
140
|
-
end
|
141
|
-
|
142
|
-
# a classifier can be applied
|
143
|
-
|
144
|
-
# returns true iff application has had success
|
145
|
-
|
146
|
-
def apply(testfile,outfile) # test either on the training or the test data in the specified dir
|
147
|
-
# make sure we produce a valid file name
|
148
|
-
testfile.gsub!(/[<>]/,"")
|
149
|
-
testfile.gsub!(/ /,"_")
|
150
|
-
# make sure we produce a valid file name
|
151
|
-
outfile.gsub!(/[<>]/,"")
|
152
|
-
outfile.gsub!(/ /,"_")
|
153
|
-
|
154
|
-
unless @ready
|
155
|
-
STDERR.puts "[ML] Warning: learner not ready for testing! Must be trained or read."
|
156
|
-
return false
|
157
|
-
end
|
158
|
-
|
159
|
-
# do we have a testfile?
|
160
|
-
|
161
|
-
unless FileTest.exists?(testfile)
|
162
|
-
STDERR.puts "[ML] Warning: could not find testfile (maybe empty test set?)."
|
163
|
-
return false
|
164
|
-
end
|
165
|
-
|
166
|
-
if @optimise
|
167
|
-
optimisedfile = testfile+".opted"
|
168
|
-
@optimiser.apply(testfile,optimisedfile)
|
169
|
-
return @learner.apply(optimisedfile,outfile)
|
170
|
-
File.delete(optimisedfile)
|
171
|
-
else
|
172
|
-
return @learner.apply(testfile,outfile)
|
173
|
-
end
|
174
|
-
|
175
|
-
end
|
176
|
-
|
177
|
-
###
|
178
|
-
# read classifier result file,
|
179
|
-
# returns a list of instance_results
|
180
|
-
# where an instance_result is a list of pairs [label, confidence]
|
181
|
-
# where the pairs are sorted by confidence
|
182
|
-
def read_resultfile(file)
|
183
|
-
return @learner.read_resultfile(file)
|
184
|
-
end
|
185
|
-
|
186
|
-
end
|
data/lib/common/Mallet.rb
DELETED
@@ -1,236 +0,0 @@
|
|
1
|
-
# wrapper script for the Mallet toolkit Maxent classifier
|
2
|
-
|
3
|
-
# Problem with Winnow: cannot be serialised (written to file). Support dropped.
|
4
|
-
|
5
|
-
# sp 27 10 04
|
6
|
-
|
7
|
-
|
8
|
-
require "tempfile"
|
9
|
-
require "ftools"
|
10
|
-
|
11
|
-
class Mallet
|
12
|
-
|
13
|
-
###
|
14
|
-
def initialize(program_path,parameters)
|
15
|
-
|
16
|
-
if parameters.empty?
|
17
|
-
puts "Error: Mallet needs two paths (first the location of mallet itself and then the location of the interface, usually program/tools/mallet)."
|
18
|
-
puts "I got only the program path."
|
19
|
-
Kernel.exit
|
20
|
-
end
|
21
|
-
|
22
|
-
@malletpath = program_path
|
23
|
-
@interface_path = parameters.first
|
24
|
-
unless @malletpath =~ /\/$/
|
25
|
-
@malletpath = @malletpath + "/"
|
26
|
-
end
|
27
|
-
|
28
|
-
@learner = "MaxEnt,gaussianPriorVariance=1.0"
|
29
|
-
|
30
|
-
# classpath for mallet
|
31
|
-
|
32
|
-
@cp = "#{ENV["CLASSPATH"]}:#{@malletpath}class:#{@malletpath}lib/bsh.jar"
|
33
|
-
|
34
|
-
end
|
35
|
-
|
36
|
-
###
|
37
|
-
def train(infilename,classifier_location)
|
38
|
-
csvfile = Tempfile.new(File.basename(infilename)+".csvtrain")
|
39
|
-
infile = File.new(infilename)
|
40
|
-
c45_to_csv(infile,csvfile) # training data in csv format
|
41
|
-
infile.close
|
42
|
-
csvfile.close
|
43
|
-
@mallet_train_vectors = infilename+".trainvectors" # training data in mallet format
|
44
|
-
if classifier_location
|
45
|
-
@classifier_mallet_path = classifier_location
|
46
|
-
else
|
47
|
-
@classifier_mallet_path = infilename+".classifier"
|
48
|
-
end
|
49
|
-
|
50
|
-
command1 = [@malletpath+"bin/csv2vectors ",
|
51
|
-
" --input ",csvfile.path,
|
52
|
-
" --output ",@mallet_train_vectors].join("")
|
53
|
-
|
54
|
-
command2 = ["cd #{@interface_path}; ",
|
55
|
-
"java -cp #{@cp} -Xmx1000m Train ",
|
56
|
-
" --train ",@mallet_train_vectors,
|
57
|
-
" --out ",@classifier_mallet_path,
|
58
|
-
" --trainer ",@learner].join("")
|
59
|
-
# STDERR.puts "[train 1] "+command1
|
60
|
-
successfully_run(command1) # encode
|
61
|
-
# STDERR.puts "[train 2] "+command2
|
62
|
-
successfully_run(command2) # train
|
63
|
-
csvfile.close(true)
|
64
|
-
end
|
65
|
-
|
66
|
-
def write(classifier_file)
|
67
|
-
if @classifier_mallet_path
|
68
|
-
%x{cp #{@classifier_mallet_path} #{classifier_file}.classifier} # store classifier
|
69
|
-
# File.chmod(0664,classifier_file+".classifier")
|
70
|
-
end
|
71
|
-
if @mallet_train_vectors
|
72
|
-
%x{cp #{@mallet_train_vectors} #{classifier_file}.trainvectors} # store train vectors to recreate pipe for testing data
|
73
|
-
# File.chmod(0664,classifier_file+".trainvectors")
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
###
|
78
|
-
def exists?(classifier_file)
|
79
|
-
return (FileTest.exists?(classifier_file+".trainvectors") and
|
80
|
-
FileTest.exists?(classifier_file+".classifier"))
|
81
|
-
end
|
82
|
-
|
83
|
-
###
|
84
|
-
# return true iff reading the classifier has had success
|
85
|
-
def read(classifier_file)
|
86
|
-
@mallet_train_vectors = classifier_file+".trainvectors" # training data in mallet format
|
87
|
-
@classifier_mallet_path = classifier_file+".classifier"
|
88
|
-
unless FileTest.exists?(@mallet_train_vectors)
|
89
|
-
$stderr.puts "No classifier file "+@mallet_train_vectors
|
90
|
-
return false
|
91
|
-
end
|
92
|
-
unless FileTest.exists?(@classifier_mallet_path)
|
93
|
-
$stderr.puts "No classifier file "+@classifier_mallet_path
|
94
|
-
return false
|
95
|
-
end
|
96
|
-
return true
|
97
|
-
end
|
98
|
-
|
99
|
-
###
|
100
|
-
def apply(infilename,outfilename)
|
101
|
-
unless @classifier_mallet_path and @mallet_train_vectors
|
102
|
-
return false
|
103
|
-
end
|
104
|
-
|
105
|
-
# STDERR.puts "Testing on "+infilename
|
106
|
-
csvfile = Tempfile.new(File.basename(infilename)+".csvtest")
|
107
|
-
|
108
|
-
infile = File.new(infilename)
|
109
|
-
c45_to_csv(infile,csvfile) # training data in csv format
|
110
|
-
infile.close
|
111
|
-
csvfile.close
|
112
|
-
|
113
|
-
test_mallet_path = infilename+".test.vectors" # training data in mallet format
|
114
|
-
|
115
|
-
# $stderr.puts "test file in " + infilename
|
116
|
-
# $stderr.puts "using training vectors from " + @mallet_train_vectors
|
117
|
-
|
118
|
-
# copy train vectors to temp file.
|
119
|
-
# reason: mallet in std edition reads _and writes_ this file
|
120
|
-
# if rosy is interrupted, corrupted (ie incomplete) train vector files
|
121
|
-
# result
|
122
|
-
|
123
|
-
tempfile = Tempfile.new("mallet")
|
124
|
-
tempfilename = tempfile.path
|
125
|
-
unless File.copy(@mallet_train_vectors,tempfilename)
|
126
|
-
return false
|
127
|
-
end
|
128
|
-
|
129
|
-
command1 = [@malletpath+"bin/csv2vectors", # encode testing data
|
130
|
-
" --input ",csvfile.path,
|
131
|
-
" --output ",test_mallet_path,
|
132
|
-
" --use-pipe-from ",tempfilename].join("")
|
133
|
-
|
134
|
-
# $stderr.puts "Mallet encode: " + command1
|
135
|
-
unless successfully_run(command1) # encode
|
136
|
-
return false
|
137
|
-
end
|
138
|
-
|
139
|
-
File.safe_unlink(tempfilename)
|
140
|
-
|
141
|
-
# some error in encoding?
|
142
|
-
unless FileTest.exists?(test_mallet_path)
|
143
|
-
return false
|
144
|
-
end
|
145
|
-
|
146
|
-
command2 = ["cd #{@interface_path}; ",
|
147
|
-
"java -cp #{@cp} -Xmx1000m Classify ",
|
148
|
-
@classifier_mallet_path," ",
|
149
|
-
test_mallet_path," ",
|
150
|
-
"> ",outfilename].join("")
|
151
|
-
|
152
|
-
# classify
|
153
|
-
# $stderr.puts "Mallet classify: " + command2
|
154
|
-
unless successfully_run(command2)
|
155
|
-
return false
|
156
|
-
end
|
157
|
-
|
158
|
-
# some error in classification
|
159
|
-
unless FileTest.exists?(outfilename)
|
160
|
-
return false
|
161
|
-
end
|
162
|
-
|
163
|
-
# no errors = success
|
164
|
-
csvfile.close(true)
|
165
|
-
return true
|
166
|
-
end
|
167
|
-
|
168
|
-
#####
|
169
|
-
# format of Mallet result file:
|
170
|
-
# <best label> <confidence> \t <secondbest_label> <confidence>....
|
171
|
-
def read_resultfile(filename)
|
172
|
-
begin
|
173
|
-
f = File.new(filename)
|
174
|
-
rescue
|
175
|
-
$stderr.puts "Mallet error: cannot read Mallet result file #{filemame}."
|
176
|
-
return nil
|
177
|
-
end
|
178
|
-
|
179
|
-
retv = Array.new()
|
180
|
-
|
181
|
-
f.each { |line|
|
182
|
-
line_results = Array.new()
|
183
|
-
pieces = line.split()
|
184
|
-
|
185
|
-
while not(pieces.empty?)
|
186
|
-
label = pieces.shift()
|
187
|
-
|
188
|
-
begin
|
189
|
-
confidence = pieces.shift().to_f()
|
190
|
-
rescue
|
191
|
-
$stderr.puts "Error reading mallet output: invalid line: #{line}"
|
192
|
-
confidence = 0
|
193
|
-
end
|
194
|
-
|
195
|
-
line_results << [label, confidence]
|
196
|
-
end
|
197
|
-
retv << line_results
|
198
|
-
}
|
199
|
-
|
200
|
-
return retv
|
201
|
-
end
|
202
|
-
|
203
|
-
|
204
|
-
###################################
|
205
|
-
private
|
206
|
-
|
207
|
-
###
|
208
|
-
# mallet needs "comma separated values"-file
|
209
|
-
# input: features separated by comma
|
210
|
-
# output:
|
211
|
-
# line_number classlabel features_joined_by_spaces
|
212
|
-
def c45_to_csv(inpipe,outpipe)
|
213
|
-
idx = 0
|
214
|
-
while (line = inpipe.gets)
|
215
|
-
line.chomp!
|
216
|
-
idx += 1
|
217
|
-
la = line.split(",")
|
218
|
-
label = la.pop
|
219
|
-
if label[-1,1] == "."
|
220
|
-
label.chop!
|
221
|
-
end
|
222
|
-
outpipe.puts [idx,label].join(" ")+" "+la.join(" ")
|
223
|
-
end
|
224
|
-
end
|
225
|
-
|
226
|
-
###
|
227
|
-
def successfully_run(command)
|
228
|
-
retv = Kernel.system(command)
|
229
|
-
unless retv
|
230
|
-
$stderr.puts "Error running classifier. Continuing."
|
231
|
-
$stderr.puts "Offending command: "+command
|
232
|
-
# exit 1
|
233
|
-
end
|
234
|
-
return retv
|
235
|
-
end
|
236
|
-
end
|
data/lib/common/Maxent.rb
DELETED
@@ -1,229 +0,0 @@
|
|
1
|
-
# wrapper script for the OpenNLP Maxent classifier
|
2
|
-
|
3
|
-
# sp July 2007
|
4
|
-
|
5
|
-
|
6
|
-
require "tempfile"
|
7
|
-
require 'fileutils'
|
8
|
-
|
9
|
-
class Maxent
|
10
|
-
|
11
|
-
###
|
12
|
-
def initialize(program_path,parameters)
|
13
|
-
|
14
|
-
# @note AB: <parameters> is an Array with the last part of the
|
15
|
-
# line from the experiment file, it should contain the path to our
|
16
|
-
# java wrappers, but we don't want it.
|
17
|
-
# Since the presence of this part is checked only here we
|
18
|
-
# suppose it obsolete and set this path manually here.
|
19
|
-
# if parameters.empty?
|
20
|
-
# puts "Error: The OpenNLP maxent system needs two paths (first the location of maxent itself and then the location of the interface, usually program/tools/maxent)."
|
21
|
-
# puts "I got only the program path."
|
22
|
-
# Kernel.exit
|
23
|
-
# end
|
24
|
-
# @interface_path = parameters.first
|
25
|
-
|
26
|
-
# @note AB: Setting path manually.
|
27
|
-
# It assumes <Maxent.rb> ist in <lib/common> and
|
28
|
-
# <Classify.class> is in <lib/ext/maxent>.
|
29
|
-
@interface_path = File.expand_path('../ext/maxent', File.dirname(__FILE__))
|
30
|
-
|
31
|
-
@maxentpath = program_path
|
32
|
-
|
33
|
-
unless @maxentpath =~ /\/$/
|
34
|
-
@maxentpath = @maxentpath + "/"
|
35
|
-
end
|
36
|
-
|
37
|
-
# classpath for maxent
|
38
|
-
|
39
|
-
@cp = "#{@maxentpath}:#{@maxentpath}lib:#{@maxentpath}lib/trove.jar:#{@maxentpath}output/maxent-2.4.0.jar:#{ENV["CLASSPATH"]}"
|
40
|
-
|
41
|
-
end
|
42
|
-
|
43
|
-
###
|
44
|
-
#
|
45
|
-
# write classifier to training directory...
|
46
|
-
def train(infilename,classifier_file)
|
47
|
-
trainfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
|
48
|
-
infile = File.new(infilename)
|
49
|
-
c45_to_maxent(infile,trainfile) # training data in csv format
|
50
|
-
infile.close
|
51
|
-
trainfile.close
|
52
|
-
|
53
|
-
if classifier_file
|
54
|
-
@classifier_location = classifier_file
|
55
|
-
else
|
56
|
-
@classifier_location = trainfile.path+"Model.bin.gz"
|
57
|
-
end
|
58
|
-
|
59
|
-
@classifier_location = enforce_compact_storage(@classifier_location)
|
60
|
-
|
61
|
-
# store model in binary, gzipped form...
|
62
|
-
command = ["cd #{@interface_path}; ",
|
63
|
-
#"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Train",
|
64
|
-
"java -cp #{@cp} -Xmx1000m Train",
|
65
|
-
trainfile.path,
|
66
|
-
@classifier_location].join(" ")
|
67
|
-
# remember location
|
68
|
-
unless successfully_run(command)
|
69
|
-
return false
|
70
|
-
end
|
71
|
-
trainfile.close(true)
|
72
|
-
end
|
73
|
-
|
74
|
-
def write(classifier_file)
|
75
|
-
|
76
|
-
classifier_file = enforce_compact_storage(classifier_file)
|
77
|
-
|
78
|
-
if @classifier_location
|
79
|
-
@classifier_location = enforce_compact_storage(@classifier_location)
|
80
|
-
%x{cp #{@classifier_location} #{classifier_file}} # store classifier
|
81
|
-
# File.chmod(0664,classifier_file+".classifier")
|
82
|
-
else
|
83
|
-
$stderr.puts "Maxent error: cannot read Maxent classifier file #{@classifier_file}."
|
84
|
-
return nil
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
###
|
89
|
-
def exists?(classifier_file)
|
90
|
-
classifier_file = enforce_compact_storage(classifier_file)
|
91
|
-
return FileTest.exists?(classifier_file)
|
92
|
-
end
|
93
|
-
|
94
|
-
###
|
95
|
-
# return true iff reading the classifier has had success
|
96
|
-
def read(classifier_file)
|
97
|
-
|
98
|
-
classifier_file = enforce_compact_storage(classifier_file)
|
99
|
-
|
100
|
-
if exists?(classifier_file)
|
101
|
-
@classifier_location = classifier_file
|
102
|
-
return true
|
103
|
-
else
|
104
|
-
$stderr.puts "No classifier file "+classifier_file
|
105
|
-
return false
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
###
|
110
|
-
def apply(infilename,outfilename)
|
111
|
-
|
112
|
-
@classifier_location = enforce_compact_storage(@classifier_location)
|
113
|
-
unless @classifier_location
|
114
|
-
return false
|
115
|
-
end
|
116
|
-
|
117
|
-
testfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
|
118
|
-
|
119
|
-
infile = File.new(infilename)
|
120
|
-
c45_to_maxent(infile,testfile) # training data in csv format
|
121
|
-
infile.close
|
122
|
-
testfile.close
|
123
|
-
|
124
|
-
command = ["cd #{@interface_path}; ",
|
125
|
-
#"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Classify ",
|
126
|
-
"java -cp #{@cp} -Xmx1000m Classify ",
|
127
|
-
testfile.path,
|
128
|
-
@classifier_location,
|
129
|
-
">",
|
130
|
-
outfilename].join(" ")
|
131
|
-
|
132
|
-
# classify
|
133
|
-
unless successfully_run(command)
|
134
|
-
return false
|
135
|
-
end
|
136
|
-
|
137
|
-
# some error in classification
|
138
|
-
unless FileTest.exists?(outfilename)
|
139
|
-
return false
|
140
|
-
end
|
141
|
-
|
142
|
-
# no errors = success
|
143
|
-
testfile.close(true)
|
144
|
-
return true
|
145
|
-
end
|
146
|
-
|
147
|
-
#####
|
148
|
-
# format of Maxent result file:
|
149
|
-
# <best label>[<confidence>] <secondbest_label>[<confidence>] ....
|
150
|
-
#
|
151
|
-
# returns a list of instance_results
|
152
|
-
# where an instance_result is a list of pairs [label, confidence]
|
153
|
-
# where the pairs are sorted by confidence
|
154
|
-
def read_resultfile(filename)
|
155
|
-
begin
|
156
|
-
f = File.new(filename)
|
157
|
-
rescue
|
158
|
-
$stderr.puts "Maxent error: cannot read Maxent result file #{filemame}."
|
159
|
-
return nil
|
160
|
-
end
|
161
|
-
|
162
|
-
retv = []
|
163
|
-
|
164
|
-
f.each do |line|
|
165
|
-
line_results = Array.new()
|
166
|
-
pieces = line.split() # split at whitespace
|
167
|
-
|
168
|
-
pieces.each {|piece|
|
169
|
-
piece =~ /(\S+)\[(.+)\]/
|
170
|
-
label = $1
|
171
|
-
confidence = $2.to_f
|
172
|
-
|
173
|
-
line_results << [label, confidence]
|
174
|
-
}
|
175
|
-
|
176
|
-
# sort: most confident label first
|
177
|
-
retv << line_results.sort {|a,b| b[1] <=> a[1]}
|
178
|
-
end
|
179
|
-
|
180
|
-
f.close
|
181
|
-
|
182
|
-
retv
|
183
|
-
end
|
184
|
-
|
185
|
-
|
186
|
-
###################################
|
187
|
-
private
|
188
|
-
|
189
|
-
###
|
190
|
-
# produce input file for maxent learner: make attribute-value pairs
|
191
|
-
# where attribute == featureX=
|
192
|
-
def c45_to_maxent(inpipe,outpipe)
|
193
|
-
while (line = inpipe.gets)
|
194
|
-
line.chomp!
|
195
|
-
la = line.split(",")
|
196
|
-
label = la.pop
|
197
|
-
if label[-1,1] == "."
|
198
|
-
label.chop!
|
199
|
-
end
|
200
|
-
la.each_index {|i|
|
201
|
-
la[i] = i.to_s() + "=" + la[i]
|
202
|
-
}
|
203
|
-
la.push(label)
|
204
|
-
outpipe.puts la.join(" ")
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
# since the OpenNLP MaxEnt system determines storage based on filename,
|
209
|
-
# make sure that all models are stored internally as binary, gzipped files.
|
210
|
-
|
211
|
-
def enforce_compact_storage(filename)
|
212
|
-
if filename =~ /Model.bin.gz/
|
213
|
-
return filename
|
214
|
-
else
|
215
|
-
return filename+"Model.bin.gz"
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
###
|
220
|
-
def successfully_run(command)
|
221
|
-
retv = Kernel.system(command)
|
222
|
-
unless retv
|
223
|
-
$stderr.puts "Error running classifier. Continuing."
|
224
|
-
$stderr.puts "Offending command: "+command
|
225
|
-
# exit 1
|
226
|
-
end
|
227
|
-
return retv
|
228
|
-
end
|
229
|
-
end
|