shalmaneser 0.0.1.alpha → 1.2.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +2 -2
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +49 -0
- data/bin/fred +18 -0
- data/bin/frprep +34 -0
- data/bin/rosy +17 -0
- data/lib/common/AbstractSynInterface.rb +35 -33
- data/lib/common/Mallet.rb +236 -0
- data/lib/common/Maxent.rb +26 -12
- data/lib/common/Parser.rb +5 -5
- data/lib/common/SynInterfaces.rb +13 -6
- data/lib/common/TabFormat.rb +7 -6
- data/lib/common/Tiger.rb +4 -4
- data/lib/common/Timbl.rb +144 -0
- data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
- data/lib/common/headz.rb +1 -1
- data/lib/common/ruby_class_extensions.rb +3 -3
- data/lib/fred/FredBOWContext.rb +14 -2
- data/lib/fred/FredDetermineTargets.rb +4 -9
- data/lib/fred/FredEval.rb +1 -1
- data/lib/fred/FredFeatureExtractors.rb +4 -3
- data/lib/fred/FredFeaturize.rb +1 -1
- data/lib/frprep/CollinsInterface.rb +6 -6
- data/lib/frprep/MiniparInterface.rb +5 -5
- data/lib/frprep/SleepyInterface.rb +7 -7
- data/lib/frprep/TntInterface.rb +1 -1
- data/lib/frprep/TreetaggerInterface.rb +29 -5
- data/lib/frprep/do_parses.rb +1 -0
- data/lib/frprep/frprep.rb +36 -32
- data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/opt_parser.rb +2 -2
- data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
- data/lib/rosy/RosyIterator.rb +11 -10
- data/lib/rosy/rosy.rb +1 -0
- data/lib/shalmaneser/version.rb +1 -1
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
- data/test/functional/test_frprep.rb +3 -3
- data/test/functional/test_rosy.rb +20 -0
- metadata +215 -224
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/CollinsInterface.rb +0 -1165
- data/lib/common/MiniparInterface.rb +0 -1388
- data/lib/common/SleepyInterface.rb +0 -384
- data/lib/common/TntInterface.rb +0 -44
- data/lib/common/TreetaggerInterface.rb +0 -303
- data/lib/frprep/AbstractSynInterface.rb +0 -1227
- data/lib/frprep/BerkeleyInterface.rb +0 -375
- data/lib/frprep/ConfigData.rb +0 -694
- data/lib/frprep/FixSynSemMapping.rb +0 -196
- data/lib/frprep/FrPrepConfigData.rb +0 -66
- data/lib/frprep/FrprepHelper.rb +0 -1324
- data/lib/frprep/ISO-8859-1.rb +0 -24
- data/lib/frprep/Parser.rb +0 -213
- data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
- data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
- data/lib/frprep/SynInterfaces.rb +0 -275
- data/lib/frprep/TabFormat.rb +0 -720
- data/lib/frprep/Tiger.rb +0 -1448
- data/lib/frprep/Tree.rb +0 -61
- data/lib/frprep/headz.rb +0 -338
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 83f5f0ca7cc27a632cb46deef7c093df649c61e1
|
4
|
+
data.tar.gz: dbc9a29186421206de7bf9b0138f05f89228fad6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8a87f1e74b16082cba8d2ab49eb33289e8db23f5bdf3cdd4f294901c8119c8bff1239ec870032871d6d2cf69efbaba500058a47827df92be707aba3ab36ab30a
|
7
|
+
data.tar.gz: be1f6b6f3e4aa0b20f26437f30c579faf68f03f7c474cb78e28cb1263ef4ab9397ab4d52fbdffa4ac7ceb50a2d3f44cb4200303a7f14b2bdd0cb06fbfae68f0f
|
data/.yardopts
CHANGED
data/CHANGELOG.md
ADDED
data/LICENSE.md
ADDED
data/README.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# [SHALMANESER - a SHALlow seMANtic parSER](http://www.coli.uni-saarland.de/projects/salsa/shal/)
|
2
|
+
|
3
|
+
|
4
|
+
[RubyGems](http://rubygems.org/gems/shalmaneser) | [RTT Project Page](http://bu.chsta.be/projects/shalmaneser/) |
|
5
|
+
[Source Code](https://github.com/arbox/shalmaneser) | [Bug Tracker](https://github.com/arbox/shalmaneser/issues)
|
6
|
+
|
7
|
+
[<img src="https://badge.fury.io/rb/shalmaneser.png" alt="Gem Version" />](http://badge.fury.io/rb/shalmaneser)
|
8
|
+
[<img src="https://travis-ci.org/arbox/shalmaneser.png" alt="Build Status" />](https://travis-ci.org/arbox/shalmaneser)
|
9
|
+
[<img src="https://codeclimate.com/github/arbox/shalmaneser.png" alt="Code Climate" />](https://codeclimate.com/github/arbox/shalmaneser)
|
10
|
+
[<img alt="Bitdeli Badge" src="https://d2weczhvl823v0.cloudfront.net/arbox/shalmaneser/trend.png" />](https://bitdeli.com/free)
|
11
|
+
|
12
|
+
## Description
|
13
|
+
|
14
|
+
Please be careful, the whole thing is under construction!
|
15
|
+
|
16
|
+
Shalmaneser is a supervised learning toolbox for shallow semantic parsing, i.e. the automatic assignment of semantic classes and roles to text. The system was developed for Frame Semantics; thus we use Frame Semantics terminology and call the classes frames and the roles frame elements. However, the architecture is reasonably general, and with a certain amount of adaption, Shalmaneser should be usable for other paradigms (e.g., PropBank roles) as well. Shalmaneser caters both for end users, and for researchers.
|
17
|
+
|
18
|
+
For end users, we provide a simple end user mode which can simply apply the pre-trained classifiers for English (FrameNet annotation / Collins parser) and German (SALSA Frame annotation / Sleepy parser). For researchers interested in investigating shallow semantic parsing, our system is extensively configurable and extendable.
|
19
|
+
|
20
|
+
## Origin
|
21
|
+
You can find original versions of Shalmaneser up to ``1.1`` on the [SALSA](http://www.coli.uni-saarland.de/projects/salsa/shal/) project page.
|
22
|
+
|
23
|
+
## Literature
|
24
|
+
|
25
|
+
K. Erk and S. Padó: Shalmaneser - a flexible toolbox for semantic role assignment. Proceedings of LREC 2006, Genoa, Italy. [Click here for details](http://www.nlpado.de/~sebastian/pub/papers/lrec06_erk.pdf).
|
26
|
+
|
27
|
+
## Documentation
|
28
|
+
|
29
|
+
The project documentation can be found in our [doc](doc/index.md) folder.
|
30
|
+
|
31
|
+
## Development
|
32
|
+
|
33
|
+
We are working now on two branches:
|
34
|
+
|
35
|
+
- ``dev`` - our development branch incorporating actual changes, for now pointing to ``1.2``;
|
36
|
+
|
37
|
+
- ``1.2`` - intermediate target;
|
38
|
+
|
39
|
+
- ``2.0`` - final target.
|
40
|
+
|
41
|
+
## Installation
|
42
|
+
|
43
|
+
See the installation instructions in the [doc](doc/index.md#installation) folder.
|
44
|
+
|
45
|
+
### Machine Learning Systems
|
46
|
+
|
47
|
+
- http://sourceforge.net/projects/maxent/files/Maxent/2.4.0/
|
48
|
+
|
49
|
+
|
data/bin/fred
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: utf-8 -*-
|
3
|
+
|
4
|
+
# AB, 2011-11-13
|
5
|
+
|
6
|
+
# fred
|
7
|
+
# Katrin Erk, April 05
|
8
|
+
#
|
9
|
+
# Frame disambiguation system:
|
10
|
+
# frame assignment as word sense disambiguation
|
11
|
+
|
12
|
+
require 'fred/opt_parser'
|
13
|
+
require 'fred/fred'
|
14
|
+
|
15
|
+
options = Fred::OptParser.parse(ARGV)
|
16
|
+
|
17
|
+
fred = Fred::Fred.new(options)
|
18
|
+
fred.assign
|
data/bin/frprep
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: utf-8 -*-
|
3
|
+
|
4
|
+
# AB, 2010-11-25
|
5
|
+
|
6
|
+
# frprep
|
7
|
+
# Katrin Erk July 05
|
8
|
+
#
|
9
|
+
# Preprocessing for Fred and Rosy:
|
10
|
+
# accept input as plain text,
|
11
|
+
# FrameNet XML, Salsa-tabular format,
|
12
|
+
# or SalsaTigerXML,
|
13
|
+
# lemmatize, POS-tag and parse
|
14
|
+
# (if asked to do so)
|
15
|
+
# and in any case produce output in
|
16
|
+
# SalsaTigerXML.
|
17
|
+
#
|
18
|
+
# Extensions to SalsaTigerXML introduced by frprep:
|
19
|
+
#
|
20
|
+
# - "lemma": lemma. Attribute of terminals.
|
21
|
+
# - "head": head word (not lemma!) of constituent.Attribute of nonterminals.
|
22
|
+
# - "fn_gf": FrameNet grammatical function label, attached to the maximal
|
23
|
+
# constituents covering the terminals labeled with that label
|
24
|
+
|
25
|
+
|
26
|
+
require 'frprep/frprep'
|
27
|
+
require 'frprep/opt_parser'
|
28
|
+
|
29
|
+
|
30
|
+
options = FrPrep::OptParser.parse(ARGV)
|
31
|
+
|
32
|
+
|
33
|
+
preprocessor = FrPrep::FrPrep.new(options)
|
34
|
+
preprocessor.transform
|
data/bin/rosy
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: utf-8 -*-
|
3
|
+
|
4
|
+
# AB: 2011-11-14
|
5
|
+
# rosy.rb
|
6
|
+
# KE, SP April 05
|
7
|
+
#
|
8
|
+
# Main file of the Rosy role assignment system.
|
9
|
+
|
10
|
+
|
11
|
+
require 'rosy/opt_parser'
|
12
|
+
require 'rosy/rosy'
|
13
|
+
|
14
|
+
options = Rosy::OptParser.parse(ARGV)
|
15
|
+
|
16
|
+
rosy = Rosy::Rosy.new(options)
|
17
|
+
rosy.assign
|
@@ -25,10 +25,10 @@
|
|
25
25
|
|
26
26
|
require "tempfile"
|
27
27
|
|
28
|
-
require
|
28
|
+
require 'common/ruby_class_extensions'
|
29
29
|
|
30
|
-
require
|
31
|
-
require
|
30
|
+
require 'common/ISO-8859-1'
|
31
|
+
require 'common/Parser'
|
32
32
|
require "common/SalsaTigerRegXML"
|
33
33
|
require "common/TabFormat"
|
34
34
|
|
@@ -42,14 +42,14 @@ class SynInterface
|
|
42
42
|
###
|
43
43
|
# returns a string: the name of the system
|
44
44
|
# e.g. "Collins" or "TNT"
|
45
|
-
def
|
45
|
+
def self.system
|
46
46
|
raise "Overwrite me"
|
47
47
|
end
|
48
48
|
|
49
49
|
###
|
50
50
|
# returns a string: the service offered
|
51
51
|
# one of "lemmatizer", "parser", "pos tagger"
|
52
|
-
def
|
52
|
+
def self.service
|
53
53
|
raise "Overwrite me"
|
54
54
|
end
|
55
55
|
|
@@ -73,10 +73,10 @@ class SynInterface
|
|
73
73
|
def process_dir(in_dir, # string: name of input directory
|
74
74
|
out_dir) # string: name of output directory
|
75
75
|
|
76
|
-
Dir[in_dir
|
77
|
-
outfilename = out_dir
|
78
|
-
process_file(infilename,outfilename)
|
79
|
-
|
76
|
+
Dir["#{in_dir}*#{@insuffix}"].each do |infilename|
|
77
|
+
outfilename = "#{out_dir}#{File.basename(infilename, @insuffix)}#{@outsuffix}"
|
78
|
+
process_file(infilename, outfilename)
|
79
|
+
end
|
80
80
|
end
|
81
81
|
|
82
82
|
###
|
@@ -91,13 +91,13 @@ class SynInterface
|
|
91
91
|
######
|
92
92
|
protected
|
93
93
|
|
94
|
-
def
|
94
|
+
def self.announce_me
|
95
95
|
if defined?(SynInterfaces)
|
96
96
|
# yup, we have a class to which we can announce ourselves
|
97
|
-
SynInterfaces.add_interface(eval(self.name
|
97
|
+
SynInterfaces.add_interface(eval(self.name))
|
98
98
|
else
|
99
99
|
# no interface collector class
|
100
|
-
|
100
|
+
STDERR.puts "Interface #{self.name} not announced: no SynInterfaces."
|
101
101
|
end
|
102
102
|
end
|
103
103
|
end
|
@@ -124,14 +124,13 @@ class SynInterfaceSTXML < SynInterface
|
|
124
124
|
def to_stxml_dir(in_dir, # string: name of dir with parse files
|
125
125
|
out_dir) # string: name of output dir
|
126
126
|
|
127
|
-
Dir[in_dir
|
128
|
-
stxmlfilename = out_dir
|
127
|
+
Dir["#{in_dir}*#{@outsuffix}"].each do |parsefilename|
|
128
|
+
stxmlfilename = "#{out_dir}#{File.basename(parsefilename, @outsuffix)}#{@stsuffix}"
|
129
129
|
to_stxml_file(parsefilename, stxmlfilename)
|
130
|
-
|
130
|
+
end
|
131
131
|
end
|
132
132
|
|
133
|
-
def to_stxml_file(infilename,
|
134
|
-
outfilename)
|
133
|
+
def to_stxml_file(infilename, outfilename)
|
135
134
|
raise "Overwrite me"
|
136
135
|
end
|
137
136
|
|
@@ -142,22 +141,25 @@ class SynInterfaceSTXML < SynInterface
|
|
142
141
|
# SalsaTigerSentence nodes returned by each_sentence():
|
143
142
|
# map the n-th word of the tab sentence to the n-th terminal of
|
144
143
|
# the SalsaTigerSentence
|
145
|
-
def
|
146
|
-
retv =
|
144
|
+
def self.standard_mapping(sent, tabsent)
|
145
|
+
retv = {}
|
146
|
+
|
147
147
|
if sent.nil?
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
148
|
+
retv = nil
|
149
|
+
else
|
150
|
+
terminals = sent.terminals_sorted
|
151
|
+
if tabsent
|
152
|
+
tabsent.each_line_parsed do |l|
|
153
|
+
if (t = terminals[l.get("lineno")])
|
154
|
+
retv[l.get("lineno")] = [t]
|
155
|
+
else
|
156
|
+
retv[l.get("lineno")] = []
|
157
|
+
end
|
157
158
|
end
|
158
|
-
|
159
|
+
end
|
159
160
|
end
|
160
|
-
|
161
|
+
|
162
|
+
retv
|
161
163
|
end
|
162
164
|
|
163
165
|
|
@@ -185,13 +187,13 @@ class SynInterfaceSTXML < SynInterface
|
|
185
187
|
|
186
188
|
# write Salsa/Tiger XML to tempfile
|
187
189
|
tf = Tempfile.new("SynInterface")
|
188
|
-
tf.close
|
190
|
+
tf.close
|
189
191
|
to_stxml_file(infilename, tf.path)
|
190
|
-
tf.flush
|
192
|
+
tf.flush
|
191
193
|
|
192
194
|
# get matching tab file, read
|
193
195
|
tab_reader = get_tab_reader(infilename)
|
194
|
-
tab_sentences =
|
196
|
+
tab_sentences = []
|
195
197
|
tab_reader.each_sentence { |s| tab_sentences << s }
|
196
198
|
|
197
199
|
# read Salsa/Tiger sentences and yield them
|
@@ -0,0 +1,236 @@
|
|
1
|
+
# wrapper script for the Mallet toolkit Maxent classifier
|
2
|
+
|
3
|
+
# Problem with Winnow: cannot be serialised (written to file). Support dropped.
|
4
|
+
|
5
|
+
# sp 27 10 04
|
6
|
+
|
7
|
+
|
8
|
+
require "tempfile"
|
9
|
+
require "ftools"
|
10
|
+
|
11
|
+
class Mallet
|
12
|
+
|
13
|
+
###
|
14
|
+
def initialize(program_path,parameters)
|
15
|
+
|
16
|
+
if parameters.empty?
|
17
|
+
puts "Error: Mallet needs two paths (first the location of mallet itself and then the location of the interface, usually program/tools/mallet)."
|
18
|
+
puts "I got only the program path."
|
19
|
+
Kernel.exit
|
20
|
+
end
|
21
|
+
|
22
|
+
@malletpath = program_path
|
23
|
+
@interface_path = parameters.first
|
24
|
+
unless @malletpath =~ /\/$/
|
25
|
+
@malletpath = @malletpath + "/"
|
26
|
+
end
|
27
|
+
|
28
|
+
@learner = "MaxEnt,gaussianPriorVariance=1.0"
|
29
|
+
|
30
|
+
# classpath for mallet
|
31
|
+
|
32
|
+
@cp = "#{ENV["CLASSPATH"]}:#{@malletpath}class:#{@malletpath}lib/bsh.jar"
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
###
|
37
|
+
def train(infilename,classifier_location)
|
38
|
+
csvfile = Tempfile.new(File.basename(infilename)+".csvtrain")
|
39
|
+
infile = File.new(infilename)
|
40
|
+
c45_to_csv(infile,csvfile) # training data in csv format
|
41
|
+
infile.close
|
42
|
+
csvfile.close
|
43
|
+
@mallet_train_vectors = infilename+".trainvectors" # training data in mallet format
|
44
|
+
if classifier_location
|
45
|
+
@classifier_mallet_path = classifier_location
|
46
|
+
else
|
47
|
+
@classifier_mallet_path = infilename+".classifier"
|
48
|
+
end
|
49
|
+
|
50
|
+
command1 = [@malletpath+"bin/csv2vectors ",
|
51
|
+
" --input ",csvfile.path,
|
52
|
+
" --output ",@mallet_train_vectors].join("")
|
53
|
+
|
54
|
+
command2 = ["cd #{@interface_path}; ",
|
55
|
+
"java -cp #{@cp} -Xmx1000m Train ",
|
56
|
+
" --train ",@mallet_train_vectors,
|
57
|
+
" --out ",@classifier_mallet_path,
|
58
|
+
" --trainer ",@learner].join("")
|
59
|
+
# STDERR.puts "[train 1] "+command1
|
60
|
+
successfully_run(command1) # encode
|
61
|
+
# STDERR.puts "[train 2] "+command2
|
62
|
+
successfully_run(command2) # train
|
63
|
+
csvfile.close(true)
|
64
|
+
end
|
65
|
+
|
66
|
+
def write(classifier_file)
|
67
|
+
if @classifier_mallet_path
|
68
|
+
%x{cp #{@classifier_mallet_path} #{classifier_file}.classifier} # store classifier
|
69
|
+
# File.chmod(0664,classifier_file+".classifier")
|
70
|
+
end
|
71
|
+
if @mallet_train_vectors
|
72
|
+
%x{cp #{@mallet_train_vectors} #{classifier_file}.trainvectors} # store train vectors to recreate pipe for testing data
|
73
|
+
# File.chmod(0664,classifier_file+".trainvectors")
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
###
|
78
|
+
def exists?(classifier_file)
|
79
|
+
return (FileTest.exists?(classifier_file+".trainvectors") and
|
80
|
+
FileTest.exists?(classifier_file+".classifier"))
|
81
|
+
end
|
82
|
+
|
83
|
+
###
|
84
|
+
# return true iff reading the classifier has had success
|
85
|
+
def read(classifier_file)
|
86
|
+
@mallet_train_vectors = classifier_file+".trainvectors" # training data in mallet format
|
87
|
+
@classifier_mallet_path = classifier_file+".classifier"
|
88
|
+
unless FileTest.exists?(@mallet_train_vectors)
|
89
|
+
$stderr.puts "No classifier file "+@mallet_train_vectors
|
90
|
+
return false
|
91
|
+
end
|
92
|
+
unless FileTest.exists?(@classifier_mallet_path)
|
93
|
+
$stderr.puts "No classifier file "+@classifier_mallet_path
|
94
|
+
return false
|
95
|
+
end
|
96
|
+
return true
|
97
|
+
end
|
98
|
+
|
99
|
+
###
|
100
|
+
def apply(infilename,outfilename)
|
101
|
+
unless @classifier_mallet_path and @mallet_train_vectors
|
102
|
+
return false
|
103
|
+
end
|
104
|
+
|
105
|
+
# STDERR.puts "Testing on "+infilename
|
106
|
+
csvfile = Tempfile.new(File.basename(infilename)+".csvtest")
|
107
|
+
|
108
|
+
infile = File.new(infilename)
|
109
|
+
c45_to_csv(infile,csvfile) # training data in csv format
|
110
|
+
infile.close
|
111
|
+
csvfile.close
|
112
|
+
|
113
|
+
test_mallet_path = infilename+".test.vectors" # training data in mallet format
|
114
|
+
|
115
|
+
# $stderr.puts "test file in " + infilename
|
116
|
+
# $stderr.puts "using training vectors from " + @mallet_train_vectors
|
117
|
+
|
118
|
+
# copy train vectors to temp file.
|
119
|
+
# reason: mallet in std edition reads _and writes_ this file
|
120
|
+
# if rosy is interrupted, corrupted (ie incomplete) train vector files
|
121
|
+
# result
|
122
|
+
|
123
|
+
tempfile = Tempfile.new("mallet")
|
124
|
+
tempfilename = tempfile.path
|
125
|
+
unless File.copy(@mallet_train_vectors,tempfilename)
|
126
|
+
return false
|
127
|
+
end
|
128
|
+
|
129
|
+
command1 = [@malletpath+"bin/csv2vectors", # encode testing data
|
130
|
+
" --input ",csvfile.path,
|
131
|
+
" --output ",test_mallet_path,
|
132
|
+
" --use-pipe-from ",tempfilename].join("")
|
133
|
+
|
134
|
+
# $stderr.puts "Mallet encode: " + command1
|
135
|
+
unless successfully_run(command1) # encode
|
136
|
+
return false
|
137
|
+
end
|
138
|
+
|
139
|
+
File.safe_unlink(tempfilename)
|
140
|
+
|
141
|
+
# some error in encoding?
|
142
|
+
unless FileTest.exists?(test_mallet_path)
|
143
|
+
return false
|
144
|
+
end
|
145
|
+
|
146
|
+
command2 = ["cd #{@interface_path}; ",
|
147
|
+
"java -cp #{@cp} -Xmx1000m Classify ",
|
148
|
+
@classifier_mallet_path," ",
|
149
|
+
test_mallet_path," ",
|
150
|
+
"> ",outfilename].join("")
|
151
|
+
|
152
|
+
# classify
|
153
|
+
# $stderr.puts "Mallet classify: " + command2
|
154
|
+
unless successfully_run(command2)
|
155
|
+
return false
|
156
|
+
end
|
157
|
+
|
158
|
+
# some error in classification
|
159
|
+
unless FileTest.exists?(outfilename)
|
160
|
+
return false
|
161
|
+
end
|
162
|
+
|
163
|
+
# no errors = success
|
164
|
+
csvfile.close(true)
|
165
|
+
return true
|
166
|
+
end
|
167
|
+
|
168
|
+
#####
|
169
|
+
# format of Mallet result file:
|
170
|
+
# <best label> <confidence> \t <secondbest_label> <confidence>....
|
171
|
+
def read_resultfile(filename)
|
172
|
+
begin
|
173
|
+
f = File.new(filename)
|
174
|
+
rescue
|
175
|
+
$stderr.puts "Mallet error: cannot read Mallet result file #{filemame}."
|
176
|
+
return nil
|
177
|
+
end
|
178
|
+
|
179
|
+
retv = Array.new()
|
180
|
+
|
181
|
+
f.each { |line|
|
182
|
+
line_results = Array.new()
|
183
|
+
pieces = line.split()
|
184
|
+
|
185
|
+
while not(pieces.empty?)
|
186
|
+
label = pieces.shift()
|
187
|
+
|
188
|
+
begin
|
189
|
+
confidence = pieces.shift().to_f()
|
190
|
+
rescue
|
191
|
+
$stderr.puts "Error reading mallet output: invalid line: #{line}"
|
192
|
+
confidence = 0
|
193
|
+
end
|
194
|
+
|
195
|
+
line_results << [label, confidence]
|
196
|
+
end
|
197
|
+
retv << line_results
|
198
|
+
}
|
199
|
+
|
200
|
+
return retv
|
201
|
+
end
|
202
|
+
|
203
|
+
|
204
|
+
###################################
|
205
|
+
private
|
206
|
+
|
207
|
+
###
|
208
|
+
# mallet needs "comma separated values"-file
|
209
|
+
# input: features separated by comma
|
210
|
+
# output:
|
211
|
+
# line_number classlabel features_joined_by_spaces
|
212
|
+
def c45_to_csv(inpipe,outpipe)
|
213
|
+
idx = 0
|
214
|
+
while (line = inpipe.gets)
|
215
|
+
line.chomp!
|
216
|
+
idx += 1
|
217
|
+
la = line.split(",")
|
218
|
+
label = la.pop
|
219
|
+
if label[-1,1] == "."
|
220
|
+
label.chop!
|
221
|
+
end
|
222
|
+
outpipe.puts [idx,label].join(" ")+" "+la.join(" ")
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
###
|
227
|
+
def successfully_run(command)
|
228
|
+
retv = Kernel.system(command)
|
229
|
+
unless retv
|
230
|
+
$stderr.puts "Error running classifier. Continuing."
|
231
|
+
$stderr.puts "Offending command: "+command
|
232
|
+
# exit 1
|
233
|
+
end
|
234
|
+
return retv
|
235
|
+
end
|
236
|
+
end
|