frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
data/lib/common/Graph.rb
ADDED
@@ -0,0 +1,345 @@
|
|
1
|
+
# GraphNode: describes one node in a graph.
|
2
|
+
#
|
3
|
+
# A node may have an arbitrary number of parents (sources of incoming edges)
|
4
|
+
# and an arbitrary number of children (targets of outgoing edges)
|
5
|
+
#
|
6
|
+
# All edges are labeled and directed
|
7
|
+
#
|
8
|
+
# The add_parent, add_child, remove_parent, remove_child methods
|
9
|
+
# take care of both ends of an edge
|
10
|
+
# (i.e. n1.add_child(n2, label) also adds n1 as parent of n2 with edge label 'label'
|
11
|
+
#
|
12
|
+
# It is possible to create a 'pointer' rather than an edge:
|
13
|
+
# n1.add_child(n2, label, pointer_insteadof_edge => true)
|
14
|
+
# will create an edge from n1 to n2 labeled 'label' that is
|
15
|
+
# listed under the outgoing edges of n1, but not among
|
16
|
+
# the incoming edges of n2
|
17
|
+
# The same option is available for add_parent, remove_parent, remove_child.
|
18
|
+
|
19
|
+
class GraphNode
|
20
|
+
|
21
|
+
def initialize(id)
|
22
|
+
@id = id
|
23
|
+
@children = Array.new
|
24
|
+
@parents = Array.new
|
25
|
+
@features = Hash.new
|
26
|
+
end
|
27
|
+
|
28
|
+
# for Marshalling:
|
29
|
+
# Dump just IDs instead of actual nodes from Parents and Children lists.
|
30
|
+
# Otherwise the Marshaller will go crazy following
|
31
|
+
# all the links to objects mentioned.
|
32
|
+
# After loading: replace IDs by actual objects with a little help
|
33
|
+
# from the caller.
|
34
|
+
|
35
|
+
def _dump(depth)
|
36
|
+
@id.to_s +
|
37
|
+
"QQSEPVALUESQQ" +
|
38
|
+
Marshal.dump(@features) +
|
39
|
+
"QQSEPVALUESQQ" +
|
40
|
+
@children.map { |label_child|
|
41
|
+
label_child[0] + "QQSEPQQ" + label_child[1].id()
|
42
|
+
}.join("QQPAIRQQ") +
|
43
|
+
"QQSEPVALUESQQ" +
|
44
|
+
@parents.map { |label_parent|
|
45
|
+
label_parent[0] + "QQSEPQQ" + label_parent[1].id()
|
46
|
+
}.join("QQPAIRQQ")
|
47
|
+
end
|
48
|
+
|
49
|
+
def GraphNode._load(string)
|
50
|
+
id, features_s, children_s, parents_s =
|
51
|
+
string.split("QQSEPVALUESQQ")
|
52
|
+
|
53
|
+
result = GraphNode.new(id)
|
54
|
+
result.fill_from_pickle(string)
|
55
|
+
return result
|
56
|
+
end
|
57
|
+
|
58
|
+
def fill_from_pickle(string)
|
59
|
+
id, features_s, children_s, parents_s =
|
60
|
+
string.split("QQSEPVALUESQQ")
|
61
|
+
|
62
|
+
@features = Marshal.load(features_s)
|
63
|
+
|
64
|
+
if children_s.nil? or children_s.empty?
|
65
|
+
@children = []
|
66
|
+
else
|
67
|
+
@children = children_s.split("QQPAIRQQ").map { |pair|
|
68
|
+
pair.split("QQSEPQQ")
|
69
|
+
}
|
70
|
+
end
|
71
|
+
|
72
|
+
if parents_s.nil? or parents_s.empty?
|
73
|
+
@parents = []
|
74
|
+
else
|
75
|
+
@parents = parents_s.split("QQPAIRQQ").map { |pair|
|
76
|
+
pair.split("QQSEPQQ")
|
77
|
+
}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def recover_from_dump(node_by_id)
|
82
|
+
@children = @children.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
|
83
|
+
@parents = @parents.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
|
84
|
+
end
|
85
|
+
|
86
|
+
# ID-related things
|
87
|
+
|
88
|
+
def ==(other_node)
|
89
|
+
unless other_node.kind_of? GraphNode
|
90
|
+
return false
|
91
|
+
end
|
92
|
+
@id == other_node.id()
|
93
|
+
end
|
94
|
+
|
95
|
+
def id()
|
96
|
+
return @id
|
97
|
+
end
|
98
|
+
|
99
|
+
def chid(newid)
|
100
|
+
@id = newid
|
101
|
+
end
|
102
|
+
|
103
|
+
# setting and retrieving features
|
104
|
+
|
105
|
+
def get_f(feature)
|
106
|
+
return @features[feature]
|
107
|
+
end
|
108
|
+
|
109
|
+
def set_f(feature, value)
|
110
|
+
@features[feature] = value
|
111
|
+
end
|
112
|
+
|
113
|
+
def add_f(feature, value)
|
114
|
+
unless @features[feature].nil?
|
115
|
+
raise "Feature " + feature + "already set."
|
116
|
+
end
|
117
|
+
set_f(feature, value)
|
118
|
+
end
|
119
|
+
|
120
|
+
# ancestors
|
121
|
+
|
122
|
+
def parents()
|
123
|
+
return @parents.map { |label_parent|
|
124
|
+
label_parent[1] }
|
125
|
+
end
|
126
|
+
|
127
|
+
def parent_labels()
|
128
|
+
return @parents.map { |label_parent| label_parent[0] }
|
129
|
+
end
|
130
|
+
|
131
|
+
def parent_label(parent)
|
132
|
+
@parents.each { |label_parent|
|
133
|
+
if label_parent[1] == parent
|
134
|
+
return label_parent[0]
|
135
|
+
end
|
136
|
+
}
|
137
|
+
return nil
|
138
|
+
end
|
139
|
+
|
140
|
+
def parents_with_edgelabel()
|
141
|
+
return @parents
|
142
|
+
end
|
143
|
+
|
144
|
+
def each_parent()
|
145
|
+
@parents.each { |label_parent| yield label_parent[1] }
|
146
|
+
end
|
147
|
+
|
148
|
+
def each_parent_with_edgelabel()
|
149
|
+
@parents.each { |label_parent| yield label_parent}
|
150
|
+
end
|
151
|
+
|
152
|
+
def parents_by_edgelabels(labels)
|
153
|
+
return @parents.select { |label_parent|
|
154
|
+
labels.include? label_parent[0]
|
155
|
+
}.map { |label_parent|
|
156
|
+
label_parent[1]
|
157
|
+
}
|
158
|
+
end
|
159
|
+
|
160
|
+
def add_parent(parent, edgelabel, varhash={})
|
161
|
+
@parents << [edgelabel, parent]
|
162
|
+
|
163
|
+
# and vice versa: add self as child to parent
|
164
|
+
unless varhash["pointer_insteadof_edge"]
|
165
|
+
unless parent.children_with_edgelabel().include? [edgelabel, self]
|
166
|
+
parent.add_child(self, edgelabel)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def remove_parent(parent, edgelabel, varhash={})
|
172
|
+
@parents = @parents.reject { |label_child|
|
173
|
+
label_child.first == edgelabel and
|
174
|
+
label_child.last == parent
|
175
|
+
}
|
176
|
+
|
177
|
+
# and vice versa: remove self as child from parent
|
178
|
+
unless varhash["pointer_insteadof_edge"]
|
179
|
+
if parent.children_with_edgelabel().include? [edgelabel, self]
|
180
|
+
parent.remove_child(self, edgelabel)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def indeg()
|
186
|
+
return @parents.length()
|
187
|
+
end
|
188
|
+
|
189
|
+
def ancestors
|
190
|
+
return ancestors_noduplicates([], [])
|
191
|
+
end
|
192
|
+
|
193
|
+
def ancestors_by_edgelabels(labels)
|
194
|
+
return ancestors_noduplicates([], labels)
|
195
|
+
end
|
196
|
+
|
197
|
+
# descendants
|
198
|
+
|
199
|
+
def children()
|
200
|
+
return @children.map { |label_child| label_child[1] }
|
201
|
+
end
|
202
|
+
|
203
|
+
def child_labels()
|
204
|
+
return @children.map { |label_child| label_child[0] }
|
205
|
+
end
|
206
|
+
|
207
|
+
def child_label(child)
|
208
|
+
@children.each { |label_child|
|
209
|
+
if label_child[1] == child
|
210
|
+
return label_child[0]
|
211
|
+
end
|
212
|
+
}
|
213
|
+
return nil
|
214
|
+
end
|
215
|
+
|
216
|
+
def children_with_edgelabel()
|
217
|
+
return @children
|
218
|
+
end
|
219
|
+
|
220
|
+
def each_child()
|
221
|
+
@children.each { |label_child| yield label_child[1]}
|
222
|
+
end
|
223
|
+
|
224
|
+
def each_child_with_edgelabel()
|
225
|
+
@children.each { |label_child| yield label_child }
|
226
|
+
end
|
227
|
+
|
228
|
+
def children_by_edgelabels(labels)
|
229
|
+
return @children.select { |label_child|
|
230
|
+
labels.include? label_child[0]
|
231
|
+
}.map { |label_child|
|
232
|
+
label_child[1]
|
233
|
+
}
|
234
|
+
end
|
235
|
+
|
236
|
+
def add_child(child, edgelabel, varhash={})
|
237
|
+
@children << [edgelabel, child]
|
238
|
+
|
239
|
+
# and vice versa: add self as parent to child
|
240
|
+
unless varhash["pointer_insteadof_edge"]
|
241
|
+
unless child.parents_with_edgelabel().include? [edgelabel, self]
|
242
|
+
child.add_parent(self, edgelabel)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
def remove_child(child, edgelabel, varhash={})
|
248
|
+
@children = @children.reject { |label_child|
|
249
|
+
label_child.first == edgelabel and
|
250
|
+
label_child.last == child
|
251
|
+
}
|
252
|
+
|
253
|
+
# and vice versa: remove self as parent from child
|
254
|
+
unless varhash["pointer_insteadof_edge"]
|
255
|
+
if child.parents_with_edgelabel().include? [edgelabel, self]
|
256
|
+
child.remove_parent(self, edgelabel)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def change_child_label(child, oldlabel, newlabel, varhash={})
|
262
|
+
if @children.include? [oldlabel, child]
|
263
|
+
remove_child(child,oldlabel, varhash)
|
264
|
+
add_child(child, newlabel, varhash)
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
def remove_all_children(varhash={})
|
269
|
+
each_child_with_edgelabel { |label, child|
|
270
|
+
remove_child(child, label, varhash)
|
271
|
+
}
|
272
|
+
end
|
273
|
+
|
274
|
+
def set_children(list, varhash={})
|
275
|
+
#### CAUTION: set_children must be called with an "internal format" list of parents:
|
276
|
+
#### instead of using [node, edgelabel], use [edgelabel, node]
|
277
|
+
remove_all_children(varhash)
|
278
|
+
|
279
|
+
@children = list
|
280
|
+
end
|
281
|
+
|
282
|
+
def outdeg()
|
283
|
+
return @children.length()
|
284
|
+
end
|
285
|
+
|
286
|
+
def yield_nodes()
|
287
|
+
arr = Array.new
|
288
|
+
if outdeg() == 0
|
289
|
+
arr << self
|
290
|
+
end
|
291
|
+
each_child { |c|
|
292
|
+
if c.outdeg() == 0
|
293
|
+
arr << c
|
294
|
+
else
|
295
|
+
arr.concat c.yield_nodes
|
296
|
+
end
|
297
|
+
}
|
298
|
+
return arr
|
299
|
+
end
|
300
|
+
|
301
|
+
def descendants
|
302
|
+
return descendants_noduplicates([], [])
|
303
|
+
end
|
304
|
+
|
305
|
+
def descendants_by_edgelabels(labels)
|
306
|
+
return descendants_noduplicates([], labels)
|
307
|
+
end
|
308
|
+
|
309
|
+
protected
|
310
|
+
|
311
|
+
def descendants_noduplicates(nodes, labels)
|
312
|
+
each_child_with_edgelabel() { |l_c|
|
313
|
+
if labels.empty? or labels.include? l_c[0]
|
314
|
+
unless nodes.include? l_c[1]
|
315
|
+
nodes = l_c[1].descendants_noduplicates(nodes << l_c[1], labels)
|
316
|
+
end
|
317
|
+
end
|
318
|
+
}
|
319
|
+
return nodes
|
320
|
+
end
|
321
|
+
|
322
|
+
def ancestors_noduplicates(nodes, labels)
|
323
|
+
each_parent_with_edgelabel() { |l_p|
|
324
|
+
if labels.empty? or labels.include? l_p[0]
|
325
|
+
unless nodes.include? l_p[1]
|
326
|
+
nodes = l_p[1].ancestors_noduplicates(nodes << l_p[1], labels)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
}
|
330
|
+
return nodes
|
331
|
+
end
|
332
|
+
|
333
|
+
#### CAUTION: set_parents must be called with an "internal format" list of parents:
|
334
|
+
#### instead of using [node, edgelabel], use [edgelabel, node]
|
335
|
+
|
336
|
+
def set_parents(list, varhash={})
|
337
|
+
each_parent_with_edgelabel { |label, parent|
|
338
|
+
remove_parent(parent, label, varhash)
|
339
|
+
}
|
340
|
+
|
341
|
+
list.each { |label, parent|
|
342
|
+
add_parent(label, parent)
|
343
|
+
}
|
344
|
+
end
|
345
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# KE changed July 05: now no inclusion of modules required,
|
2
|
+
# and names changed from REXML.Encodign to UtfIso
|
3
|
+
|
4
|
+
module UtfIso
|
5
|
+
# Convert from UTF-8
|
6
|
+
def UtfIso.to_iso_8859_1(content)
|
7
|
+
array_utf8 = content.unpack('U*')
|
8
|
+
array_enc = []
|
9
|
+
array_utf8.each do |num|
|
10
|
+
if num <= 0xFF
|
11
|
+
array_enc << num
|
12
|
+
else
|
13
|
+
# Numeric entity (&#nnnn;); shard by Stefan Scholl
|
14
|
+
# array_enc += to_iso_8859("&\##{num};").unpack('C*')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
array_enc.pack('C*')
|
18
|
+
end
|
19
|
+
|
20
|
+
# Convert to UTF-8
|
21
|
+
def UtfIso.from_iso_8859_1(str)
|
22
|
+
str.unpack('C*').pack('U*')
|
23
|
+
end
|
24
|
+
end
|
data/lib/common/ML.rb
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
# sp 24 08 04
|
2
|
+
|
3
|
+
# this file provides a very simple wrapper for using different ML systems
|
4
|
+
# all you need to do is to write the appropriate learner class
|
5
|
+
# and insert them in the initialize routine here in ML()
|
6
|
+
#
|
7
|
+
# available at the moment:
|
8
|
+
# * timbl (memory-based learner)
|
9
|
+
# * mallet-maxent (another maxent system)
|
10
|
+
# * maxent (the OpenNLP maxent system)
|
11
|
+
|
12
|
+
# part of contract: learner is not initialised unless it is either trained or read
|
13
|
+
|
14
|
+
require "common/Optimise"
|
15
|
+
|
16
|
+
class Classifier
|
17
|
+
|
18
|
+
@@learners = [
|
19
|
+
["timbl", "Timbl", "Timbl"],
|
20
|
+
# ["mallet", "Mallet", "Mallet"],
|
21
|
+
["maxent", "Maxent", "Maxent"]
|
22
|
+
]
|
23
|
+
|
24
|
+
def initialize(learner,params)
|
25
|
+
|
26
|
+
@ready = false
|
27
|
+
|
28
|
+
if params[0] == "optimise"
|
29
|
+
params.shift
|
30
|
+
@optimise = true
|
31
|
+
else
|
32
|
+
@optimise = false
|
33
|
+
end
|
34
|
+
|
35
|
+
program_path = ""
|
36
|
+
begin
|
37
|
+
program_path = params.shift.chomp
|
38
|
+
unless FileTest.exist? program_path
|
39
|
+
$stderr.puts "Error: Could not find classifier system at " + program_path
|
40
|
+
$stderr.puts "Perhaps an erroneous entry in your experiment file?"
|
41
|
+
exit 1
|
42
|
+
end
|
43
|
+
rescue NoMethodError
|
44
|
+
$stderr.puts "Error: No program path provided for classifier system."
|
45
|
+
end
|
46
|
+
|
47
|
+
# try to find our learner in the pre-set list of learners
|
48
|
+
learner_tuple = @@learners.assoc(learner)
|
49
|
+
unless learner_tuple
|
50
|
+
$stderr.puts "Error: I don't know the learner " + learner.to_s
|
51
|
+
$stderr.puts "Perhaps an erroneous entry in your experiment file?"
|
52
|
+
exit 1
|
53
|
+
end
|
54
|
+
|
55
|
+
learner_name, learner_filename, learner_classname = learner_tuple
|
56
|
+
require "common/#{learner_filename}"
|
57
|
+
@learner = eval(learner_classname).new(program_path,params)
|
58
|
+
end
|
59
|
+
|
60
|
+
# a classifier can (and has to be) either trained or read
|
61
|
+
def train(trainfile, classifier_file=nil)
|
62
|
+
# train on the training data in trainfile
|
63
|
+
# make sure we produce a valid file name
|
64
|
+
|
65
|
+
# it is possible to directly specify a filename for storing the classifier
|
66
|
+
|
67
|
+
trainfile.gsub!(/[<>]/,"")
|
68
|
+
trainfile.gsub!(/ /,"_")
|
69
|
+
if @optimise
|
70
|
+
STDERR.puts "[ML] using feature optimisation"
|
71
|
+
@optimiser = Optimise.new
|
72
|
+
@optimiser.init_from_data(trainfile)
|
73
|
+
optimisedfile = trainfile+".opted"
|
74
|
+
@optimiser.apply(trainfile,optimisedfile)
|
75
|
+
@learner.train(optimisedfile,classifier_file)
|
76
|
+
File.delete(optimisedfile)
|
77
|
+
else
|
78
|
+
STDERR.puts "[ML] no feature optimisation"
|
79
|
+
@learner.train(trainfile,classifier_file)
|
80
|
+
end
|
81
|
+
@ready = true
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
# returns true iff reading the classifier from the file has had success
|
86
|
+
|
87
|
+
def read(classifier_file)
|
88
|
+
# make sure we produce a valid file name
|
89
|
+
classifier_file.gsub!(/[<>]/,"")
|
90
|
+
classifier_file.gsub!(/ /,"_")
|
91
|
+
|
92
|
+
# read file, if present
|
93
|
+
|
94
|
+
status = @learner.read(classifier_file)
|
95
|
+
|
96
|
+
# if reading has failed, return "false"
|
97
|
+
unless status
|
98
|
+
STDERR.puts "reading from #{classifier_file} did not succeed"
|
99
|
+
return status
|
100
|
+
end
|
101
|
+
|
102
|
+
# read optimisation, if desired
|
103
|
+
if @optimise
|
104
|
+
optimisations_filename = Optimise.recommended_filename(classifier_file)
|
105
|
+
unless FileTest.exists? optimisations_filename
|
106
|
+
STDERR.puts "[ML] Error: attempted to read stored optimisation, but file does not exist"
|
107
|
+
return false
|
108
|
+
else
|
109
|
+
@optimiser = Optimise.new
|
110
|
+
@optimiser.init_from_file(optimisations_filename)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
@ready = true
|
115
|
+
return true
|
116
|
+
|
117
|
+
end
|
118
|
+
|
119
|
+
# a classifier can be stored somewhere. This can be more than one file (classifier-specific),
|
120
|
+
# but all files start with "classifier_file"
|
121
|
+
|
122
|
+
def write(classifier_file)
|
123
|
+
# make sure we produce a valid file name
|
124
|
+
classifier_file.gsub!(/[<>]/,"")
|
125
|
+
classifier_file.gsub!(/ /,"_")
|
126
|
+
@learner.write(classifier_file)
|
127
|
+
if @optimise
|
128
|
+
@optimiser.store(Optimise.recommended_filename(classifier_file))
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
###
|
133
|
+
# exists?
|
134
|
+
# check if a classifier is living at some particular path
|
135
|
+
|
136
|
+
def exists?(classifier_file)
|
137
|
+
classifier_file.gsub!(/[<>]/,"")
|
138
|
+
classifier_file.gsub!(/ /,"_")
|
139
|
+
return @learner.exists?(classifier_file)
|
140
|
+
end
|
141
|
+
|
142
|
+
# a classifier can be applied
|
143
|
+
|
144
|
+
# returns true iff application has had success
|
145
|
+
|
146
|
+
def apply(testfile,outfile) # test either on the training or the test data in the specified dir
|
147
|
+
# make sure we produce a valid file name
|
148
|
+
testfile.gsub!(/[<>]/,"")
|
149
|
+
testfile.gsub!(/ /,"_")
|
150
|
+
# make sure we produce a valid file name
|
151
|
+
outfile.gsub!(/[<>]/,"")
|
152
|
+
outfile.gsub!(/ /,"_")
|
153
|
+
|
154
|
+
unless @ready
|
155
|
+
STDERR.puts "[ML] Warning: learner not ready for testing! Must be trained or read."
|
156
|
+
return false
|
157
|
+
end
|
158
|
+
|
159
|
+
# do we have a testfile?
|
160
|
+
|
161
|
+
unless FileTest.exists?(testfile)
|
162
|
+
STDERR.puts "[ML] Warning: could not find testfile (maybe empty test set?)."
|
163
|
+
return false
|
164
|
+
end
|
165
|
+
|
166
|
+
if @optimise
|
167
|
+
optimisedfile = testfile+".opted"
|
168
|
+
@optimiser.apply(testfile,optimisedfile)
|
169
|
+
return @learner.apply(optimisedfile,outfile)
|
170
|
+
File.delete(optimisedfile)
|
171
|
+
else
|
172
|
+
return @learner.apply(testfile,outfile)
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
176
|
+
|
177
|
+
###
|
178
|
+
# read classifier result file,
|
179
|
+
# returns a list of instance_results
|
180
|
+
# where an instance_result is a list of pairs [label, confidence]
|
181
|
+
# where the pairs are sorted by confidence
|
182
|
+
def read_resultfile(file)
|
183
|
+
return @learner.read_resultfile(file)
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|