frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,345 @@
1
+ # GraphNode: describes one node in a graph.
2
+ #
3
+ # A node may have an arbitrary number of parents (sources of incoming edges)
4
+ # and an arbitrary number of children (targets of outgoing edges)
5
+ #
6
+ # All edges are labeled and directed
7
+ #
8
+ # The add_parent, add_child, remove_parent, remove_child methods
9
+ # take care of both ends of an edge
10
+ # (i.e. n1.add_child(n2, label) also adds n1 as parent of n2 with edge label 'label'
11
+ #
12
+ # It is possible to create a 'pointer' rather than an edge:
13
+ # n1.add_child(n2, label, pointer_insteadof_edge => true)
14
+ # will create an edge from n1 to n2 labeled 'label' that is
15
+ # listed under the outgoing edges of n1, but not among
16
+ # the incoming edges of n2
17
+ # The same option is available for add_parent, remove_parent, remove_child.
18
+
19
+ class GraphNode
20
+
21
+ def initialize(id)
22
+ @id = id
23
+ @children = Array.new
24
+ @parents = Array.new
25
+ @features = Hash.new
26
+ end
27
+
28
+ # for Marshalling:
29
+ # Dump just IDs instead of actual nodes from Parents and Children lists.
30
+ # Otherwise the Marshaller will go crazy following
31
+ # all the links to objects mentioned.
32
+ # After loading: replace IDs by actual objects with a little help
33
+ # from the caller.
34
+
35
+ def _dump(depth)
36
+ @id.to_s +
37
+ "QQSEPVALUESQQ" +
38
+ Marshal.dump(@features) +
39
+ "QQSEPVALUESQQ" +
40
+ @children.map { |label_child|
41
+ label_child[0] + "QQSEPQQ" + label_child[1].id()
42
+ }.join("QQPAIRQQ") +
43
+ "QQSEPVALUESQQ" +
44
+ @parents.map { |label_parent|
45
+ label_parent[0] + "QQSEPQQ" + label_parent[1].id()
46
+ }.join("QQPAIRQQ")
47
+ end
48
+
49
+ def GraphNode._load(string)
50
+ id, features_s, children_s, parents_s =
51
+ string.split("QQSEPVALUESQQ")
52
+
53
+ result = GraphNode.new(id)
54
+ result.fill_from_pickle(string)
55
+ return result
56
+ end
57
+
58
+ def fill_from_pickle(string)
59
+ id, features_s, children_s, parents_s =
60
+ string.split("QQSEPVALUESQQ")
61
+
62
+ @features = Marshal.load(features_s)
63
+
64
+ if children_s.nil? or children_s.empty?
65
+ @children = []
66
+ else
67
+ @children = children_s.split("QQPAIRQQ").map { |pair|
68
+ pair.split("QQSEPQQ")
69
+ }
70
+ end
71
+
72
+ if parents_s.nil? or parents_s.empty?
73
+ @parents = []
74
+ else
75
+ @parents = parents_s.split("QQPAIRQQ").map { |pair|
76
+ pair.split("QQSEPQQ")
77
+ }
78
+ end
79
+ end
80
+
81
+ def recover_from_dump(node_by_id)
82
+ @children = @children.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
83
+ @parents = @parents.map { |label_id| [label_id[0], node_by_id.call(label_id[1])] }
84
+ end
85
+
86
+ # ID-related things
87
+
88
+ def ==(other_node)
89
+ unless other_node.kind_of? GraphNode
90
+ return false
91
+ end
92
+ @id == other_node.id()
93
+ end
94
+
95
+ def id()
96
+ return @id
97
+ end
98
+
99
+ def chid(newid)
100
+ @id = newid
101
+ end
102
+
103
+ # setting and retrieving features
104
+
105
+ def get_f(feature)
106
+ return @features[feature]
107
+ end
108
+
109
+ def set_f(feature, value)
110
+ @features[feature] = value
111
+ end
112
+
113
+ def add_f(feature, value)
114
+ unless @features[feature].nil?
115
+ raise "Feature " + feature + "already set."
116
+ end
117
+ set_f(feature, value)
118
+ end
119
+
120
+ # ancestors
121
+
122
+ def parents()
123
+ return @parents.map { |label_parent|
124
+ label_parent[1] }
125
+ end
126
+
127
+ def parent_labels()
128
+ return @parents.map { |label_parent| label_parent[0] }
129
+ end
130
+
131
+ def parent_label(parent)
132
+ @parents.each { |label_parent|
133
+ if label_parent[1] == parent
134
+ return label_parent[0]
135
+ end
136
+ }
137
+ return nil
138
+ end
139
+
140
+ def parents_with_edgelabel()
141
+ return @parents
142
+ end
143
+
144
+ def each_parent()
145
+ @parents.each { |label_parent| yield label_parent[1] }
146
+ end
147
+
148
+ def each_parent_with_edgelabel()
149
+ @parents.each { |label_parent| yield label_parent}
150
+ end
151
+
152
+ def parents_by_edgelabels(labels)
153
+ return @parents.select { |label_parent|
154
+ labels.include? label_parent[0]
155
+ }.map { |label_parent|
156
+ label_parent[1]
157
+ }
158
+ end
159
+
160
+ def add_parent(parent, edgelabel, varhash={})
161
+ @parents << [edgelabel, parent]
162
+
163
+ # and vice versa: add self as child to parent
164
+ unless varhash["pointer_insteadof_edge"]
165
+ unless parent.children_with_edgelabel().include? [edgelabel, self]
166
+ parent.add_child(self, edgelabel)
167
+ end
168
+ end
169
+ end
170
+
171
+ def remove_parent(parent, edgelabel, varhash={})
172
+ @parents = @parents.reject { |label_child|
173
+ label_child.first == edgelabel and
174
+ label_child.last == parent
175
+ }
176
+
177
+ # and vice versa: remove self as child from parent
178
+ unless varhash["pointer_insteadof_edge"]
179
+ if parent.children_with_edgelabel().include? [edgelabel, self]
180
+ parent.remove_child(self, edgelabel)
181
+ end
182
+ end
183
+ end
184
+
185
+ def indeg()
186
+ return @parents.length()
187
+ end
188
+
189
+ def ancestors
190
+ return ancestors_noduplicates([], [])
191
+ end
192
+
193
+ def ancestors_by_edgelabels(labels)
194
+ return ancestors_noduplicates([], labels)
195
+ end
196
+
197
+ # descendants
198
+
199
+ def children()
200
+ return @children.map { |label_child| label_child[1] }
201
+ end
202
+
203
+ def child_labels()
204
+ return @children.map { |label_child| label_child[0] }
205
+ end
206
+
207
+ def child_label(child)
208
+ @children.each { |label_child|
209
+ if label_child[1] == child
210
+ return label_child[0]
211
+ end
212
+ }
213
+ return nil
214
+ end
215
+
216
+ def children_with_edgelabel()
217
+ return @children
218
+ end
219
+
220
+ def each_child()
221
+ @children.each { |label_child| yield label_child[1]}
222
+ end
223
+
224
+ def each_child_with_edgelabel()
225
+ @children.each { |label_child| yield label_child }
226
+ end
227
+
228
+ def children_by_edgelabels(labels)
229
+ return @children.select { |label_child|
230
+ labels.include? label_child[0]
231
+ }.map { |label_child|
232
+ label_child[1]
233
+ }
234
+ end
235
+
236
+ def add_child(child, edgelabel, varhash={})
237
+ @children << [edgelabel, child]
238
+
239
+ # and vice versa: add self as parent to child
240
+ unless varhash["pointer_insteadof_edge"]
241
+ unless child.parents_with_edgelabel().include? [edgelabel, self]
242
+ child.add_parent(self, edgelabel)
243
+ end
244
+ end
245
+ end
246
+
247
+ def remove_child(child, edgelabel, varhash={})
248
+ @children = @children.reject { |label_child|
249
+ label_child.first == edgelabel and
250
+ label_child.last == child
251
+ }
252
+
253
+ # and vice versa: remove self as parent from child
254
+ unless varhash["pointer_insteadof_edge"]
255
+ if child.parents_with_edgelabel().include? [edgelabel, self]
256
+ child.remove_parent(self, edgelabel)
257
+ end
258
+ end
259
+ end
260
+
261
+ def change_child_label(child, oldlabel, newlabel, varhash={})
262
+ if @children.include? [oldlabel, child]
263
+ remove_child(child,oldlabel, varhash)
264
+ add_child(child, newlabel, varhash)
265
+ end
266
+ end
267
+
268
+ def remove_all_children(varhash={})
269
+ each_child_with_edgelabel { |label, child|
270
+ remove_child(child, label, varhash)
271
+ }
272
+ end
273
+
274
+ def set_children(list, varhash={})
275
+ #### CAUTION: set_children must be called with an "internal format" list of parents:
276
+ #### instead of using [node, edgelabel], use [edgelabel, node]
277
+ remove_all_children(varhash)
278
+
279
+ @children = list
280
+ end
281
+
282
+ def outdeg()
283
+ return @children.length()
284
+ end
285
+
286
+ def yield_nodes()
287
+ arr = Array.new
288
+ if outdeg() == 0
289
+ arr << self
290
+ end
291
+ each_child { |c|
292
+ if c.outdeg() == 0
293
+ arr << c
294
+ else
295
+ arr.concat c.yield_nodes
296
+ end
297
+ }
298
+ return arr
299
+ end
300
+
301
+ def descendants
302
+ return descendants_noduplicates([], [])
303
+ end
304
+
305
+ def descendants_by_edgelabels(labels)
306
+ return descendants_noduplicates([], labels)
307
+ end
308
+
309
+ protected
310
+
311
+ def descendants_noduplicates(nodes, labels)
312
+ each_child_with_edgelabel() { |l_c|
313
+ if labels.empty? or labels.include? l_c[0]
314
+ unless nodes.include? l_c[1]
315
+ nodes = l_c[1].descendants_noduplicates(nodes << l_c[1], labels)
316
+ end
317
+ end
318
+ }
319
+ return nodes
320
+ end
321
+
322
+ def ancestors_noduplicates(nodes, labels)
323
+ each_parent_with_edgelabel() { |l_p|
324
+ if labels.empty? or labels.include? l_p[0]
325
+ unless nodes.include? l_p[1]
326
+ nodes = l_p[1].ancestors_noduplicates(nodes << l_p[1], labels)
327
+ end
328
+ end
329
+ }
330
+ return nodes
331
+ end
332
+
333
+ #### CAUTION: set_parents must be called with an "internal format" list of parents:
334
+ #### instead of using [node, edgelabel], use [edgelabel, node]
335
+
336
+ def set_parents(list, varhash={})
337
+ each_parent_with_edgelabel { |label, parent|
338
+ remove_parent(parent, label, varhash)
339
+ }
340
+
341
+ list.each { |label, parent|
342
+ add_parent(label, parent)
343
+ }
344
+ end
345
+ end
@@ -0,0 +1,24 @@
1
+ # KE changed July 05: now no inclusion of modules required,
2
+ # and names changed from REXML.Encodign to UtfIso
3
+
4
+ module UtfIso
5
+ # Convert from UTF-8
6
+ def UtfIso.to_iso_8859_1(content)
7
+ array_utf8 = content.unpack('U*')
8
+ array_enc = []
9
+ array_utf8.each do |num|
10
+ if num <= 0xFF
11
+ array_enc << num
12
+ else
13
+ # Numeric entity (&#nnnn;); shard by Stefan Scholl
14
+ # array_enc += to_iso_8859("&\##{num};").unpack('C*')
15
+ end
16
+ end
17
+ array_enc.pack('C*')
18
+ end
19
+
20
+ # Convert to UTF-8
21
+ def UtfIso.from_iso_8859_1(str)
22
+ str.unpack('C*').pack('U*')
23
+ end
24
+ end
@@ -0,0 +1,186 @@
1
+ # sp 24 08 04
2
+
3
+ # this file provides a very simple wrapper for using different ML systems
4
+ # all you need to do is to write the appropriate learner class
5
+ # and insert them in the initialize routine here in ML()
6
+ #
7
+ # available at the moment:
8
+ # * timbl (memory-based learner)
9
+ # * mallet-maxent (another maxent system)
10
+ # * maxent (the OpenNLP maxent system)
11
+
12
+ # part of contract: learner is not initialised unless it is either trained or read
13
+
14
+ require "common/Optimise"
15
+
16
+ class Classifier
17
+
18
+ @@learners = [
19
+ ["timbl", "Timbl", "Timbl"],
20
+ # ["mallet", "Mallet", "Mallet"],
21
+ ["maxent", "Maxent", "Maxent"]
22
+ ]
23
+
24
+ def initialize(learner,params)
25
+
26
+ @ready = false
27
+
28
+ if params[0] == "optimise"
29
+ params.shift
30
+ @optimise = true
31
+ else
32
+ @optimise = false
33
+ end
34
+
35
+ program_path = ""
36
+ begin
37
+ program_path = params.shift.chomp
38
+ unless FileTest.exist? program_path
39
+ $stderr.puts "Error: Could not find classifier system at " + program_path
40
+ $stderr.puts "Perhaps an erroneous entry in your experiment file?"
41
+ exit 1
42
+ end
43
+ rescue NoMethodError
44
+ $stderr.puts "Error: No program path provided for classifier system."
45
+ end
46
+
47
+ # try to find our learner in the pre-set list of learners
48
+ learner_tuple = @@learners.assoc(learner)
49
+ unless learner_tuple
50
+ $stderr.puts "Error: I don't know the learner " + learner.to_s
51
+ $stderr.puts "Perhaps an erroneous entry in your experiment file?"
52
+ exit 1
53
+ end
54
+
55
+ learner_name, learner_filename, learner_classname = learner_tuple
56
+ require "common/#{learner_filename}"
57
+ @learner = eval(learner_classname).new(program_path,params)
58
+ end
59
+
60
+ # a classifier can (and has to be) either trained or read
61
+ def train(trainfile, classifier_file=nil)
62
+ # train on the training data in trainfile
63
+ # make sure we produce a valid file name
64
+
65
+ # it is possible to directly specify a filename for storing the classifier
66
+
67
+ trainfile.gsub!(/[<>]/,"")
68
+ trainfile.gsub!(/ /,"_")
69
+ if @optimise
70
+ STDERR.puts "[ML] using feature optimisation"
71
+ @optimiser = Optimise.new
72
+ @optimiser.init_from_data(trainfile)
73
+ optimisedfile = trainfile+".opted"
74
+ @optimiser.apply(trainfile,optimisedfile)
75
+ @learner.train(optimisedfile,classifier_file)
76
+ File.delete(optimisedfile)
77
+ else
78
+ STDERR.puts "[ML] no feature optimisation"
79
+ @learner.train(trainfile,classifier_file)
80
+ end
81
+ @ready = true
82
+ end
83
+
84
+
85
+ # returns true iff reading the classifier from the file has had success
86
+
87
+ def read(classifier_file)
88
+ # make sure we produce a valid file name
89
+ classifier_file.gsub!(/[<>]/,"")
90
+ classifier_file.gsub!(/ /,"_")
91
+
92
+ # read file, if present
93
+
94
+ status = @learner.read(classifier_file)
95
+
96
+ # if reading has failed, return "false"
97
+ unless status
98
+ STDERR.puts "reading from #{classifier_file} did not succeed"
99
+ return status
100
+ end
101
+
102
+ # read optimisation, if desired
103
+ if @optimise
104
+ optimisations_filename = Optimise.recommended_filename(classifier_file)
105
+ unless FileTest.exists? optimisations_filename
106
+ STDERR.puts "[ML] Error: attempted to read stored optimisation, but file does not exist"
107
+ return false
108
+ else
109
+ @optimiser = Optimise.new
110
+ @optimiser.init_from_file(optimisations_filename)
111
+ end
112
+ end
113
+
114
+ @ready = true
115
+ return true
116
+
117
+ end
118
+
119
+ # a classifier can be stored somewhere. This can be more than one file (classifier-specific),
120
+ # but all files start with "classifier_file"
121
+
122
+ def write(classifier_file)
123
+ # make sure we produce a valid file name
124
+ classifier_file.gsub!(/[<>]/,"")
125
+ classifier_file.gsub!(/ /,"_")
126
+ @learner.write(classifier_file)
127
+ if @optimise
128
+ @optimiser.store(Optimise.recommended_filename(classifier_file))
129
+ end
130
+ end
131
+
132
+ ###
133
+ # exists?
134
+ # check if a classifier is living at some particular path
135
+
136
+ def exists?(classifier_file)
137
+ classifier_file.gsub!(/[<>]/,"")
138
+ classifier_file.gsub!(/ /,"_")
139
+ return @learner.exists?(classifier_file)
140
+ end
141
+
142
+ # a classifier can be applied
143
+
144
+ # returns true iff application has had success
145
+
146
+ def apply(testfile,outfile) # test either on the training or the test data in the specified dir
147
+ # make sure we produce a valid file name
148
+ testfile.gsub!(/[<>]/,"")
149
+ testfile.gsub!(/ /,"_")
150
+ # make sure we produce a valid file name
151
+ outfile.gsub!(/[<>]/,"")
152
+ outfile.gsub!(/ /,"_")
153
+
154
+ unless @ready
155
+ STDERR.puts "[ML] Warning: learner not ready for testing! Must be trained or read."
156
+ return false
157
+ end
158
+
159
+ # do we have a testfile?
160
+
161
+ unless FileTest.exists?(testfile)
162
+ STDERR.puts "[ML] Warning: could not find testfile (maybe empty test set?)."
163
+ return false
164
+ end
165
+
166
+ if @optimise
167
+ optimisedfile = testfile+".opted"
168
+ @optimiser.apply(testfile,optimisedfile)
169
+ return @learner.apply(optimisedfile,outfile)
170
+ File.delete(optimisedfile)
171
+ else
172
+ return @learner.apply(testfile,outfile)
173
+ end
174
+
175
+ end
176
+
177
+ ###
178
+ # read classifier result file,
179
+ # returns a list of instance_results
180
+ # where an instance_result is a list of pairs [label, confidence]
181
+ # where the pairs are sorted by confidence
182
+ def read_resultfile(file)
183
+ return @learner.read_resultfile(file)
184
+ end
185
+
186
+ end