shalmaneser 0.0.1.alpha → 1.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +2 -2
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +49 -0
  6. data/bin/fred +18 -0
  7. data/bin/frprep +34 -0
  8. data/bin/rosy +17 -0
  9. data/lib/common/AbstractSynInterface.rb +35 -33
  10. data/lib/common/Mallet.rb +236 -0
  11. data/lib/common/Maxent.rb +26 -12
  12. data/lib/common/Parser.rb +5 -5
  13. data/lib/common/SynInterfaces.rb +13 -6
  14. data/lib/common/TabFormat.rb +7 -6
  15. data/lib/common/Tiger.rb +4 -4
  16. data/lib/common/Timbl.rb +144 -0
  17. data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
  18. data/lib/common/headz.rb +1 -1
  19. data/lib/common/ruby_class_extensions.rb +3 -3
  20. data/lib/fred/FredBOWContext.rb +14 -2
  21. data/lib/fred/FredDetermineTargets.rb +4 -9
  22. data/lib/fred/FredEval.rb +1 -1
  23. data/lib/fred/FredFeatureExtractors.rb +4 -3
  24. data/lib/fred/FredFeaturize.rb +1 -1
  25. data/lib/frprep/CollinsInterface.rb +6 -6
  26. data/lib/frprep/MiniparInterface.rb +5 -5
  27. data/lib/frprep/SleepyInterface.rb +7 -7
  28. data/lib/frprep/TntInterface.rb +1 -1
  29. data/lib/frprep/TreetaggerInterface.rb +29 -5
  30. data/lib/frprep/do_parses.rb +1 -0
  31. data/lib/frprep/frprep.rb +36 -32
  32. data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
  33. data/lib/frprep/interfaces/stanford_interface.rb +353 -0
  34. data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
  35. data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
  36. data/lib/frprep/opt_parser.rb +2 -2
  37. data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
  38. data/lib/rosy/RosyIterator.rb +11 -10
  39. data/lib/rosy/rosy.rb +1 -0
  40. data/lib/shalmaneser/version.rb +1 -1
  41. data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
  42. data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
  43. data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
  44. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
  45. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
  46. data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
  47. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
  48. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
  49. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
  50. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
  51. data/test/functional/test_frprep.rb +3 -3
  52. data/test/functional/test_rosy.rb +20 -0
  53. metadata +215 -224
  54. data/CHANGELOG.rdoc +0 -0
  55. data/LICENSE.rdoc +0 -0
  56. data/README.rdoc +0 -0
  57. data/lib/common/CollinsInterface.rb +0 -1165
  58. data/lib/common/MiniparInterface.rb +0 -1388
  59. data/lib/common/SleepyInterface.rb +0 -384
  60. data/lib/common/TntInterface.rb +0 -44
  61. data/lib/common/TreetaggerInterface.rb +0 -303
  62. data/lib/frprep/AbstractSynInterface.rb +0 -1227
  63. data/lib/frprep/BerkeleyInterface.rb +0 -375
  64. data/lib/frprep/ConfigData.rb +0 -694
  65. data/lib/frprep/FixSynSemMapping.rb +0 -196
  66. data/lib/frprep/FrPrepConfigData.rb +0 -66
  67. data/lib/frprep/FrprepHelper.rb +0 -1324
  68. data/lib/frprep/ISO-8859-1.rb +0 -24
  69. data/lib/frprep/Parser.rb +0 -213
  70. data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
  71. data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
  72. data/lib/frprep/SynInterfaces.rb +0 -275
  73. data/lib/frprep/TabFormat.rb +0 -720
  74. data/lib/frprep/Tiger.rb +0 -1448
  75. data/lib/frprep/Tree.rb +0 -61
  76. data/lib/frprep/headz.rb +0 -338
data/lib/common/Maxent.rb CHANGED
@@ -11,21 +11,32 @@ class Maxent
11
11
  ###
12
12
  def initialize(program_path,parameters)
13
13
 
14
- if parameters.empty?
15
- puts "Error: The OpenNLP maxent system needs two paths (first the location of maxent itself and then the location of the interface, usually program/tools/maxent)."
16
- puts "I got only the program path."
17
- Kernel.exit
18
- end
19
-
14
+ # @note AB: <parameters> is an Array with the last part of the
15
+ # line from the experiment file, it should contain the path to our
16
+ # java wrappers, but we don't want it.
17
+ # Since the presence of this part is checked only here we
18
+ # suppose it obsolete and set this path manually here.
19
+ # if parameters.empty?
20
+ # puts "Error: The OpenNLP maxent system needs two paths (first the location of maxent itself and then the location of the interface, usually program/tools/maxent)."
21
+ # puts "I got only the program path."
22
+ # Kernel.exit
23
+ # end
24
+ # @interface_path = parameters.first
25
+
26
+ # @note AB: Setting path manually.
27
+ # It assumes <Maxent.rb> ist in <lib/common> and
28
+ # <Classify.class> is in <lib/ext/maxent>.
29
+ @interface_path = File.expand_path('../ext/maxent', File.dirname(__FILE__))
30
+
20
31
  @maxentpath = program_path
21
- @interface_path = parameters.first
32
+
22
33
  unless @maxentpath =~ /\/$/
23
34
  @maxentpath = @maxentpath + "/"
24
35
  end
25
36
 
26
37
  # classpath for maxent
27
38
 
28
- @cp = "#{ENV["CLASSPATH"]}:#{@maxentpath}:#{@maxentpath}lib:#{@maxentpath}lib/trove.jar:#{@maxentpath}output/maxent-2.4.0.jar"
39
+ @cp = "#{@maxentpath}:#{@maxentpath}lib:#{@maxentpath}lib/trove.jar:#{@maxentpath}output/maxent-2.4.0.jar:#{ENV["CLASSPATH"]}"
29
40
 
30
41
  end
31
42
 
@@ -148,9 +159,9 @@ class Maxent
148
159
  return nil
149
160
  end
150
161
 
151
- retv = Array.new()
162
+ retv = []
152
163
 
153
- f.each { |line|
164
+ f.each do |line|
154
165
  line_results = Array.new()
155
166
  pieces = line.split() # split at whitespace
156
167
 
@@ -164,8 +175,11 @@ class Maxent
164
175
 
165
176
  # sort: most confident label first
166
177
  retv << line_results.sort {|a,b| b[1] <=> a[1]}
167
- }
168
- return retv
178
+ end
179
+
180
+ f.close
181
+
182
+ retv
169
183
  end
170
184
 
171
185
 
data/lib/common/Parser.rb CHANGED
@@ -72,12 +72,12 @@ end
72
72
 
73
73
 
74
74
  class FilePartsParser
75
- # @file = File object for the corpus
76
- # @head = string up to the first <s> tag
77
- # @tail = string after the last </s> tag
78
- # @rest = string starting with the latest <s> tag (complete this to
75
+ # <@file> = File object for the corpus
76
+ # <@head> = string up to the first <s> tag
77
+ # <@tail> = string after the last </s> tag
78
+ # <@rest> = string starting with the latest <s> tag (complete this to
79
79
  # a <s>...</s> structure by reading up to next </s> tag)
80
- # @readCompletely = boolean specifying whether there's still something
80
+ # <@readCompletely> = boolean specifying whether there's still something
81
81
  # left to read in the file
82
82
 
83
83
  attr_reader :head, :tail
@@ -245,12 +245,19 @@ class SynInterfaces
245
245
  end
246
246
 
247
247
 
248
- require "common/CollinsInterface"
249
- require "common/BerkeleyInterface"
250
- require "common/SleepyInterface"
251
- require "common/MiniparInterface"
252
- require "common/TntInterface"
253
- require "common/TreetaggerInterface"
248
+ # AB: TODO We should require programmatically all files in
249
+ # <frprep/interpreters> and <frprep/interfaces>.
250
+ require "frprep/CollinsInterface"
251
+
252
+ require 'frprep/interfaces/berkeley_interface'
253
+ require 'frprep/interpreters/berkeley_interpreter'
254
+ require 'frprep/interfaces/stanford_interface'
255
+ require 'frprep/interpreters/stanford_interpreter'
256
+
257
+ require "frprep/SleepyInterface"
258
+ require "frprep/MiniparInterface"
259
+ require "frprep/TntInterface"
260
+ require "frprep/TreetaggerInterface"
254
261
 
255
262
 
256
263
  class EmptyInterpreter < SynInterpreter
@@ -25,15 +25,16 @@ require "tempfile"
25
25
  require "common/ISO-8859-1"
26
26
  require "common/ruby_class_extensions"
27
27
 
28
+ # @todo Remove this definition in the top namespace!
28
29
  #######################
29
30
  # This function takes a variable number of arguments and
30
31
  # returns them as an array
31
32
  # Idea: make formulation of tab format entries easier to read,
32
33
  # enclose variable arguments in a repeat() call,
33
34
  # which immediately gets transformed into a list
34
- def repeat(*args)
35
- return args
36
- end
35
+ #def repeat(*args)
36
+ # return args
37
+ #end
37
38
 
38
39
  #######################
39
40
  class TabFormatFile
@@ -84,7 +85,7 @@ class TabFormatFile
84
85
  # (in terms of number of lines) as the first file.
85
86
  # each sentence is returned in the form of an
86
87
  # array of TabFormatSentence sentences.
87
-
88
+ # AB: TODO Delete this nasty exception!!!
88
89
  def each_sentence
89
90
  unless @read_completely
90
91
  sentence = @my_sentence_class.new(@patterns)
@@ -98,9 +99,9 @@ class TabFormatFile
98
99
  }
99
100
  #STDERR.puts linearray
100
101
  @no_of_read_lines += 1
101
- if linearray.detect{|x| x.strip == ""}
102
+ if linearray.detect {|x| x.strip == ""}
102
103
  if linearray.detect {|x| x.strip != ""}
103
- STDERR.puts "Error: Mismatching empty lines!"
104
+ STDERR.puts "Error: Mismatching empty lines! <from lib/common>"
104
105
  exit(1)
105
106
  else
106
107
  # sentence finished. yield it and start a new one
data/lib/common/Tiger.rb CHANGED
@@ -1,13 +1,13 @@
1
1
  # -*- coding: utf-8 -*-
2
- require "common/headz"
3
- require "common/SalsaTigerRegXML"
4
- require "common/ruby_class_extensions"
2
+ require 'common/headz'
3
+ require 'common/SalsaTigerRegXML'
4
+ require 'common/ruby_class_extensions'
5
5
  class Array
6
6
  include EnumerableDistribute
7
7
  end
8
8
 
9
9
 
10
- require "common/AbstractSynInterface"
10
+ require 'common/AbstractSynInterface'
11
11
 
12
12
  #############################################
13
13
  #
@@ -0,0 +1,144 @@
1
+ # wrapper script for timbl learner
2
+ # sp 24 08 04
3
+
4
+ # contract for Learner classes:
5
+
6
+
7
+ class Timbl
8
+
9
+ def initialize(program_path, parameters)
10
+
11
+ # @timblpath="/proj/llx/Software/MachineLearning/Timbl5/Timbl "
12
+
13
+ @timblpath = File.join(program_path,"Timbl")
14
+ unless @timblpath =~ /\s$/
15
+ # path must end in space so we can just attach parameters
16
+ @timblpath << " "
17
+ end
18
+
19
+ if parameters.empty?
20
+ # was: +vs
21
+ @params = "-mM -k5 +vs" # default parameters
22
+ else
23
+ @params = parameters.join(" ") + " "
24
+ end
25
+ end
26
+
27
+ def timbl_out_to_malouf_out(infilename,outfilename) # timbl: [all features], [gold standard label]
28
+ infile = File.new(infilename)
29
+ outfile = File.new(outfilename,"w")
30
+ while (line = infile.gets)
31
+ larray = line.chomp.split(",")
32
+ ml_label = larray.last
33
+ outfile.puts ml_label+"\t1"
34
+ end
35
+ infile.close
36
+ outfile.close
37
+ end
38
+
39
+ def train(infile,classifier_location) # lazy learning: for training, store the
40
+ # instancebase as a tree (TiMBL -I / -i option)
41
+ # figure out how many features we have
42
+ f = File.new(infile)
43
+ line = f.gets().chomp()
44
+ num_features = line.split(",").length() - 1
45
+
46
+ # and train
47
+ if classifier_location then
48
+ @instancebase = classifier_location
49
+ else
50
+ @instancebase = infile+".instancebase"
51
+ end
52
+ successfully_run(@timblpath+@params+" -N#{num_features} -f "+infile+" -I "+@instancebase)
53
+ end
54
+
55
+ # return true iff reading the classifier has had success
56
+ def read(classifierfile)
57
+ unless FileTest.exists?(classifierfile)
58
+ STDERR.puts "[Timbl] Cannot find instancebase at #{classifierfile}"
59
+ return false
60
+ end
61
+ @instancebase = classifierfile
62
+ return true
63
+ end
64
+
65
+ def exists?(classifierfile)
66
+ return FileTest.exists?(classifierfile)
67
+ end
68
+
69
+ def write(classifierfile)
70
+ %x{cp #{@instancebase} #{classifierfile}} # store training data as "modelfile"
71
+ File.chmod(0664,classifierfile)
72
+ end
73
+
74
+ def apply(infile,outfile)
75
+ temp_outfile = outfile+".temp"
76
+ successfully_run(@timblpath+@params+" -i "+@instancebase+" -t "+infile+" -o "+temp_outfile)
77
+
78
+ # if we have an empty input file, timbl will not produce an output file
79
+ unless FileTest.exists?(temp_outfile)
80
+ # STDERR.puts "[Timbl] Warning: Timbl failed to produce an outfile."
81
+ return false
82
+ end
83
+
84
+ # no error
85
+ timbl_out_to_malouf_out(temp_outfile,outfile)
86
+ File.unlink(temp_outfile)
87
+
88
+ # true iff outfile exists
89
+ if FileTest.exists?(outfile)
90
+ return true
91
+ else
92
+ # STDERR.puts "[Timbl] Warning: Final outfile could not be produced."
93
+ return false
94
+ end
95
+
96
+ end
97
+
98
+ #####
99
+ def read_resultfile(filename)
100
+ begin
101
+ f = File.new(filename)
102
+ rescue
103
+ $stderr.puts "TiMBL error: cannot read TiMBL result file #{filemame}."
104
+ return nil
105
+ end
106
+
107
+ retv = Array.new()
108
+
109
+ f.each { |line|
110
+ line_results = Array.new()
111
+ pieces = line.split()
112
+
113
+ while not(pieces.empty?)
114
+ label = pieces.shift()
115
+
116
+ begin
117
+ confidence = pieces.shift().to_f()
118
+ rescue
119
+ $stderr.puts "Error reading mallet output: invalid line: #{line}"
120
+ confidence = 0
121
+ end
122
+
123
+ line_results << [label, confidence]
124
+ end
125
+ retv << line_results
126
+ }
127
+
128
+ return retv
129
+ end
130
+
131
+ #########################
132
+ private
133
+
134
+ ###
135
+ def successfully_run(command)
136
+ retv = Kernel.system(command)
137
+ unless retv
138
+ $stderr.puts "Error running classifier. Exiting."
139
+ $stderr.puts "Offending command: "+command
140
+ exit 1
141
+ end
142
+ end
143
+
144
+ end
@@ -10,7 +10,6 @@ require "common/AbstractSynInterface"
10
10
 
11
11
  ############################################3
12
12
  # Module FrprepHelper:
13
- #
14
13
  # diverse transformation methods for frprep.rb
15
14
  # moved over here to make the main file less crowded
16
15
  module FrprepHelper
@@ -162,23 +161,28 @@ module FrprepHelper
162
161
  raise "Could not read #{input_filename}, or could not write to #{output_filename}."
163
162
  end
164
163
 
165
- filename_core = File.basename(input_filename, "txt")
164
+ # AB: TODO This assumes all input files have the extension <txt>.
165
+ # Is it good?
166
+ filename_core = File.basename(input_filename, 'txt')
166
167
 
167
168
  # array(string): keep the words of each sentence
168
- sentence = Array.new
169
+ sentence = []
169
170
  # sentence number for making the sentence ID:
170
171
  # global count, over all input files
171
172
  sentno = 0
172
173
 
173
- while (line = infile.gets())
174
+ while line = infile.gets
174
175
 
175
176
  # make a sentence ID for the next sentence: running number
176
- sentid = filename_core + "_" + sentno.to_s
177
+ sentid = "#{filename_core}_#{sentno}"
177
178
  sentno += 1
178
179
 
179
180
  # read words into the sentence array,
180
181
  # separating out punctuation attached to the beginning or end of words
181
- sentence.clear()
182
+ sentence.clear
183
+
184
+ # AB: TODO Remove this naive tokenizer, better to have a fully
185
+ # tokenized input using an external tokenizer than that.
182
186
  line.split.each { |word|
183
187
  # punctuation at the beginning of the word
184
188
  #if word =~ /^([\(\[`'\"-]+)(.*)$/
@@ -206,7 +210,9 @@ module FrprepHelper
206
210
  }
207
211
 
208
212
 
213
+
209
214
  # remove empty words
215
+ # AB: TODO Is it possible? Remove this.
210
216
  sentence.reject! { |word| word.nil? or word.strip.empty? }
211
217
 
212
218
  # write words to tab file
@@ -222,13 +228,13 @@ module FrprepHelper
222
228
  }
223
229
  outfile.puts
224
230
  end
225
- outfile.close()
231
+ outfile.close
226
232
  end
227
233
 
228
234
  ###########
229
235
  #
230
236
  # class method split_dir:
231
- # read all files in one directory and produce chunk files *#{suffix} in outdir
237
+ # read all files in one directory and produce chunk files with _suffix_ in outdir
232
238
  # with a certain number of files in them (sent_num).
233
239
  # Optionally, remove all sentences longer than sent_leng
234
240
  #
data/lib/common/headz.rb CHANGED
@@ -27,7 +27,7 @@
27
27
  # print "preposition of conjunction involved"
28
28
  # end
29
29
 
30
- require "common/SalsaTigerRegXML"
30
+ require 'common/SalsaTigerRegXML'
31
31
 
32
32
  class Headz
33
33
 
@@ -234,7 +234,7 @@ end
234
234
  ################
235
235
  module EnumerableBool
236
236
  ###
237
- # And_{x \in X} block(x)
237
+ # And_(x \in X) block(x)
238
238
  def big_and(&block)
239
239
  each { |x|
240
240
  unless block.call(x)
@@ -245,7 +245,7 @@ module EnumerableBool
245
245
  end
246
246
 
247
247
  ###
248
- # Or_{x \in X} block(x)
248
+ # Or_(x \in X) block(x)
249
249
  def big_or(&block)
250
250
  each { |x|
251
251
  if block.call(x)
@@ -256,7 +256,7 @@ module EnumerableBool
256
256
  end
257
257
 
258
258
  ###
259
- # Sum_{x \in X} block(x)
259
+ # Sum_(x \in X) block(x)
260
260
  def big_sum(init = 0, &block)
261
261
  sum = init
262
262
  unless block_given?
@@ -432,7 +432,16 @@ class NoncontiguousContextProvider < AbstractContextProvider
432
432
  File.symlink(filename, frprep_in + "infile")
433
433
 
434
434
  # call frprep
435
- retv = Kernel.system("ruby frprep.rb -e #{tf_exp_frprep.path()}")
435
+ # AB: Bad hack, find a way to invoke FrPrep directly.
436
+ # We will need an FrPrep instance and an options object.
437
+ base_dir_path = File.expand_path(File.dirname(__FILE__) + '/../..')
438
+
439
+ # Remove this
440
+ FileUtils.cp(tf_exp_frprep.path, '/tmp/frprep.exp')
441
+ # after debugging
442
+
443
+ retv = system("ruby -rubygems -I #{base_dir_path}/lib #{base_dir_path}/bin/frprep -e #{tf_exp_frprep.path}")
444
+
436
445
  unless retv
437
446
  $stderr.puts "Error analyzing #{filename}. Exiting."
438
447
  exit 1
@@ -469,6 +478,8 @@ class NoncontiguousContextProvider < AbstractContextProvider
469
478
 
470
479
  # remove temporary data
471
480
  temptable_obj.drop_temp_table()
481
+
482
+ # AB: TODO Rewrite this passage using pure Ruby.
472
483
  %x{rm -rf #{frprep_in}}
473
484
  %x{rm -rf #{frprep_out}}
474
485
  %x{rm -rf #{frprep_dir}}
@@ -487,6 +498,7 @@ class NoncontiguousContextProvider < AbstractContextProvider
487
498
  # - hash table containing all hash keys
488
499
  def make_index(dir)
489
500
 
501
+ # AB: Why this limits? Use constants!
490
502
  space_for_sentstring = 30000
491
503
  space_for_hashkey = 500
492
504
 
@@ -644,7 +656,7 @@ class NoncontiguousContextProvider < AbstractContextProvider
644
656
  end
645
657
  # and enter recursion
646
658
  remove_files(subdir)
647
- File.rm_f(subdir)
659
+ FileUtils.rm_f(subdir)
648
660
  }
649
661
  end
650
662