frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,13 @@
1
+ # -*- encoding: us-ascii -*-
2
+
3
+ # AB, 2010-11-25
4
+
5
+
6
+ # It is a general class for parsing options.
7
+ # It is now emtpy, we are implementing three different classes:
8
+ # FRPrepOptionParser, RosyOptionParser and FredOptionParser.
9
+ # All classes above inherit from OptionParser.
10
+ #--
11
+ # TODO: move the functionality to the parent class.
12
+ class OptionParser
13
+ end
@@ -0,0 +1,310 @@
1
+ # Katrin Erk Oct 05
2
+ #
3
+ # useful extensions to standard classes
4
+
5
+ require 'fileutils'
6
+
7
+ class String
8
+ def startswith(other_string)
9
+ self[0..other_string.length() - 1] == other_string
10
+ end
11
+
12
+ def endswith(other_string)
13
+ not(other_string.length() > self.length()) and
14
+ self[self.length() - other_string.length()..-1] == other_string
15
+ end
16
+ end
17
+
18
+ class File
19
+ ########
20
+ # check whether a given path exists,
21
+ # and if it doesn't, make sure it is created.
22
+ #
23
+ # piece together the strings in 'pieces' to make the path,
24
+ # appending "/" to all strings if necessary
25
+ #
26
+ # returns: the path pieced together
27
+ def File.new_dir(*pieces) # strings, to be pieced together
28
+
29
+ dir_path, dummy = File.make_path(pieces, true)
30
+ unless File.exists? dir_path
31
+ FileUtils.mkdir_p dir_path
32
+ end
33
+ # check that all went well in creating the directory)
34
+ File.existing_dir(dir_path)
35
+
36
+ return dir_path
37
+ end
38
+
39
+ ########
40
+ # same as new_dir, but last piece is a filename
41
+ def File.new_filename(*pieces)
42
+ dir_path, whole_path = File.make_path(pieces, false)
43
+ unless File.exists? dir_path
44
+ FileUtils.mkdir_p dir_path
45
+ end
46
+ # check that all went well in creating the directory)
47
+ File.existing_dir(dir_path)
48
+
49
+ return whole_path
50
+ end
51
+
52
+
53
+ #####
54
+ # check whether a given path exists,
55
+ # and report failure of it does not exist.
56
+ #
57
+ # piece together the strings in 'pieces' to make the path,
58
+ # appending "/" to all strings if necessary
59
+ #
60
+ # returns: the path pieced together
61
+ def File.existing_dir(*pieces) # strings
62
+
63
+ dir_path, dummy = File.make_path(pieces, true)
64
+
65
+ unless File.exists? dir_path and File.directory? dir_path
66
+ $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting."
67
+ exit(1)
68
+ end
69
+ unless File.executable? dir_path
70
+ $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
71
+ exit(1)
72
+ end
73
+
74
+ return dir_path
75
+ end
76
+
77
+ ####
78
+ # like existing_dir, but last bit is filename
79
+ def File.existing_filename(*pieces) # strings
80
+
81
+ dir_path, whole_path = File.make_path(pieces, false)
82
+
83
+ unless File.exists? dir_path and File.directory? dir_path
84
+ $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting"
85
+ exit(1)
86
+ end
87
+ unless File.executable? dir_path
88
+ $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
89
+ exit(1)
90
+ end
91
+
92
+ return whole_path
93
+ end
94
+
95
+ ####
96
+ # piece together the strings in 'pieces' to make a path,
97
+ # appending "/" to all but the last string if necessary
98
+ #
99
+ # if 'pieces' is already a string, take that as a one-piece path
100
+ #
101
+ # if dir is true, also append "/" to the last piece of the string
102
+ #
103
+ # the resulting path is expanded: For example, initial
104
+ # ~ is expanded to the setting of $HOME
105
+ #
106
+ # returns: pair of strings (directory_part, whole_path)
107
+ #
108
+ def File.make_path(pieces, # string or array:string
109
+ is_dir = false) # Boolean: is the path a directory?
110
+
111
+ if pieces.kind_of? String
112
+ pieces = [ pieces ]
113
+ end
114
+
115
+ dir = ""
116
+ # iterate over all but the filename
117
+ if is_dir
118
+ last_dir_index = -1
119
+ else
120
+ last_dir_index = -2
121
+ end
122
+ pieces[0..last_dir_index].each { |piece|
123
+ if piece.nil?
124
+ # whoops, nil entry in name of path!
125
+ $stderr.puts "File.make_path ERROR: nil for piece of path name."
126
+ next
127
+ end
128
+ if piece =~ /\/$/
129
+ dir << piece
130
+ else
131
+ dir << piece << "/"
132
+ end
133
+ }
134
+ dir = File.expand_path(dir)
135
+ # expand_path removes the final "/" again
136
+ unless dir =~ /\/$/
137
+ dir = dir + "/"
138
+ end
139
+
140
+ if is_dir
141
+ return [dir, dir]
142
+ else
143
+ return [dir, dir + pieces[-1]]
144
+ end
145
+ end
146
+
147
+ end
148
+
149
+ #############################################
150
+ class Array
151
+
152
+ ###
153
+ # interleave N arrays:
154
+ # given arrays [a1... an], [b1,...,bn], ..[z1, ...,zn]
155
+ # return [[a1,b1, .., z1]...,[an,bn, .., zn]]
156
+ #
157
+ # if one array is longer than the other,
158
+ # e.g. [a1...an], [b1,...,bm] with n> m
159
+ # the result is
160
+ # [[a1,b1],...[am, bm], [am+1, nil], ..., [an, nil]]
161
+ # and analogously for m>n
162
+ def interleave(*arrays)
163
+ len = [length(), arrays.map { |a| a.length() }.max()].max()
164
+ (0..len-1).to_a.map { |ix|
165
+ [at(ix)] + arrays.map { |a| a[ix] }
166
+ }
167
+ end
168
+
169
+ ###
170
+ # prepend: prepend element to array
171
+ # because I can never remember which is 'shift'
172
+ # and which is 'unshift'
173
+ def prepend(element)
174
+ unshift(element)
175
+ end
176
+
177
+ ###
178
+ # count the number of occurrences of element in this array
179
+ def count(element)
180
+ num = 0
181
+ each { |my_element|
182
+ if my_element == element
183
+ num += 1
184
+ end
185
+ }
186
+ return num
187
+ end
188
+
189
+ ###
190
+ # count the number of occurrences of
191
+ # elements from list in this array
192
+ def counts(list)
193
+ num = 0
194
+ each { |my_element|
195
+ if list.include? my_element
196
+ num += 1
197
+ end
198
+ }
199
+ return num
200
+ end
201
+
202
+ ###
203
+ # draw a random sample of size N
204
+ # from this array
205
+ def sample(size)
206
+ if size < 0
207
+ return nil
208
+ elsif size == 0
209
+ return []
210
+ elsif size >= length()
211
+ return self.clone()
212
+ end
213
+
214
+ rank = Hash.new()
215
+ each { |my_element|
216
+ rank[my_element] = rand()
217
+ }
218
+ return self.sort { |a, b| rank[a] <=> rank[b] }[0..size-1]
219
+ end
220
+ end
221
+
222
+ class Float
223
+ ###
224
+ # round a float to the given number of decimal points
225
+ def round_to_decpts(n)
226
+ if self.nan?
227
+ return self
228
+ else
229
+ return (self * 10**n).round.to_f / 10**n
230
+ end
231
+ end
232
+ end
233
+
234
+ ################
235
+ module EnumerableBool
236
+ ###
237
+ # And_{x \in X} block(x)
238
+ def big_and(&block)
239
+ each { |x|
240
+ unless block.call(x)
241
+ return false
242
+ end
243
+ }
244
+ return true
245
+ end
246
+
247
+ ###
248
+ # Or_{x \in X} block(x)
249
+ def big_or(&block)
250
+ each { |x|
251
+ if block.call(x)
252
+ return true
253
+ end
254
+ }
255
+ return false
256
+ end
257
+
258
+ ###
259
+ # Sum_{x \in X} block(x)
260
+ def big_sum(init = 0, &block)
261
+ sum = init
262
+ unless block_given?
263
+ block = Proc.new { |x| x}
264
+ end
265
+ each { |x|
266
+ sum += block.call(x)
267
+ }
268
+ return sum
269
+ end
270
+ end
271
+
272
+ ################
273
+ # Given an enumerable, distribute its items into two bins (arrays)
274
+ # depending on whether the block returns true
275
+ module EnumerableDistribute
276
+ def distribute(&block)
277
+ retv1 = Array.new
278
+ retv2 = Array.new
279
+ each { |x|
280
+ if block.call(x)
281
+ retv1 << x
282
+ else
283
+ retv2 << x
284
+ end
285
+ }
286
+ return [retv1, retv2]
287
+ end
288
+ end
289
+
290
+ #####################
291
+ # map with index
292
+ module MapWithIndex
293
+ def map_with_index(&block)
294
+ retv = Array.new
295
+
296
+ each_with_index { |x, index|
297
+ retv << block.call(x, index)
298
+ }
299
+
300
+ return retv
301
+ end
302
+ end
303
+
304
+ # include new Mixins into array already.
305
+ # for other classes, do this when requiring StandardPkgExtensions
306
+ class Array
307
+ include EnumerableBool
308
+ include EnumerableDistribute
309
+ include MapWithIndex
310
+ end
@@ -0,0 +1,150 @@
1
+ # Baseline
2
+ # Katrin Erk April 05
3
+ #
4
+ # baseline for WSD:
5
+ # always assign most frequent sense
6
+ # The baseline doesn't do binary classifiers.
7
+
8
+ require "fred/FredConventions"
9
+ require "fred/FredSplitPkg"
10
+ require "fred/FredFeatures"
11
+ require "fred/FredDetermineTargets"
12
+
13
+ class Baseline
14
+ ###
15
+ # new
16
+ #
17
+ # get splitlog dir (if any) along with everything else
18
+ # because we are only evaluating the training data
19
+ # at test time
20
+ #
21
+ def initialize(exp, # FredConfigData object
22
+ split_id = nil) # string: split ID
23
+ @exp = exp
24
+ @split_id = split_id
25
+
26
+ # for each lemma: remember prevalent sense
27
+ @lemma_to_sense = Hash.new()
28
+
29
+ if @split_id
30
+ split_obj = FredSplitPkg.new(@exp)
31
+ end
32
+
33
+ lemma_done = Hash.new()
34
+
35
+ # iterate through lemmas
36
+ @target_obj = Targets.new(@exp, nil, "r")
37
+ unless @target_obj.targets_okay
38
+ # error during initialization
39
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
40
+ exit 1
41
+ end
42
+
43
+ @target_obj.get_lemmas().each { |lemmapos|
44
+
45
+ if @split_id
46
+ # read training split of answer keys
47
+ answer_obj = AnswerKeyAccess.new(@exp, "train", lemmapos, "r", @split_id, "train")
48
+ else
49
+ # read full answer key file of training data
50
+ answer_obj = AnswerKeyAccess.new(@exp, "train", lemmapos, "r")
51
+ end
52
+
53
+ count_senses = Hash.new(0)
54
+
55
+ answer_obj.each { |lemma, pos, ids, sid, senses_all, senses_this|
56
+ # senses_this may include more than one sense for multi-label assignment
57
+ senses_this.each { |sense|
58
+ count_senses[sense] += 1
59
+ }
60
+ }
61
+
62
+ @lemma_to_sense[lemmapos] = count_senses.keys().max { |a, b|
63
+ count_senses[a] <=> count_senses[b]
64
+ }
65
+ }
66
+
67
+
68
+ @lemma = nil
69
+ end
70
+
71
+ ###
72
+ def train(infilename)
73
+ # no training here
74
+ end
75
+
76
+ ###
77
+ def write(classifier_file)
78
+ # no classifiers to write
79
+ end
80
+
81
+ def exists?(classifier_file)
82
+ return true
83
+ end
84
+
85
+ def read(classifier_file)
86
+ values = deconstruct_fred_classifier_filename(File.basename(classifier_file))
87
+ @lemma = values["lemma"]
88
+ if @lemma
89
+ return true
90
+ else
91
+ $stderr.puts "Warning: couldn't determine lemma name in #{classifier_file}, skipping"
92
+ return false
93
+ end
94
+ end
95
+
96
+
97
+ def read_resultfile(filename)
98
+ retv = Array.new()
99
+ begin
100
+ f = File.new(filename)
101
+ rescue
102
+ raise "Could not read baseline result file #{filename}"
103
+ end
104
+
105
+ f.each { |line|
106
+ retv << [[ line.chomp(), 1.0 ]]
107
+ }
108
+
109
+ return retv
110
+ end
111
+
112
+ def apply(infilename, outfilename)
113
+ # open input and output file
114
+ begin
115
+ out_f = File.new(outfilename, "w")
116
+ rescue
117
+ $stderr.puts "Error: cannot write to classification output file #{outfilename}."
118
+ exit 1
119
+ end
120
+ begin
121
+ f = File.new(infilename)
122
+ rescue
123
+ $stderr.puts "Error: cannot read feature file #{infilename}."
124
+ exit 1
125
+ end
126
+
127
+ # deconstruct input filename to determine lemma
128
+ unless @lemma
129
+ # something went wrong in read()
130
+ return false
131
+ end
132
+
133
+ # do we have a sense for this?
134
+ unless (sense = @lemma_to_sense[@lemma])
135
+ # nope: assign "NONE" (or whatever the null label is here)
136
+ sense = @exp.get("negsense")
137
+ unless sense
138
+ sense = "NONE"
139
+ end
140
+ end
141
+
142
+ f.each { |line|
143
+ out_f.puts sense
144
+ }
145
+ out_f.close()
146
+ f.close()
147
+
148
+ return true
149
+ end
150
+ end