frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,13 @@
1
+ # -*- encoding: us-ascii -*-
2
+
3
+ # AB, 2010-11-25
4
+
5
+
6
+ # It is a general class for parsing options.
7
+ # It is now emtpy, we are implementing three different classes:
8
+ # FRPrepOptionParser, RosyOptionParser and FredOptionParser.
9
+ # All classes above inherit from OptionParser.
10
+ #--
11
+ # TODO: move the functionality to the parent class.
12
+ class OptionParser
13
+ end
@@ -0,0 +1,310 @@
1
+ # Katrin Erk Oct 05
2
+ #
3
+ # useful extensions to standard classes
4
+
5
+ require 'fileutils'
6
+
7
+ class String
8
+ def startswith(other_string)
9
+ self[0..other_string.length() - 1] == other_string
10
+ end
11
+
12
+ def endswith(other_string)
13
+ not(other_string.length() > self.length()) and
14
+ self[self.length() - other_string.length()..-1] == other_string
15
+ end
16
+ end
17
+
18
+ class File
19
+ ########
20
+ # check whether a given path exists,
21
+ # and if it doesn't, make sure it is created.
22
+ #
23
+ # piece together the strings in 'pieces' to make the path,
24
+ # appending "/" to all strings if necessary
25
+ #
26
+ # returns: the path pieced together
27
+ def File.new_dir(*pieces) # strings, to be pieced together
28
+
29
+ dir_path, dummy = File.make_path(pieces, true)
30
+ unless File.exists? dir_path
31
+ FileUtils.mkdir_p dir_path
32
+ end
33
+ # check that all went well in creating the directory)
34
+ File.existing_dir(dir_path)
35
+
36
+ return dir_path
37
+ end
38
+
39
+ ########
40
+ # same as new_dir, but last piece is a filename
41
+ def File.new_filename(*pieces)
42
+ dir_path, whole_path = File.make_path(pieces, false)
43
+ unless File.exists? dir_path
44
+ FileUtils.mkdir_p dir_path
45
+ end
46
+ # check that all went well in creating the directory)
47
+ File.existing_dir(dir_path)
48
+
49
+ return whole_path
50
+ end
51
+
52
+
53
+ #####
54
+ # check whether a given path exists,
55
+ # and report failure of it does not exist.
56
+ #
57
+ # piece together the strings in 'pieces' to make the path,
58
+ # appending "/" to all strings if necessary
59
+ #
60
+ # returns: the path pieced together
61
+ def File.existing_dir(*pieces) # strings
62
+
63
+ dir_path, dummy = File.make_path(pieces, true)
64
+
65
+ unless File.exists? dir_path and File.directory? dir_path
66
+ $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting."
67
+ exit(1)
68
+ end
69
+ unless File.executable? dir_path
70
+ $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
71
+ exit(1)
72
+ end
73
+
74
+ return dir_path
75
+ end
76
+
77
+ ####
78
+ # like existing_dir, but last bit is filename
79
+ def File.existing_filename(*pieces) # strings
80
+
81
+ dir_path, whole_path = File.make_path(pieces, false)
82
+
83
+ unless File.exists? dir_path and File.directory? dir_path
84
+ $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting"
85
+ exit(1)
86
+ end
87
+ unless File.executable? dir_path
88
+ $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
89
+ exit(1)
90
+ end
91
+
92
+ return whole_path
93
+ end
94
+
95
+ ####
96
+ # piece together the strings in 'pieces' to make a path,
97
+ # appending "/" to all but the last string if necessary
98
+ #
99
+ # if 'pieces' is already a string, take that as a one-piece path
100
+ #
101
+ # if dir is true, also append "/" to the last piece of the string
102
+ #
103
+ # the resulting path is expanded: For example, initial
104
+ # ~ is expanded to the setting of $HOME
105
+ #
106
+ # returns: pair of strings (directory_part, whole_path)
107
+ #
108
+ def File.make_path(pieces, # string or array:string
109
+ is_dir = false) # Boolean: is the path a directory?
110
+
111
+ if pieces.kind_of? String
112
+ pieces = [ pieces ]
113
+ end
114
+
115
+ dir = ""
116
+ # iterate over all but the filename
117
+ if is_dir
118
+ last_dir_index = -1
119
+ else
120
+ last_dir_index = -2
121
+ end
122
+ pieces[0..last_dir_index].each { |piece|
123
+ if piece.nil?
124
+ # whoops, nil entry in name of path!
125
+ $stderr.puts "File.make_path ERROR: nil for piece of path name."
126
+ next
127
+ end
128
+ if piece =~ /\/$/
129
+ dir << piece
130
+ else
131
+ dir << piece << "/"
132
+ end
133
+ }
134
+ dir = File.expand_path(dir)
135
+ # expand_path removes the final "/" again
136
+ unless dir =~ /\/$/
137
+ dir = dir + "/"
138
+ end
139
+
140
+ if is_dir
141
+ return [dir, dir]
142
+ else
143
+ return [dir, dir + pieces[-1]]
144
+ end
145
+ end
146
+
147
+ end
148
+
149
+ #############################################
150
+ class Array
151
+
152
+ ###
153
+ # interleave N arrays:
154
+ # given arrays [a1... an], [b1,...,bn], ..[z1, ...,zn]
155
+ # return [[a1,b1, .., z1]...,[an,bn, .., zn]]
156
+ #
157
+ # if one array is longer than the other,
158
+ # e.g. [a1...an], [b1,...,bm] with n> m
159
+ # the result is
160
+ # [[a1,b1],...[am, bm], [am+1, nil], ..., [an, nil]]
161
+ # and analogously for m>n
162
+ def interleave(*arrays)
163
+ len = [length(), arrays.map { |a| a.length() }.max()].max()
164
+ (0..len-1).to_a.map { |ix|
165
+ [at(ix)] + arrays.map { |a| a[ix] }
166
+ }
167
+ end
168
+
169
+ ###
170
+ # prepend: prepend element to array
171
+ # because I can never remember which is 'shift'
172
+ # and which is 'unshift'
173
+ def prepend(element)
174
+ unshift(element)
175
+ end
176
+
177
+ ###
178
+ # count the number of occurrences of element in this array
179
+ def count(element)
180
+ num = 0
181
+ each { |my_element|
182
+ if my_element == element
183
+ num += 1
184
+ end
185
+ }
186
+ return num
187
+ end
188
+
189
+ ###
190
+ # count the number of occurrences of
191
+ # elements from list in this array
192
+ def counts(list)
193
+ num = 0
194
+ each { |my_element|
195
+ if list.include? my_element
196
+ num += 1
197
+ end
198
+ }
199
+ return num
200
+ end
201
+
202
+ ###
203
+ # draw a random sample of size N
204
+ # from this array
205
+ def sample(size)
206
+ if size < 0
207
+ return nil
208
+ elsif size == 0
209
+ return []
210
+ elsif size >= length()
211
+ return self.clone()
212
+ end
213
+
214
+ rank = Hash.new()
215
+ each { |my_element|
216
+ rank[my_element] = rand()
217
+ }
218
+ return self.sort { |a, b| rank[a] <=> rank[b] }[0..size-1]
219
+ end
220
+ end
221
+
222
+ class Float
223
+ ###
224
+ # round a float to the given number of decimal points
225
+ def round_to_decpts(n)
226
+ if self.nan?
227
+ return self
228
+ else
229
+ return (self * 10**n).round.to_f / 10**n
230
+ end
231
+ end
232
+ end
233
+
234
+ ################
235
+ module EnumerableBool
236
+ ###
237
+ # And_{x \in X} block(x)
238
+ def big_and(&block)
239
+ each { |x|
240
+ unless block.call(x)
241
+ return false
242
+ end
243
+ }
244
+ return true
245
+ end
246
+
247
+ ###
248
+ # Or_{x \in X} block(x)
249
+ def big_or(&block)
250
+ each { |x|
251
+ if block.call(x)
252
+ return true
253
+ end
254
+ }
255
+ return false
256
+ end
257
+
258
+ ###
259
+ # Sum_{x \in X} block(x)
260
+ def big_sum(init = 0, &block)
261
+ sum = init
262
+ unless block_given?
263
+ block = Proc.new { |x| x}
264
+ end
265
+ each { |x|
266
+ sum += block.call(x)
267
+ }
268
+ return sum
269
+ end
270
+ end
271
+
272
+ ################
273
+ # Given an enumerable, distribute its items into two bins (arrays)
274
+ # depending on whether the block returns true
275
+ module EnumerableDistribute
276
+ def distribute(&block)
277
+ retv1 = Array.new
278
+ retv2 = Array.new
279
+ each { |x|
280
+ if block.call(x)
281
+ retv1 << x
282
+ else
283
+ retv2 << x
284
+ end
285
+ }
286
+ return [retv1, retv2]
287
+ end
288
+ end
289
+
290
+ #####################
291
+ # map with index
292
+ module MapWithIndex
293
+ def map_with_index(&block)
294
+ retv = Array.new
295
+
296
+ each_with_index { |x, index|
297
+ retv << block.call(x, index)
298
+ }
299
+
300
+ return retv
301
+ end
302
+ end
303
+
304
+ # include new Mixins into array already.
305
+ # for other classes, do this when requiring StandardPkgExtensions
306
+ class Array
307
+ include EnumerableBool
308
+ include EnumerableDistribute
309
+ include MapWithIndex
310
+ end
@@ -0,0 +1,150 @@
1
+ # Baseline
2
+ # Katrin Erk April 05
3
+ #
4
+ # baseline for WSD:
5
+ # always assign most frequent sense
6
+ # The baseline doesn't do binary classifiers.
7
+
8
+ require "fred/FredConventions"
9
+ require "fred/FredSplitPkg"
10
+ require "fred/FredFeatures"
11
+ require "fred/FredDetermineTargets"
12
+
13
+ class Baseline
14
+ ###
15
+ # new
16
+ #
17
+ # get splitlog dir (if any) along with everything else
18
+ # because we are only evaluating the training data
19
+ # at test time
20
+ #
21
+ def initialize(exp, # FredConfigData object
22
+ split_id = nil) # string: split ID
23
+ @exp = exp
24
+ @split_id = split_id
25
+
26
+ # for each lemma: remember prevalent sense
27
+ @lemma_to_sense = Hash.new()
28
+
29
+ if @split_id
30
+ split_obj = FredSplitPkg.new(@exp)
31
+ end
32
+
33
+ lemma_done = Hash.new()
34
+
35
+ # iterate through lemmas
36
+ @target_obj = Targets.new(@exp, nil, "r")
37
+ unless @target_obj.targets_okay
38
+ # error during initialization
39
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
40
+ exit 1
41
+ end
42
+
43
+ @target_obj.get_lemmas().each { |lemmapos|
44
+
45
+ if @split_id
46
+ # read training split of answer keys
47
+ answer_obj = AnswerKeyAccess.new(@exp, "train", lemmapos, "r", @split_id, "train")
48
+ else
49
+ # read full answer key file of training data
50
+ answer_obj = AnswerKeyAccess.new(@exp, "train", lemmapos, "r")
51
+ end
52
+
53
+ count_senses = Hash.new(0)
54
+
55
+ answer_obj.each { |lemma, pos, ids, sid, senses_all, senses_this|
56
+ # senses_this may include more than one sense for multi-label assignment
57
+ senses_this.each { |sense|
58
+ count_senses[sense] += 1
59
+ }
60
+ }
61
+
62
+ @lemma_to_sense[lemmapos] = count_senses.keys().max { |a, b|
63
+ count_senses[a] <=> count_senses[b]
64
+ }
65
+ }
66
+
67
+
68
+ @lemma = nil
69
+ end
70
+
71
+ ###
72
+ def train(infilename)
73
+ # no training here
74
+ end
75
+
76
+ ###
77
+ def write(classifier_file)
78
+ # no classifiers to write
79
+ end
80
+
81
+ def exists?(classifier_file)
82
+ return true
83
+ end
84
+
85
+ def read(classifier_file)
86
+ values = deconstruct_fred_classifier_filename(File.basename(classifier_file))
87
+ @lemma = values["lemma"]
88
+ if @lemma
89
+ return true
90
+ else
91
+ $stderr.puts "Warning: couldn't determine lemma name in #{classifier_file}, skipping"
92
+ return false
93
+ end
94
+ end
95
+
96
+
97
+ def read_resultfile(filename)
98
+ retv = Array.new()
99
+ begin
100
+ f = File.new(filename)
101
+ rescue
102
+ raise "Could not read baseline result file #{filename}"
103
+ end
104
+
105
+ f.each { |line|
106
+ retv << [[ line.chomp(), 1.0 ]]
107
+ }
108
+
109
+ return retv
110
+ end
111
+
112
+ def apply(infilename, outfilename)
113
+ # open input and output file
114
+ begin
115
+ out_f = File.new(outfilename, "w")
116
+ rescue
117
+ $stderr.puts "Error: cannot write to classification output file #{outfilename}."
118
+ exit 1
119
+ end
120
+ begin
121
+ f = File.new(infilename)
122
+ rescue
123
+ $stderr.puts "Error: cannot read feature file #{infilename}."
124
+ exit 1
125
+ end
126
+
127
+ # deconstruct input filename to determine lemma
128
+ unless @lemma
129
+ # something went wrong in read()
130
+ return false
131
+ end
132
+
133
+ # do we have a sense for this?
134
+ unless (sense = @lemma_to_sense[@lemma])
135
+ # nope: assign "NONE" (or whatever the null label is here)
136
+ sense = @exp.get("negsense")
137
+ unless sense
138
+ sense = "NONE"
139
+ end
140
+ end
141
+
142
+ f.each { |line|
143
+ out_f.puts sense
144
+ }
145
+ out_f.close()
146
+ f.close()
147
+
148
+ return true
149
+ end
150
+ end