shalmaneser-prep 1.2.0.rc4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +93 -0
- data/lib/frprep/Ampersand.rb +39 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/TreetaggerInterface.rb +327 -0
- data/lib/frprep/do_parses.rb +143 -0
- data/lib/frprep/frprep.rb +693 -0
- data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +58 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +99 -0
- data/test/functional/test_rosy.rb +40 -0
- metadata +85 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
# AB: 2013-12-25
|
2
|
+
class BerkeleyInterpreter < Tiger
|
3
|
+
BerkeleyInterpreter.announce_me
|
4
|
+
|
5
|
+
###
|
6
|
+
# names of the systems interpreted by this class:
|
7
|
+
# returns a hash service(string) -> system name (string),
|
8
|
+
# e.g.
|
9
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
10
|
+
def self.systems
|
11
|
+
{"parser" => "berkeley"}
|
12
|
+
end
|
13
|
+
|
14
|
+
###
|
15
|
+
# names of additional systems that may be interpreted by this class
|
16
|
+
# returns a hash service(string) -> system name(string)
|
17
|
+
# same as names()
|
18
|
+
def self.optional_systems
|
19
|
+
{"lemmatizer" => "treetagger", 'pos_tagger' => 'treetagger'}
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# AB: 2013-12-25
|
2
|
+
class StanfordInterpreter < Tiger
|
3
|
+
StanfordInterpreter.announce_me
|
4
|
+
|
5
|
+
###
|
6
|
+
# names of the systems interpreted by this class:
|
7
|
+
# returns a hash service(string) -> system name (string),
|
8
|
+
# e.g.
|
9
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
10
|
+
def self.systems
|
11
|
+
{"parser" => "stanford"}
|
12
|
+
end
|
13
|
+
|
14
|
+
###
|
15
|
+
# names of additional systems that may be interpreted by this class
|
16
|
+
# returns a hash service(string) -> system name(string)
|
17
|
+
# same as names()
|
18
|
+
def self.optional_systems
|
19
|
+
{"lemmatizer" => "treetagger", 'pos_tagger' => 'treetagger'}
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
# AB, 2010-11-25
|
3
|
+
|
4
|
+
|
5
|
+
##############################
|
6
|
+
# class for managing the parses of one file
|
7
|
+
class OneParsedFile
|
8
|
+
attr_reader :filename
|
9
|
+
|
10
|
+
def initialize(filename, # string: core of filename for the parse file
|
11
|
+
complete_filename, # string: complete filename of parse file
|
12
|
+
obj_with_iterator) # object with each_sentence method, see above
|
13
|
+
@obj_with_iterator = obj_with_iterator
|
14
|
+
@filename = filename
|
15
|
+
@complete_filename = complete_filename
|
16
|
+
end
|
17
|
+
|
18
|
+
# yield each parse sentence as a tuple
|
19
|
+
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
20
|
+
# of a SalsaTigerSentence object, a FNTabSentence object,
|
21
|
+
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
22
|
+
# pointing each tab word to one or more SalsaTigerSentence terminals
|
23
|
+
def each_sentence()
|
24
|
+
@obj_with_iterator.each_sentence(@complete_filename) { |st_sent, tab_sent, mapping|
|
25
|
+
yield [st_sent, tab_sent, mapping]
|
26
|
+
}
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
# AB, 2010-11-25
|
4
|
+
|
5
|
+
require 'optparse'
|
6
|
+
require 'common/prep_config_data'
|
7
|
+
require 'common/SynInterfaces'
|
8
|
+
module FrPrep
|
9
|
+
|
10
|
+
# This class parses options for FrPrep.
|
11
|
+
class OptParser
|
12
|
+
|
13
|
+
# Main class method.
|
14
|
+
# OP expects cmd_args to be an array like ARGV.
|
15
|
+
def self.parse(cmd_args)
|
16
|
+
@prg_name = 'frprep'
|
17
|
+
@@options = {}
|
18
|
+
|
19
|
+
parser = create_parser
|
20
|
+
|
21
|
+
# If no options provided print the help.
|
22
|
+
if cmd_args.empty?
|
23
|
+
$stderr.puts('You have to provide some options.',
|
24
|
+
"Please start with <#{@prg_name} --help>.")
|
25
|
+
exit(1)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Parse ARGV and provide the options hash.
|
29
|
+
# Check if everything is correct and handle exceptions
|
30
|
+
begin
|
31
|
+
parser.parse(cmd_args)
|
32
|
+
rescue OptionParser::InvalidArgument => e
|
33
|
+
arg = e.message.split.last
|
34
|
+
$stderr.puts "The provided argument #{arg} is currently not supported!"
|
35
|
+
$stderr.puts "Please colsult <#{@prg_name} --help>."
|
36
|
+
exit(1)
|
37
|
+
rescue OptionParser::InvalidOption => e
|
38
|
+
$stderr.puts "You have provided an #{e.message}."
|
39
|
+
$stderr.puts "Please colsult <#{@prg_name} --help>."
|
40
|
+
exit(1)
|
41
|
+
rescue
|
42
|
+
raise
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
exp = FrPrepConfigData.new(@@options[:exp_file])
|
47
|
+
|
48
|
+
# AB: this stuff should be move into FrPrepConfigData.
|
49
|
+
# sanity checks
|
50
|
+
unless exp.get("prep_experiment_ID") =~ /^[A-Za-z0-9_]+$/
|
51
|
+
raise "Please choose an experiment ID consisting only of the letters A-Za-z0-9_."
|
52
|
+
end
|
53
|
+
|
54
|
+
SynInterfaces.check_interfaces_abort_if_missing(exp)
|
55
|
+
|
56
|
+
exp
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
def self.create_parser
|
61
|
+
OptionParser.new do |opts|
|
62
|
+
opts.banner = <<STOP
|
63
|
+
Fred Preprocessor <FrPrep>. Preprocessing stage before Fred and Rosy
|
64
|
+
for further frame/word sense assignment and semantic role assignment.
|
65
|
+
|
66
|
+
Usage: frprep -h|-e FILENAME'
|
67
|
+
STOP
|
68
|
+
opts.separator ''
|
69
|
+
opts.separator 'Program specific options:'
|
70
|
+
|
71
|
+
opts.on('-e', '--expfile FILENAME',
|
72
|
+
'Provide the path to an experiment file.',
|
73
|
+
'FrPrep will preprocess data according to the specifications',
|
74
|
+
'given in your experiment file.',
|
75
|
+
'This option is required!',
|
76
|
+
'Also consider the documentation on format and features.'
|
77
|
+
) do |exp_file|
|
78
|
+
@@options[:exp_file] = File.expand_path(exp_file)
|
79
|
+
end
|
80
|
+
|
81
|
+
opts.separator ''
|
82
|
+
opts.separator 'Common options:'
|
83
|
+
|
84
|
+
opts.on_tail('-h', '--help', 'Show this help message.') do
|
85
|
+
puts opts
|
86
|
+
exit
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
end # def self.parse
|
92
|
+
|
93
|
+
end # class OptParser
|
94
|
+
end # module FrPrep
|
@@ -0,0 +1,310 @@
|
|
1
|
+
# Katrin Erk Oct 05
|
2
|
+
#
|
3
|
+
# useful extensions to standard classes
|
4
|
+
|
5
|
+
require 'fileutils'
|
6
|
+
|
7
|
+
class String
|
8
|
+
def startswith(other_string)
|
9
|
+
self[0..other_string.length() - 1] == other_string
|
10
|
+
end
|
11
|
+
|
12
|
+
def endswith(other_string)
|
13
|
+
not(other_string.length() > self.length()) and
|
14
|
+
self[self.length() - other_string.length()..-1] == other_string
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class File
|
19
|
+
########
|
20
|
+
# check whether a given path exists,
|
21
|
+
# and if it doesn't, make sure it is created.
|
22
|
+
#
|
23
|
+
# piece together the strings in 'pieces' to make the path,
|
24
|
+
# appending "/" to all strings if necessary
|
25
|
+
#
|
26
|
+
# returns: the path pieced together
|
27
|
+
def File.new_dir(*pieces) # strings, to be pieced together
|
28
|
+
|
29
|
+
dir_path, dummy = File.make_path(pieces, true)
|
30
|
+
unless File.exists? dir_path
|
31
|
+
FileUtils.mkdir_p dir_path
|
32
|
+
end
|
33
|
+
# check that all went well in creating the directory)
|
34
|
+
File.existing_dir(dir_path)
|
35
|
+
|
36
|
+
return dir_path
|
37
|
+
end
|
38
|
+
|
39
|
+
########
|
40
|
+
# same as new_dir, but last piece is a filename
|
41
|
+
def File.new_filename(*pieces)
|
42
|
+
dir_path, whole_path = File.make_path(pieces, false)
|
43
|
+
unless File.exists? dir_path
|
44
|
+
FileUtils.mkdir_p dir_path
|
45
|
+
end
|
46
|
+
# check that all went well in creating the directory)
|
47
|
+
File.existing_dir(dir_path)
|
48
|
+
|
49
|
+
return whole_path
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
#####
|
54
|
+
# check whether a given path exists,
|
55
|
+
# and report failure of it does not exist.
|
56
|
+
#
|
57
|
+
# piece together the strings in 'pieces' to make the path,
|
58
|
+
# appending "/" to all strings if necessary
|
59
|
+
#
|
60
|
+
# returns: the path pieced together
|
61
|
+
def File.existing_dir(*pieces) # strings
|
62
|
+
|
63
|
+
dir_path, dummy = File.make_path(pieces, true)
|
64
|
+
|
65
|
+
unless File.exists? dir_path and File.directory? dir_path
|
66
|
+
$stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting."
|
67
|
+
exit(1)
|
68
|
+
end
|
69
|
+
unless File.executable? dir_path
|
70
|
+
$stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
|
71
|
+
exit(1)
|
72
|
+
end
|
73
|
+
|
74
|
+
return dir_path
|
75
|
+
end
|
76
|
+
|
77
|
+
####
|
78
|
+
# like existing_dir, but last bit is filename
|
79
|
+
def File.existing_filename(*pieces) # strings
|
80
|
+
|
81
|
+
dir_path, whole_path = File.make_path(pieces, false)
|
82
|
+
|
83
|
+
unless File.exists? dir_path and File.directory? dir_path
|
84
|
+
$stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting"
|
85
|
+
exit(1)
|
86
|
+
end
|
87
|
+
unless File.executable? dir_path
|
88
|
+
$stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
|
89
|
+
exit(1)
|
90
|
+
end
|
91
|
+
|
92
|
+
return whole_path
|
93
|
+
end
|
94
|
+
|
95
|
+
####
|
96
|
+
# piece together the strings in 'pieces' to make a path,
|
97
|
+
# appending "/" to all but the last string if necessary
|
98
|
+
#
|
99
|
+
# if 'pieces' is already a string, take that as a one-piece path
|
100
|
+
#
|
101
|
+
# if dir is true, also append "/" to the last piece of the string
|
102
|
+
#
|
103
|
+
# the resulting path is expanded: For example, initial
|
104
|
+
# ~ is expanded to the setting of $HOME
|
105
|
+
#
|
106
|
+
# returns: pair of strings (directory_part, whole_path)
|
107
|
+
#
|
108
|
+
def File.make_path(pieces, # string or array:string
|
109
|
+
is_dir = false) # Boolean: is the path a directory?
|
110
|
+
|
111
|
+
if pieces.kind_of? String
|
112
|
+
pieces = [ pieces ]
|
113
|
+
end
|
114
|
+
|
115
|
+
dir = ""
|
116
|
+
# iterate over all but the filename
|
117
|
+
if is_dir
|
118
|
+
last_dir_index = -1
|
119
|
+
else
|
120
|
+
last_dir_index = -2
|
121
|
+
end
|
122
|
+
pieces[0..last_dir_index].each { |piece|
|
123
|
+
if piece.nil?
|
124
|
+
# whoops, nil entry in name of path!
|
125
|
+
$stderr.puts "File.make_path ERROR: nil for piece of path name."
|
126
|
+
next
|
127
|
+
end
|
128
|
+
if piece =~ /\/$/
|
129
|
+
dir << piece
|
130
|
+
else
|
131
|
+
dir << piece << "/"
|
132
|
+
end
|
133
|
+
}
|
134
|
+
dir = File.expand_path(dir)
|
135
|
+
# expand_path removes the final "/" again
|
136
|
+
unless dir =~ /\/$/
|
137
|
+
dir = dir + "/"
|
138
|
+
end
|
139
|
+
|
140
|
+
if is_dir
|
141
|
+
return [dir, dir]
|
142
|
+
else
|
143
|
+
return [dir, dir + pieces[-1]]
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
#############################################
|
150
|
+
class Array
|
151
|
+
|
152
|
+
###
|
153
|
+
# interleave N arrays:
|
154
|
+
# given arrays [a1... an], [b1,...,bn], ..[z1, ...,zn]
|
155
|
+
# return [[a1,b1, .., z1]...,[an,bn, .., zn]]
|
156
|
+
#
|
157
|
+
# if one array is longer than the other,
|
158
|
+
# e.g. [a1...an], [b1,...,bm] with n> m
|
159
|
+
# the result is
|
160
|
+
# [[a1,b1],...[am, bm], [am+1, nil], ..., [an, nil]]
|
161
|
+
# and analogously for m>n
|
162
|
+
def interleave(*arrays)
|
163
|
+
len = [length(), arrays.map { |a| a.length() }.max()].max()
|
164
|
+
(0..len-1).to_a.map { |ix|
|
165
|
+
[at(ix)] + arrays.map { |a| a[ix] }
|
166
|
+
}
|
167
|
+
end
|
168
|
+
|
169
|
+
###
|
170
|
+
# prepend: prepend element to array
|
171
|
+
# because I can never remember which is 'shift'
|
172
|
+
# and which is 'unshift'
|
173
|
+
def prepend(element)
|
174
|
+
unshift(element)
|
175
|
+
end
|
176
|
+
|
177
|
+
###
|
178
|
+
# count the number of occurrences of element in this array
|
179
|
+
def count(element)
|
180
|
+
num = 0
|
181
|
+
each { |my_element|
|
182
|
+
if my_element == element
|
183
|
+
num += 1
|
184
|
+
end
|
185
|
+
}
|
186
|
+
return num
|
187
|
+
end
|
188
|
+
|
189
|
+
###
|
190
|
+
# count the number of occurrences of
|
191
|
+
# elements from list in this array
|
192
|
+
def counts(list)
|
193
|
+
num = 0
|
194
|
+
each { |my_element|
|
195
|
+
if list.include? my_element
|
196
|
+
num += 1
|
197
|
+
end
|
198
|
+
}
|
199
|
+
return num
|
200
|
+
end
|
201
|
+
|
202
|
+
###
|
203
|
+
# draw a random sample of size N
|
204
|
+
# from this array
|
205
|
+
def sample(size)
|
206
|
+
if size < 0
|
207
|
+
return nil
|
208
|
+
elsif size == 0
|
209
|
+
return []
|
210
|
+
elsif size >= length()
|
211
|
+
return self.clone()
|
212
|
+
end
|
213
|
+
|
214
|
+
rank = Hash.new()
|
215
|
+
each { |my_element|
|
216
|
+
rank[my_element] = rand()
|
217
|
+
}
|
218
|
+
return self.sort { |a, b| rank[a] <=> rank[b] }[0..size-1]
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
class Float
|
223
|
+
###
|
224
|
+
# round a float to the given number of decimal points
|
225
|
+
def round_to_decpts(n)
|
226
|
+
if self.nan?
|
227
|
+
return self
|
228
|
+
else
|
229
|
+
return (self * 10**n).round.to_f / 10**n
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
################
|
235
|
+
module EnumerableBool
|
236
|
+
###
|
237
|
+
# And_{x \in X} block(x)
|
238
|
+
def big_and(&block)
|
239
|
+
each { |x|
|
240
|
+
unless block.call(x)
|
241
|
+
return false
|
242
|
+
end
|
243
|
+
}
|
244
|
+
return true
|
245
|
+
end
|
246
|
+
|
247
|
+
###
|
248
|
+
# Or_{x \in X} block(x)
|
249
|
+
def big_or(&block)
|
250
|
+
each { |x|
|
251
|
+
if block.call(x)
|
252
|
+
return true
|
253
|
+
end
|
254
|
+
}
|
255
|
+
return false
|
256
|
+
end
|
257
|
+
|
258
|
+
###
|
259
|
+
# Sum_{x \in X} block(x)
|
260
|
+
def big_sum(init = 0, &block)
|
261
|
+
sum = init
|
262
|
+
unless block_given?
|
263
|
+
block = Proc.new { |x| x}
|
264
|
+
end
|
265
|
+
each { |x|
|
266
|
+
sum += block.call(x)
|
267
|
+
}
|
268
|
+
return sum
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
################
|
273
|
+
# Given an enumerable, distribute its items into two bins (arrays)
|
274
|
+
# depending on whether the block returns true
|
275
|
+
module EnumerableDistribute
|
276
|
+
def distribute(&block)
|
277
|
+
retv1 = Array.new
|
278
|
+
retv2 = Array.new
|
279
|
+
each { |x|
|
280
|
+
if block.call(x)
|
281
|
+
retv1 << x
|
282
|
+
else
|
283
|
+
retv2 << x
|
284
|
+
end
|
285
|
+
}
|
286
|
+
return [retv1, retv2]
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
#####################
|
291
|
+
# map with index
|
292
|
+
module MapWithIndex
|
293
|
+
def map_with_index(&block)
|
294
|
+
retv = Array.new
|
295
|
+
|
296
|
+
each_with_index { |x, index|
|
297
|
+
retv << block.call(x, index)
|
298
|
+
}
|
299
|
+
|
300
|
+
return retv
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
# include new Mixins into array already.
|
305
|
+
# for other classes, do this when requiring StandardPkgExtensions
|
306
|
+
class Array
|
307
|
+
include EnumerableBool
|
308
|
+
include EnumerableDistribute
|
309
|
+
include MapWithIndex
|
310
|
+
end
|