shalmaneser-prep 1.2.0.rc4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +93 -0
- data/lib/frprep/Ampersand.rb +39 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/TreetaggerInterface.rb +327 -0
- data/lib/frprep/do_parses.rb +143 -0
- data/lib/frprep/frprep.rb +693 -0
- data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +58 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +99 -0
- data/test/functional/test_rosy.rb +40 -0
- metadata +85 -0
@@ -0,0 +1,384 @@
|
|
1
|
+
####
|
2
|
+
# sp 21 07 05
|
3
|
+
#
|
4
|
+
# modified ke 30 10 05: adapted to fit into SynInterface
|
5
|
+
#
|
6
|
+
# represents a file containing Sleepy parses
|
7
|
+
#
|
8
|
+
# underlying data structure for individual sentences: SalsaTigerSentence
|
9
|
+
require 'tempfile'
|
10
|
+
|
11
|
+
require 'common/SalsaTigerRegXML'
|
12
|
+
require 'common/SalsaTigerXMLHelper'
|
13
|
+
require 'common/TabFormat'
|
14
|
+
require 'common/Counter'
|
15
|
+
|
16
|
+
require 'common/AbstractSynInterface'
|
17
|
+
require 'common/Tiger.rb'
|
18
|
+
|
19
|
+
################################################
|
20
|
+
# Interface class
|
21
|
+
class SleepyInterface < SynInterfaceSTXML
|
22
|
+
SleepyInterface.announce_me()
|
23
|
+
|
24
|
+
###
|
25
|
+
def SleepyInterface.system()
|
26
|
+
return "sleepy"
|
27
|
+
end
|
28
|
+
|
29
|
+
###
|
30
|
+
def SleepyInterface.service()
|
31
|
+
return "parser"
|
32
|
+
end
|
33
|
+
|
34
|
+
###
|
35
|
+
# initialize to set values for all subsequent processing
|
36
|
+
def initialize(program_path, # string: path to system
|
37
|
+
insuffix, # string: suffix of tab files
|
38
|
+
outsuffix, # string: suffix for parsed files
|
39
|
+
stsuffix, # string: suffix for Salsa/TIGER XML files
|
40
|
+
var_hash = {}) # optional arguments in a hash
|
41
|
+
|
42
|
+
super(program_path, insuffix, outsuffix, stsuffix, var_hash)
|
43
|
+
unless @program_path =~ /\/$/
|
44
|
+
@program_path = @program_path + "/"
|
45
|
+
end
|
46
|
+
|
47
|
+
# new: evaluate var hash
|
48
|
+
@pos_suffix = var_hash["pos_suffix"]
|
49
|
+
@lemma_suffix = var_hash["lemma_suffix"]
|
50
|
+
@tab_dir = var_hash["tab_dir"]
|
51
|
+
end
|
52
|
+
|
53
|
+
####
|
54
|
+
# parse a directory with TabFormat files and write the parse trees to outputdir
|
55
|
+
# I assume that the files in inputdir are smaller than
|
56
|
+
# the maximum number of sentences that
|
57
|
+
# Sleepy can parse in one go (i.e. that they are split)
|
58
|
+
def process_dir(in_dir, # string: input directory name
|
59
|
+
out_dir) # string: output directory name
|
60
|
+
|
61
|
+
sleepy_prog = "#{@program_path}sleepy --beam 1000 --model-file #{@program_path}negra.model --parse "
|
62
|
+
|
63
|
+
Dir[in_dir + "*" + @insuffix].each {|inputfilename|
|
64
|
+
STDERR.puts "*** Parsing #{inputfilename} with Sleepy"
|
65
|
+
corpusfilename = File.basename(inputfilename, @insuffix)
|
66
|
+
parsefilename = out_dir + corpusfilename + @outsuffix
|
67
|
+
tempfile = Tempfile.new(corpusfilename)
|
68
|
+
|
69
|
+
# we need neither lemmata nor POS tags; sleepy can do with the words
|
70
|
+
corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
|
71
|
+
corpusfile.each_sentence {|sentence|
|
72
|
+
tempfile.puts sentence.to_s
|
73
|
+
}
|
74
|
+
tempfile.close
|
75
|
+
# parse and remove comments in the parser output
|
76
|
+
Kernel.system(sleepy_prog+" "+tempfile.path+" 2>&1 | grep -v \"Span:\" > "+parsefilename)
|
77
|
+
}
|
78
|
+
end
|
79
|
+
|
80
|
+
###
|
81
|
+
# for a given parsed file:
|
82
|
+
# yield each sentence as a pair
|
83
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object]
|
84
|
+
# of the sentence in SalsaTigerXML and the matching tab format sentence
|
85
|
+
#
|
86
|
+
# If a parse has failed, returns
|
87
|
+
# [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
|
88
|
+
# to allow more detailed accounting for failed parses
|
89
|
+
# (basically just a flat structure with a failed=true attribute
|
90
|
+
# at the sentence node)
|
91
|
+
def each_sentence(parsefilename)
|
92
|
+
# sanity checks
|
93
|
+
unless @tab_dir
|
94
|
+
$stderr.puts "SleepyInterface error: Need to set tab directory on initialization"
|
95
|
+
exit 1
|
96
|
+
end
|
97
|
+
|
98
|
+
# get matching tab file for this parser output file
|
99
|
+
parsefile = File.new(parsefilename)
|
100
|
+
tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
|
101
|
+
tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
|
102
|
+
|
103
|
+
sentid = 0
|
104
|
+
|
105
|
+
tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
|
106
|
+
|
107
|
+
sentence_str = ""
|
108
|
+
status = true # error encountered?
|
109
|
+
|
110
|
+
# assemble next sentence in Sleepy file by reading lines from parsefile
|
111
|
+
while true
|
112
|
+
line = parsefile.gets
|
113
|
+
case line
|
114
|
+
when /% Parse failed/
|
115
|
+
status = false
|
116
|
+
break
|
117
|
+
when nil # end of file: nothing more to break
|
118
|
+
break
|
119
|
+
when /^%/, /^\s*$/ # empty lines, other comments: end of current sentence
|
120
|
+
unless sentence_str == "" # only break if you have read something
|
121
|
+
break
|
122
|
+
end
|
123
|
+
else
|
124
|
+
sentence_str += line.chomp # collect line of current parse and continue reading
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# we have reached some kind of end
|
129
|
+
sentid +=1
|
130
|
+
|
131
|
+
# we don't have a sentence: hopefully, this is becase parsing has failed
|
132
|
+
# if this is not the case, we are in trouble
|
133
|
+
if sentence_str == ""
|
134
|
+
case status
|
135
|
+
|
136
|
+
when false
|
137
|
+
# return a SalsaTigerSentence object for the failed sentence
|
138
|
+
# with a virtual top node and one terminal per word.
|
139
|
+
if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
|
140
|
+
my_sent_id = tab_sent.get_sent_id()
|
141
|
+
else
|
142
|
+
my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
|
143
|
+
end
|
144
|
+
sent = SleepyInterface.failed_sentence(tab_sent, my_sent_id)
|
145
|
+
yield [sent, tab_sent, SleepyInterface.standard_mapping(sent, tab_sent)]
|
146
|
+
|
147
|
+
else
|
148
|
+
# this may not happen: we need some sentence for the current
|
149
|
+
# TabFile sentence
|
150
|
+
$stderr.puts "SleepyInterface error: premature end of parser file!"
|
151
|
+
exit 1
|
152
|
+
end
|
153
|
+
else
|
154
|
+
# if we are here, we have a sentence_str to work on
|
155
|
+
# hopefully, our status is OK
|
156
|
+
case status
|
157
|
+
when true
|
158
|
+
if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
|
159
|
+
my_sent_id = tab_sent.get_sent_id()
|
160
|
+
else
|
161
|
+
my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
|
162
|
+
end
|
163
|
+
st_sent = build_salsatiger(" " + sentence_str + " ", 0,
|
164
|
+
Array.new, Counter.new(0),
|
165
|
+
Counter.new(500),
|
166
|
+
SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
|
167
|
+
yield [st_sent, tab_sent, SleepyInterface.standard_mapping(st_sent, tab_sent)]
|
168
|
+
|
169
|
+
else # i.e. when "failed"
|
170
|
+
$stderr.puts "SleepyInterface error: failed parse, but parse tree exists??"
|
171
|
+
exit 1
|
172
|
+
end
|
173
|
+
end
|
174
|
+
}
|
175
|
+
|
176
|
+
# all TabFile sentences are consumed:
|
177
|
+
# now we may just encounter comments, garbage, empty lines etc.
|
178
|
+
|
179
|
+
while not parsefile.eof?
|
180
|
+
case parsefile.gets
|
181
|
+
when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
|
182
|
+
else
|
183
|
+
$stderr.puts "SleepyInterface error: premature end of tab file"
|
184
|
+
exit 1
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
|
190
|
+
###
|
191
|
+
# write Salsa/TIGER XML output to file
|
192
|
+
def to_stxml_file(infilename, # string: name of parse file
|
193
|
+
outfilename) # string: name of output stxml file
|
194
|
+
|
195
|
+
outfile = File.new(outfilename, "w")
|
196
|
+
outfile.puts SalsaTigerXMLHelper.get_header()
|
197
|
+
each_sentence(infilename) { |st_sent, tabsent|
|
198
|
+
outfile.puts st_sent.get()
|
199
|
+
}
|
200
|
+
outfile.puts SalsaTigerXMLHelper.get_footer()
|
201
|
+
outfile.close()
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
|
206
|
+
########################
|
207
|
+
private
|
208
|
+
|
209
|
+
###
|
210
|
+
# Recursive function for parsing a Sleepy parse tree and
|
211
|
+
# building a SalsaTigerSentence recursively
|
212
|
+
#
|
213
|
+
# Algorithm: manage stack which contains, for the current constituent,
|
214
|
+
# child constituents (if a nonterminal), and the category label.
|
215
|
+
# When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
|
216
|
+
# All children and the category label are popped from the stack and integrated into the
|
217
|
+
# TigerSalsa data structure. The new node is re-pushed onto the stack.
|
218
|
+
def build_salsatiger(sentence, # string
|
219
|
+
pos, # position in string (index): integer
|
220
|
+
stack, # stack with incomplete nodes: Array
|
221
|
+
termc, # terminal counter
|
222
|
+
nontc, # nonterminal counter
|
223
|
+
sent_obj) # SalsaTigerSentence
|
224
|
+
|
225
|
+
|
226
|
+
# main case distinction: match the beginning of our string
|
227
|
+
# (i.e. what follows our current position in the string)
|
228
|
+
|
229
|
+
case sentence[pos..-1]
|
230
|
+
|
231
|
+
when /^ *$/ # nothing -> whole sentence parsed
|
232
|
+
if stack.length == 1
|
233
|
+
# sleepy always delivers one "top" node; if we don't get just one
|
234
|
+
# node, something has gone wrong
|
235
|
+
node = stack.pop
|
236
|
+
node.del_attribute("gf")
|
237
|
+
return sent_obj
|
238
|
+
else
|
239
|
+
$stderr.puts "SleepyINterface Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
|
240
|
+
exit 1
|
241
|
+
end
|
242
|
+
|
243
|
+
when /^\s*\(([^ )]+) /
|
244
|
+
# match the beginning of a new constituent
|
245
|
+
# (opening bracket + category + space, may not contain closing bracket)
|
246
|
+
cat = $1
|
247
|
+
if cat.nil? or cat == ""
|
248
|
+
$stderr.puts "SleepyInterface Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
|
249
|
+
exit 1
|
250
|
+
end
|
251
|
+
# STDERR.puts "new const #{cat}"
|
252
|
+
stack.push cat # throw the category label on the stack
|
253
|
+
return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
|
254
|
+
|
255
|
+
when /^\s*(\S+)\) /
|
256
|
+
# match the end of a terminal constituent (something before a closing bracket + space)
|
257
|
+
word = $1
|
258
|
+
comb_cat = stack.pop
|
259
|
+
if comb_cat.to_s == ""
|
260
|
+
$stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
261
|
+
exit 1
|
262
|
+
end
|
263
|
+
cat,gf = split_cat(comb_cat)
|
264
|
+
node = sent_obj.add_syn("t",
|
265
|
+
nil, # cat (doesn't matter here)
|
266
|
+
SalsaTigerXMLHelper.escape(word), # word
|
267
|
+
cat, # pos
|
268
|
+
termc.next.to_s)
|
269
|
+
node.set_attribute("gf",gf)
|
270
|
+
# STDERR.puts "completed terminal #{cat}, #{word}"
|
271
|
+
stack.push node
|
272
|
+
return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
|
273
|
+
|
274
|
+
when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
|
275
|
+
# now collect children:
|
276
|
+
# pop items from the stack until you find the category
|
277
|
+
children = Array.new
|
278
|
+
while true
|
279
|
+
if stack.empty?
|
280
|
+
$stderr.puts "SleepyInterface Error: stack empty; cannot find more children"
|
281
|
+
exit 1
|
282
|
+
end
|
283
|
+
item = stack.pop
|
284
|
+
case item.class.to_s
|
285
|
+
when "SynNode" # this is a child
|
286
|
+
children.push item
|
287
|
+
when "String" # this is the category label
|
288
|
+
if item.to_s == ""
|
289
|
+
$stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
290
|
+
exit 1
|
291
|
+
end
|
292
|
+
cat,gf = split_cat(item)
|
293
|
+
break
|
294
|
+
else
|
295
|
+
$stderr.puts "SleepyInterface Error: unknown item class #{item.class.to_s}"
|
296
|
+
exit 1
|
297
|
+
end
|
298
|
+
end
|
299
|
+
# now add a nonterminal node to the sentence object and
|
300
|
+
# register the children nodes
|
301
|
+
node = sent_obj.add_syn("nt",
|
302
|
+
cat, # cat
|
303
|
+
nil, # word (doesn't matter)
|
304
|
+
nil, # pos (doesn't matter)
|
305
|
+
nontc.next.to_s)
|
306
|
+
children.each {|child|
|
307
|
+
child_gf = child.get_attribute("gf")
|
308
|
+
child.del_attribute("gf")
|
309
|
+
node.add_child(child,child_gf)
|
310
|
+
child.add_parent(node, child_gf)
|
311
|
+
}
|
312
|
+
node.set_attribute("gf",gf)
|
313
|
+
# STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
|
314
|
+
stack.push node
|
315
|
+
return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
|
316
|
+
else
|
317
|
+
|
318
|
+
if sentence =~ /Fatal error: exception Out_of_memory/
|
319
|
+
$stderr.puts "SleepyInterface error: Sleepy parser ran out of memory."
|
320
|
+
$stderr.puts "Try reducing the max. sentence length"
|
321
|
+
$stderr.puts "in the experiment file."
|
322
|
+
exit 1
|
323
|
+
end
|
324
|
+
|
325
|
+
|
326
|
+
$stderr.puts "SleepyInterface Error: cannot analyse sentence at pos #{pos}:\n #{sentence[pos..-1]}\n Complete sentence: \n#{sentence}"
|
327
|
+
exit 1
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
###
|
332
|
+
# Sleepy delivers node labels as "phrase type"-"grammatical function"
|
333
|
+
# but the GF may not be present.
|
334
|
+
|
335
|
+
def split_cat(cat)
|
336
|
+
|
337
|
+
cat =~ /^([^-]*)(-([^-]*))?$/
|
338
|
+
unless $1
|
339
|
+
$stderr.puts "SleepyInterface Error: could not identify category in #{cat}"
|
340
|
+
exit 1
|
341
|
+
end
|
342
|
+
|
343
|
+
proper_cat = $1
|
344
|
+
|
345
|
+
if $3
|
346
|
+
gf = $3
|
347
|
+
else
|
348
|
+
gf = ""
|
349
|
+
end
|
350
|
+
|
351
|
+
return [proper_cat,gf]
|
352
|
+
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
|
357
|
+
|
358
|
+
################################################
|
359
|
+
# Interpreter class
|
360
|
+
class SleepyInterpreter < Tiger
|
361
|
+
SleepyInterpreter.announce_me()
|
362
|
+
|
363
|
+
###
|
364
|
+
# names of the systems interpreted by this class:
|
365
|
+
# returns a hash service(string) -> system name (string),
|
366
|
+
# e.g.
|
367
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
368
|
+
def SleepyInterpreter.systems()
|
369
|
+
return {
|
370
|
+
"parser" => "sleepy"
|
371
|
+
}
|
372
|
+
end
|
373
|
+
|
374
|
+
###
|
375
|
+
# names of additional systems that may be interpreted by this class
|
376
|
+
# returns a hash service(string) -> system name(string)
|
377
|
+
# same as names()
|
378
|
+
def SleepyInterpreter.optional_systems()
|
379
|
+
return {
|
380
|
+
"lemmatizer" => "treetagger"
|
381
|
+
}
|
382
|
+
end
|
383
|
+
|
384
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
require "common/AbstractSynInterface"
|
3
|
+
|
4
|
+
################################################
|
5
|
+
# Interface class
|
6
|
+
class TntInterface < SynInterfaceTab
|
7
|
+
TntInterface.announce_me()
|
8
|
+
|
9
|
+
def TntInterface.system()
|
10
|
+
return "tnt"
|
11
|
+
end
|
12
|
+
|
13
|
+
def TntInterface.service()
|
14
|
+
return "pos_tagger"
|
15
|
+
end
|
16
|
+
|
17
|
+
def process_file(infilename, # string: name of input file
|
18
|
+
outfilename) # string: name of output file
|
19
|
+
|
20
|
+
tempfile = Tempfile.new("Tnt")
|
21
|
+
TntInterface.fntab_words_to_file(infilename, tempfile)
|
22
|
+
tempfile.close
|
23
|
+
|
24
|
+
# 1. use grep to remove commentaries from file
|
25
|
+
# 2. use sed to extract tags tag list:
|
26
|
+
# - match one or more non-spaces
|
27
|
+
# - match one or more spaces
|
28
|
+
# - match one or more non-spaces and write to outfilename
|
29
|
+
|
30
|
+
# This assumes that the experiment file entry for pos_tagger_path
|
31
|
+
# has the form
|
32
|
+
# pos_tagger_path = <program_name> <model>
|
33
|
+
|
34
|
+
Kernel.system(@program_path + " " + tempfile.path +
|
35
|
+
' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
|
36
|
+
|
37
|
+
tempfile.close(true) # delete tempfile
|
38
|
+
unless `cat #{infilename} | wc -l`.strip ==
|
39
|
+
`cat #{outfilename} | wc -l`.strip
|
40
|
+
raise "Error: tagged file has different line number from corpus file!"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
@@ -0,0 +1,327 @@
|
|
1
|
+
# sp 30 11 06
|
2
|
+
# extended by TreeTaggerPOSInterface
|
3
|
+
|
4
|
+
require "tempfile"
|
5
|
+
require 'pathname'
|
6
|
+
require "common/AbstractSynInterface"
|
7
|
+
|
8
|
+
###########
|
9
|
+
# KE dec 7, 06
|
10
|
+
# common mixin for both Treetagger modules, doing the actual processing
|
11
|
+
module TreetaggerModule
|
12
|
+
###
|
13
|
+
# Treetagger does both lemmatization and POS-tagging.
|
14
|
+
# However, the way the SynInterface system is set up in Shalmaneser,
|
15
|
+
# each SynInterface can offer only _one_ service.
|
16
|
+
# This means that we cannot do a SynInterface that writes
|
17
|
+
# both a POS file and a lemma file.
|
18
|
+
# Instead, both will include this module, which does the
|
19
|
+
# actual TreeTagger call and then stores the result in a file
|
20
|
+
# of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
|
21
|
+
# but with a separate extension.
|
22
|
+
# really_process_file checks for existence of this file because,
|
23
|
+
# if the TreeTagger lemmatization and POS-tagging classes are called separately,
|
24
|
+
# one of them will go first, and the 2nd one will not need to do the
|
25
|
+
# TreeTagger call anymore
|
26
|
+
#
|
27
|
+
# really_process_file returns a filename, the name of the file containing
|
28
|
+
# the TreeTagger output with both POS tags and lemma information
|
29
|
+
#
|
30
|
+
# WARNING: this method assumes that outfilename contains a suffix
|
31
|
+
# that can be replaced by .TreeTagger
|
32
|
+
def really_process_file(infilename, # string: name of input file
|
33
|
+
outfilename,# string: name of file that the caller is to produce
|
34
|
+
make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
|
35
|
+
|
36
|
+
# fabricate the filename in which the
|
37
|
+
# actual TreeTagger output will be placed:
|
38
|
+
# <directory> + <outfilename minus last suffix> + ".TreeTagger"
|
39
|
+
current_suffix = outfilename[outfilename.rindex(".")..-1]
|
40
|
+
my_outfilename = File.dirname(outfilename) + "/" +
|
41
|
+
File.basename(outfilename, current_suffix) +
|
42
|
+
".TreeTagger"
|
43
|
+
|
44
|
+
##
|
45
|
+
# does it exist? then just return it
|
46
|
+
if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
|
47
|
+
return my_outfilename
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# else construct it, then return it
|
52
|
+
tempfile = Tempfile.new("Treetagger")
|
53
|
+
TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
|
54
|
+
tempfile.close
|
55
|
+
|
56
|
+
# @todo AB: Remove it by my shame :(
|
57
|
+
# AB: A very dirty hack of mine:
|
58
|
+
# We need the language attribute, but we don't have the FrPrepConfigData,
|
59
|
+
# then we'll try to find it in the ObjectSpace since we should have only one.
|
60
|
+
lang = ''
|
61
|
+
ObjectSpace.each_object(FrPrepConfigData) do |o|
|
62
|
+
lang = o.get('language')
|
63
|
+
end
|
64
|
+
|
65
|
+
case lang
|
66
|
+
when 'en'
|
67
|
+
tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'english.par')
|
68
|
+
tt_filter = ''
|
69
|
+
when 'de'
|
70
|
+
tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'german.par')
|
71
|
+
tt_filter = "| #{Pathname.new(@program_path).join('cmd').join('filter-german-tags')}"
|
72
|
+
end
|
73
|
+
|
74
|
+
# call TreeTagger
|
75
|
+
tt_binary = Pathname.new(@program_path).join('bin').join(ENV['SHALM_TREETAGGER_BIN'] || 'tree-tagger')
|
76
|
+
|
77
|
+
invocation_str = "#{tt_binary} -lemma -token -sgml #{tt_model} #{tempfile.path} #{tt_filter} > #{my_outfilename}"
|
78
|
+
|
79
|
+
STDERR.puts "*** Tagging and lemmatizing #{tempfile.path} with TreeTagger."
|
80
|
+
STDERR.puts invocation_str
|
81
|
+
|
82
|
+
Kernel.system(invocation_str)
|
83
|
+
tempfile.close(true) # delete first tempfile
|
84
|
+
|
85
|
+
# external problem: sometimes, the treetagger keeps the last <EOS> for itself,
|
86
|
+
# resulting on a .tagged file missing the last (blank) line
|
87
|
+
|
88
|
+
original_length = `cat #{infilename} | wc -l`.strip.to_i
|
89
|
+
puts infilename
|
90
|
+
lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
|
91
|
+
|
92
|
+
# `cp #{tempfile2.path()} /tmp/lout`
|
93
|
+
|
94
|
+
case original_length - lemmatised_length
|
95
|
+
when 0
|
96
|
+
# everything ok, don't do anything
|
97
|
+
when 1
|
98
|
+
# add one more newline to the .tagged file
|
99
|
+
`echo "" >> #{my_outfilename}`
|
100
|
+
else
|
101
|
+
# this is "real" error
|
102
|
+
STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
|
103
|
+
STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
|
104
|
+
$stderr.puts "has different line number from corpus file!"
|
105
|
+
raise
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
return my_outfilename
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
#######################################
|
114
|
+
class TreetaggerInterface < SynInterfaceTab
|
115
|
+
TreetaggerInterface.announce_me()
|
116
|
+
|
117
|
+
include TreetaggerModule
|
118
|
+
|
119
|
+
###
|
120
|
+
def self.system
|
121
|
+
'treetagger'
|
122
|
+
end
|
123
|
+
|
124
|
+
###
|
125
|
+
def self.service
|
126
|
+
'lemmatizer'
|
127
|
+
end
|
128
|
+
|
129
|
+
###
|
130
|
+
# convert TreeTagger's penn tagset into Collins' penn tagset *argh*
|
131
|
+
# @todo AB: Generalize this method to work with different parsers.
|
132
|
+
def convert_to_berkeley(line)
|
133
|
+
line.chomp!
|
134
|
+
return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
###
|
139
|
+
def process_file(infilename, # string: name of input file
|
140
|
+
outfilename) # string: name of output file
|
141
|
+
|
142
|
+
# KE change here
|
143
|
+
ttfilename = really_process_file(infilename, outfilename)
|
144
|
+
|
145
|
+
# write all output to tempfile2 first, then
|
146
|
+
# change ISO to UTF-8 into outputfile
|
147
|
+
tempfile2 = Tempfile.new("treetagger")
|
148
|
+
tempfile2.close()
|
149
|
+
|
150
|
+
# 2. use cut to get the actual lemmtisation
|
151
|
+
|
152
|
+
Kernel.system("cat " + ttfilename +
|
153
|
+
' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
|
154
|
+
|
155
|
+
# transform ISO-8859-1 back to UTF-8,
|
156
|
+
# write to 'outfilename'
|
157
|
+
begin
|
158
|
+
outfile = File.new(outfilename, "w")
|
159
|
+
rescue
|
160
|
+
raise "Could not write to #{outfilename}"
|
161
|
+
end
|
162
|
+
tempfile2.open
|
163
|
+
# AB: Internally all the flow is an utf-8 encoded stream.
|
164
|
+
# TreeTagger consumes one byte encodings (but we should provide a
|
165
|
+
# utf-8 model for German). So we convert utf-8 to latin1, then
|
166
|
+
# process the text and convert it back to utf-8.
|
167
|
+
#
|
168
|
+
while line = tempfile2.gets
|
169
|
+
#outfile.puts UtfIso.from_iso_8859_1(line)
|
170
|
+
utf8line = UtfIso.from_iso_8859_1(line)
|
171
|
+
outfile.puts convert_to_berkeley(utf8line)
|
172
|
+
end
|
173
|
+
|
174
|
+
# remove second tempfile, finalize output file
|
175
|
+
tempfile2.close(true)
|
176
|
+
outfile.close()
|
177
|
+
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
# sp 30 11 06
|
183
|
+
#
|
184
|
+
# using TreeTagger for POS tagging of English text
|
185
|
+
#
|
186
|
+
# copy-and-paste from lemmatisation
|
187
|
+
#
|
188
|
+
# differences:
|
189
|
+
# 1. use field 2 and not 3 from the output
|
190
|
+
# 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
|
191
|
+
#
|
192
|
+
# KE 7 12 06
|
193
|
+
# change interface such that TreeTagger is called only once
|
194
|
+
# and both POS tags and lemma are read from the same files,
|
195
|
+
# rather than calling the tagger twice
|
196
|
+
class TreetaggerPOSInterface < SynInterfaceTab
|
197
|
+
TreetaggerPOSInterface.announce_me()
|
198
|
+
include TreetaggerModule
|
199
|
+
|
200
|
+
###
|
201
|
+
def TreetaggerPOSInterface.system()
|
202
|
+
return "treetagger"
|
203
|
+
end
|
204
|
+
|
205
|
+
###
|
206
|
+
def TreetaggerPOSInterface.service()
|
207
|
+
return "pos_tagger"
|
208
|
+
end
|
209
|
+
|
210
|
+
###
|
211
|
+
# convert TreeTagger's penn tagset into Collins' penn tagset *argh*
|
212
|
+
|
213
|
+
def convert_to_collins(line)
|
214
|
+
line.chomp!
|
215
|
+
return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
|
216
|
+
end
|
217
|
+
|
218
|
+
###
|
219
|
+
def process_file(infilename, # string: name of input file
|
220
|
+
outfilename) # string: name of output file
|
221
|
+
|
222
|
+
# KE change here
|
223
|
+
tt_filename = really_process_file(infilename, outfilename, true)
|
224
|
+
|
225
|
+
# write all output to tempfile2 first, then
|
226
|
+
# change ISO to UTF-8 into outputfile
|
227
|
+
tempfile2 = Tempfile.new("treetagger")
|
228
|
+
tempfile2.close()
|
229
|
+
|
230
|
+
# 2. use cut to get the actual lemmtisation
|
231
|
+
|
232
|
+
Kernel.system("cat " + tt_filename +
|
233
|
+
' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
|
234
|
+
|
235
|
+
# transform ISO-8859-1 back to UTF-8,
|
236
|
+
# write to 'outfilename'
|
237
|
+
begin
|
238
|
+
outfile = File.new(outfilename, "w")
|
239
|
+
rescue
|
240
|
+
raise "Could not write to #{outfilename}"
|
241
|
+
end
|
242
|
+
tempfile2.open()
|
243
|
+
while (line = tempfile2.gets())
|
244
|
+
outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
|
245
|
+
end
|
246
|
+
|
247
|
+
# remove second tempfile, finalize output file
|
248
|
+
tempfile2.close(true)
|
249
|
+
outfile.close()
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
###############
|
254
|
+
# an interpreter that only has Treetagger, no parser
|
255
|
+
class TreetaggerInterpreter < SynInterpreter
|
256
|
+
TreetaggerInterpreter.announce_me()
|
257
|
+
|
258
|
+
###
|
259
|
+
# names of the systems interpreted by this class:
|
260
|
+
# returns a hash service(string) -> system name (string),
|
261
|
+
# e.g.
|
262
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
263
|
+
def TreetaggerInterpreter.systems()
|
264
|
+
return {
|
265
|
+
"pos_tagger" => "treetagger",
|
266
|
+
}
|
267
|
+
end
|
268
|
+
|
269
|
+
###
|
270
|
+
# names of additional systems that may be interpreted by this class
|
271
|
+
# returns a hash service(string) -> system name(string)
|
272
|
+
# same as names()
|
273
|
+
def TreetaggerInterpreter.optional_systems()
|
274
|
+
return {
|
275
|
+
"lemmatizer" => "treetagger"
|
276
|
+
}
|
277
|
+
end
|
278
|
+
|
279
|
+
###
|
280
|
+
# generalize over POS tags.
|
281
|
+
#
|
282
|
+
# returns one of:
|
283
|
+
#
|
284
|
+
# adj: adjective (phrase)
|
285
|
+
# adv: adverb (phrase)
|
286
|
+
# card: numbers, quantity phrases
|
287
|
+
# con: conjunction
|
288
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
289
|
+
# for: foreign material
|
290
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
291
|
+
# part: particles, truncated words (German compound parts)
|
292
|
+
# prep: preposition (phrase)
|
293
|
+
# pun: punctuation, brackets, etc.
|
294
|
+
# sent: sentence
|
295
|
+
# top: top node of a sentence
|
296
|
+
# verb: verb (phrase)
|
297
|
+
# nil: something went wrong
|
298
|
+
#
|
299
|
+
# returns: string, or nil
|
300
|
+
def TreetaggerInterpreter.category(node) # SynNode
|
301
|
+
pt = TreetaggerInterpreter.pt(node)
|
302
|
+
if pt.nil?
|
303
|
+
# phrase type could not be determined
|
304
|
+
return nil
|
305
|
+
end
|
306
|
+
|
307
|
+
pt.to_s.strip() =~ /^([^-]*)/
|
308
|
+
case $1
|
309
|
+
when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
|
310
|
+
when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
|
311
|
+
when /^CD/, /^QP/ then return "card"
|
312
|
+
when /^CC/, /^WRB/, /^CONJP/ then return "con"
|
313
|
+
when /^DT/, /^POS/ then return "det"
|
314
|
+
when /^FW/, /^SYM/ then return "for"
|
315
|
+
when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
|
316
|
+
when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
|
317
|
+
when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
|
318
|
+
when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
|
319
|
+
when /^TOP/ then return "top"
|
320
|
+
when /^TRACE/ then return "trace"
|
321
|
+
when /^V/ , /^MD/ then return "verb"
|
322
|
+
else
|
323
|
+
# $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
|
324
|
+
return nil
|
325
|
+
end
|
326
|
+
end
|
327
|
+
end
|