shalmaneser-prep 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,384 @@
1
+ ####
2
+ # sp 21 07 05
3
+ #
4
+ # modified ke 30 10 05: adapted to fit into SynInterface
5
+ #
6
+ # represents a file containing Sleepy parses
7
+ #
8
+ # underlying data structure for individual sentences: SalsaTigerSentence
9
+ require 'tempfile'
10
+
11
+ require 'common/SalsaTigerRegXML'
12
+ require 'common/SalsaTigerXMLHelper'
13
+ require 'common/TabFormat'
14
+ require 'common/Counter'
15
+
16
+ require 'common/AbstractSynInterface'
17
+ require 'common/Tiger.rb'
18
+
19
+ ################################################
20
+ # Interface class
21
+ class SleepyInterface < SynInterfaceSTXML
22
+ SleepyInterface.announce_me()
23
+
24
+ ###
25
+ def SleepyInterface.system()
26
+ return "sleepy"
27
+ end
28
+
29
+ ###
30
+ def SleepyInterface.service()
31
+ return "parser"
32
+ end
33
+
34
+ ###
35
+ # initialize to set values for all subsequent processing
36
+ def initialize(program_path, # string: path to system
37
+ insuffix, # string: suffix of tab files
38
+ outsuffix, # string: suffix for parsed files
39
+ stsuffix, # string: suffix for Salsa/TIGER XML files
40
+ var_hash = {}) # optional arguments in a hash
41
+
42
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
43
+ unless @program_path =~ /\/$/
44
+ @program_path = @program_path + "/"
45
+ end
46
+
47
+ # new: evaluate var hash
48
+ @pos_suffix = var_hash["pos_suffix"]
49
+ @lemma_suffix = var_hash["lemma_suffix"]
50
+ @tab_dir = var_hash["tab_dir"]
51
+ end
52
+
53
+ ####
54
+ # parse a directory with TabFormat files and write the parse trees to outputdir
55
+ # I assume that the files in inputdir are smaller than
56
+ # the maximum number of sentences that
57
+ # Sleepy can parse in one go (i.e. that they are split)
58
+ def process_dir(in_dir, # string: input directory name
59
+ out_dir) # string: output directory name
60
+
61
+ sleepy_prog = "#{@program_path}sleepy --beam 1000 --model-file #{@program_path}negra.model --parse "
62
+
63
+ Dir[in_dir + "*" + @insuffix].each {|inputfilename|
64
+ STDERR.puts "*** Parsing #{inputfilename} with Sleepy"
65
+ corpusfilename = File.basename(inputfilename, @insuffix)
66
+ parsefilename = out_dir + corpusfilename + @outsuffix
67
+ tempfile = Tempfile.new(corpusfilename)
68
+
69
+ # we need neither lemmata nor POS tags; sleepy can do with the words
70
+ corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
71
+ corpusfile.each_sentence {|sentence|
72
+ tempfile.puts sentence.to_s
73
+ }
74
+ tempfile.close
75
+ # parse and remove comments in the parser output
76
+ Kernel.system(sleepy_prog+" "+tempfile.path+" 2>&1 | grep -v \"Span:\" > "+parsefilename)
77
+ }
78
+ end
79
+
80
+ ###
81
+ # for a given parsed file:
82
+ # yield each sentence as a pair
83
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
84
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
85
+ #
86
+ # If a parse has failed, returns
87
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
88
+ # to allow more detailed accounting for failed parses
89
+ # (basically just a flat structure with a failed=true attribute
90
+ # at the sentence node)
91
+ def each_sentence(parsefilename)
92
+ # sanity checks
93
+ unless @tab_dir
94
+ $stderr.puts "SleepyInterface error: Need to set tab directory on initialization"
95
+ exit 1
96
+ end
97
+
98
+ # get matching tab file for this parser output file
99
+ parsefile = File.new(parsefilename)
100
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
101
+ tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
102
+
103
+ sentid = 0
104
+
105
+ tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
106
+
107
+ sentence_str = ""
108
+ status = true # error encountered?
109
+
110
+ # assemble next sentence in Sleepy file by reading lines from parsefile
111
+ while true
112
+ line = parsefile.gets
113
+ case line
114
+ when /% Parse failed/
115
+ status = false
116
+ break
117
+ when nil # end of file: nothing more to break
118
+ break
119
+ when /^%/, /^\s*$/ # empty lines, other comments: end of current sentence
120
+ unless sentence_str == "" # only break if you have read something
121
+ break
122
+ end
123
+ else
124
+ sentence_str += line.chomp # collect line of current parse and continue reading
125
+ end
126
+ end
127
+
128
+ # we have reached some kind of end
129
+ sentid +=1
130
+
131
+ # we don't have a sentence: hopefully, this is becase parsing has failed
132
+ # if this is not the case, we are in trouble
133
+ if sentence_str == ""
134
+ case status
135
+
136
+ when false
137
+ # return a SalsaTigerSentence object for the failed sentence
138
+ # with a virtual top node and one terminal per word.
139
+ if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
140
+ my_sent_id = tab_sent.get_sent_id()
141
+ else
142
+ my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
143
+ end
144
+ sent = SleepyInterface.failed_sentence(tab_sent, my_sent_id)
145
+ yield [sent, tab_sent, SleepyInterface.standard_mapping(sent, tab_sent)]
146
+
147
+ else
148
+ # this may not happen: we need some sentence for the current
149
+ # TabFile sentence
150
+ $stderr.puts "SleepyInterface error: premature end of parser file!"
151
+ exit 1
152
+ end
153
+ else
154
+ # if we are here, we have a sentence_str to work on
155
+ # hopefully, our status is OK
156
+ case status
157
+ when true
158
+ if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
159
+ my_sent_id = tab_sent.get_sent_id()
160
+ else
161
+ my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
162
+ end
163
+ st_sent = build_salsatiger(" " + sentence_str + " ", 0,
164
+ Array.new, Counter.new(0),
165
+ Counter.new(500),
166
+ SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
167
+ yield [st_sent, tab_sent, SleepyInterface.standard_mapping(st_sent, tab_sent)]
168
+
169
+ else # i.e. when "failed"
170
+ $stderr.puts "SleepyInterface error: failed parse, but parse tree exists??"
171
+ exit 1
172
+ end
173
+ end
174
+ }
175
+
176
+ # all TabFile sentences are consumed:
177
+ # now we may just encounter comments, garbage, empty lines etc.
178
+
179
+ while not parsefile.eof?
180
+ case parsefile.gets
181
+ when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
182
+ else
183
+ $stderr.puts "SleepyInterface error: premature end of tab file"
184
+ exit 1
185
+ end
186
+ end
187
+ end
188
+
189
+
190
+ ###
191
+ # write Salsa/TIGER XML output to file
192
+ def to_stxml_file(infilename, # string: name of parse file
193
+ outfilename) # string: name of output stxml file
194
+
195
+ outfile = File.new(outfilename, "w")
196
+ outfile.puts SalsaTigerXMLHelper.get_header()
197
+ each_sentence(infilename) { |st_sent, tabsent|
198
+ outfile.puts st_sent.get()
199
+ }
200
+ outfile.puts SalsaTigerXMLHelper.get_footer()
201
+ outfile.close()
202
+ end
203
+
204
+
205
+
206
+ ########################
207
+ private
208
+
209
+ ###
210
+ # Recursive function for parsing a Sleepy parse tree and
211
+ # building a SalsaTigerSentence recursively
212
+ #
213
+ # Algorithm: manage stack which contains, for the current constituent,
214
+ # child constituents (if a nonterminal), and the category label.
215
+ # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
216
+ # All children and the category label are popped from the stack and integrated into the
217
+ # TigerSalsa data structure. The new node is re-pushed onto the stack.
218
+ def build_salsatiger(sentence, # string
219
+ pos, # position in string (index): integer
220
+ stack, # stack with incomplete nodes: Array
221
+ termc, # terminal counter
222
+ nontc, # nonterminal counter
223
+ sent_obj) # SalsaTigerSentence
224
+
225
+
226
+ # main case distinction: match the beginning of our string
227
+ # (i.e. what follows our current position in the string)
228
+
229
+ case sentence[pos..-1]
230
+
231
+ when /^ *$/ # nothing -> whole sentence parsed
232
+ if stack.length == 1
233
+ # sleepy always delivers one "top" node; if we don't get just one
234
+ # node, something has gone wrong
235
+ node = stack.pop
236
+ node.del_attribute("gf")
237
+ return sent_obj
238
+ else
239
+ $stderr.puts "SleepyINterface Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
240
+ exit 1
241
+ end
242
+
243
+ when /^\s*\(([^ )]+) /
244
+ # match the beginning of a new constituent
245
+ # (opening bracket + category + space, may not contain closing bracket)
246
+ cat = $1
247
+ if cat.nil? or cat == ""
248
+ $stderr.puts "SleepyInterface Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
249
+ exit 1
250
+ end
251
+ # STDERR.puts "new const #{cat}"
252
+ stack.push cat # throw the category label on the stack
253
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
254
+
255
+ when /^\s*(\S+)\) /
256
+ # match the end of a terminal constituent (something before a closing bracket + space)
257
+ word = $1
258
+ comb_cat = stack.pop
259
+ if comb_cat.to_s == ""
260
+ $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
261
+ exit 1
262
+ end
263
+ cat,gf = split_cat(comb_cat)
264
+ node = sent_obj.add_syn("t",
265
+ nil, # cat (doesn't matter here)
266
+ SalsaTigerXMLHelper.escape(word), # word
267
+ cat, # pos
268
+ termc.next.to_s)
269
+ node.set_attribute("gf",gf)
270
+ # STDERR.puts "completed terminal #{cat}, #{word}"
271
+ stack.push node
272
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
273
+
274
+ when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
275
+ # now collect children:
276
+ # pop items from the stack until you find the category
277
+ children = Array.new
278
+ while true
279
+ if stack.empty?
280
+ $stderr.puts "SleepyInterface Error: stack empty; cannot find more children"
281
+ exit 1
282
+ end
283
+ item = stack.pop
284
+ case item.class.to_s
285
+ when "SynNode" # this is a child
286
+ children.push item
287
+ when "String" # this is the category label
288
+ if item.to_s == ""
289
+ $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
290
+ exit 1
291
+ end
292
+ cat,gf = split_cat(item)
293
+ break
294
+ else
295
+ $stderr.puts "SleepyInterface Error: unknown item class #{item.class.to_s}"
296
+ exit 1
297
+ end
298
+ end
299
+ # now add a nonterminal node to the sentence object and
300
+ # register the children nodes
301
+ node = sent_obj.add_syn("nt",
302
+ cat, # cat
303
+ nil, # word (doesn't matter)
304
+ nil, # pos (doesn't matter)
305
+ nontc.next.to_s)
306
+ children.each {|child|
307
+ child_gf = child.get_attribute("gf")
308
+ child.del_attribute("gf")
309
+ node.add_child(child,child_gf)
310
+ child.add_parent(node, child_gf)
311
+ }
312
+ node.set_attribute("gf",gf)
313
+ # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
314
+ stack.push node
315
+ return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
316
+ else
317
+
318
+ if sentence =~ /Fatal error: exception Out_of_memory/
319
+ $stderr.puts "SleepyInterface error: Sleepy parser ran out of memory."
320
+ $stderr.puts "Try reducing the max. sentence length"
321
+ $stderr.puts "in the experiment file."
322
+ exit 1
323
+ end
324
+
325
+
326
+ $stderr.puts "SleepyInterface Error: cannot analyse sentence at pos #{pos}:\n #{sentence[pos..-1]}\n Complete sentence: \n#{sentence}"
327
+ exit 1
328
+ end
329
+ end
330
+
331
+ ###
332
+ # Sleepy delivers node labels as "phrase type"-"grammatical function"
333
+ # but the GF may not be present.
334
+
335
+ def split_cat(cat)
336
+
337
+ cat =~ /^([^-]*)(-([^-]*))?$/
338
+ unless $1
339
+ $stderr.puts "SleepyInterface Error: could not identify category in #{cat}"
340
+ exit 1
341
+ end
342
+
343
+ proper_cat = $1
344
+
345
+ if $3
346
+ gf = $3
347
+ else
348
+ gf = ""
349
+ end
350
+
351
+ return [proper_cat,gf]
352
+
353
+ end
354
+ end
355
+
356
+
357
+
358
+ ################################################
359
+ # Interpreter class
360
+ class SleepyInterpreter < Tiger
361
+ SleepyInterpreter.announce_me()
362
+
363
+ ###
364
+ # names of the systems interpreted by this class:
365
+ # returns a hash service(string) -> system name (string),
366
+ # e.g.
367
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
368
+ def SleepyInterpreter.systems()
369
+ return {
370
+ "parser" => "sleepy"
371
+ }
372
+ end
373
+
374
+ ###
375
+ # names of additional systems that may be interpreted by this class
376
+ # returns a hash service(string) -> system name(string)
377
+ # same as names()
378
+ def SleepyInterpreter.optional_systems()
379
+ return {
380
+ "lemmatizer" => "treetagger"
381
+ }
382
+ end
383
+
384
+ end
@@ -0,0 +1,44 @@
1
+ require "tempfile"
2
+ require "common/AbstractSynInterface"
3
+
4
+ ################################################
5
+ # Interface class
6
+ class TntInterface < SynInterfaceTab
7
+ TntInterface.announce_me()
8
+
9
+ def TntInterface.system()
10
+ return "tnt"
11
+ end
12
+
13
+ def TntInterface.service()
14
+ return "pos_tagger"
15
+ end
16
+
17
+ def process_file(infilename, # string: name of input file
18
+ outfilename) # string: name of output file
19
+
20
+ tempfile = Tempfile.new("Tnt")
21
+ TntInterface.fntab_words_to_file(infilename, tempfile)
22
+ tempfile.close
23
+
24
+ # 1. use grep to remove commentaries from file
25
+ # 2. use sed to extract tags tag list:
26
+ # - match one or more non-spaces
27
+ # - match one or more spaces
28
+ # - match one or more non-spaces and write to outfilename
29
+
30
+ # This assumes that the experiment file entry for pos_tagger_path
31
+ # has the form
32
+ # pos_tagger_path = <program_name> <model>
33
+
34
+ Kernel.system(@program_path + " " + tempfile.path +
35
+ ' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
36
+
37
+ tempfile.close(true) # delete tempfile
38
+ unless `cat #{infilename} | wc -l`.strip ==
39
+ `cat #{outfilename} | wc -l`.strip
40
+ raise "Error: tagged file has different line number from corpus file!"
41
+ end
42
+ end
43
+ end
44
+
@@ -0,0 +1,327 @@
1
+ # sp 30 11 06
2
+ # extended by TreeTaggerPOSInterface
3
+
4
+ require "tempfile"
5
+ require 'pathname'
6
+ require "common/AbstractSynInterface"
7
+
8
+ ###########
9
+ # KE dec 7, 06
10
+ # common mixin for both Treetagger modules, doing the actual processing
11
+ module TreetaggerModule
12
+ ###
13
+ # Treetagger does both lemmatization and POS-tagging.
14
+ # However, the way the SynInterface system is set up in Shalmaneser,
15
+ # each SynInterface can offer only _one_ service.
16
+ # This means that we cannot do a SynInterface that writes
17
+ # both a POS file and a lemma file.
18
+ # Instead, both will include this module, which does the
19
+ # actual TreeTagger call and then stores the result in a file
20
+ # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
21
+ # but with a separate extension.
22
+ # really_process_file checks for existence of this file because,
23
+ # if the TreeTagger lemmatization and POS-tagging classes are called separately,
24
+ # one of them will go first, and the 2nd one will not need to do the
25
+ # TreeTagger call anymore
26
+ #
27
+ # really_process_file returns a filename, the name of the file containing
28
+ # the TreeTagger output with both POS tags and lemma information
29
+ #
30
+ # WARNING: this method assumes that outfilename contains a suffix
31
+ # that can be replaced by .TreeTagger
32
+ def really_process_file(infilename, # string: name of input file
33
+ outfilename,# string: name of file that the caller is to produce
34
+ make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
35
+
36
+ # fabricate the filename in which the
37
+ # actual TreeTagger output will be placed:
38
+ # <directory> + <outfilename minus last suffix> + ".TreeTagger"
39
+ current_suffix = outfilename[outfilename.rindex(".")..-1]
40
+ my_outfilename = File.dirname(outfilename) + "/" +
41
+ File.basename(outfilename, current_suffix) +
42
+ ".TreeTagger"
43
+
44
+ ##
45
+ # does it exist? then just return it
46
+ if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
47
+ return my_outfilename
48
+ end
49
+
50
+ ##
51
+ # else construct it, then return it
52
+ tempfile = Tempfile.new("Treetagger")
53
+ TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
54
+ tempfile.close
55
+
56
+ # @todo AB: Remove it by my shame :(
57
+ # AB: A very dirty hack of mine:
58
+ # We need the language attribute, but we don't have the FrPrepConfigData,
59
+ # then we'll try to find it in the ObjectSpace since we should have only one.
60
+ lang = ''
61
+ ObjectSpace.each_object(FrPrepConfigData) do |o|
62
+ lang = o.get('language')
63
+ end
64
+
65
+ case lang
66
+ when 'en'
67
+ tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'english.par')
68
+ tt_filter = ''
69
+ when 'de'
70
+ tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'german.par')
71
+ tt_filter = "| #{Pathname.new(@program_path).join('cmd').join('filter-german-tags')}"
72
+ end
73
+
74
+ # call TreeTagger
75
+ tt_binary = Pathname.new(@program_path).join('bin').join(ENV['SHALM_TREETAGGER_BIN'] || 'tree-tagger')
76
+
77
+ invocation_str = "#{tt_binary} -lemma -token -sgml #{tt_model} #{tempfile.path} #{tt_filter} > #{my_outfilename}"
78
+
79
+ STDERR.puts "*** Tagging and lemmatizing #{tempfile.path} with TreeTagger."
80
+ STDERR.puts invocation_str
81
+
82
+ Kernel.system(invocation_str)
83
+ tempfile.close(true) # delete first tempfile
84
+
85
+ # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
86
+ # resulting on a .tagged file missing the last (blank) line
87
+
88
+ original_length = `cat #{infilename} | wc -l`.strip.to_i
89
+ puts infilename
90
+ lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
91
+
92
+ # `cp #{tempfile2.path()} /tmp/lout`
93
+
94
+ case original_length - lemmatised_length
95
+ when 0
96
+ # everything ok, don't do anything
97
+ when 1
98
+ # add one more newline to the .tagged file
99
+ `echo "" >> #{my_outfilename}`
100
+ else
101
+ # this is "real" error
102
+ STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
103
+ STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
104
+ $stderr.puts "has different line number from corpus file!"
105
+ raise
106
+ end
107
+
108
+
109
+ return my_outfilename
110
+ end
111
+ end
112
+
113
+ #######################################
114
+ class TreetaggerInterface < SynInterfaceTab
115
+ TreetaggerInterface.announce_me()
116
+
117
+ include TreetaggerModule
118
+
119
+ ###
120
+ def self.system
121
+ 'treetagger'
122
+ end
123
+
124
+ ###
125
+ def self.service
126
+ 'lemmatizer'
127
+ end
128
+
129
+ ###
130
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
131
+ # @todo AB: Generalize this method to work with different parsers.
132
+ def convert_to_berkeley(line)
133
+ line.chomp!
134
+ return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
135
+ end
136
+
137
+
138
+ ###
139
+ def process_file(infilename, # string: name of input file
140
+ outfilename) # string: name of output file
141
+
142
+ # KE change here
143
+ ttfilename = really_process_file(infilename, outfilename)
144
+
145
+ # write all output to tempfile2 first, then
146
+ # change ISO to UTF-8 into outputfile
147
+ tempfile2 = Tempfile.new("treetagger")
148
+ tempfile2.close()
149
+
150
+ # 2. use cut to get the actual lemmtisation
151
+
152
+ Kernel.system("cat " + ttfilename +
153
+ ' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
154
+
155
+ # transform ISO-8859-1 back to UTF-8,
156
+ # write to 'outfilename'
157
+ begin
158
+ outfile = File.new(outfilename, "w")
159
+ rescue
160
+ raise "Could not write to #{outfilename}"
161
+ end
162
+ tempfile2.open
163
+ # AB: Internally all the flow is an utf-8 encoded stream.
164
+ # TreeTagger consumes one byte encodings (but we should provide a
165
+ # utf-8 model for German). So we convert utf-8 to latin1, then
166
+ # process the text and convert it back to utf-8.
167
+ #
168
+ while line = tempfile2.gets
169
+ #outfile.puts UtfIso.from_iso_8859_1(line)
170
+ utf8line = UtfIso.from_iso_8859_1(line)
171
+ outfile.puts convert_to_berkeley(utf8line)
172
+ end
173
+
174
+ # remove second tempfile, finalize output file
175
+ tempfile2.close(true)
176
+ outfile.close()
177
+
178
+ end
179
+ end
180
+
181
+
182
+ # sp 30 11 06
183
+ #
184
+ # using TreeTagger for POS tagging of English text
185
+ #
186
+ # copy-and-paste from lemmatisation
187
+ #
188
+ # differences:
189
+ # 1. use field 2 and not 3 from the output
190
+ # 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
191
+ #
192
+ # KE 7 12 06
193
+ # change interface such that TreeTagger is called only once
194
+ # and both POS tags and lemma are read from the same files,
195
+ # rather than calling the tagger twice
196
+ class TreetaggerPOSInterface < SynInterfaceTab
197
+ TreetaggerPOSInterface.announce_me()
198
+ include TreetaggerModule
199
+
200
+ ###
201
+ def TreetaggerPOSInterface.system()
202
+ return "treetagger"
203
+ end
204
+
205
+ ###
206
+ def TreetaggerPOSInterface.service()
207
+ return "pos_tagger"
208
+ end
209
+
210
+ ###
211
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
212
+
213
+ def convert_to_collins(line)
214
+ line.chomp!
215
+ return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
216
+ end
217
+
218
+ ###
219
+ def process_file(infilename, # string: name of input file
220
+ outfilename) # string: name of output file
221
+
222
+ # KE change here
223
+ tt_filename = really_process_file(infilename, outfilename, true)
224
+
225
+ # write all output to tempfile2 first, then
226
+ # change ISO to UTF-8 into outputfile
227
+ tempfile2 = Tempfile.new("treetagger")
228
+ tempfile2.close()
229
+
230
+ # 2. use cut to get the actual lemmtisation
231
+
232
+ Kernel.system("cat " + tt_filename +
233
+ ' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
234
+
235
+ # transform ISO-8859-1 back to UTF-8,
236
+ # write to 'outfilename'
237
+ begin
238
+ outfile = File.new(outfilename, "w")
239
+ rescue
240
+ raise "Could not write to #{outfilename}"
241
+ end
242
+ tempfile2.open()
243
+ while (line = tempfile2.gets())
244
+ outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
245
+ end
246
+
247
+ # remove second tempfile, finalize output file
248
+ tempfile2.close(true)
249
+ outfile.close()
250
+ end
251
+ end
252
+
253
+ ###############
254
+ # an interpreter that only has Treetagger, no parser
255
+ class TreetaggerInterpreter < SynInterpreter
256
+ TreetaggerInterpreter.announce_me()
257
+
258
+ ###
259
+ # names of the systems interpreted by this class:
260
+ # returns a hash service(string) -> system name (string),
261
+ # e.g.
262
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
263
+ def TreetaggerInterpreter.systems()
264
+ return {
265
+ "pos_tagger" => "treetagger",
266
+ }
267
+ end
268
+
269
+ ###
270
+ # names of additional systems that may be interpreted by this class
271
+ # returns a hash service(string) -> system name(string)
272
+ # same as names()
273
+ def TreetaggerInterpreter.optional_systems()
274
+ return {
275
+ "lemmatizer" => "treetagger"
276
+ }
277
+ end
278
+
279
+ ###
280
+ # generalize over POS tags.
281
+ #
282
+ # returns one of:
283
+ #
284
+ # adj: adjective (phrase)
285
+ # adv: adverb (phrase)
286
+ # card: numbers, quantity phrases
287
+ # con: conjunction
288
+ # det: determiner, including possessive/demonstrative pronouns etc.
289
+ # for: foreign material
290
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
291
+ # part: particles, truncated words (German compound parts)
292
+ # prep: preposition (phrase)
293
+ # pun: punctuation, brackets, etc.
294
+ # sent: sentence
295
+ # top: top node of a sentence
296
+ # verb: verb (phrase)
297
+ # nil: something went wrong
298
+ #
299
+ # returns: string, or nil
300
+ def TreetaggerInterpreter.category(node) # SynNode
301
+ pt = TreetaggerInterpreter.pt(node)
302
+ if pt.nil?
303
+ # phrase type could not be determined
304
+ return nil
305
+ end
306
+
307
+ pt.to_s.strip() =~ /^([^-]*)/
308
+ case $1
309
+ when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
310
+ when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
311
+ when /^CD/, /^QP/ then return "card"
312
+ when /^CC/, /^WRB/, /^CONJP/ then return "con"
313
+ when /^DT/, /^POS/ then return "det"
314
+ when /^FW/, /^SYM/ then return "for"
315
+ when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
316
+ when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
317
+ when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
318
+ when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
319
+ when /^TOP/ then return "top"
320
+ when /^TRACE/ then return "trace"
321
+ when /^V/ , /^MD/ then return "verb"
322
+ else
323
+ # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
324
+ return nil
325
+ end
326
+ end
327
+ end