shalmaneser-prep 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,384 @@
1
+ ####
2
+ # sp 21 07 05
3
+ #
4
+ # modified ke 30 10 05: adapted to fit into SynInterface
5
+ #
6
+ # represents a file containing Sleepy parses
7
+ #
8
+ # underlying data structure for individual sentences: SalsaTigerSentence
9
+ require 'tempfile'
10
+
11
+ require 'common/SalsaTigerRegXML'
12
+ require 'common/SalsaTigerXMLHelper'
13
+ require 'common/TabFormat'
14
+ require 'common/Counter'
15
+
16
+ require 'common/AbstractSynInterface'
17
+ require 'common/Tiger.rb'
18
+
19
+ ################################################
20
+ # Interface class
21
+ class SleepyInterface < SynInterfaceSTXML
22
+ SleepyInterface.announce_me()
23
+
24
+ ###
25
+ def SleepyInterface.system()
26
+ return "sleepy"
27
+ end
28
+
29
+ ###
30
+ def SleepyInterface.service()
31
+ return "parser"
32
+ end
33
+
34
+ ###
35
+ # initialize to set values for all subsequent processing
36
+ def initialize(program_path, # string: path to system
37
+ insuffix, # string: suffix of tab files
38
+ outsuffix, # string: suffix for parsed files
39
+ stsuffix, # string: suffix for Salsa/TIGER XML files
40
+ var_hash = {}) # optional arguments in a hash
41
+
42
+ super(program_path, insuffix, outsuffix, stsuffix, var_hash)
43
+ unless @program_path =~ /\/$/
44
+ @program_path = @program_path + "/"
45
+ end
46
+
47
+ # new: evaluate var hash
48
+ @pos_suffix = var_hash["pos_suffix"]
49
+ @lemma_suffix = var_hash["lemma_suffix"]
50
+ @tab_dir = var_hash["tab_dir"]
51
+ end
52
+
53
+ ####
54
+ # parse a directory with TabFormat files and write the parse trees to outputdir
55
+ # I assume that the files in inputdir are smaller than
56
+ # the maximum number of sentences that
57
+ # Sleepy can parse in one go (i.e. that they are split)
58
+ def process_dir(in_dir, # string: input directory name
59
+ out_dir) # string: output directory name
60
+
61
+ sleepy_prog = "#{@program_path}sleepy --beam 1000 --model-file #{@program_path}negra.model --parse "
62
+
63
+ Dir[in_dir + "*" + @insuffix].each {|inputfilename|
64
+ STDERR.puts "*** Parsing #{inputfilename} with Sleepy"
65
+ corpusfilename = File.basename(inputfilename, @insuffix)
66
+ parsefilename = out_dir + corpusfilename + @outsuffix
67
+ tempfile = Tempfile.new(corpusfilename)
68
+
69
+ # we need neither lemmata nor POS tags; sleepy can do with the words
70
+ corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
71
+ corpusfile.each_sentence {|sentence|
72
+ tempfile.puts sentence.to_s
73
+ }
74
+ tempfile.close
75
+ # parse and remove comments in the parser output
76
+ Kernel.system(sleepy_prog+" "+tempfile.path+" 2>&1 | grep -v \"Span:\" > "+parsefilename)
77
+ }
78
+ end
79
+
80
+ ###
81
+ # for a given parsed file:
82
+ # yield each sentence as a pair
83
+ # [SalsaTigerSentence object, FNTabFormatSentence object]
84
+ # of the sentence in SalsaTigerXML and the matching tab format sentence
85
+ #
86
+ # If a parse has failed, returns
87
+ # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
88
+ # to allow more detailed accounting for failed parses
89
+ # (basically just a flat structure with a failed=true attribute
90
+ # at the sentence node)
91
+ def each_sentence(parsefilename)
92
+ # sanity checks
93
+ unless @tab_dir
94
+ $stderr.puts "SleepyInterface error: Need to set tab directory on initialization"
95
+ exit 1
96
+ end
97
+
98
+ # get matching tab file for this parser output file
99
+ parsefile = File.new(parsefilename)
100
+ tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
101
+ tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
102
+
103
+ sentid = 0
104
+
105
+ tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
106
+
107
+ sentence_str = ""
108
+ status = true # error encountered?
109
+
110
+ # assemble next sentence in Sleepy file by reading lines from parsefile
111
+ while true
112
+ line = parsefile.gets
113
+ case line
114
+ when /% Parse failed/
115
+ status = false
116
+ break
117
+ when nil # end of file: nothing more to break
118
+ break
119
+ when /^%/, /^\s*$/ # empty lines, other comments: end of current sentence
120
+ unless sentence_str == "" # only break if you have read something
121
+ break
122
+ end
123
+ else
124
+ sentence_str += line.chomp # collect line of current parse and continue reading
125
+ end
126
+ end
127
+
128
+ # we have reached some kind of end
129
+ sentid +=1
130
+
131
+ # we don't have a sentence: hopefully, this is becase parsing has failed
132
+ # if this is not the case, we are in trouble
133
+ if sentence_str == ""
134
+ case status
135
+
136
+ when false
137
+ # return a SalsaTigerSentence object for the failed sentence
138
+ # with a virtual top node and one terminal per word.
139
+ if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
140
+ my_sent_id = tab_sent.get_sent_id()
141
+ else
142
+ my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
143
+ end
144
+ sent = SleepyInterface.failed_sentence(tab_sent, my_sent_id)
145
+ yield [sent, tab_sent, SleepyInterface.standard_mapping(sent, tab_sent)]
146
+
147
+ else
148
+ # this may not happen: we need some sentence for the current
149
+ # TabFile sentence
150
+ $stderr.puts "SleepyInterface error: premature end of parser file!"
151
+ exit 1
152
+ end
153
+ else
154
+ # if we are here, we have a sentence_str to work on
155
+ # hopefully, our status is OK
156
+ case status
157
+ when true
158
+ if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
159
+ my_sent_id = tab_sent.get_sent_id()
160
+ else
161
+ my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
162
+ end
163
+ st_sent = build_salsatiger(" " + sentence_str + " ", 0,
164
+ Array.new, Counter.new(0),
165
+ Counter.new(500),
166
+ SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
167
+ yield [st_sent, tab_sent, SleepyInterface.standard_mapping(st_sent, tab_sent)]
168
+
169
+ else # i.e. when "failed"
170
+ $stderr.puts "SleepyInterface error: failed parse, but parse tree exists??"
171
+ exit 1
172
+ end
173
+ end
174
+ }
175
+
176
+ # all TabFile sentences are consumed:
177
+ # now we may just encounter comments, garbage, empty lines etc.
178
+
179
+ while not parsefile.eof?
180
+ case parsefile.gets
181
+ when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
182
+ else
183
+ $stderr.puts "SleepyInterface error: premature end of tab file"
184
+ exit 1
185
+ end
186
+ end
187
+ end
188
+
189
+
190
+ ###
191
+ # write Salsa/TIGER XML output to file
192
+ def to_stxml_file(infilename, # string: name of parse file
193
+ outfilename) # string: name of output stxml file
194
+
195
+ outfile = File.new(outfilename, "w")
196
+ outfile.puts SalsaTigerXMLHelper.get_header()
197
+ each_sentence(infilename) { |st_sent, tabsent|
198
+ outfile.puts st_sent.get()
199
+ }
200
+ outfile.puts SalsaTigerXMLHelper.get_footer()
201
+ outfile.close()
202
+ end
203
+
204
+
205
+
206
+ ########################
207
+ private
208
+
209
+ ###
210
+ # Recursive function for parsing a Sleepy parse tree and
211
+ # building a SalsaTigerSentence recursively
212
+ #
213
+ # Algorithm: manage stack which contains, for the current constituent,
214
+ # child constituents (if a nonterminal), and the category label.
215
+ # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
216
+ # All children and the category label are popped from the stack and integrated into the
217
+ # TigerSalsa data structure. The new node is re-pushed onto the stack.
218
+ def build_salsatiger(sentence, # string
219
+ pos, # position in string (index): integer
220
+ stack, # stack with incomplete nodes: Array
221
+ termc, # terminal counter
222
+ nontc, # nonterminal counter
223
+ sent_obj) # SalsaTigerSentence
224
+
225
+
226
+ # main case distinction: match the beginning of our string
227
+ # (i.e. what follows our current position in the string)
228
+
229
+ case sentence[pos..-1]
230
+
231
+ when /^ *$/ # nothing -> whole sentence parsed
232
+ if stack.length == 1
233
+ # sleepy always delivers one "top" node; if we don't get just one
234
+ # node, something has gone wrong
235
+ node = stack.pop
236
+ node.del_attribute("gf")
237
+ return sent_obj
238
+ else
239
+ $stderr.puts "SleepyINterface Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
240
+ exit 1
241
+ end
242
+
243
+ when /^\s*\(([^ )]+) /
244
+ # match the beginning of a new constituent
245
+ # (opening bracket + category + space, may not contain closing bracket)
246
+ cat = $1
247
+ if cat.nil? or cat == ""
248
+ $stderr.puts "SleepyInterface Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
249
+ exit 1
250
+ end
251
+ # STDERR.puts "new const #{cat}"
252
+ stack.push cat # throw the category label on the stack
253
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
254
+
255
+ when /^\s*(\S+)\) /
256
+ # match the end of a terminal constituent (something before a closing bracket + space)
257
+ word = $1
258
+ comb_cat = stack.pop
259
+ if comb_cat.to_s == ""
260
+ $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
261
+ exit 1
262
+ end
263
+ cat,gf = split_cat(comb_cat)
264
+ node = sent_obj.add_syn("t",
265
+ nil, # cat (doesn't matter here)
266
+ SalsaTigerXMLHelper.escape(word), # word
267
+ cat, # pos
268
+ termc.next.to_s)
269
+ node.set_attribute("gf",gf)
270
+ # STDERR.puts "completed terminal #{cat}, #{word}"
271
+ stack.push node
272
+ return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
273
+
274
+ when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
275
+ # now collect children:
276
+ # pop items from the stack until you find the category
277
+ children = Array.new
278
+ while true
279
+ if stack.empty?
280
+ $stderr.puts "SleepyInterface Error: stack empty; cannot find more children"
281
+ exit 1
282
+ end
283
+ item = stack.pop
284
+ case item.class.to_s
285
+ when "SynNode" # this is a child
286
+ children.push item
287
+ when "String" # this is the category label
288
+ if item.to_s == ""
289
+ $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
290
+ exit 1
291
+ end
292
+ cat,gf = split_cat(item)
293
+ break
294
+ else
295
+ $stderr.puts "SleepyInterface Error: unknown item class #{item.class.to_s}"
296
+ exit 1
297
+ end
298
+ end
299
+ # now add a nonterminal node to the sentence object and
300
+ # register the children nodes
301
+ node = sent_obj.add_syn("nt",
302
+ cat, # cat
303
+ nil, # word (doesn't matter)
304
+ nil, # pos (doesn't matter)
305
+ nontc.next.to_s)
306
+ children.each {|child|
307
+ child_gf = child.get_attribute("gf")
308
+ child.del_attribute("gf")
309
+ node.add_child(child,child_gf)
310
+ child.add_parent(node, child_gf)
311
+ }
312
+ node.set_attribute("gf",gf)
313
+ # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
314
+ stack.push node
315
+ return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
316
+ else
317
+
318
+ if sentence =~ /Fatal error: exception Out_of_memory/
319
+ $stderr.puts "SleepyInterface error: Sleepy parser ran out of memory."
320
+ $stderr.puts "Try reducing the max. sentence length"
321
+ $stderr.puts "in the experiment file."
322
+ exit 1
323
+ end
324
+
325
+
326
+ $stderr.puts "SleepyInterface Error: cannot analyse sentence at pos #{pos}:\n #{sentence[pos..-1]}\n Complete sentence: \n#{sentence}"
327
+ exit 1
328
+ end
329
+ end
330
+
331
+ ###
332
+ # Sleepy delivers node labels as "phrase type"-"grammatical function"
333
+ # but the GF may not be present.
334
+
335
+ def split_cat(cat)
336
+
337
+ cat =~ /^([^-]*)(-([^-]*))?$/
338
+ unless $1
339
+ $stderr.puts "SleepyInterface Error: could not identify category in #{cat}"
340
+ exit 1
341
+ end
342
+
343
+ proper_cat = $1
344
+
345
+ if $3
346
+ gf = $3
347
+ else
348
+ gf = ""
349
+ end
350
+
351
+ return [proper_cat,gf]
352
+
353
+ end
354
+ end
355
+
356
+
357
+
358
+ ################################################
359
+ # Interpreter class
360
+ class SleepyInterpreter < Tiger
361
+ SleepyInterpreter.announce_me()
362
+
363
+ ###
364
+ # names of the systems interpreted by this class:
365
+ # returns a hash service(string) -> system name (string),
366
+ # e.g.
367
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
368
+ def SleepyInterpreter.systems()
369
+ return {
370
+ "parser" => "sleepy"
371
+ }
372
+ end
373
+
374
+ ###
375
+ # names of additional systems that may be interpreted by this class
376
+ # returns a hash service(string) -> system name(string)
377
+ # same as names()
378
+ def SleepyInterpreter.optional_systems()
379
+ return {
380
+ "lemmatizer" => "treetagger"
381
+ }
382
+ end
383
+
384
+ end
@@ -0,0 +1,44 @@
1
+ require "tempfile"
2
+ require "common/AbstractSynInterface"
3
+
4
+ ################################################
5
+ # Interface class
6
+ class TntInterface < SynInterfaceTab
7
+ TntInterface.announce_me()
8
+
9
+ def TntInterface.system()
10
+ return "tnt"
11
+ end
12
+
13
+ def TntInterface.service()
14
+ return "pos_tagger"
15
+ end
16
+
17
+ def process_file(infilename, # string: name of input file
18
+ outfilename) # string: name of output file
19
+
20
+ tempfile = Tempfile.new("Tnt")
21
+ TntInterface.fntab_words_to_file(infilename, tempfile)
22
+ tempfile.close
23
+
24
+ # 1. use grep to remove commentaries from file
25
+ # 2. use sed to extract tags tag list:
26
+ # - match one or more non-spaces
27
+ # - match one or more spaces
28
+ # - match one or more non-spaces and write to outfilename
29
+
30
+ # This assumes that the experiment file entry for pos_tagger_path
31
+ # has the form
32
+ # pos_tagger_path = <program_name> <model>
33
+
34
+ Kernel.system(@program_path + " " + tempfile.path +
35
+ ' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
36
+
37
+ tempfile.close(true) # delete tempfile
38
+ unless `cat #{infilename} | wc -l`.strip ==
39
+ `cat #{outfilename} | wc -l`.strip
40
+ raise "Error: tagged file has different line number from corpus file!"
41
+ end
42
+ end
43
+ end
44
+
@@ -0,0 +1,327 @@
1
+ # sp 30 11 06
2
+ # extended by TreeTaggerPOSInterface
3
+
4
+ require "tempfile"
5
+ require 'pathname'
6
+ require "common/AbstractSynInterface"
7
+
8
+ ###########
9
+ # KE dec 7, 06
10
+ # common mixin for both Treetagger modules, doing the actual processing
11
+ module TreetaggerModule
12
+ ###
13
+ # Treetagger does both lemmatization and POS-tagging.
14
+ # However, the way the SynInterface system is set up in Shalmaneser,
15
+ # each SynInterface can offer only _one_ service.
16
+ # This means that we cannot do a SynInterface that writes
17
+ # both a POS file and a lemma file.
18
+ # Instead, both will include this module, which does the
19
+ # actual TreeTagger call and then stores the result in a file
20
+ # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
21
+ # but with a separate extension.
22
+ # really_process_file checks for existence of this file because,
23
+ # if the TreeTagger lemmatization and POS-tagging classes are called separately,
24
+ # one of them will go first, and the 2nd one will not need to do the
25
+ # TreeTagger call anymore
26
+ #
27
+ # really_process_file returns a filename, the name of the file containing
28
+ # the TreeTagger output with both POS tags and lemma information
29
+ #
30
+ # WARNING: this method assumes that outfilename contains a suffix
31
+ # that can be replaced by .TreeTagger
32
+ def really_process_file(infilename, # string: name of input file
33
+ outfilename,# string: name of file that the caller is to produce
34
+ make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
35
+
36
+ # fabricate the filename in which the
37
+ # actual TreeTagger output will be placed:
38
+ # <directory> + <outfilename minus last suffix> + ".TreeTagger"
39
+ current_suffix = outfilename[outfilename.rindex(".")..-1]
40
+ my_outfilename = File.dirname(outfilename) + "/" +
41
+ File.basename(outfilename, current_suffix) +
42
+ ".TreeTagger"
43
+
44
+ ##
45
+ # does it exist? then just return it
46
+ if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
47
+ return my_outfilename
48
+ end
49
+
50
+ ##
51
+ # else construct it, then return it
52
+ tempfile = Tempfile.new("Treetagger")
53
+ TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
54
+ tempfile.close
55
+
56
+ # @todo AB: Remove it by my shame :(
57
+ # AB: A very dirty hack of mine:
58
+ # We need the language attribute, but we don't have the FrPrepConfigData,
59
+ # then we'll try to find it in the ObjectSpace since we should have only one.
60
+ lang = ''
61
+ ObjectSpace.each_object(FrPrepConfigData) do |o|
62
+ lang = o.get('language')
63
+ end
64
+
65
+ case lang
66
+ when 'en'
67
+ tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'english.par')
68
+ tt_filter = ''
69
+ when 'de'
70
+ tt_model = Pathname.new(@program_path).join('lib').join(ENV['SHALM_TREETAGGER_MODEL'] || 'german.par')
71
+ tt_filter = "| #{Pathname.new(@program_path).join('cmd').join('filter-german-tags')}"
72
+ end
73
+
74
+ # call TreeTagger
75
+ tt_binary = Pathname.new(@program_path).join('bin').join(ENV['SHALM_TREETAGGER_BIN'] || 'tree-tagger')
76
+
77
+ invocation_str = "#{tt_binary} -lemma -token -sgml #{tt_model} #{tempfile.path} #{tt_filter} > #{my_outfilename}"
78
+
79
+ STDERR.puts "*** Tagging and lemmatizing #{tempfile.path} with TreeTagger."
80
+ STDERR.puts invocation_str
81
+
82
+ Kernel.system(invocation_str)
83
+ tempfile.close(true) # delete first tempfile
84
+
85
+ # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
86
+ # resulting on a .tagged file missing the last (blank) line
87
+
88
+ original_length = `cat #{infilename} | wc -l`.strip.to_i
89
+ puts infilename
90
+ lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
91
+
92
+ # `cp #{tempfile2.path()} /tmp/lout`
93
+
94
+ case original_length - lemmatised_length
95
+ when 0
96
+ # everything ok, don't do anything
97
+ when 1
98
+ # add one more newline to the .tagged file
99
+ `echo "" >> #{my_outfilename}`
100
+ else
101
+ # this is "real" error
102
+ STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
103
+ STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
104
+ $stderr.puts "has different line number from corpus file!"
105
+ raise
106
+ end
107
+
108
+
109
+ return my_outfilename
110
+ end
111
+ end
112
+
113
+ #######################################
114
+ class TreetaggerInterface < SynInterfaceTab
115
+ TreetaggerInterface.announce_me()
116
+
117
+ include TreetaggerModule
118
+
119
+ ###
120
+ def self.system
121
+ 'treetagger'
122
+ end
123
+
124
+ ###
125
+ def self.service
126
+ 'lemmatizer'
127
+ end
128
+
129
+ ###
130
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
131
+ # @todo AB: Generalize this method to work with different parsers.
132
+ def convert_to_berkeley(line)
133
+ line.chomp!
134
+ return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
135
+ end
136
+
137
+
138
+ ###
139
+ def process_file(infilename, # string: name of input file
140
+ outfilename) # string: name of output file
141
+
142
+ # KE change here
143
+ ttfilename = really_process_file(infilename, outfilename)
144
+
145
+ # write all output to tempfile2 first, then
146
+ # change ISO to UTF-8 into outputfile
147
+ tempfile2 = Tempfile.new("treetagger")
148
+ tempfile2.close()
149
+
150
+ # 2. use cut to get the actual lemmtisation
151
+
152
+ Kernel.system("cat " + ttfilename +
153
+ ' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
154
+
155
+ # transform ISO-8859-1 back to UTF-8,
156
+ # write to 'outfilename'
157
+ begin
158
+ outfile = File.new(outfilename, "w")
159
+ rescue
160
+ raise "Could not write to #{outfilename}"
161
+ end
162
+ tempfile2.open
163
+ # AB: Internally all the flow is an utf-8 encoded stream.
164
+ # TreeTagger consumes one byte encodings (but we should provide a
165
+ # utf-8 model for German). So we convert utf-8 to latin1, then
166
+ # process the text and convert it back to utf-8.
167
+ #
168
+ while line = tempfile2.gets
169
+ #outfile.puts UtfIso.from_iso_8859_1(line)
170
+ utf8line = UtfIso.from_iso_8859_1(line)
171
+ outfile.puts convert_to_berkeley(utf8line)
172
+ end
173
+
174
+ # remove second tempfile, finalize output file
175
+ tempfile2.close(true)
176
+ outfile.close()
177
+
178
+ end
179
+ end
180
+
181
+
182
+ # sp 30 11 06
183
+ #
184
+ # using TreeTagger for POS tagging of English text
185
+ #
186
+ # copy-and-paste from lemmatisation
187
+ #
188
+ # differences:
189
+ # 1. use field 2 and not 3 from the output
190
+ # 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
191
+ #
192
+ # KE 7 12 06
193
+ # change interface such that TreeTagger is called only once
194
+ # and both POS tags and lemma are read from the same files,
195
+ # rather than calling the tagger twice
196
+ class TreetaggerPOSInterface < SynInterfaceTab
197
+ TreetaggerPOSInterface.announce_me()
198
+ include TreetaggerModule
199
+
200
+ ###
201
+ def TreetaggerPOSInterface.system()
202
+ return "treetagger"
203
+ end
204
+
205
+ ###
206
+ def TreetaggerPOSInterface.service()
207
+ return "pos_tagger"
208
+ end
209
+
210
+ ###
211
+ # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
212
+
213
+ def convert_to_collins(line)
214
+ line.chomp!
215
+ return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
216
+ end
217
+
218
+ ###
219
+ def process_file(infilename, # string: name of input file
220
+ outfilename) # string: name of output file
221
+
222
+ # KE change here
223
+ tt_filename = really_process_file(infilename, outfilename, true)
224
+
225
+ # write all output to tempfile2 first, then
226
+ # change ISO to UTF-8 into outputfile
227
+ tempfile2 = Tempfile.new("treetagger")
228
+ tempfile2.close()
229
+
230
+ # 2. use cut to get the actual lemmtisation
231
+
232
+ Kernel.system("cat " + tt_filename +
233
+ ' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
234
+
235
+ # transform ISO-8859-1 back to UTF-8,
236
+ # write to 'outfilename'
237
+ begin
238
+ outfile = File.new(outfilename, "w")
239
+ rescue
240
+ raise "Could not write to #{outfilename}"
241
+ end
242
+ tempfile2.open()
243
+ while (line = tempfile2.gets())
244
+ outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
245
+ end
246
+
247
+ # remove second tempfile, finalize output file
248
+ tempfile2.close(true)
249
+ outfile.close()
250
+ end
251
+ end
252
+
253
+ ###############
254
+ # an interpreter that only has Treetagger, no parser
255
+ class TreetaggerInterpreter < SynInterpreter
256
+ TreetaggerInterpreter.announce_me()
257
+
258
+ ###
259
+ # names of the systems interpreted by this class:
260
+ # returns a hash service(string) -> system name (string),
261
+ # e.g.
262
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
263
+ def TreetaggerInterpreter.systems()
264
+ return {
265
+ "pos_tagger" => "treetagger",
266
+ }
267
+ end
268
+
269
+ ###
270
+ # names of additional systems that may be interpreted by this class
271
+ # returns a hash service(string) -> system name(string)
272
+ # same as names()
273
+ def TreetaggerInterpreter.optional_systems()
274
+ return {
275
+ "lemmatizer" => "treetagger"
276
+ }
277
+ end
278
+
279
+ ###
280
+ # generalize over POS tags.
281
+ #
282
+ # returns one of:
283
+ #
284
+ # adj: adjective (phrase)
285
+ # adv: adverb (phrase)
286
+ # card: numbers, quantity phrases
287
+ # con: conjunction
288
+ # det: determiner, including possessive/demonstrative pronouns etc.
289
+ # for: foreign material
290
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
291
+ # part: particles, truncated words (German compound parts)
292
+ # prep: preposition (phrase)
293
+ # pun: punctuation, brackets, etc.
294
+ # sent: sentence
295
+ # top: top node of a sentence
296
+ # verb: verb (phrase)
297
+ # nil: something went wrong
298
+ #
299
+ # returns: string, or nil
300
+ def TreetaggerInterpreter.category(node) # SynNode
301
+ pt = TreetaggerInterpreter.pt(node)
302
+ if pt.nil?
303
+ # phrase type could not be determined
304
+ return nil
305
+ end
306
+
307
+ pt.to_s.strip() =~ /^([^-]*)/
308
+ case $1
309
+ when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
310
+ when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
311
+ when /^CD/, /^QP/ then return "card"
312
+ when /^CC/, /^WRB/, /^CONJP/ then return "con"
313
+ when /^DT/, /^POS/ then return "det"
314
+ when /^FW/, /^SYM/ then return "for"
315
+ when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
316
+ when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
317
+ when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
318
+ when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
319
+ when /^TOP/ then return "top"
320
+ when /^TRACE/ then return "trace"
321
+ when /^V/ , /^MD/ then return "verb"
322
+ else
323
+ # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
324
+ return nil
325
+ end
326
+ end
327
+ end