shalmaneser 0.0.1.alpha → 1.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +2 -2
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +49 -0
  6. data/bin/fred +18 -0
  7. data/bin/frprep +34 -0
  8. data/bin/rosy +17 -0
  9. data/lib/common/AbstractSynInterface.rb +35 -33
  10. data/lib/common/Mallet.rb +236 -0
  11. data/lib/common/Maxent.rb +26 -12
  12. data/lib/common/Parser.rb +5 -5
  13. data/lib/common/SynInterfaces.rb +13 -6
  14. data/lib/common/TabFormat.rb +7 -6
  15. data/lib/common/Tiger.rb +4 -4
  16. data/lib/common/Timbl.rb +144 -0
  17. data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
  18. data/lib/common/headz.rb +1 -1
  19. data/lib/common/ruby_class_extensions.rb +3 -3
  20. data/lib/fred/FredBOWContext.rb +14 -2
  21. data/lib/fred/FredDetermineTargets.rb +4 -9
  22. data/lib/fred/FredEval.rb +1 -1
  23. data/lib/fred/FredFeatureExtractors.rb +4 -3
  24. data/lib/fred/FredFeaturize.rb +1 -1
  25. data/lib/frprep/CollinsInterface.rb +6 -6
  26. data/lib/frprep/MiniparInterface.rb +5 -5
  27. data/lib/frprep/SleepyInterface.rb +7 -7
  28. data/lib/frprep/TntInterface.rb +1 -1
  29. data/lib/frprep/TreetaggerInterface.rb +29 -5
  30. data/lib/frprep/do_parses.rb +1 -0
  31. data/lib/frprep/frprep.rb +36 -32
  32. data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
  33. data/lib/frprep/interfaces/stanford_interface.rb +353 -0
  34. data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
  35. data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
  36. data/lib/frprep/opt_parser.rb +2 -2
  37. data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
  38. data/lib/rosy/RosyIterator.rb +11 -10
  39. data/lib/rosy/rosy.rb +1 -0
  40. data/lib/shalmaneser/version.rb +1 -1
  41. data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
  42. data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
  43. data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
  44. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
  45. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
  46. data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
  47. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
  48. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
  49. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
  50. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
  51. data/test/functional/test_frprep.rb +3 -3
  52. data/test/functional/test_rosy.rb +20 -0
  53. metadata +215 -224
  54. data/CHANGELOG.rdoc +0 -0
  55. data/LICENSE.rdoc +0 -0
  56. data/README.rdoc +0 -0
  57. data/lib/common/CollinsInterface.rb +0 -1165
  58. data/lib/common/MiniparInterface.rb +0 -1388
  59. data/lib/common/SleepyInterface.rb +0 -384
  60. data/lib/common/TntInterface.rb +0 -44
  61. data/lib/common/TreetaggerInterface.rb +0 -303
  62. data/lib/frprep/AbstractSynInterface.rb +0 -1227
  63. data/lib/frprep/BerkeleyInterface.rb +0 -375
  64. data/lib/frprep/ConfigData.rb +0 -694
  65. data/lib/frprep/FixSynSemMapping.rb +0 -196
  66. data/lib/frprep/FrPrepConfigData.rb +0 -66
  67. data/lib/frprep/FrprepHelper.rb +0 -1324
  68. data/lib/frprep/ISO-8859-1.rb +0 -24
  69. data/lib/frprep/Parser.rb +0 -213
  70. data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
  71. data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
  72. data/lib/frprep/SynInterfaces.rb +0 -275
  73. data/lib/frprep/TabFormat.rb +0 -720
  74. data/lib/frprep/Tiger.rb +0 -1448
  75. data/lib/frprep/Tree.rb +0 -61
  76. data/lib/frprep/headz.rb +0 -338
@@ -1,384 +0,0 @@
1
- ####
2
- # sp 21 07 05
3
- #
4
- # modified ke 30 10 05: adapted to fit into SynInterface
5
- #
6
- # represents a file containing Sleepy parses
7
- #
8
- # underlying data structure for individual sentences: SalsaTigerSentence
9
- require "tempfile"
10
-
11
- require "common/SalsaTigerRegXML"
12
- require "common/SalsaTigerXMLHelper"
13
- require "common/TabFormat"
14
- require "common/Counter"
15
-
16
- require "common/AbstractSynInterface"
17
- require "common/Tiger.rb"
18
-
19
- ################################################
20
- # Interface class
21
- class SleepyInterface < SynInterfaceSTXML
22
- SleepyInterface.announce_me()
23
-
24
- ###
25
- def SleepyInterface.system()
26
- return "sleepy"
27
- end
28
-
29
- ###
30
- def SleepyInterface.service()
31
- return "parser"
32
- end
33
-
34
- ###
35
- # initialize to set values for all subsequent processing
36
- def initialize(program_path, # string: path to system
37
- insuffix, # string: suffix of tab files
38
- outsuffix, # string: suffix for parsed files
39
- stsuffix, # string: suffix for Salsa/TIGER XML files
40
- var_hash = {}) # optional arguments in a hash
41
-
42
- super(program_path, insuffix, outsuffix, stsuffix, var_hash)
43
- unless @program_path =~ /\/$/
44
- @program_path = @program_path + "/"
45
- end
46
-
47
- # new: evaluate var hash
48
- @pos_suffix = var_hash["pos_suffix"]
49
- @lemma_suffix = var_hash["lemma_suffix"]
50
- @tab_dir = var_hash["tab_dir"]
51
- end
52
-
53
- ####
54
- # parse a directory with TabFormat files and write the parse trees to outputdir
55
- # I assume that the files in inputdir are smaller than
56
- # the maximum number of sentences that
57
- # Sleepy can parse in one go (i.e. that they are split)
58
- def process_dir(in_dir, # string: input directory name
59
- out_dir) # string: output directory name
60
-
61
- sleepy_prog = "#{@program_path}sleepy --beam 1000 --model-file #{@program_path}negra.model --parse "
62
-
63
- Dir[in_dir + "*" + @insuffix].each {|inputfilename|
64
- STDERR.puts "*** Parsing #{inputfilename} with Sleepy"
65
- corpusfilename = File.basename(inputfilename, @insuffix)
66
- parsefilename = out_dir + corpusfilename + @outsuffix
67
- tempfile = Tempfile.new(corpusfilename)
68
-
69
- # we need neither lemmata nor POS tags; sleepy can do with the words
70
- corpusfile = FNTabFormatFile.new(inputfilename,nil, nil)
71
- corpusfile.each_sentence {|sentence|
72
- tempfile.puts sentence.to_s
73
- }
74
- tempfile.close
75
- # parse and remove comments in the parser output
76
- Kernel.system(sleepy_prog+" "+tempfile.path+" 2>&1 | grep -v \"Span:\" > "+parsefilename)
77
- }
78
- end
79
-
80
- ###
81
- # for a given parsed file:
82
- # yield each sentence as a pair
83
- # [SalsaTigerSentence object, FNTabFormatSentence object]
84
- # of the sentence in SalsaTigerXML and the matching tab format sentence
85
- #
86
- # If a parse has failed, returns
87
- # [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
88
- # to allow more detailed accounting for failed parses
89
- # (basically just a flat structure with a failed=true attribute
90
- # at the sentence node)
91
- def each_sentence(parsefilename)
92
- # sanity checks
93
- unless @tab_dir
94
- $stderr.puts "SleepyInterface error: Need to set tab directory on initialization"
95
- exit 1
96
- end
97
-
98
- # get matching tab file for this parser output file
99
- parsefile = File.new(parsefilename)
100
- tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
101
- tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
102
-
103
- sentid = 0
104
-
105
- tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
106
-
107
- sentence_str = ""
108
- status = true # error encountered?
109
-
110
- # assemble next sentence in Sleepy file by reading lines from parsefile
111
- while true
112
- line = parsefile.gets
113
- case line
114
- when /% Parse failed/
115
- status = false
116
- break
117
- when nil # end of file: nothing more to break
118
- break
119
- when /^%/, /^\s*$/ # empty lines, other comments: end of current sentence
120
- unless sentence_str == "" # only break if you have read something
121
- break
122
- end
123
- else
124
- sentence_str += line.chomp # collect line of current parse and continue reading
125
- end
126
- end
127
-
128
- # we have reached some kind of end
129
- sentid +=1
130
-
131
- # we don't have a sentence: hopefully, this is becase parsing has failed
132
- # if this is not the case, we are in trouble
133
- if sentence_str == ""
134
- case status
135
-
136
- when false
137
- # return a SalsaTigerSentence object for the failed sentence
138
- # with a virtual top node and one terminal per word.
139
- if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
140
- my_sent_id = tab_sent.get_sent_id()
141
- else
142
- my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
143
- end
144
- sent = SleepyInterface.failed_sentence(tab_sent, my_sent_id)
145
- yield [sent, tab_sent, SleepyInterface.standard_mapping(sent, tab_sent)]
146
-
147
- else
148
- # this may not happen: we need some sentence for the current
149
- # TabFile sentence
150
- $stderr.puts "SleepyInterface error: premature end of parser file!"
151
- exit 1
152
- end
153
- else
154
- # if we are here, we have a sentence_str to work on
155
- # hopefully, our status is OK
156
- case status
157
- when true
158
- if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
159
- my_sent_id = tab_sent.get_sent_id()
160
- else
161
- my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
162
- end
163
- st_sent = build_salsatiger(" " + sentence_str + " ", 0,
164
- Array.new, Counter.new(0),
165
- Counter.new(500),
166
- SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
167
- yield [st_sent, tab_sent, SleepyInterface.standard_mapping(st_sent, tab_sent)]
168
-
169
- else # i.e. when "failed"
170
- $stderr.puts "SleepyInterface error: failed parse, but parse tree exists??"
171
- exit 1
172
- end
173
- end
174
- }
175
-
176
- # all TabFile sentences are consumed:
177
- # now we may just encounter comments, garbage, empty lines etc.
178
-
179
- while not parsefile.eof?
180
- case parsefile.gets
181
- when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
182
- else
183
- $stderr.puts "SleepyInterface error: premature end of tab file"
184
- exit 1
185
- end
186
- end
187
- end
188
-
189
-
190
- ###
191
- # write Salsa/TIGER XML output to file
192
- def to_stxml_file(infilename, # string: name of parse file
193
- outfilename) # string: name of output stxml file
194
-
195
- outfile = File.new(outfilename, "w")
196
- outfile.puts SalsaTigerXMLHelper.get_header()
197
- each_sentence(infilename) { |st_sent, tabsent|
198
- outfile.puts st_sent.get()
199
- }
200
- outfile.puts SalsaTigerXMLHelper.get_footer()
201
- outfile.close()
202
- end
203
-
204
-
205
-
206
- ########################
207
- private
208
-
209
- ###
210
- # Recursive function for parsing a Sleepy parse tree and
211
- # building a SalsaTigerSentence recursively
212
- #
213
- # Algorithm: manage stack which contains, for the current constituent,
214
- # child constituents (if a nonterminal), and the category label.
215
- # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
216
- # All children and the category label are popped from the stack and integrated into the
217
- # TigerSalsa data structure. The new node is re-pushed onto the stack.
218
- def build_salsatiger(sentence, # string
219
- pos, # position in string (index): integer
220
- stack, # stack with incomplete nodes: Array
221
- termc, # terminal counter
222
- nontc, # nonterminal counter
223
- sent_obj) # SalsaTigerSentence
224
-
225
-
226
- # main case distinction: match the beginning of our string
227
- # (i.e. what follows our current position in the string)
228
-
229
- case sentence[pos..-1]
230
-
231
- when /^ *$/ # nothing -> whole sentence parsed
232
- if stack.length == 1
233
- # sleepy always delivers one "top" node; if we don't get just one
234
- # node, something has gone wrong
235
- node = stack.pop
236
- node.del_attribute("gf")
237
- return sent_obj
238
- else
239
- $stderr.puts "SleepyINterface Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
240
- exit 1
241
- end
242
-
243
- when /^\s*\(([^ )]+) /
244
- # match the beginning of a new constituent
245
- # (opening bracket + category + space, may not contain closing bracket)
246
- cat = $1
247
- if cat.nil? or cat == ""
248
- $stderr.puts "SleepyInterface Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
249
- exit 1
250
- end
251
- # STDERR.puts "new const #{cat}"
252
- stack.push cat # throw the category label on the stack
253
- return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
254
-
255
- when /^\s*(\S+)\) /
256
- # match the end of a terminal constituent (something before a closing bracket + space)
257
- word = $1
258
- comb_cat = stack.pop
259
- if comb_cat.to_s == ""
260
- $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
261
- exit 1
262
- end
263
- cat,gf = split_cat(comb_cat)
264
- node = sent_obj.add_syn("t",
265
- nil, # cat (doesn't matter here)
266
- SalsaTigerXMLHelper.escape(word), # word
267
- cat, # pos
268
- termc.next.to_s)
269
- node.set_attribute("gf",gf)
270
- # STDERR.puts "completed terminal #{cat}, #{word}"
271
- stack.push node
272
- return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
273
-
274
- when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
275
- # now collect children:
276
- # pop items from the stack until you find the category
277
- children = Array.new
278
- while true
279
- if stack.empty?
280
- $stderr.puts "SleepyInterface Error: stack empty; cannot find more children"
281
- exit 1
282
- end
283
- item = stack.pop
284
- case item.class.to_s
285
- when "SynNode" # this is a child
286
- children.push item
287
- when "String" # this is the category label
288
- if item.to_s == ""
289
- $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
290
- exit 1
291
- end
292
- cat,gf = split_cat(item)
293
- break
294
- else
295
- $stderr.puts "SleepyInterface Error: unknown item class #{item.class.to_s}"
296
- exit 1
297
- end
298
- end
299
- # now add a nonterminal node to the sentence object and
300
- # register the children nodes
301
- node = sent_obj.add_syn("nt",
302
- cat, # cat
303
- nil, # word (doesn't matter)
304
- nil, # pos (doesn't matter)
305
- nontc.next.to_s)
306
- children.each {|child|
307
- child_gf = child.get_attribute("gf")
308
- child.del_attribute("gf")
309
- node.add_child(child,child_gf)
310
- child.add_parent(node, child_gf)
311
- }
312
- node.set_attribute("gf",gf)
313
- # STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
314
- stack.push node
315
- return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
316
- else
317
-
318
- if sentence =~ /Fatal error: exception Out_of_memory/
319
- $stderr.puts "SleepyInterface error: Sleepy parser ran out of memory."
320
- $stderr.puts "Try reducing the max. sentence length"
321
- $stderr.puts "in the experiment file."
322
- exit 1
323
- end
324
-
325
-
326
- $stderr.puts "SleepyInterface Error: cannot analyse sentence at pos #{pos}:\n #{sentence[pos..-1]}\n Complete sentence: \n#{sentence}"
327
- exit 1
328
- end
329
- end
330
-
331
- ###
332
- # Sleepy delivers node labels as "phrase type"-"grammatical function"
333
- # but the GF may not be present.
334
-
335
- def split_cat(cat)
336
-
337
- cat =~ /^([^-]*)(-([^-]*))?$/
338
- unless $1
339
- $stderr.puts "SleepyInterface Error: could not identify category in #{cat}"
340
- exit 1
341
- end
342
-
343
- proper_cat = $1
344
-
345
- if $3
346
- gf = $3
347
- else
348
- gf = ""
349
- end
350
-
351
- return [proper_cat,gf]
352
-
353
- end
354
- end
355
-
356
-
357
-
358
- ################################################
359
- # Interpreter class
360
- class SleepyInterpreter < Tiger
361
- SleepyInterpreter.announce_me()
362
-
363
- ###
364
- # names of the systems interpreted by this class:
365
- # returns a hash service(string) -> system name (string),
366
- # e.g.
367
- # { "parser" => "collins", "lemmatizer" => "treetagger" }
368
- def SleepyInterpreter.systems()
369
- return {
370
- "parser" => "sleepy"
371
- }
372
- end
373
-
374
- ###
375
- # names of additional systems that may be interpreted by this class
376
- # returns a hash service(string) -> system name(string)
377
- # same as names()
378
- def SleepyInterpreter.optional_systems()
379
- return {
380
- "lemmatizer" => "treetagger"
381
- }
382
- end
383
-
384
- end
@@ -1,44 +0,0 @@
1
- require "tempfile"
2
- require "common/AbstractSynInterface"
3
-
4
- ################################################
5
- # Interface class
6
- class TntInterface < SynInterfaceTab
7
- TntInterface.announce_me()
8
-
9
- def TntInterface.system()
10
- return "tnt"
11
- end
12
-
13
- def TntInterface.service()
14
- return "pos_tagger"
15
- end
16
-
17
- def process_file(infilename, # string: name of input file
18
- outfilename) # string: name of output file
19
-
20
- tempfile = Tempfile.new("Tnt")
21
- TntInterface.fntab_words_to_file(infilename, tempfile)
22
- tempfile.close
23
-
24
- # 1. use grep to remove commentaries from file
25
- # 2. use sed to extract tags tag list:
26
- # - match one or more non-spaces
27
- # - match one or more spaces
28
- # - match one or more non-spaces and write to outfilename
29
-
30
- # This assumes that the experiment file entry for pos_tagger_path
31
- # has the form
32
- # pos_tagger_path = <program_name> <model>
33
-
34
- Kernel.system(@program_path + " " + tempfile.path +
35
- ' | grep -v -E "^%%" | sed -e\'s/^[^ ]\{1,\}[[:space:]]\{1,\}\([^ ]\{1,\}\)/\1/\' > '+outfilename)
36
-
37
- tempfile.close(true) # delete tempfile
38
- unless `cat #{infilename} | wc -l`.strip ==
39
- `cat #{outfilename} | wc -l`.strip
40
- raise "Error: tagged file has different line number from corpus file!"
41
- end
42
- end
43
- end
44
-
@@ -1,303 +0,0 @@
1
- # sp 30 11 06
2
- # extended by TreeTaggerPOSInterface
3
-
4
- require "tempfile"
5
-
6
- require "common/AbstractSynInterface"
7
-
8
- ###########
9
- # KE dec 7, 06
10
- # common mixin for both Treetagger modules, doing the actual processing
11
- module TreetaggerModule
12
- ###
13
- # Treetagger does both lemmatization and POS-tagging.
14
- # However, the way the SynInterface system is set up in Shalmaneser,
15
- # each SynInterface can offer only _one_ service.
16
- # This means that we cannot do a SynInterface that writes
17
- # both a POS file and a lemma file.
18
- # Instead, both will include this module, which does the
19
- # actual TreeTagger call and then stores the result in a file
20
- # of its own, similar to the 'outfilename' given to TreetaggerInterface.process_file
21
- # but with a separate extension.
22
- # really_process_file checks for existence of this file because,
23
- # if the TreeTagger lemmatization and POS-tagging classes are called separately,
24
- # one of them will go first, and the 2nd one will not need to do the
25
- # TreeTagger call anymore
26
- #
27
- # really_process_file returns a filename, the name of the file containing
28
- # the TreeTagger output with both POS tags and lemma information
29
- #
30
- # WARNING: this method assumes that outfilename contains a suffix
31
- # that can be replaced by .TreeTagger
32
- def really_process_file(infilename, # string: name of input file
33
- outfilename,# string: name of file that the caller is to produce
34
- make_new_outfile_anyway = false) # Boolean: run TreeTagger in any case?
35
-
36
- # fabricate the filename in which the
37
- # actual TreeTagger output will be placed:
38
- # <directory> + <outfilename minus last suffix> + ".TreeTagger"
39
- current_suffix = outfilename[outfilename.rindex(".")..-1]
40
- my_outfilename = File.dirname(outfilename) + "/" +
41
- File.basename(outfilename, current_suffix) +
42
- ".TreeTagger"
43
-
44
- ##
45
- # does it exist? then just return it
46
- if not(make_new_outfile_anyway) and File.exists?(my_outfilename)
47
- return my_outfilename
48
- end
49
-
50
- ##
51
- # else construct it, then return it
52
- tempfile = Tempfile.new("Treetagger")
53
- TreetaggerInterface.fntab_words_to_file(infilename, tempfile, "<EOS>", "iso")
54
- tempfile.close
55
-
56
- # call TreeTagger
57
- Kernel.system(@program_path+" "+tempfile.path +
58
- " > " + my_outfilename)
59
- tempfile.close(true) # delete first tempfile
60
-
61
- # external problem: sometimes, the treetagger keeps the last <EOS> for itself,
62
- # resulting on a .tagged file missing the last (blank) line
63
-
64
- original_length = `cat #{infilename} | wc -l`.strip.to_i
65
- puts infilename
66
- lemmatised_length = `cat #{my_outfilename} | wc -l`.strip.to_i
67
-
68
- # `cp #{tempfile2.path()} /tmp/lout`
69
-
70
- case original_length - lemmatised_length
71
- when 0
72
- # everything ok, don't do anything
73
- when 1
74
- # add one more newline to the .tagged file
75
- `echo "" >> #{my_outfilename}`
76
- else
77
- # this is "real" error
78
- STDERR.puts "Original length: #{original_length}\tLemmatised length: #{lemmatised_length}"
79
- STDERR.puts "Error: lemmatiser/tagger output for for #{File.basename(infilename)}"
80
- $stderr.puts "has different line number from corpus file!"
81
- raise
82
- end
83
-
84
-
85
- return my_outfilename
86
- end
87
- end
88
-
89
- #######################################
90
- class TreetaggerInterface < SynInterfaceTab
91
- TreetaggerInterface.announce_me()
92
-
93
- include TreetaggerModule
94
-
95
- ###
96
- def TreetaggerInterface.system()
97
- return "treetagger"
98
- end
99
-
100
- ###
101
- def TreetaggerInterface.service()
102
- return "lemmatizer"
103
- end
104
-
105
- ###
106
- # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
107
-
108
- def convert_to_berkeley(line)
109
- line.chomp!
110
- return line.gsub(/\(/,"-LRB-").gsub(/\)/,"-RRB-").gsub(/''/,"\"").gsub(/\`\`/,"\"")
111
- end
112
-
113
-
114
- ###
115
- def process_file(infilename, # string: name of input file
116
- outfilename) # string: name of output file
117
-
118
- # KE change here
119
- ttfilename = really_process_file(infilename, outfilename)
120
-
121
- # write all output to tempfile2 first, then
122
- # change ISO to UTF-8 into outputfile
123
- tempfile2 = Tempfile.new("treetagger")
124
- tempfile2.close()
125
-
126
- # 2. use cut to get the actual lemmtisation
127
-
128
- Kernel.system("cat " + ttfilename +
129
- ' | sed -e\'s/<EOS>//\' | cut -f3 > '+tempfile2.path())
130
-
131
- # transform ISO-8859-1 back to UTF-8,
132
- # write to 'outfilename'
133
- begin
134
- outfile = File.new(outfilename, "w")
135
- rescue
136
- raise "Could not write to #{outfilename}"
137
- end
138
- tempfile2.open
139
- # AB: Internally all the flow is an utf-8 encoded stream.
140
- # TreeTagger consumes one byte encodings (but we should provide a
141
- # utf-8 model for German). So we convert utf-8 to latin1, then
142
- # process the text and convert it back to utf-8.
143
- #
144
- while line = tempfile2.gets
145
- #outfile.puts UtfIso.from_iso_8859_1(line)
146
- utf8line = UtfIso.from_iso_8859_1(line)
147
- outfile.puts convert_to_berkeley(utf8line)
148
- end
149
-
150
- # remove second tempfile, finalize output file
151
- tempfile2.close(true)
152
- outfile.close()
153
-
154
- end
155
- end
156
-
157
-
158
- # sp 30 11 06
159
- #
160
- # using TreeTagger for POS tagging of English text
161
- #
162
- # copy-and-paste from lemmatisation
163
- #
164
- # differences:
165
- # 1. use field 2 and not 3 from the output
166
- # 2. convert tags from what Treetagger thinks is the Penn Tagset to what TnT and Collins think is the Penn Tagset
167
- #
168
- # KE 7 12 06
169
- # change interface such that TreeTagger is called only once
170
- # and both POS tags and lemma are read from the same files,
171
- # rather than calling the tagger twice
172
- class TreetaggerPOSInterface < SynInterfaceTab
173
- TreetaggerPOSInterface.announce_me()
174
- include TreetaggerModule
175
-
176
- ###
177
- def TreetaggerPOSInterface.system()
178
- return "treetagger"
179
- end
180
-
181
- ###
182
- def TreetaggerPOSInterface.service()
183
- return "pos_tagger"
184
- end
185
-
186
- ###
187
- # convert TreeTagger's penn tagset into Collins' penn tagset *argh*
188
-
189
- def convert_to_collins(line)
190
- line.chomp!
191
- return line.gsub(/^PP/,"PRP").gsub(/^NP/,"NNP").gsub(/^VV/,"VB").gsub(/^VH/,"VB").gsub(/^SENT/,".")
192
- end
193
-
194
- ###
195
- def process_file(infilename, # string: name of input file
196
- outfilename) # string: name of output file
197
-
198
- # KE change here
199
- tt_filename = really_process_file(infilename, outfilename, true)
200
-
201
- # write all output to tempfile2 first, then
202
- # change ISO to UTF-8 into outputfile
203
- tempfile2 = Tempfile.new("treetagger")
204
- tempfile2.close()
205
-
206
- # 2. use cut to get the actual lemmtisation
207
-
208
- Kernel.system("cat " + tt_filename +
209
- ' | sed -e\'s/<EOS>//\' | cut -f2 > '+tempfile2.path())
210
-
211
- # transform ISO-8859-1 back to UTF-8,
212
- # write to 'outfilename'
213
- begin
214
- outfile = File.new(outfilename, "w")
215
- rescue
216
- raise "Could not write to #{outfilename}"
217
- end
218
- tempfile2.open()
219
- while (line = tempfile2.gets())
220
- outfile.puts UtfIso.from_iso_8859_1(convert_to_collins(line))
221
- end
222
-
223
- # remove second tempfile, finalize output file
224
- tempfile2.close(true)
225
- outfile.close()
226
- end
227
- end
228
-
229
- ###############
230
- # an interpreter that only has Treetagger, no parser
231
- class TreetaggerInterpreter < SynInterpreter
232
- TreetaggerInterpreter.announce_me()
233
-
234
- ###
235
- # names of the systems interpreted by this class:
236
- # returns a hash service(string) -> system name (string),
237
- # e.g.
238
- # { "parser" => "collins", "lemmatizer" => "treetagger" }
239
- def TreetaggerInterpreter.systems()
240
- return {
241
- "pos_tagger" => "treetagger",
242
- }
243
- end
244
-
245
- ###
246
- # names of additional systems that may be interpreted by this class
247
- # returns a hash service(string) -> system name(string)
248
- # same as names()
249
- def TreetaggerInterpreter.optional_systems()
250
- return {
251
- "lemmatizer" => "treetagger"
252
- }
253
- end
254
-
255
- ###
256
- # generalize over POS tags.
257
- #
258
- # returns one of:
259
- #
260
- # adj: adjective (phrase)
261
- # adv: adverb (phrase)
262
- # card: numbers, quantity phrases
263
- # con: conjunction
264
- # det: determiner, including possessive/demonstrative pronouns etc.
265
- # for: foreign material
266
- # noun: noun (phrase), including personal pronouns, proper names, expletives
267
- # part: particles, truncated words (German compound parts)
268
- # prep: preposition (phrase)
269
- # pun: punctuation, brackets, etc.
270
- # sent: sentence
271
- # top: top node of a sentence
272
- # verb: verb (phrase)
273
- # nil: something went wrong
274
- #
275
- # returns: string, or nil
276
- def TreetaggerInterpreter.category(node) # SynNode
277
- pt = TreetaggerInterpreter.pt(node)
278
- if pt.nil?
279
- # phrase type could not be determined
280
- return nil
281
- end
282
-
283
- pt.to_s.strip() =~ /^([^-]*)/
284
- case $1
285
- when /^JJ/ ,/(WH)?ADJP/, /^PDT/ then return "adj"
286
- when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
287
- when /^CD/, /^QP/ then return "card"
288
- when /^CC/, /^WRB/, /^CONJP/ then return "con"
289
- when /^DT/, /^POS/ then return "det"
290
- when /^FW/, /^SYM/ then return "for"
291
- when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/ then return "noun"
292
- when /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
293
- when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then return "pun"
294
- when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
295
- when /^TOP/ then return "top"
296
- when /^TRACE/ then return "trace"
297
- when /^V/ , /^MD/ then return "verb"
298
- else
299
- # $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
300
- return nil
301
- end
302
- end
303
- end