shalmaneser-frappe 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/frappe/Ampersand.rb +41 -0
- data/lib/frappe/file_parser.rb +126 -0
- data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
- data/lib/frappe/frappe.rb +217 -0
- data/lib/frappe/frappe_flat_syntax.rb +89 -0
- data/lib/frappe/frappe_read_stxml.rb +48 -0
- data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
- data/lib/frappe/interfaces/collins_interface.rb +340 -0
- data/lib/frappe/interfaces/counter.rb +19 -0
- data/lib/frappe/interfaces/stanford_interface.rb +353 -0
- data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
- data/lib/frappe/interfaces/treetagger_module.rb +111 -0
- data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
- data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
- data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
- data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
- data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
- data/lib/frappe/interpreters/headz.rb +265 -0
- data/lib/frappe/interpreters/headz_helpers.rb +54 -0
- data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
- data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
- data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
- data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
- data/lib/frappe/one_parsed_file.rb +31 -0
- data/lib/frappe/opt_parser.rb +92 -0
- data/lib/frappe/path.rb +199 -0
- data/lib/frappe/plain_converter.rb +59 -0
- data/lib/frappe/salsa_tab_converter.rb +154 -0
- data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
- data/lib/frappe/stxml_converter.rb +666 -0
- data/lib/frappe/syn_interface.rb +76 -0
- data/lib/frappe/syn_interface_stxml.rb +173 -0
- data/lib/frappe/syn_interface_tab.rb +39 -0
- data/lib/frappe/utf_iso.rb +27 -0
- data/lib/shalmaneser/frappe.rb +1 -0
- metadata +130 -0
@@ -0,0 +1,217 @@
|
|
1
|
+
require 'frappe/utf_iso'
|
2
|
+
|
3
|
+
# For FN input.
|
4
|
+
require 'framenet_format/fn_corpus_xml_file' # !
|
5
|
+
require 'framenet_format/fn_database' # !
|
6
|
+
|
7
|
+
require 'logging' # !
|
8
|
+
|
9
|
+
require 'frappe/stxml_converter'
|
10
|
+
require 'frappe/plain_converter'
|
11
|
+
require 'frappe/salsa_tab_converter'
|
12
|
+
require 'frappe/salsa_tab_with_pos_converter'
|
13
|
+
|
14
|
+
##############################
|
15
|
+
# The class that does all the work
|
16
|
+
module Shalmaneser
|
17
|
+
module Frappe
|
18
|
+
class Frappe
|
19
|
+
# @param exp [FrprepConfigData] Configuration object
|
20
|
+
def initialize(exp)
|
21
|
+
@exp = exp
|
22
|
+
end
|
23
|
+
|
24
|
+
# Main processing method.
|
25
|
+
# @raise [ConfigurationError]
|
26
|
+
def transform
|
27
|
+
# experiment directory:
|
28
|
+
# frprep internal data directory, subdir according to experiment ID
|
29
|
+
# @todo Move it to a separate method.
|
30
|
+
File.new_dir(@exp.get("frprep_directory"), @exp.get("prep_experiment_ID"))
|
31
|
+
|
32
|
+
# input and output directories.
|
33
|
+
#
|
34
|
+
input_dir = File.existing_dir(@exp.get("directory_input"))
|
35
|
+
output_dir = File.new_dir(@exp.get("directory_preprocessed"))
|
36
|
+
|
37
|
+
if @exp.get("tabformat_output")
|
38
|
+
split_dir = output_dir
|
39
|
+
else
|
40
|
+
split_dir = frprep_dirname("split", "new")
|
41
|
+
end
|
42
|
+
|
43
|
+
####
|
44
|
+
# @todo Use standard Ruby transcoding mechanics.
|
45
|
+
# transform data to UTF-8
|
46
|
+
if @exp.convert_encoding?
|
47
|
+
# transform ISO -> UTF-8 or Hex -> UTF-8
|
48
|
+
# write result to encoding_dir,
|
49
|
+
# then set encoding_dir to be the new input_dir
|
50
|
+
|
51
|
+
encoding_dir = frprep_dirname("encoding", "new")
|
52
|
+
|
53
|
+
LOGGER.info "Frappe: Transforming to UTF-8."
|
54
|
+
|
55
|
+
Dir[input_dir + "*"].each do |filename|
|
56
|
+
unless File.file? filename
|
57
|
+
# not a file? then skip
|
58
|
+
next
|
59
|
+
end
|
60
|
+
outfilename = encoding_dir + File.basename(filename)
|
61
|
+
to_utf8_file(filename, outfilename, @exp.get("encoding"))
|
62
|
+
end
|
63
|
+
|
64
|
+
input_dir = encoding_dir
|
65
|
+
end
|
66
|
+
|
67
|
+
####
|
68
|
+
# transform data all the way to the output format,]
|
69
|
+
# which is SalsaTigerXML by default,
|
70
|
+
# except when tabformat_output has been set, in which case it's
|
71
|
+
# Tab format.
|
72
|
+
current_dir = input_dir
|
73
|
+
|
74
|
+
current_format = @exp.get("format")
|
75
|
+
|
76
|
+
# while current_format != done_format
|
77
|
+
# @todo Change the configuration to input_format vs. output_format.
|
78
|
+
# Input Formats:
|
79
|
+
# Output Formats: STXML (default), TABULAR
|
80
|
+
loop do
|
81
|
+
case current_format
|
82
|
+
when "Plain"
|
83
|
+
tab_dir = frprep_dirname("tab", "new")
|
84
|
+
|
85
|
+
LOGGER.info "Frappe: Transforming plain text to SalsaTab format."
|
86
|
+
LOGGER.debug "Frappe: Transforming plain text in #{current_dir} to SalsaTab format.\n"\
|
87
|
+
"Storing the result in #{tab_dir}.\n"\
|
88
|
+
"Expecting one sentence per line.\n"
|
89
|
+
|
90
|
+
transformer = PlainConverter.new
|
91
|
+
transformer.transform_plain_dir(current_dir, tab_dir)
|
92
|
+
|
93
|
+
current_dir = tab_dir
|
94
|
+
current_format = "SalsaTab"
|
95
|
+
|
96
|
+
when "FNXml"
|
97
|
+
# transform to tab format
|
98
|
+
|
99
|
+
tab_dir = frprep_dirname("tab", "new")
|
100
|
+
|
101
|
+
LOGGER.info 'Frappe: Transforming FN Data to the tabular format.'
|
102
|
+
LOGGER.debug "Frappe: Transforming FN data in #{current_dir} to the "\
|
103
|
+
"tabular format. Storing the result in #{tab_dir}"
|
104
|
+
|
105
|
+
fndata = FNDatabase.new(current_dir)
|
106
|
+
fndata.extract_everything(tab_dir)
|
107
|
+
|
108
|
+
current_dir = tab_dir
|
109
|
+
current_format = "SalsaTab"
|
110
|
+
|
111
|
+
when "FNCorpusXml"
|
112
|
+
# transform to tab format
|
113
|
+
tab_dir = frprep_dirname("tab", "new")
|
114
|
+
|
115
|
+
LOGGER.info 'Frappe: Transforming FrameNet data to the tabular format.'
|
116
|
+
LOGGER.debug "Frprep: Transforming FN data in #{current_dir} to tabular format.\n"\
|
117
|
+
"Storing the result in: #{tab_dir}.\n"
|
118
|
+
|
119
|
+
# assuming that all XML files in the current directory are FN Corpus XML files
|
120
|
+
Dir[current_dir + "*.xml"].each do |fncorpusfilename|
|
121
|
+
corpus = FNCorpusXMLFile.new(fncorpusfilename)
|
122
|
+
output_file = "#{tab_dir}#{File.basename(fncorpusfilename, '.xml')}.tab"
|
123
|
+
File.open(output_file, 'w') do |f|
|
124
|
+
corpus.print_conll_style(f)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
current_dir = tab_dir
|
129
|
+
current_format = "SalsaTab"
|
130
|
+
|
131
|
+
when "SalsaTab"
|
132
|
+
LOGGER.info "#{PROGRAM_NAME}: I'm Lemmatizing and Parsing texts."
|
133
|
+
LOGGER.debug "#{PROGRAM_NAME}: Lemmatizing and parsing text in #{current_dir}.\n"\
|
134
|
+
"Storing the result in #{split_dir}.\n"
|
135
|
+
|
136
|
+
transformer = SalsaTabConverter.new(@exp)
|
137
|
+
transformer.transform_pos_and_lemmatize(current_dir, split_dir)
|
138
|
+
|
139
|
+
# current_format = "SalsaTabWithPos"
|
140
|
+
if @exp.get("tabformat_output")
|
141
|
+
break
|
142
|
+
else
|
143
|
+
current_format = 'SalsaTabWithPos'
|
144
|
+
current_dir = split_dir
|
145
|
+
end
|
146
|
+
|
147
|
+
when "SalsaTabWithPos"
|
148
|
+
parse_dir = frprep_dirname("parse", "new")
|
149
|
+
|
150
|
+
LOGGER.info 'Frappe: Trasforming the tabular format into the STXML format.'
|
151
|
+
LOGGER.debug "Frprep: Transforming tabular format text in #{current_dir} to SalsaTigerXML format. "\
|
152
|
+
"Storing the result in #{parse_dir}."
|
153
|
+
|
154
|
+
transformer = SalsaTabWithPOSConverter.new(@exp)
|
155
|
+
transformer.transform_salsatab_dir(current_dir, parse_dir, output_dir)
|
156
|
+
break
|
157
|
+
when "SalsaTigerXML"
|
158
|
+
parse_dir = frprep_dirname("parse", "new")
|
159
|
+
LOGGER.info "#{PROGRAM_NAME}: Transforming parser output into STXML format."
|
160
|
+
transformer = STXMLConverter.new(@exp)
|
161
|
+
transformer.transform_stxml_dir(parse_dir, split_dir, input_dir, output_dir)
|
162
|
+
break
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
LOGGER.info "#{PROGRAM_NAME} is ready! Preprocessing of all the texts is finished."
|
167
|
+
end
|
168
|
+
|
169
|
+
private
|
170
|
+
|
171
|
+
###############
|
172
|
+
# frprep_dirname:
|
173
|
+
# make directory name for frprep-internal data
|
174
|
+
# of a certain kind described in <subdir>
|
175
|
+
#
|
176
|
+
# frprep_directory has one subdirectory for each experiment ID,
|
177
|
+
# and below that there is one subdir per subtask
|
178
|
+
#
|
179
|
+
# If this is a new directory, it is constructed,
|
180
|
+
# if it should be an existing directory, its existence is checked.
|
181
|
+
# @param subdir [String] designator of a subdirectory
|
182
|
+
# @param neu [Nil] non-nil This may be a new directory
|
183
|
+
def frprep_dirname(subdir, neu = nil)
|
184
|
+
dirname = File.new_dir(@exp.get("frprep_directory"), @exp.get("prep_experiment_ID"), subdir)
|
185
|
+
|
186
|
+
neu ? File.new_dir(dirname) : File.existing_dir(dirname)
|
187
|
+
end
|
188
|
+
####
|
189
|
+
# transform a file to UTF-8 from a given encoding
|
190
|
+
# @note Is used.
|
191
|
+
def to_utf8_file(input_filename, # string: name of input file
|
192
|
+
output_filename, # string: name of output file
|
193
|
+
encoding) # string: "iso", "hex"
|
194
|
+
begin
|
195
|
+
infile = File.new(input_filename)
|
196
|
+
outfile = File.new(output_filename, "w")
|
197
|
+
rescue
|
198
|
+
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
199
|
+
end
|
200
|
+
|
201
|
+
while (line = infile.gets)
|
202
|
+
case encoding
|
203
|
+
when "iso"
|
204
|
+
outfile.puts UtfIso.from_iso_8859_1(line)
|
205
|
+
when "hex"
|
206
|
+
outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
|
207
|
+
else
|
208
|
+
raise "Shouldn't be here."
|
209
|
+
end
|
210
|
+
end
|
211
|
+
infile.close
|
212
|
+
outfile.close
|
213
|
+
end
|
214
|
+
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require_relative 'syn_interface_stxml'
|
2
|
+
|
3
|
+
require 'tabular_format/fn_tab_format_file'
|
4
|
+
require 'salsa_tiger_xml/salsa_tiger_sentence'
|
5
|
+
require 'salsa_tiger_xml/salsa_tiger_xml_helper'
|
6
|
+
|
7
|
+
############################################
|
8
|
+
# Class FrappeFlatSyntax:
|
9
|
+
#
|
10
|
+
# given a FNTabFormat file,
|
11
|
+
# yield each of its sentences in SalsaTigerXML,
|
12
|
+
# constructing a flat syntax
|
13
|
+
module Shalmaneser
|
14
|
+
module Frappe
|
15
|
+
class FrappeFlatSyntax
|
16
|
+
def initialize(tabfilename, # string: name of tab file
|
17
|
+
postag_suffix, # postag file suffix (or nil)
|
18
|
+
lemma_suffix) # lemmatisation file suffix (or nil)
|
19
|
+
|
20
|
+
@tabfilename = tabfilename
|
21
|
+
@pos_suffix = postag_suffix
|
22
|
+
@lemma_suffix = lemma_suffix
|
23
|
+
end
|
24
|
+
|
25
|
+
# yield each non-parse sentence as a tuple
|
26
|
+
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
27
|
+
# of a SalsaTigerSentence object, a FNTabSentence object,
|
28
|
+
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
29
|
+
# pointing each tab word to one or more SalsaTigerSentence terminals
|
30
|
+
def each_sentence(dummy)
|
31
|
+
# read tab file with lemma and POS info
|
32
|
+
tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
|
33
|
+
|
34
|
+
tabfile.each_sentence { |tabsent|
|
35
|
+
# start new, empty sentence with "failed" attribute (i.e. no parse)
|
36
|
+
# and with the ID of the corresponding TabFormat sentence
|
37
|
+
sentid = tabsent.get_sent_id
|
38
|
+
if sentid.nil? or sentid =~ /^-*$/
|
39
|
+
$stderr.puts "No sentence ID for sentence:"
|
40
|
+
tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
|
41
|
+
$stderr.puts
|
42
|
+
# @todo AB: [2015-12-16 Wed 18:24]
|
43
|
+
# Change this!!!
|
44
|
+
sentid = Time.new.to_f.to_s
|
45
|
+
end
|
46
|
+
sent = STXML::SalsaTigerSentence.new("<s id=\"#{STXML::SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
|
47
|
+
|
48
|
+
# add single nonterminal node, category "S"
|
49
|
+
single_nonterminal_id = STXML::SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
|
50
|
+
vroot = sent.add_syn("nt", "S", # category
|
51
|
+
nil, # word
|
52
|
+
nil, # pos
|
53
|
+
single_nonterminal_id)
|
54
|
+
|
55
|
+
# add terminals
|
56
|
+
tabsent.each_line_parsed { |line_obj|
|
57
|
+
# make terminal node with tab sent info
|
58
|
+
node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
|
59
|
+
word = line_obj.get("word")
|
60
|
+
unless word
|
61
|
+
word = ""
|
62
|
+
end
|
63
|
+
word = STXML::SalsaTigerXMLHelper.escape(word)
|
64
|
+
pos = line_obj.get("pos")
|
65
|
+
unless pos
|
66
|
+
pos = ""
|
67
|
+
end
|
68
|
+
pos = STXML::SalsaTigerXMLHelper.escape(pos)
|
69
|
+
terminal = sent.add_syn("t", nil, # category
|
70
|
+
word, pos,
|
71
|
+
node_id)
|
72
|
+
|
73
|
+
if line_obj.get("lemma")
|
74
|
+
# lemma
|
75
|
+
terminal.set_attribute("lemma", STXML::SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
|
76
|
+
end
|
77
|
+
|
78
|
+
# add new terminal as child of vroot
|
79
|
+
vroot.add_child(terminal, nil)
|
80
|
+
terminal.add_parent(vroot, nil)
|
81
|
+
} # each line of tab file
|
82
|
+
|
83
|
+
# yield newly constructed SalsaTigerXMl sentence plus tab sentence
|
84
|
+
yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
|
85
|
+
}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require_relative 'syn_interface_stxml'
|
2
|
+
require 'tabular_format/fn_tab_format_file'
|
3
|
+
require 'salsa_tiger_xml/salsa_tiger_sentence'
|
4
|
+
require 'salsa_tiger_xml/file_parts_parser'
|
5
|
+
|
6
|
+
#
|
7
|
+
# given a STXML file,
|
8
|
+
# yield each of its sentences
|
9
|
+
module Shalmaneser
|
10
|
+
module Frappe
|
11
|
+
class FrappeReadStxml
|
12
|
+
def initialize(stxmlfilename, # string: name of SalsaTigerXML file
|
13
|
+
tabfilename, # string: name of corresponding tab file (or nil)
|
14
|
+
postag_suffix, # POS tag file suffix (or nil)
|
15
|
+
lemma_suffix) # lemmatization file suffix (or nil)
|
16
|
+
|
17
|
+
@stxmlfilename = stxmlfilename
|
18
|
+
@tabfilename = tabfilename
|
19
|
+
@pos_suffix = postag_suffix
|
20
|
+
@lemma_suffix = lemma_suffix
|
21
|
+
end
|
22
|
+
# yield each non-parse sentence as a tuple
|
23
|
+
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
24
|
+
# of a SalsaTigerSentence object, a FNTabSentence object,
|
25
|
+
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
26
|
+
# pointing each tab word to one or more SalsaTigerSentence terminals
|
27
|
+
# @todo AB: [2015-12-17 Thu 20:22]
|
28
|
+
# Remove this dummy argument.
|
29
|
+
def each_sentence(dummy)
|
30
|
+
# read corresponding tab file?
|
31
|
+
tab_sents = []
|
32
|
+
if File.exist?(@tabfilename)
|
33
|
+
tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
|
34
|
+
tabfile.each_sentence { |tabsent| tab_sents << tabsent }
|
35
|
+
end
|
36
|
+
|
37
|
+
# read STXML file
|
38
|
+
infile = STXML::FilePartsParser.new(@stxmlfilename)
|
39
|
+
index = 0
|
40
|
+
infile.scan_s do |sent_string|
|
41
|
+
sent = STXML::SalsaTigerSentence.new(sent_string)
|
42
|
+
yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
|
43
|
+
index += 1
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,380 @@
|
|
1
|
+
#-*- coding: utf-8 -*-
|
2
|
+
####
|
3
|
+
# sp 21 07 05
|
4
|
+
#
|
5
|
+
# modified ke 30 10 05: adapted to fit into SynInterface
|
6
|
+
#
|
7
|
+
# represents a file containing Berkeley parses
|
8
|
+
#
|
9
|
+
# underlying data structure for individual sentences: SalsaTigerSentence
|
10
|
+
require_relative 'counter'
|
11
|
+
|
12
|
+
require 'salsa_tiger_xml/salsa_tiger_sentence'
|
13
|
+
require 'salsa_tiger_xml/salsa_tiger_xml_helper'
|
14
|
+
require 'tabular_format/fn_tab_format_file'
|
15
|
+
require 'logging'
|
16
|
+
|
17
|
+
require "tempfile"
|
18
|
+
|
19
|
+
################################################
|
20
|
+
# Interface class
|
21
|
+
module Shalmaneser
|
22
|
+
module Frappe
|
23
|
+
class BerkeleyInterface < SynInterfaceSTXML
|
24
|
+
LOGGER.debug 'Announcing Berkeley Interface'
|
25
|
+
BerkeleyInterface.announce_me
|
26
|
+
|
27
|
+
def self.system
|
28
|
+
'berkeley'
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.service
|
32
|
+
'parser'
|
33
|
+
end
|
34
|
+
|
35
|
+
###
|
36
|
+
# initialize to set values for all subsequent processing
|
37
|
+
# @param program_path [String] path to a system
|
38
|
+
# @param insuffix [String] suffix of tab files
|
39
|
+
# @param outsuffix [String] suffix of parsed files
|
40
|
+
# @param stsuffix [String] suffix of Salsa/TigerXML files
|
41
|
+
# @param var_hash [Hash] optional arguments
|
42
|
+
def initialize(program_path, insuffix, outsuffix, stsuffix, var_hash = {})
|
43
|
+
super(program_path, insuffix, outsuffix, stsuffix, var_hash)
|
44
|
+
|
45
|
+
# @togo AB: This should be checked in the OptionParser.
|
46
|
+
unless @program_path =~ /\/$/
|
47
|
+
@program_path += '/'
|
48
|
+
end
|
49
|
+
|
50
|
+
# new: evaluate var hash
|
51
|
+
@pos_suffix = var_hash["pos_suffix"]
|
52
|
+
@lemma_suffix = var_hash["lemma_suffix"]
|
53
|
+
@tab_dir = var_hash["tab_dir"]
|
54
|
+
end
|
55
|
+
|
56
|
+
####
|
57
|
+
# parse a directory with TabFormat files and write the parse trees to outputdir
|
58
|
+
# I assume that the files in inputdir are smaller than
|
59
|
+
# the maximum number of sentences that
|
60
|
+
# Berkeley can parse in one go (i.e. that they are split)
|
61
|
+
# string: input directory name
|
62
|
+
# string: output directory name
|
63
|
+
def process_dir(in_dir, out_dir)
|
64
|
+
parser = ENV['SHALM_BERKELEY_BIN'] || 'berkeleyParser.jar'
|
65
|
+
grammar = ENV['SHALM_BERKELEY_MODEL'] || 'grammar.gr'
|
66
|
+
options = ENV['SHALM_BERKELEY_OPTIONS']
|
67
|
+
|
68
|
+
berkeley_prog = "java -jar #{@program_path}#{parser} #{options} -gr #{@program_path}#{grammar}"
|
69
|
+
|
70
|
+
Dir[in_dir + "*" + @insuffix].each do |inputfilename|
|
71
|
+
LOGGER.info "Parsing #{inputfilename} with Berkeley Parser."
|
72
|
+
corpusfilename = File.basename(inputfilename, @insuffix)
|
73
|
+
parsefilename = out_dir + corpusfilename + @outsuffix
|
74
|
+
tempfile = Tempfile.new(corpusfilename)
|
75
|
+
|
76
|
+
# we need neither lemmata nor POS tags; berkeley can do with the words
|
77
|
+
corpusfile = FNTabFormatFile.new(inputfilename, nil, nil)
|
78
|
+
|
79
|
+
corpusfile.each_sentence do |sentence|
|
80
|
+
# Convert FNTabSentence to a String.
|
81
|
+
sentence = sentence.to_s
|
82
|
+
|
83
|
+
# @todo AB: I don't know why the Berkeley Parser wants this.
|
84
|
+
# Investigate if every Grammar needs this conversion.
|
85
|
+
# Try to move this convertion from FrappeHelper.
|
86
|
+
# sentence.gsub!(/\(/, "*LRB*")
|
87
|
+
# sentence.gsub!(/\)/, "*RRB*")
|
88
|
+
# sentence.gsub!(/``/, '"')
|
89
|
+
# sentence.gsub!(/''/, '"')
|
90
|
+
# sentence.gsub!(%r{\'\'}, '"')
|
91
|
+
## text.gsub!(/word=['"]\(['"]/, "word='-LRB-'")
|
92
|
+
## text.gsub!(/word=['"]\)['"]/, "word='-RRB-'")
|
93
|
+
tempfile.puts sentence
|
94
|
+
end
|
95
|
+
|
96
|
+
tempfile.close
|
97
|
+
|
98
|
+
# parse and remove comments in the parser output
|
99
|
+
shell_cmd = "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
|
100
|
+
LOGGER.debug shell_cmd
|
101
|
+
|
102
|
+
rv = system(shell_cmd)
|
103
|
+
|
104
|
+
# AB: Testing for return value.
|
105
|
+
unless rv
|
106
|
+
fail 'Berkeley Parser failed to parse our files!'
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
###
|
112
|
+
# for a given parsed file:
|
113
|
+
# yield each sentence as a pair
|
114
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object]
|
115
|
+
# of the sentence in SalsaTigerXML and the matching tab format sentence
|
116
|
+
#
|
117
|
+
# If a parse has failed, returns
|
118
|
+
# [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence]
|
119
|
+
# to allow more detailed accounting for failed parses
|
120
|
+
# (basically just a flat structure with a failed=true attribute
|
121
|
+
# at the sentence node)
|
122
|
+
def each_sentence(parsefilename)
|
123
|
+
# sanity checks
|
124
|
+
unless @tab_dir
|
125
|
+
raise "Need to set tab directory on initialization"
|
126
|
+
end
|
127
|
+
|
128
|
+
# get matching tab file for this parser output file
|
129
|
+
parsefile = File.new(parsefilename)
|
130
|
+
tabfilename = @tab_dir + File.basename(parsefilename, @outsuffix) + @insuffix
|
131
|
+
tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)
|
132
|
+
|
133
|
+
sentid = 0
|
134
|
+
tabfile.each_sentence do |tab_sent| # iterate over corpus sentences
|
135
|
+
|
136
|
+
sentence_str = ""
|
137
|
+
status = true # error encountered?
|
138
|
+
# assemble next sentence in Berkeley file by reading lines from parsefile
|
139
|
+
# for berkeley:
|
140
|
+
while (line = parsefile.gets)
|
141
|
+
|
142
|
+
# search for the next "relevant" file or end of the file
|
143
|
+
# We expect here:
|
144
|
+
# - an empty line;
|
145
|
+
# - a failed parse;
|
146
|
+
# - a parse beginning with <( (>, <( (TOP>, <( (VROOT> etc.
|
147
|
+
# TOP - Negra Grammars
|
148
|
+
# VROOT - Tiger Grammars
|
149
|
+
# PSEUDO - Original BP Grammars
|
150
|
+
# ROOT - some english grammars
|
151
|
+
# empty identifiers for older Tiger grammars
|
152
|
+
if line.nil? or line=~/^(\( *)?\((PSEUDO|TOP|ROOT|VROOT)? / or line=~/^\(\(\)/
|
153
|
+
break
|
154
|
+
end
|
155
|
+
|
156
|
+
sentid += 1
|
157
|
+
end
|
158
|
+
# @todo AB: Check if this condition is valid.
|
159
|
+
if line.nil? # while we search a parse, the parse file is over...
|
160
|
+
raise "Error: premature end of parser file!"
|
161
|
+
end
|
162
|
+
|
163
|
+
# Insert a top node <VROOT> if missing.
|
164
|
+
# Some grammars trained on older Tiger Versions
|
165
|
+
# expose this problem.
|
166
|
+
#STDERR.puts "@@@1 <#{line}>"
|
167
|
+
line.sub!(/^(\(\s+\()(\s+)/, '\1VROOT\2')
|
168
|
+
#STDERR.puts "@@@2 <#{line}>"
|
169
|
+
# berkeley parser output: remove brackets /(.*)/
|
170
|
+
# Remove leading and trailing top level brackets.
|
171
|
+
line.sub!(/^\( */, '')
|
172
|
+
line.sub!(/ *\) *$/, '')
|
173
|
+
|
174
|
+
# Split consequtive closing brackets.
|
175
|
+
line.gsub!(/\)\)/, ') )')
|
176
|
+
line.gsub!(/\)\)/, ') )')
|
177
|
+
|
178
|
+
# Change CAT_FUNC delimiter from <_> to <->.
|
179
|
+
line.gsub!(/(\([A-Z]+)_/, '\1-')
|
180
|
+
|
181
|
+
sentence_str = line.chomp!
|
182
|
+
|
183
|
+
# if we are here, we have a sentence_str to work on
|
184
|
+
# hopefully, our status is OK
|
185
|
+
case status
|
186
|
+
when true
|
187
|
+
if tab_sent.get_sent_id and tab_sent.get_sent_id != "--"
|
188
|
+
my_sent_id = tab_sent.get_sent_id
|
189
|
+
else
|
190
|
+
my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
|
191
|
+
end
|
192
|
+
|
193
|
+
st_sent = build_salsatiger(" " + sentence_str + " ", 0,
|
194
|
+
[], Counter.new(0),
|
195
|
+
Counter.new(500),
|
196
|
+
STXML::SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
|
197
|
+
if st_sent.nil?
|
198
|
+
next
|
199
|
+
end
|
200
|
+
yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
|
201
|
+
else # i.e. when "failed"
|
202
|
+
#raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
|
207
|
+
# we don't have a sentence: hopefully, this is becase parsing has failed
|
208
|
+
|
209
|
+
|
210
|
+
# all TabFile sentences are consumed:
|
211
|
+
# now we may just encounter comments, garbage, empty lines etc.
|
212
|
+
|
213
|
+
while not parsefile.eof?
|
214
|
+
|
215
|
+
case abline = parsefile.gets
|
216
|
+
when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse
|
217
|
+
else
|
218
|
+
raise "Error: premature end of tab file! Found line: #{abline}"
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
|
224
|
+
###
|
225
|
+
# write Salsa/TIGER XML output to file
|
226
|
+
# string: name of parse file
|
227
|
+
# string: name of output stxml file
|
228
|
+
def to_stxml_file(infilename, outfilename)
|
229
|
+
File.open(outfilename, 'w') do |outfile|
|
230
|
+
outfile.puts STXML::SalsaTigerXMLHelper.get_header
|
231
|
+
each_sentence(infilename) do |st_sent, tabsent|
|
232
|
+
outfile.puts st_sent.get
|
233
|
+
end
|
234
|
+
outfile.puts STXML::SalsaTigerXMLHelper.get_footer
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
########################
|
239
|
+
private
|
240
|
+
|
241
|
+
###
|
242
|
+
# Recursive function for parsing a Berkeley parse tree and
|
243
|
+
# building a SalsaTigerSentence recursively
|
244
|
+
#
|
245
|
+
# Algorithm: manage stack which contains, for the current constituent,
|
246
|
+
# child constituents (if a nonterminal), and the category label.
|
247
|
+
# When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
|
248
|
+
# All children and the category label are popped from the stack and integrated into the
|
249
|
+
# TigerSalsa data structure. The new node is re-pushed onto the stack.
|
250
|
+
def build_salsatiger(sentence, # string
|
251
|
+
pos, # position in string (index): integer
|
252
|
+
stack, # stack with incomplete nodes: Array
|
253
|
+
termc, # terminal counter
|
254
|
+
nontc, # nonterminal counter
|
255
|
+
sent_obj) # SalsaTigerSentence
|
256
|
+
|
257
|
+
|
258
|
+
|
259
|
+
if sentence =~ /\(\)/
|
260
|
+
return nil
|
261
|
+
end
|
262
|
+
|
263
|
+
# main case distinction: match the beginning of our string
|
264
|
+
# (i.e. what follows our current position in the string)
|
265
|
+
case sentence[pos..-1]
|
266
|
+
|
267
|
+
when /^ *$/ # nothing -> whole sentence parsed
|
268
|
+
if stack.length == 1
|
269
|
+
# sleepy always delivers one "top" node; if we don't get just one
|
270
|
+
# node, something has gone wrong
|
271
|
+
node = stack.pop
|
272
|
+
node.del_attribute("gf")
|
273
|
+
return sent_obj
|
274
|
+
else
|
275
|
+
raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
|
276
|
+
end
|
277
|
+
|
278
|
+
when /^\s*\(([^ )]+) /
|
279
|
+
# match the beginning of a new constituent
|
280
|
+
# (opening bracket + category + space, may not contain closing bracket)
|
281
|
+
cat = $1
|
282
|
+
if cat.nil? or cat == ""
|
283
|
+
raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
|
284
|
+
end
|
285
|
+
# STDERR.puts "new const #{cat}"
|
286
|
+
stack.push cat # throw the category label on the stack
|
287
|
+
return build_salsatiger(sentence, pos + $&.length, stack, termc, nontc, sent_obj)
|
288
|
+
|
289
|
+
when /^\s*(\S+)\) /
|
290
|
+
# match the end of a terminal constituent (something before a closing bracket + space)
|
291
|
+
word = $1
|
292
|
+
|
293
|
+
comb_cat = stack.pop
|
294
|
+
if comb_cat.to_s == ""
|
295
|
+
raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
|
296
|
+
end
|
297
|
+
|
298
|
+
cat, gf = split_cat(comb_cat)
|
299
|
+
node = sent_obj.add_syn("t",
|
300
|
+
nil, # cat (doesn't matter here)
|
301
|
+
STXML::SalsaTigerXMLHelper.escape(word), # word
|
302
|
+
cat, # pos
|
303
|
+
termc.next.to_s)
|
304
|
+
node.set_attribute("gf", gf)
|
305
|
+
# STDERR.puts "completed terminal #{cat}, #{word}"
|
306
|
+
stack.push node
|
307
|
+
return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)
|
308
|
+
|
309
|
+
when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
|
310
|
+
# now collect children:
|
311
|
+
# pop items from the stack until you find the category
|
312
|
+
children = []
|
313
|
+
loop do
|
314
|
+
if stack.empty?
|
315
|
+
raise "Error: stack empty; cannot find more children"
|
316
|
+
end
|
317
|
+
|
318
|
+
item = stack.pop
|
319
|
+
|
320
|
+
# @todo Change the check from string to class instances. 'SynNode' -> SynNode
|
321
|
+
case item
|
322
|
+
when STXML::SynNode # this is a child
|
323
|
+
children.push item
|
324
|
+
when String
|
325
|
+
# this is the category label
|
326
|
+
if item.to_s == ""
|
327
|
+
raise "Empty cat at position #{sentence[pos, 10]}, full sentence\n#{sentence}"
|
328
|
+
end
|
329
|
+
cat, gf = split_cat(item)
|
330
|
+
break
|
331
|
+
else
|
332
|
+
raise "Error: unknown item class #{item.class}."
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
# now add a nonterminal node to the sentence object and
|
337
|
+
# register the children nodes
|
338
|
+
node = sent_obj.add_syn("nt",
|
339
|
+
cat, # cat
|
340
|
+
nil, # word (doesn't matter)
|
341
|
+
nil, # pos (doesn't matter)
|
342
|
+
nontc.next.to_s)
|
343
|
+
|
344
|
+
children.each do |child|
|
345
|
+
child_gf = child.get_attribute("gf")
|
346
|
+
child.del_attribute("gf")
|
347
|
+
node.add_child(child, child_gf)
|
348
|
+
child.add_parent(node, child_gf)
|
349
|
+
end
|
350
|
+
|
351
|
+
node.set_attribute("gf", gf)
|
352
|
+
# STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
|
353
|
+
stack.push node
|
354
|
+
|
355
|
+
return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
|
356
|
+
else
|
357
|
+
raise "Error: cannot analyse sentence at pos #{pos}: <#{sentence[pos..-1]}>. Complete sentence: \n#{sentence}"
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
###
|
362
|
+
# BerkeleyParser delivers node labels in different forms:
|
363
|
+
# - "phrase type"-"grammatical function",
|
364
|
+
# - "phrase type"_"grammatical function",
|
365
|
+
# - "prase type":"grammatical function",
|
366
|
+
# but the GF may be absent.
|
367
|
+
# @param cat [String]
|
368
|
+
# @return [Array<String>]
|
369
|
+
def split_cat(cat)
|
370
|
+
md = cat.match(/^([^-:_]*)([-:_]([^-:_]*))?$/)
|
371
|
+
raise "Error: Could not identify category in #{cat}!" unless md[1]
|
372
|
+
|
373
|
+
proper_cat = md[1]
|
374
|
+
gf = md[3] ? md[3] : ''
|
375
|
+
|
376
|
+
[proper_cat, gf]
|
377
|
+
end
|
378
|
+
end
|
379
|
+
end
|
380
|
+
end
|