frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,196 @@
|
|
1
|
+
###
|
2
|
+
# FixSynSemMapping:
|
3
|
+
# Given a SalsaTigerRegXML sentence with semantic role annotation,
|
4
|
+
# simplify the mapping of semantic roles to syntactic constituents
|
5
|
+
#
|
6
|
+
# The following is lifted from the LREC06 paper on Shalmaneser:
|
7
|
+
# During preprocessing, the span of semantic roles in the training corpora is
|
8
|
+
# projected onto the output of the syntactic parser by assigning each
|
9
|
+
# role to the set of maximal constituents covering its word span.
|
10
|
+
# f the word span of a role does not coincide
|
11
|
+
# with parse tree constituents, e.g. due to misparses,
|
12
|
+
# the role is ``spread out'' across several constituents. This leads to
|
13
|
+
# idiosyncratic paths between predicate and semantic role in the parse
|
14
|
+
# tree.
|
15
|
+
#
|
16
|
+
# [The following span standardization algorithm is used to make the
|
17
|
+
# syntax-semantics mapping more uniform:]
|
18
|
+
# Given a role r that has been assigned, let N be the set of
|
19
|
+
# terminal nodes of the syntactic structure that are covered by r.
|
20
|
+
#
|
21
|
+
# Iteratively compute the maximal projection of N in the syntactic
|
22
|
+
# structure:
|
23
|
+
# 1) If n is a node such that all of n's children are in N,
|
24
|
+
# then remove n's children from N and add n instead.
|
25
|
+
# 2) If n is a node with 3 or more children, and all of n's
|
26
|
+
# children except one are in N, then remove n's children from N
|
27
|
+
# and add n instead.
|
28
|
+
# 3) If n is an NP with 2 children, and one of them, another NP,
|
29
|
+
# is in N, and the other, a relative clause, is not, then remove
|
30
|
+
# n's children from N and add n instead.
|
31
|
+
#
|
32
|
+
# If none of the rules is applicable to N anymore, assign r to the
|
33
|
+
# nodes in N.
|
34
|
+
#
|
35
|
+
# Rule 1 implements normal maximal projection. Rule 2 ``repairs'' parser
|
36
|
+
# errors where all children of a node but one have been assigned the
|
37
|
+
# same role. Rule 3 addresses a problem of the FrameNet data, where
|
38
|
+
# relative clauses have been omitted from roles assigned to NPs.
|
39
|
+
|
40
|
+
# KE Feb 08: rule 3 currently out of commission!
|
41
|
+
|
42
|
+
require "common/SalsaTigerRegXML"
|
43
|
+
|
44
|
+
module FixSynSemMapping
|
45
|
+
##
|
46
|
+
# fix it
|
47
|
+
#
|
48
|
+
# relevant settings in the experiment file:
|
49
|
+
#
|
50
|
+
# fe_syn_repair:
|
51
|
+
# If there is a node that would be a max. constituent for the
|
52
|
+
# words covered by the given FE, except that it has one child
|
53
|
+
# whose words are not in the FE, use the node as max constituent anyway.
|
54
|
+
# This is to repair cases where the parser has made an attachment choice
|
55
|
+
# that differs from the one in the gold annotation
|
56
|
+
#
|
57
|
+
# fe_rel_repair:
|
58
|
+
# If there is an NP such that all of its children except one have been
|
59
|
+
# assigned the same FE, and that missing child is a relative clause
|
60
|
+
# depending on one of the other children, then take the complete NP as
|
61
|
+
# that FE
|
62
|
+
def FixSynSemMapping.fixit(sent, # SalsaTigerSentence object
|
63
|
+
exp, # experiment file object
|
64
|
+
interpreter_class) # SynInterpreter class
|
65
|
+
|
66
|
+
|
67
|
+
unless exp.get("fe_syn_repair") or exp.get("fe_rel_repair")
|
68
|
+
return
|
69
|
+
end
|
70
|
+
|
71
|
+
if sent.nil?
|
72
|
+
return
|
73
|
+
end
|
74
|
+
|
75
|
+
# "repair" FEs:
|
76
|
+
sent.each_frame { |frame|
|
77
|
+
|
78
|
+
frame.each_child { |fe_or_target|
|
79
|
+
|
80
|
+
# repair only if the FE currently
|
81
|
+
# points to more than one syn node
|
82
|
+
if fe_or_target.children.length() < 2
|
83
|
+
next
|
84
|
+
end
|
85
|
+
|
86
|
+
if exp.get("fe_rel_repair")
|
87
|
+
lastfe = fe_or_target.children.last()
|
88
|
+
if lastfe and interpreter_class.simplified_pt(lastfe) =~ /^(WDT)|(WP\$?)|(WRB)/
|
89
|
+
|
90
|
+
# remove syn nodes that the FE points to
|
91
|
+
old_fe_syn = fe_or_target.children()
|
92
|
+
old_fe_syn.each { |child|
|
93
|
+
fe_or_target.remove_child(child)
|
94
|
+
}
|
95
|
+
|
96
|
+
# set it to point only to the last previous node, the relative pronoun
|
97
|
+
fe_or_target.add_child(lastfe)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
if exp.get("fe_syn_repair")
|
102
|
+
# remove syn nodes that the FE points to
|
103
|
+
old_fe_syn = fe_or_target.children()
|
104
|
+
old_fe_syn.each { |child|
|
105
|
+
fe_or_target.remove_child(child)
|
106
|
+
}
|
107
|
+
|
108
|
+
# and recompute
|
109
|
+
new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t|
|
110
|
+
t.yield_nodes
|
111
|
+
}.flatten.uniq,
|
112
|
+
sent,
|
113
|
+
exp.get("fe_syn_repair"))
|
114
|
+
|
115
|
+
# make the FE point to the new nodes
|
116
|
+
new_fe_syn.each { |syn_node|
|
117
|
+
fe_or_target.add_child(syn_node)
|
118
|
+
}
|
119
|
+
end
|
120
|
+
} # each FE
|
121
|
+
} # each frame
|
122
|
+
end # def fixit
|
123
|
+
end # module
|
124
|
+
|
125
|
+
|
126
|
+
#########3
|
127
|
+
# old code
|
128
|
+
|
129
|
+
# if exp.get("fe_rel_repair")
|
130
|
+
# # repair relative clauses:
|
131
|
+
# # then make a procedure to pass on to max constituents
|
132
|
+
# # that will recognize the relevant cases
|
133
|
+
|
134
|
+
# accept_anyway_proc = Proc.new { |node, children_in, children_out|
|
135
|
+
|
136
|
+
# # node: SynNode
|
137
|
+
# # children_in, children_out: array:SynNode. children_in are the children
|
138
|
+
# # that are already covered by the FE, children_out the ones that aren't
|
139
|
+
|
140
|
+
# # if node is an NP,
|
141
|
+
# # and only one of its children is out,
|
142
|
+
# # and one node in children_in is an NP, and the missing child is an SBAR
|
143
|
+
# # with a child that is a relative pronoun, then consider the child in children_out as covered
|
144
|
+
# if interpreter_class.category(node) == "noun" and
|
145
|
+
# children_out.length() == 1 and
|
146
|
+
# children_in.select { |n| interpreter_class.category(n) == "noun" } and
|
147
|
+
# interpreter_class.category(children_out.first) == "sent" and
|
148
|
+
# (ch = children_out.first.children) and
|
149
|
+
# ch.select { |n| interpreter_class.relative_pronoun?(n) }
|
150
|
+
# true
|
151
|
+
# else
|
152
|
+
# false
|
153
|
+
# end
|
154
|
+
# }
|
155
|
+
|
156
|
+
# else
|
157
|
+
# accept_anyway_proc = nil
|
158
|
+
# end
|
159
|
+
|
160
|
+
|
161
|
+
# # "repair" FEs:
|
162
|
+
# sent.each_frame { |frame|
|
163
|
+
|
164
|
+
# frame.each_child { |fe_or_target|
|
165
|
+
|
166
|
+
# # repair only if the FE currently
|
167
|
+
# # points to more than one syn node, or
|
168
|
+
# # if it is a noun with a non-covered sentence sister
|
169
|
+
# if fe_or_target.children.length() > 1 or
|
170
|
+
# (exp.get("fe_rel_repair") and (curr_marked = fe_or_target.children.first()) and
|
171
|
+
# interpreter_class.category(curr_marked) == "noun" and
|
172
|
+
# (p = curr_marked.parent) and
|
173
|
+
# p.children.select { |n| n != curr_marked and interpreter_class.category(n) == "sent" } )
|
174
|
+
|
175
|
+
# # remember nodes covered by the FE
|
176
|
+
# old_fe_syn = fe_or_target.children()
|
177
|
+
|
178
|
+
# # remove syn nodes that the FE points to
|
179
|
+
# old_fe_syn.each { |child|
|
180
|
+
# fe_or_target.remove_child(child)
|
181
|
+
# }
|
182
|
+
|
183
|
+
# # and recompute
|
184
|
+
# new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t| t.yield_nodes}.flatten.uniq,
|
185
|
+
# sent,
|
186
|
+
# exp.get("fe_syn_repair"),
|
187
|
+
# accept_anyway_proc)
|
188
|
+
|
189
|
+
# # make the FE point to the new nodes
|
190
|
+
# new_fe_syn.each { |syn_node|
|
191
|
+
# fe_or_target.add_child(syn_node)
|
192
|
+
# }
|
193
|
+
|
194
|
+
# end # if FE points to more than one syn node
|
195
|
+
# } # each FE
|
196
|
+
# } # each frame
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# FPrepConfigData
|
2
|
+
# Katrin Erk July 05
|
3
|
+
#
|
4
|
+
# Preprocessing for Fred and Rosy:
|
5
|
+
# access to a configuration and experiment description file
|
6
|
+
|
7
|
+
require "common/ConfigData"
|
8
|
+
|
9
|
+
##############################
|
10
|
+
# Class FrPrepConfigData
|
11
|
+
#
|
12
|
+
# inherits from ConfigData,
|
13
|
+
# sets variable names appropriate to preprocessing task
|
14
|
+
|
15
|
+
class FrPrepConfigData < ConfigData
|
16
|
+
def initialize(filename)
|
17
|
+
|
18
|
+
# initialize config data object
|
19
|
+
super(filename, # config file
|
20
|
+
{ "prep_experiment_ID" => "string", # experiment identifier
|
21
|
+
|
22
|
+
"frprep_directory" => "string", # dir for frprep internal data
|
23
|
+
|
24
|
+
# information about the dataset
|
25
|
+
"language" => "string", # en, de
|
26
|
+
"origin"=> "string", # FrameNet, Salsa, or nothing
|
27
|
+
"format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
|
28
|
+
"encoding" => "string", # utf8, iso, hex, or nothing
|
29
|
+
|
30
|
+
|
31
|
+
# directories
|
32
|
+
"directory_input" => "string", # dir with input data
|
33
|
+
"directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
|
34
|
+
"directory_parserout" => "string", # dir with parser output for the parser named below
|
35
|
+
|
36
|
+
# syntactic processing
|
37
|
+
"pos_tagger" => "string", # name of POS tagger
|
38
|
+
"lemmatizer" => "string", # name of lemmatizer
|
39
|
+
"parser" => "string", # name of parser
|
40
|
+
"pos_tagger_path" => "string", # path to POS tagger
|
41
|
+
"lemmatizer_path" => "string", # path to lemmatizer
|
42
|
+
"parser_path" => "string", # path to parser
|
43
|
+
"parser_max_sent_num" => "integer", # max number of sentences per parser input file
|
44
|
+
"parser_max_sent_len" => "integer", # max sentence length the parser handles
|
45
|
+
|
46
|
+
"do_parse" => "bool", # use parser?
|
47
|
+
"do_lemmatize" => "bool",# use lemmatizer?
|
48
|
+
"do_postag" => "bool", # use POS tagger?
|
49
|
+
|
50
|
+
# output format: if tabformat_output == true,
|
51
|
+
# output in Tab format rather than Salsa/Tiger XML
|
52
|
+
# (this will not work if do_parse == true)
|
53
|
+
"tabformat_output" => "bool",
|
54
|
+
|
55
|
+
# syntactic repairs, dependent on existing semantic role annotation
|
56
|
+
"fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
|
57
|
+
"fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
|
58
|
+
},
|
59
|
+
[ ] # variables
|
60
|
+
)
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
|
@@ -0,0 +1,1324 @@
|
|
1
|
+
# Salsa packages
|
2
|
+
require "common/ISO-8859-1"
|
3
|
+
require "common/Parser"
|
4
|
+
require "common/RegXML"
|
5
|
+
require "common/SalsaTigerRegXML"
|
6
|
+
require "common/SalsaTigerXMLHelper"
|
7
|
+
require "common/TabFormat"
|
8
|
+
require "common/ruby_class_extensions"
|
9
|
+
require "common/AbstractSynInterface"
|
10
|
+
|
11
|
+
############################################3
|
12
|
+
# Module FrprepHelper:
|
13
|
+
#
|
14
|
+
# diverse transformation methods for frprep.rb
|
15
|
+
# moved over here to make the main file less crowded
|
16
|
+
module FrprepHelper
|
17
|
+
|
18
|
+
####
|
19
|
+
# transform a file to UTF-8 from a given encoding
|
20
|
+
def FrprepHelper.to_utf8_file(input_filename, # string: name of input file
|
21
|
+
output_filename, # string: name of output file
|
22
|
+
encoding) # string: "iso", "hex"
|
23
|
+
begin
|
24
|
+
infile = File.new(input_filename)
|
25
|
+
outfile = File.new(output_filename, "w")
|
26
|
+
rescue
|
27
|
+
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
28
|
+
end
|
29
|
+
|
30
|
+
while (line = infile.gets())
|
31
|
+
case encoding
|
32
|
+
when "iso"
|
33
|
+
outfile.puts UtfIso.from_iso_8859_1(line)
|
34
|
+
when "hex"
|
35
|
+
outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
|
36
|
+
else
|
37
|
+
raise "Shouldn't be here."
|
38
|
+
end
|
39
|
+
end
|
40
|
+
infile.close()
|
41
|
+
outfile.close()
|
42
|
+
end
|
43
|
+
|
44
|
+
####
|
45
|
+
# transform BNC format file to plaintext file
|
46
|
+
def FrprepHelper.bnc_to_plain_file(input_filename, # string: name of input file
|
47
|
+
output_filename) # string: name of output file
|
48
|
+
begin
|
49
|
+
infile = File.new(input_filename)
|
50
|
+
outfile = File.new(output_filename, "w")
|
51
|
+
rescue
|
52
|
+
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
53
|
+
end
|
54
|
+
|
55
|
+
infile.each { |line|
|
56
|
+
# does this line contain a sentence?
|
57
|
+
if line =~ /^\s*<s\s+n=/
|
58
|
+
# remove all tags, replace by spaces,
|
59
|
+
# then remove superfluous spaces
|
60
|
+
textline = line.gsub(/<.+?>/, " ").strip().squeeze(" ")
|
61
|
+
|
62
|
+
|
63
|
+
textline.gsub!(/&bquo;/, '"')
|
64
|
+
textline.gsub!(/&equo;/, '"')
|
65
|
+
textline.gsub!(/—/, "-")
|
66
|
+
textline.gsub!(/–/, "-")
|
67
|
+
textline.gsub!(/%/, "%")
|
68
|
+
textline.gsub!(/£/, " pounds ")
|
69
|
+
textline.gsub!(/&/, " and ")
|
70
|
+
textline.gsub!(/…/, "...")
|
71
|
+
textline.gsub!(/©/, "(copyright)")
|
72
|
+
textline.gsub!(/é/, "e")
|
73
|
+
textline.gsub!(/•/, "*")
|
74
|
+
textline.gsub!(/$/, "$")
|
75
|
+
textline.gsub!(/°/, " degree ")
|
76
|
+
|
77
|
+
textline.gsub!(/½/, "1/2")
|
78
|
+
textline.gsub!(/¾/, "3/4")
|
79
|
+
|
80
|
+
textline.gsub!(/[/, "[")
|
81
|
+
textline.gsub!(/]/, "]")
|
82
|
+
|
83
|
+
textline.gsub!(/&ins;/, "i")
|
84
|
+
textline.gsub!(/&ft;/, "ft")
|
85
|
+
|
86
|
+
textline.gsub!(/→/, ">")
|
87
|
+
textline.gsub!(/←/, "<")
|
88
|
+
|
89
|
+
|
90
|
+
textline.gsub!(/á/, "a")
|
91
|
+
textline.gsub!(/ä/, "a")
|
92
|
+
textline.gsub!(/à/, "a")
|
93
|
+
textline.gsub!(/ã/, "a")
|
94
|
+
textline.gsub!(/â/, "a")
|
95
|
+
textline.gsub!(/Á/, "A")
|
96
|
+
textline.gsub!(/Ä/, "A")
|
97
|
+
textline.gsub!(/À/, "A")
|
98
|
+
textline.gsub!(/Ã/, "A")
|
99
|
+
textline.gsub!(/Â/, "A")
|
100
|
+
|
101
|
+
textline.gsub!(/é/, "e")
|
102
|
+
textline.gsub!(/è/, "e")
|
103
|
+
textline.gsub!(/ê/, "e")
|
104
|
+
textline.gsub!(/ë/, "e")
|
105
|
+
textline.gsub!(/É/, "E")
|
106
|
+
textline.gsub!(/È/, "E")
|
107
|
+
textline.gsub!(/Ê/, "E")
|
108
|
+
textline.gsub!(/Ë/, "E")
|
109
|
+
|
110
|
+
textline.gsub!(/í/, "i")
|
111
|
+
textline.gsub!(/ì/, "i")
|
112
|
+
textline.gsub!(/î/, "i")
|
113
|
+
textline.gsub!(/ï/, "i")
|
114
|
+
textline.gsub!(/Í/, "I")
|
115
|
+
textline.gsub!(/Ì/, "I")
|
116
|
+
textline.gsub!(/Î/, "I")
|
117
|
+
|
118
|
+
textline.gsub!(/ó/, "o")
|
119
|
+
textline.gsub!(/ò/, "o")
|
120
|
+
textline.gsub!(/ô/, "o")
|
121
|
+
textline.gsub!(/ö/, "o")
|
122
|
+
textline.gsub!(/Ó/, "O")
|
123
|
+
textline.gsub!(/Ò/, "O")
|
124
|
+
textline.gsub!(/Ô/, "O")
|
125
|
+
textline.gsub!(/Ö/, "O")
|
126
|
+
|
127
|
+
textline.gsub!(/ú/, "u")
|
128
|
+
textline.gsub!(/ù/, "u")
|
129
|
+
textline.gsub!(/û/, "u")
|
130
|
+
textline.gsub!(/ü/, "u")
|
131
|
+
textline.gsub!(/Ú/, "U")
|
132
|
+
textline.gsub!(/Ù/, "U")
|
133
|
+
textline.gsub!(/Û/, "U")
|
134
|
+
textline.gsub!(/Ü/, "U")
|
135
|
+
|
136
|
+
textline.gsub!(/ÿ/, "y")
|
137
|
+
textline.gsub!(/Ÿ/, "Y")
|
138
|
+
|
139
|
+
textline.gsub!(/ñ/, "n")
|
140
|
+
textline.gsub!(/Ñ/, "N")
|
141
|
+
|
142
|
+
textline.gsub!(/ç/, "c")
|
143
|
+
textline.gsub!(/Ç/, "C")
|
144
|
+
|
145
|
+
|
146
|
+
outfile.puts textline
|
147
|
+
end
|
148
|
+
}
|
149
|
+
infile.close()
|
150
|
+
outfile.close()
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
####
|
155
|
+
# transform plaintext file to Tab format file
|
156
|
+
def FrprepHelper.plain_to_tab_file(input_filename,# string: name of input file
|
157
|
+
output_filename) # string: name of output file
|
158
|
+
begin
|
159
|
+
infile = File.new(input_filename)
|
160
|
+
outfile = File.new(output_filename, "w")
|
161
|
+
rescue
|
162
|
+
raise "Could not read #{input_filename}, or could not write to #{output_filename}."
|
163
|
+
end
|
164
|
+
|
165
|
+
filename_core = File.basename(input_filename, "txt")
|
166
|
+
|
167
|
+
# array(string): keep the words of each sentence
|
168
|
+
sentence = Array.new
|
169
|
+
# sentence number for making the sentence ID:
|
170
|
+
# global count, over all input files
|
171
|
+
sentno = 0
|
172
|
+
|
173
|
+
while (line = infile.gets())
|
174
|
+
|
175
|
+
# make a sentence ID for the next sentence: running number
|
176
|
+
sentid = filename_core + "_" + sentno.to_s
|
177
|
+
sentno += 1
|
178
|
+
|
179
|
+
# read words into the sentence array,
|
180
|
+
# separating out punctuation attached to the beginning or end of words
|
181
|
+
sentence.clear()
|
182
|
+
line.split.each { |word|
|
183
|
+
# punctuation at the beginning of the word
|
184
|
+
#if word =~ /^([\(\[`'\"-]+)(.*)$/
|
185
|
+
if word =~ /^([\(\[`\"-]+)(.*)$/
|
186
|
+
punct = $1
|
187
|
+
word = $2
|
188
|
+
punct.scan(/./) { |single_punct|
|
189
|
+
sentence << single_punct
|
190
|
+
}
|
191
|
+
|
192
|
+
end
|
193
|
+
# punctuation at the end of the word
|
194
|
+
#if word =~ /[,:;-\`?!'\"\.\)\]]+$/
|
195
|
+
if word =~ /[,:;-\`?!\"\.\)\]]+$/
|
196
|
+
sentence << $` # part before the match: the word
|
197
|
+
punct = $&
|
198
|
+
punct.scan(/./) { |single_punct|
|
199
|
+
sentence << single_punct
|
200
|
+
}
|
201
|
+
|
202
|
+
else
|
203
|
+
# no punctuation recognized
|
204
|
+
sentence << word
|
205
|
+
end
|
206
|
+
}
|
207
|
+
|
208
|
+
|
209
|
+
# remove empty words
|
210
|
+
sentence.reject! { |word| word.nil? or word.strip.empty? }
|
211
|
+
|
212
|
+
# write words to tab file
|
213
|
+
# KE Dec 06: TabFormat changed
|
214
|
+
sentence.each { |word|
|
215
|
+
# for each word, one line, entries in the line tab-separated
|
216
|
+
# the 'word' entry is the word, the 'lu_sent_ids' entry is the sentence ID sentid,
|
217
|
+
# all other entries (gf, pt, frame etc.) are not set
|
218
|
+
outfile.puts FNTabFormatFile.format_str({
|
219
|
+
"word" => word,
|
220
|
+
"sent_id" => sentid
|
221
|
+
})
|
222
|
+
}
|
223
|
+
outfile.puts
|
224
|
+
end
|
225
|
+
outfile.close()
|
226
|
+
end
|
227
|
+
|
228
|
+
###########
|
229
|
+
#
|
230
|
+
# class method split_dir:
|
231
|
+
# read all files in one directory and produce chunk files *#{suffix} in outdir
|
232
|
+
# with a certain number of files in them (sent_num).
|
233
|
+
# Optionally, remove all sentences longer than sent_leng
|
234
|
+
#
|
235
|
+
# produces output files 1.<suffix>, 2.<suffix>, etc.
|
236
|
+
#
|
237
|
+
# assumes TabFormat sentences
|
238
|
+
#
|
239
|
+
# example: split_all("/tmp/in","/tmp/out",".tab",2000,80)
|
240
|
+
|
241
|
+
def FrprepHelper.split_dir(indir,
|
242
|
+
outdir,
|
243
|
+
suffix,
|
244
|
+
sent_num,
|
245
|
+
sent_leng=nil)
|
246
|
+
|
247
|
+
unless indir[-1,1] == "/"
|
248
|
+
indir += "/"
|
249
|
+
end
|
250
|
+
unless outdir[-1,1] == "/"
|
251
|
+
outdir += "/"
|
252
|
+
end
|
253
|
+
|
254
|
+
outfile_counter = 0
|
255
|
+
line_stack = Array.new
|
256
|
+
sent_stack = Array.new
|
257
|
+
|
258
|
+
Dir[indir+"*#{suffix}"].each {|infilename|
|
259
|
+
STDERR.puts "Now splitting #{infilename}"
|
260
|
+
infile = File.new(infilename)
|
261
|
+
|
262
|
+
while line = infile.gets
|
263
|
+
line.chomp!
|
264
|
+
case line
|
265
|
+
when "" # end of sentence
|
266
|
+
if !(sent_leng.nil? or line_stack.length < sent_leng) # record sentence
|
267
|
+
# suppress multiple empty lines
|
268
|
+
# to avoid problems with lemmatiser
|
269
|
+
# only record sent_stack if it is not empty.
|
270
|
+
|
271
|
+
# change (sp 15 01 07): just cut off sentence at sent_leng.
|
272
|
+
|
273
|
+
STDERR.puts "Cutting off long sentence #{line_stack.last.split("\t").last}"
|
274
|
+
line_stack = line_stack[0..sent_leng-1]
|
275
|
+
end
|
276
|
+
unless line_stack.empty?
|
277
|
+
sent_stack << line_stack
|
278
|
+
# reset line_stack
|
279
|
+
line_stack = Array.new
|
280
|
+
end
|
281
|
+
|
282
|
+
|
283
|
+
# check if we have to empty the sent stack
|
284
|
+
if sent_stack.length == sent_num # enough sentences for new outfile?
|
285
|
+
outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
|
286
|
+
sent_stack.each {|l_stack|
|
287
|
+
outfile.puts l_stack.join("\n")
|
288
|
+
outfile.puts
|
289
|
+
}
|
290
|
+
outfile.close
|
291
|
+
outfile_counter += 1
|
292
|
+
sent_stack = Array.new
|
293
|
+
end
|
294
|
+
|
295
|
+
else # for any other line
|
296
|
+
line_stack << line
|
297
|
+
end
|
298
|
+
end
|
299
|
+
infile.close
|
300
|
+
}
|
301
|
+
# the last remaining sentences
|
302
|
+
unless sent_stack.empty?
|
303
|
+
outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
|
304
|
+
sent_stack.each {|l_stack|
|
305
|
+
l_stack << "\n"
|
306
|
+
outfile.puts l_stack.join("\n")
|
307
|
+
}
|
308
|
+
outfile.close
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
####
|
313
|
+
# note salsa targetlemma
|
314
|
+
#
|
315
|
+
# old_dir contains xml files whose name starts with the
|
316
|
+
# target lemma for all frames in the file
|
317
|
+
# record that target lemma in the <target> element of each frame
|
318
|
+
def FrprepHelper.note_salsa_targetlemma(old_dir, # string ending in /
|
319
|
+
new_dir) # string ending in /
|
320
|
+
|
321
|
+
|
322
|
+
# each input file: extract target lemma from filename,
|
323
|
+
# not this lemma in the <target> element of each frame
|
324
|
+
Dir[old_dir + "*.xml"].each { |filename|
|
325
|
+
changedfilename = new_dir + File.basename(filename)
|
326
|
+
|
327
|
+
if File.basename(filename) =~ /^(.*?)[_\.]/
|
328
|
+
lemma = $1
|
329
|
+
|
330
|
+
infile = FilePartsParser.new(filename)
|
331
|
+
outfile = File.new(changedfilename, "w")
|
332
|
+
|
333
|
+
# write header
|
334
|
+
outfile.puts infile.head()
|
335
|
+
|
336
|
+
# iterate through sentences, yield as SalsaTigerSentence objects
|
337
|
+
infile.scan_s() { |sent_string|
|
338
|
+
sent = SalsaTigerSentence.new(sent_string)
|
339
|
+
sent.each_frame { |frame|
|
340
|
+
frame.target.set_attribute("lemma", lemma)
|
341
|
+
}
|
342
|
+
|
343
|
+
# write changed sentence
|
344
|
+
outfile.puts sent.get()
|
345
|
+
} # each sentence
|
346
|
+
|
347
|
+
# write footer
|
348
|
+
outfile.puts infile.tail()
|
349
|
+
infile.close()
|
350
|
+
outfile.close()
|
351
|
+
|
352
|
+
else
|
353
|
+
# couldn't determine lemma
|
354
|
+
# just copy the file
|
355
|
+
`cp #{filename} #{changedfilename}`
|
356
|
+
end
|
357
|
+
}
|
358
|
+
end
|
359
|
+
|
360
|
+
####
|
361
|
+
# stxml_split_dir
|
362
|
+
#
|
363
|
+
# split SalsaTigerXML files into new files of given length,
|
364
|
+
# skipping sentences that are too long
|
365
|
+
#
|
366
|
+
# At the same time, sentences that occur several times (i.e. sentences which are
|
367
|
+
# annotated by SALSA for more than one predicate) are compacted into one occurrence
|
368
|
+
# with combined semantics.
|
369
|
+
#
|
370
|
+
# assumes that all files in input_dir with
|
371
|
+
# extension .xml are SalsaTigerXMl files
|
372
|
+
def FrprepHelper.stxml_split_dir(input_dir, # string: input directory with STXML files
|
373
|
+
split_dir, # string: output directory
|
374
|
+
max_sentnum, # integer: max num of sentences per file
|
375
|
+
max_sentlen) # integer: max num of terminals per sentence
|
376
|
+
|
377
|
+
filenames = Dir[input_dir+"*.xml"].to_a
|
378
|
+
|
379
|
+
graph_hash = Hash.new # for each sentence id, keep <s...</graph>
|
380
|
+
frame_hash = Hash.new # for each sentence id , keep the <frame... </frame> string
|
381
|
+
uspfes_hash = Hash.new # for each sentence id, keep the uspfes stuff
|
382
|
+
uspframes_hash = Hash.new # for each sentence id, keep the uspframes stuff
|
383
|
+
|
384
|
+
########################
|
385
|
+
# Traverse of file(s): compute an index of all frames for each sentence, with unique identifiers
|
386
|
+
|
387
|
+
filenames.each {|filename|
|
388
|
+
|
389
|
+
infile = FilePartsParser.new(filename)
|
390
|
+
infile.scan_s {|sent_str|
|
391
|
+
|
392
|
+
sentlen = 0
|
393
|
+
sent_str.delete("\n").scan(/<t\s/) { |occ| sentlen += 1}
|
394
|
+
if sentlen > max_sentlen
|
395
|
+
sent = RegXML.new(sent_str)
|
396
|
+
# revisit handling of long sentences
|
397
|
+
# $stderr.puts "I would have skipped overly long sentence " + sent.attributes["id"]+" but Sebastian forbade me.".to_s
|
398
|
+
# next
|
399
|
+
end
|
400
|
+
|
401
|
+
# substitute old frame identifiers with new, unique ones
|
402
|
+
|
403
|
+
# problem: we may have several frames per sentence, and need to keep track of them
|
404
|
+
# if we rename etc sxx_f1 to sxx_f2 and there is already a sxx_f2, then
|
405
|
+
# we cannot distinguish between these frames
|
406
|
+
|
407
|
+
# therefore, we substitute temporary identifiers until we have substituted
|
408
|
+
# all ids with temporary ones, and re-substitute final ones at the end.
|
409
|
+
|
410
|
+
this_frames = Array.new
|
411
|
+
|
412
|
+
temp_subs = Array.new
|
413
|
+
final_subs = Array.new
|
414
|
+
|
415
|
+
sent = RegXML.new(sent_str)
|
416
|
+
sentid = sent.attributes["id"].to_s
|
417
|
+
if sentid.nil?
|
418
|
+
STDERR.puts "[frprep] Warning: cannot find sentence id, skipping sentence:"
|
419
|
+
STDERR.puts sent_str
|
420
|
+
# strange sentence, no ID? skip
|
421
|
+
next
|
422
|
+
end
|
423
|
+
|
424
|
+
unless frame_hash.key? sentid
|
425
|
+
frame_hash[sentid] = Array.new
|
426
|
+
uspfes_hash[sentid] = Array.new
|
427
|
+
uspframes_hash[sentid] = Array.new
|
428
|
+
end
|
429
|
+
|
430
|
+
# find everything up to and including the graph
|
431
|
+
sent_children = sent.children_and_text()
|
432
|
+
graph = sent_children.detect { |child| child.name == "graph" }
|
433
|
+
graph_hash[sentid] = "<s " +
|
434
|
+
sent.attributes.to_a.map { |at, val| "#{at}=\'#{val}\'" }.join(" ") +
|
435
|
+
">" +
|
436
|
+
graph.to_s
|
437
|
+
|
438
|
+
# find the usp block
|
439
|
+
|
440
|
+
sem = sent_children.detect { |child| child.name == "sem"}
|
441
|
+
usp = ""
|
442
|
+
if sem
|
443
|
+
usp = sem.children_and_text.detect { |child| child.name == "usp" }
|
444
|
+
usp = usp.to_s
|
445
|
+
end
|
446
|
+
|
447
|
+
# find all frames
|
448
|
+
if sem
|
449
|
+
frames = sem.children_and_text.detect { |child| child.name == "frames" }
|
450
|
+
if frames
|
451
|
+
frames.children_and_text.each { |frame|
|
452
|
+
unless frame.name == "frame"
|
453
|
+
next
|
454
|
+
end
|
455
|
+
frameid = frame.attributes["id"]
|
456
|
+
|
457
|
+
temp_frameid = "#{sentid}_temp_f#{frame_hash[sentid].length+this_frames.length+1}"
|
458
|
+
final_frameid = "#{sentid}_f#{frame_hash[sentid].length+this_frames.length+1}"
|
459
|
+
|
460
|
+
temp_subs << [frameid,temp_frameid]
|
461
|
+
final_subs << [temp_frameid,final_frameid]
|
462
|
+
|
463
|
+
this_frames << frame.to_s
|
464
|
+
}
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
# now first rename all the frames to temporary names
|
469
|
+
|
470
|
+
temp_subs.each {|orig_frameid, temp_frameid|
|
471
|
+
this_frames.map! {|frame_str|
|
472
|
+
#print "orig ", orig_frameid, " temp ", temp_frameid, "\n"
|
473
|
+
frame_str.gsub(orig_frameid,temp_frameid)
|
474
|
+
}
|
475
|
+
|
476
|
+
usp.gsub!(orig_frameid,temp_frameid)
|
477
|
+
}
|
478
|
+
|
479
|
+
# and re-rename the temporary names
|
480
|
+
|
481
|
+
final_subs.each {|temp_frameid, final_frameid|
|
482
|
+
this_frames.map! {|frame_str|
|
483
|
+
frame_str.gsub(temp_frameid,final_frameid)
|
484
|
+
}
|
485
|
+
usp.gsub!(temp_frameid, final_frameid)
|
486
|
+
}
|
487
|
+
|
488
|
+
# store frames in data structure
|
489
|
+
this_frames.each {|frame_str|
|
490
|
+
frame_hash[sentid] << frame_str
|
491
|
+
}
|
492
|
+
|
493
|
+
# store uspfes in data structure
|
494
|
+
unless usp.empty?
|
495
|
+
usp_elt = RegXML.new(usp)
|
496
|
+
uspfes = usp_elt.children_and_text.detect { |child| child.name == "uspfes" }
|
497
|
+
uspfes.children_and_text.each { |child|
|
498
|
+
unless child.name == "uspblock"
|
499
|
+
next
|
500
|
+
end
|
501
|
+
uspfes_hash[sentid] << child.to_s
|
502
|
+
}
|
503
|
+
|
504
|
+
# store uspframes in data structure
|
505
|
+
uspframes = usp_elt.children_and_text.detect { |child| child.name == "uspframes" }
|
506
|
+
uspframes.children_and_text.each { |child|
|
507
|
+
unless child.name == "uspblock"
|
508
|
+
next
|
509
|
+
end
|
510
|
+
uspframes_hash[sentid] << child.to_s
|
511
|
+
}
|
512
|
+
end
|
513
|
+
}
|
514
|
+
}
|
515
|
+
|
516
|
+
# now write everything in the data structure back to a file
|
517
|
+
|
518
|
+
filecounter = 0
|
519
|
+
sentcounter = 0
|
520
|
+
outfile = nil
|
521
|
+
sent_stack = Array.new
|
522
|
+
|
523
|
+
graph_hash.sort {|a,b| a[0].to_i <=> b[0].to_i}.each {|sentid,graph_str|
|
524
|
+
|
525
|
+
if sentcounter == max_sentnum
|
526
|
+
outfile.puts SalsaTigerXMLHelper.get_footer
|
527
|
+
outfile.close
|
528
|
+
outfile = nil
|
529
|
+
end
|
530
|
+
|
531
|
+
unless outfile
|
532
|
+
outfile = File.new(split_dir+filecounter.to_s+".xml","w")
|
533
|
+
outfile.puts SalsaTigerXMLHelper.get_header
|
534
|
+
filecounter +=1
|
535
|
+
sentcounter = 0
|
536
|
+
end
|
537
|
+
|
538
|
+
xml = Array.new
|
539
|
+
xml << graph_str
|
540
|
+
xml << "<sem>"
|
541
|
+
xml << "<globals>"
|
542
|
+
xml << "</globals>"
|
543
|
+
xml << "<frames>"
|
544
|
+
frame_hash[sentid].each {|frame_str|
|
545
|
+
xml << frame_str
|
546
|
+
}
|
547
|
+
xml << "</frames>"
|
548
|
+
xml << "<usp>"
|
549
|
+
xml << "<uspframes>"
|
550
|
+
uspframes_hash[sentid].each {|uspblock_str|
|
551
|
+
xml << uspblock_str
|
552
|
+
}
|
553
|
+
xml << "</uspframes>"
|
554
|
+
xml << "<uspfes>"
|
555
|
+
uspfes_hash[sentid].each {|uspblock_str|
|
556
|
+
xml << uspblock_str
|
557
|
+
}
|
558
|
+
xml << "</uspfes>"
|
559
|
+
xml << "</usp>"
|
560
|
+
xml << "</sem>"
|
561
|
+
xml << "</s>"
|
562
|
+
|
563
|
+
outfile.puts xml.join("\n")
|
564
|
+
sentcounter += 1
|
565
|
+
}
|
566
|
+
|
567
|
+
if outfile
|
568
|
+
outfile.puts SalsaTigerXMLHelper.get_footer
|
569
|
+
outfile.close
|
570
|
+
outfile = nil
|
571
|
+
end
|
572
|
+
|
573
|
+
end
|
574
|
+
|
575
|
+
|
576
|
+
####
|
577
|
+
# transform SalsaTigerXML file to Tab format file
|
578
|
+
def FrprepHelper.stxml_to_tab_file(input_filename, # string: name of input file
|
579
|
+
output_filename, # string: name of output file
|
580
|
+
exp) # FrprepConfigData
|
581
|
+
infile = FilePartsParser.new(input_filename)
|
582
|
+
begin
|
583
|
+
outfile = File.new(output_filename,"w")
|
584
|
+
rescue
|
585
|
+
raise "Stxml to tab: could not write to tab file #{output_filename}"
|
586
|
+
end
|
587
|
+
|
588
|
+
infile.scan_s {|sent_string|
|
589
|
+
|
590
|
+
# determine sentence ID
|
591
|
+
sentid = RegXML.new(sent_string).attributes["id"]
|
592
|
+
unless sentid
|
593
|
+
$stderr.puts "No sentence ID in sentence:\n "+ sent_string
|
594
|
+
$stderr.puts "Making a new one up."
|
595
|
+
sentid = Time.new().to_f.to_s
|
596
|
+
end
|
597
|
+
|
598
|
+
# find terminals and process them
|
599
|
+
unless sent_string.delete("\n") =~ /<terminals[ >].+<\/terminals>/
|
600
|
+
$stderr.puts "Warning: could not find terminals in sentence:"
|
601
|
+
$stderr.puts sent_string
|
602
|
+
$stderr.puts "Skipping"
|
603
|
+
next
|
604
|
+
end
|
605
|
+
|
606
|
+
# modified by ines, 27/08/08
|
607
|
+
# for Berkeley => convert ( ) to -LRB- -RRB-
|
608
|
+
|
609
|
+
text = $&
|
610
|
+
if exp.get("parser") == "berkeley"
|
611
|
+
text.gsub!(/word='\('/, "word='*LRB*'")
|
612
|
+
text.gsub!(/word='\)'/, "word='*RRB*'")
|
613
|
+
text.gsub!(/word=['"]``['"]/, "word='\"'")
|
614
|
+
text.gsub!(/word=['"]''['"]/, "word='\"'")
|
615
|
+
text.gsub!(/word=['"]\'\'['"]/, "word='\"'")
|
616
|
+
#text.gsub!(/word=['"]\(['"]/, "word='-LRB-'")
|
617
|
+
#text.gsub!(/word=['"]\)['"]/, "word='-RRB-'")
|
618
|
+
|
619
|
+
end
|
620
|
+
terminals = text
|
621
|
+
#terminals = sent_string
|
622
|
+
terminals = RegXML.new(terminals)
|
623
|
+
terminals.children_and_text.each { |terminal|
|
624
|
+
|
625
|
+
unless terminal.name == "t"
|
626
|
+
# not a terminal after all
|
627
|
+
next
|
628
|
+
end
|
629
|
+
|
630
|
+
|
631
|
+
outfile.puts FNTabFormatFile.format_str({
|
632
|
+
"word" => SalsaTigerXMLHelper.unescape(terminal.attributes["word"]),
|
633
|
+
"sent_id" => sentid
|
634
|
+
})
|
635
|
+
} # each terminal
|
636
|
+
outfile.puts
|
637
|
+
} # each sentence
|
638
|
+
outfile.close
|
639
|
+
end
|
640
|
+
|
641
|
+
###
|
642
|
+
# add semantics from tab:
|
643
|
+
#
|
644
|
+
# add information about semantics from a FN tab sentence
|
645
|
+
# to a SalsaTigerSentence object:
|
646
|
+
# - frames (one frame per sentence)
|
647
|
+
# - roles
|
648
|
+
# - FrameNet grammatical functions
|
649
|
+
# - FrameNet POS of target
|
650
|
+
def FrprepHelper.add_semantics_from_tab(st_sent, # SalsaTigerSentence object
|
651
|
+
tab_sent, # FNTabFormatSentence object
|
652
|
+
mapping, # hash: tab lineno -> array:SynNode
|
653
|
+
interpreter_class, # SynInterpreter class
|
654
|
+
exp) # FrprepConfigData
|
655
|
+
|
656
|
+
if tab_sent.nil?
|
657
|
+
# tab sentence not found
|
658
|
+
return
|
659
|
+
end
|
660
|
+
|
661
|
+
# iterate through frames in the tabsent
|
662
|
+
frame_index = 0
|
663
|
+
tab_sent.each_frame { |tab_frame_obj|
|
664
|
+
frame_name = tab_frame_obj.get_frame() # string
|
665
|
+
|
666
|
+
if frame_name.nil? or frame_name =~ /^-*$/
|
667
|
+
# weird: a frame without a frame
|
668
|
+
$stderr.puts "Warning: frame entry without a frame in tab sentence #{st_sent.id}."
|
669
|
+
$stderr.puts "Skipping"
|
670
|
+
next
|
671
|
+
end
|
672
|
+
|
673
|
+
frame_node = st_sent.add_frame(frame_name, tab_sent.get_sent_id() + "_f#{frame_index}")
|
674
|
+
frame_index += 1
|
675
|
+
|
676
|
+
# target
|
677
|
+
target_nodes = Array.new
|
678
|
+
tab_frame_obj.get_target_indices.each {|terminal_id|
|
679
|
+
if mapping[terminal_id]
|
680
|
+
target_nodes.concat mapping[terminal_id]
|
681
|
+
end
|
682
|
+
}
|
683
|
+
|
684
|
+
# let the interpreter class decide on how to determine the maximum constituents
|
685
|
+
target_maxnodes = interpreter_class.max_constituents(target_nodes, st_sent)
|
686
|
+
if target_maxnodes.empty?
|
687
|
+
# HIEr
|
688
|
+
STDERR.puts "Warning: no target in frame entry, sentence #{st_sent.id}."
|
689
|
+
$stderr.puts "frame is #{frame_name}, frame no #{frame_index}"
|
690
|
+
$stderr.puts "Skipping."
|
691
|
+
$stderr.puts "target indices: " + tab_frame_obj.get_target_indices.join(", ")
|
692
|
+
#tab_sent.each_line { |line|
|
693
|
+
# $stderr.puts line
|
694
|
+
# $stderr.puts "--"
|
695
|
+
#}
|
696
|
+
next
|
697
|
+
end
|
698
|
+
frame_node.add_fe("target",target_maxnodes)
|
699
|
+
|
700
|
+
# set features on target: target lemma, target POS
|
701
|
+
target_lemma = tab_frame_obj.get_target()
|
702
|
+
target_pos = nil
|
703
|
+
if target_lemma
|
704
|
+
if exp.get("origin") == "FrameNet"
|
705
|
+
# FrameNet data: here the lemma in the tab file has the form
|
706
|
+
# <lemma>.<POS>
|
707
|
+
# separate the two
|
708
|
+
if target_lemma =~ /^(.*)\.(.*)$/
|
709
|
+
target_lemma = $1
|
710
|
+
target_pos = $2
|
711
|
+
end
|
712
|
+
end
|
713
|
+
frame_node.target.set_attribute("lemma", target_lemma)
|
714
|
+
if target_pos
|
715
|
+
frame_node.target.set_attribute("pos", target_pos)
|
716
|
+
end
|
717
|
+
end
|
718
|
+
|
719
|
+
# roles, GF, PT
|
720
|
+
# synnode_markable_label:
|
721
|
+
# hash "role" | "gf" | "pt" -> SynNode -> array: label(string)
|
722
|
+
layer_synnode_label = Hash.new
|
723
|
+
["gf", "pt", "role"].each {|layer|
|
724
|
+
termids2labels = tab_frame_obj.markables(layer)
|
725
|
+
|
726
|
+
unless layer_synnode_label[layer]
|
727
|
+
layer_synnode_label[layer] = Hash.new
|
728
|
+
end
|
729
|
+
|
730
|
+
termids2labels.each {|terminal_indices, label|
|
731
|
+
terminal_indices.each { |t_i|
|
732
|
+
|
733
|
+
if (nodes = mapping[t_i])
|
734
|
+
|
735
|
+
nodes.each { |node|
|
736
|
+
unless layer_synnode_label[layer][node]
|
737
|
+
layer_synnode_label[layer][node] = Array.new
|
738
|
+
end
|
739
|
+
|
740
|
+
layer_synnode_label[layer][node] << label
|
741
|
+
} # each node that t_i maps to
|
742
|
+
end # if t_i maps to anything
|
743
|
+
|
744
|
+
} # each terminal index
|
745
|
+
} # each mapping terminal indices -> label
|
746
|
+
} # each layer
|
747
|
+
|
748
|
+
# 'stuff' (Support and other things)
|
749
|
+
layer_synnode_label["stuff"] = Hash.new
|
750
|
+
tab_frame_obj.each_line_parsed { |line_obj|
|
751
|
+
if (label = line_obj.get("stuff")) != "-"
|
752
|
+
if (nodes = mapping[line_obj.get("lineno")])
|
753
|
+
nodes.each { |node|
|
754
|
+
unless layer_synnode_label["stuff"][node]
|
755
|
+
layer_synnode_label["stuff"][node] = Array.new
|
756
|
+
end
|
757
|
+
layer_synnode_label["stuff"][node] << label
|
758
|
+
}
|
759
|
+
end
|
760
|
+
end
|
761
|
+
}
|
762
|
+
|
763
|
+
# reencode:
|
764
|
+
# hash role_label(string) -> array of tuples [synnodes, gflabels, ptlabels]
|
765
|
+
# synnodes: array:SynNode. gflabels, ptlabels: array:String
|
766
|
+
#
|
767
|
+
# note that in this step, any gf or pt labels that have been
|
768
|
+
# assigned to a SynNode that has not also been assigned a role
|
769
|
+
# will be lost
|
770
|
+
role2nodes_labels = Hash.new
|
771
|
+
layer_synnode_label["role"].each_pair { |synnode, labels|
|
772
|
+
labels.each { | rolelabel|
|
773
|
+
unless role2nodes_labels[rolelabel]
|
774
|
+
role2nodes_labels[rolelabel] = Array.new
|
775
|
+
end
|
776
|
+
|
777
|
+
role2nodes_labels[rolelabel] << [
|
778
|
+
synnode,
|
779
|
+
layer_synnode_label["gf"][synnode],
|
780
|
+
layer_synnode_label["pt"][synnode]
|
781
|
+
]
|
782
|
+
} # each role label
|
783
|
+
} # each pair SynNode/role labels
|
784
|
+
|
785
|
+
# reencode "stuff", but only the support cases
|
786
|
+
role2nodes_labels["Support"] = Array.new()
|
787
|
+
|
788
|
+
layer_synnode_label["stuff"].each_pair { |synnode, labels|
|
789
|
+
labels.each { |stufflabel|
|
790
|
+
if stufflabel =~ /Supp/
|
791
|
+
# some sort of support
|
792
|
+
role2nodes_labels["Support"] << [synnode, nil, nil]
|
793
|
+
end
|
794
|
+
}
|
795
|
+
}
|
796
|
+
|
797
|
+
##
|
798
|
+
# each role label:
|
799
|
+
# make FeNode for the current frame
|
800
|
+
role2nodes_labels.each_pair { |rolelabel, node_gf_pt|
|
801
|
+
|
802
|
+
# get list of syn nodes, GF and PT labels for this role
|
803
|
+
# shortcut for GF and PT labels: take any labels that have
|
804
|
+
# been assigned for _some_ Synnode of this role
|
805
|
+
synnodes = node_gf_pt.map { |ngp| ngp[0] }
|
806
|
+
gflabels = node_gf_pt.map { |ngp| ngp[1] }.compact.flatten.uniq
|
807
|
+
ptlabels = node_gf_pt.map { |ngp| ngp[2] }.compact.flatten.uniq
|
808
|
+
|
809
|
+
|
810
|
+
# let the interpreter class decide on how to
|
811
|
+
# determine the maximum constituents
|
812
|
+
maxnodes = interpreter_class.max_constituents(synnodes, st_sent)
|
813
|
+
|
814
|
+
fe_node = st_sent.add_fe(frame_node, rolelabel, maxnodes)
|
815
|
+
unless gflabels.empty?
|
816
|
+
fe_node.set_attribute("gf", gflabels.join(","))
|
817
|
+
end
|
818
|
+
unless ptlabels.empty?
|
819
|
+
fe_node.set_attribute("pt", ptlabels.join(","))
|
820
|
+
end
|
821
|
+
} # each role label
|
822
|
+
} # each frame
|
823
|
+
end
|
824
|
+
|
825
|
+
|
826
|
+
######
|
827
|
+
# handle multiword targets:
|
828
|
+
# if you find a verb with a separate prefix,
|
829
|
+
# change the verb's lemma information accordingly
|
830
|
+
# and add an attribute "other_words" to the verb node
|
831
|
+
# pointing to the other node
|
832
|
+
#
|
833
|
+
# In general, it will be assumed that "other_words" contains
|
834
|
+
# a list of node IDs for other nodes belonging to the same
|
835
|
+
# group, node IDs separated by spaces, and that
|
836
|
+
# each node of a group has the "other_words" attribute.
|
837
|
+
#
|
838
|
+
def FrprepHelper.handle_multiword_targets(sent, # SalsaTigerSentence object
|
839
|
+
interpreter, # SynInterpreter object
|
840
|
+
language) # string: en, de
|
841
|
+
##
|
842
|
+
# only retain the interesting words of the sentence:
|
843
|
+
# content words and prepositions
|
844
|
+
if sent.nil?
|
845
|
+
return
|
846
|
+
end
|
847
|
+
|
848
|
+
nodes = sent.terminals.select { |node|
|
849
|
+
[
|
850
|
+
"adj", "adv", "card", "noun", "part", "prep", "verb"
|
851
|
+
].include? interpreter.category(node)
|
852
|
+
}
|
853
|
+
|
854
|
+
##
|
855
|
+
# group:
|
856
|
+
# group verbs with their separate particles
|
857
|
+
# (at a later point, other types of grouping can be inserted here)
|
858
|
+
groups = FrprepHelper.group_words(nodes, interpreter)
|
859
|
+
|
860
|
+
##
|
861
|
+
# record grouping information as attributes on the terminals.
|
862
|
+
groups.each { |descr, group_of_nodes|
|
863
|
+
case descr
|
864
|
+
when "none"
|
865
|
+
# no grouping
|
866
|
+
when "part"
|
867
|
+
# separate particle belonging to a verb
|
868
|
+
|
869
|
+
# group_of_nodes is a pair [verb, particle]
|
870
|
+
verb, particle = group_of_nodes
|
871
|
+
|
872
|
+
verb.set_attribute("other_words", particle.id())
|
873
|
+
particle.set_attribute("other_words", verb.id())
|
874
|
+
|
875
|
+
if verb.get_attribute("lemma") and particle.get_attribute("lemma")
|
876
|
+
case language
|
877
|
+
when "de"
|
878
|
+
# German: prepend SVP to get the real lemma of the verb
|
879
|
+
verb.set_attribute("lemma",
|
880
|
+
particle.get_attribute("lemma") +
|
881
|
+
verb.get_attribute("lemma"))
|
882
|
+
when "en"
|
883
|
+
# English: append particle as separate word after the lemma of the verb
|
884
|
+
verb.set_attribute("lemma",
|
885
|
+
verb.get_attribute("lemma") + " " +
|
886
|
+
particle.get_attribute("lemma"))
|
887
|
+
else
|
888
|
+
# default
|
889
|
+
verb.set_attribute("lemma",
|
890
|
+
verb.get_attribute("lemma") + " " +
|
891
|
+
particle.get_attribute("lemma"))
|
892
|
+
end
|
893
|
+
end
|
894
|
+
|
895
|
+
else
|
896
|
+
raise "Shouldn't be here: unexpected description #{descr}"
|
897
|
+
end
|
898
|
+
}
|
899
|
+
end
|
900
|
+
|
901
|
+
########################
|
902
|
+
# group_words
|
903
|
+
#
|
904
|
+
# auxiliary of transform_multiword targets
|
905
|
+
#
|
906
|
+
# Group terminals:
|
907
|
+
# At the moment, just find separate prefixes and particles
|
908
|
+
# for verbs
|
909
|
+
#
|
910
|
+
# returns: list of pairs [descr, nodes]
|
911
|
+
# descr: string, "none" (no group), "part" (separate verb particle)
|
912
|
+
# nodes: array:SynNode
|
913
|
+
def FrprepHelper.group_words(nodes, # array: SynNode
|
914
|
+
interpreter) # SynInterpreter object
|
915
|
+
|
916
|
+
retv = Array.new # array of groups, array:array:SynNode
|
917
|
+
done = Array.new # remember nodes already covered
|
918
|
+
|
919
|
+
nodes.each { |terminal_node|
|
920
|
+
if done.include? terminal_node
|
921
|
+
# we have already included this node in one of the groups
|
922
|
+
next
|
923
|
+
end
|
924
|
+
|
925
|
+
if (svp = interpreter.particle_of_verb(terminal_node, nodes))
|
926
|
+
retv << ["part", [terminal_node, svp]]
|
927
|
+
done << terminal_node
|
928
|
+
done << svp
|
929
|
+
else
|
930
|
+
retv << ["none", [terminal_node]]
|
931
|
+
done << terminal_node
|
932
|
+
end
|
933
|
+
|
934
|
+
}
|
935
|
+
|
936
|
+
return retv
|
937
|
+
end
|
938
|
+
|
939
|
+
|
940
|
+
######
|
941
|
+
# handle unknown framenames
|
942
|
+
#
|
943
|
+
# For all frames with names matching Unknown\d+,
|
944
|
+
# rename them to <lemma>_Unknown\d+
|
945
|
+
def FrprepHelper.handle_unknown_framenames(sent, # SalsaTigerSentence
|
946
|
+
interpreter) # SynInterpreter class
|
947
|
+
if sent.nil?
|
948
|
+
return
|
949
|
+
end
|
950
|
+
|
951
|
+
sent.each_frame { |frame|
|
952
|
+
if frame.name() =~ /^Unknown/
|
953
|
+
if frame.target
|
954
|
+
maintarget = interpreter.main_node_of_expr(frame.target.children(), "no_mwe")
|
955
|
+
else
|
956
|
+
maintarget = nil
|
957
|
+
end
|
958
|
+
unless maintarget
|
959
|
+
$stderr.puts "Warning: Unknown frame, and I could not determine the target lemma: Frame #{frame.id()}"
|
960
|
+
$stderr.puts "Cannot repair frame name, leaving it as is."
|
961
|
+
return
|
962
|
+
end
|
963
|
+
|
964
|
+
# get lemma, if it exists, otherwise get word
|
965
|
+
# also, if the lemmatizer has returned a disjunction of lemmas,
|
966
|
+
# get the first disjunct
|
967
|
+
lemma = interpreter.lemma_backoff(maintarget)
|
968
|
+
if lemma
|
969
|
+
# we have a lemma
|
970
|
+
frame.set_name(lemma + "_" + frame.name())
|
971
|
+
else
|
972
|
+
# the main target word has no lemma attribute,
|
973
|
+
# and somehow I couldn't even get the target word
|
974
|
+
$stderr.puts "Warning: Salsa 'Unknown' frame."
|
975
|
+
$stderr.puts "Trying to make its lemma-specificity explicit, but"
|
976
|
+
$stderr.puts "I could not determine the target lemma nor the target word: frame #{frame.id()}"
|
977
|
+
$stderr.puts "Leaving 'Unknown' as it is."
|
978
|
+
end
|
979
|
+
end
|
980
|
+
}
|
981
|
+
end
|
982
|
+
|
983
|
+
|
984
|
+
#####################
|
985
|
+
#
|
986
|
+
# Integrate the semantic annotation of an old sentence
|
987
|
+
# into the corresponding new sentence
|
988
|
+
# At the same time, integrate the lemma information from the
|
989
|
+
# old sentence into the new sentence
|
990
|
+
def FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent,
|
991
|
+
newsent,
|
992
|
+
interpreter_class,
|
993
|
+
exp)
|
994
|
+
if oldsent.nil? or newsent.nil?
|
995
|
+
return
|
996
|
+
end
|
997
|
+
##
|
998
|
+
# match old and new sentence via terminals
|
999
|
+
newterminals = newsent.terminals_sorted()
|
1000
|
+
oldterminals = oldsent.terminals_sorted()
|
1001
|
+
# sanity check: exact match on terminals?
|
1002
|
+
newterminals.interleave(oldterminals).each { |newnode, oldnode|
|
1003
|
+
#print "old ", oldnode.word, " ", newnode.word, "\n"
|
1004
|
+
# new and old word: use both unescaped and escaped variant
|
1005
|
+
if newnode
|
1006
|
+
newwords = [ newnode.word, SalsaTigerXMLHelper.escape(newnode.word) ]
|
1007
|
+
else
|
1008
|
+
newwords = [nil, nil]
|
1009
|
+
end
|
1010
|
+
if oldnode
|
1011
|
+
oldwords = [ oldnode.word, SalsaTigerXMLHelper.escape(oldnode.word) ]
|
1012
|
+
else
|
1013
|
+
oldwords = [ nil, nil]
|
1014
|
+
end
|
1015
|
+
|
1016
|
+
if (newwords & oldwords).empty?
|
1017
|
+
# old and new word don't match, either escaped or non-escaped
|
1018
|
+
|
1019
|
+
$stderr.puts "Warning: could not match terminals of sentence #{newsent.id()}"
|
1020
|
+
$stderr.puts "This means that I cannot match the semantic annotation"
|
1021
|
+
$stderr.puts "to the newly parsed sentence. Skipping."
|
1022
|
+
#$stderr.puts "Old sentence: "
|
1023
|
+
#$stderr.puts oldterminals.map { |n| n.word }.join("--")
|
1024
|
+
#$stderr.puts "New sentence: "
|
1025
|
+
#$stderr.puts newterminals.map { |n| n.word }.join("--")
|
1026
|
+
return false
|
1027
|
+
end
|
1028
|
+
}
|
1029
|
+
|
1030
|
+
##
|
1031
|
+
# copy lemma information
|
1032
|
+
oldterminals.each_with_index { |oldnode, ix|
|
1033
|
+
newnode = newterminals[ix]
|
1034
|
+
if oldnode.get_attribute("lemma")
|
1035
|
+
newnode.set_attribute("lemma", oldnode.get_attribute("lemma"))
|
1036
|
+
end
|
1037
|
+
}
|
1038
|
+
|
1039
|
+
##
|
1040
|
+
# copy frames
|
1041
|
+
oldsent.each_frame { |oldframe|
|
1042
|
+
# make new frame with same ID
|
1043
|
+
newframe = newsent.add_frame(oldframe.name, oldframe.id())
|
1044
|
+
# copy FEs
|
1045
|
+
oldframe.each_child { |oldfe|
|
1046
|
+
# new nodes: map old terminals to new terminals,
|
1047
|
+
# then find max constituents covering them
|
1048
|
+
newnodes = oldfe.descendants.select { |n|
|
1049
|
+
n.is_terminal?
|
1050
|
+
}.map { |n|
|
1051
|
+
oldterminals.index(n)
|
1052
|
+
}.map { |ix|
|
1053
|
+
newterminals[ix]
|
1054
|
+
}
|
1055
|
+
|
1056
|
+
# let the interpreter class decide on how to determine the maximum constituents
|
1057
|
+
newnodes = interpreter_class.max_constituents(newnodes, newsent)
|
1058
|
+
|
1059
|
+
# make new FE with same ID
|
1060
|
+
new_fe = newsent.add_fe(newframe, oldfe.name(), newnodes, oldfe.id())
|
1061
|
+
# keep all attributes of the FE
|
1062
|
+
if oldfe.get_f("attributes")
|
1063
|
+
oldfe.get_f("attributes").each_pair { |attr, value|
|
1064
|
+
new_fe.set_attribute(attr, value)
|
1065
|
+
}
|
1066
|
+
end
|
1067
|
+
}
|
1068
|
+
}
|
1069
|
+
|
1070
|
+
##
|
1071
|
+
### changed by ines => appears twice in stxml file
|
1072
|
+
|
1073
|
+
# copy underspecification
|
1074
|
+
# keep as is, since we've kept all frame and FE IDs
|
1075
|
+
oldsent.each_usp_frameblock { |olduspframe|
|
1076
|
+
newuspframe = newsent.add_usp("frame")
|
1077
|
+
olduspframe.each_child { |oldnode|
|
1078
|
+
newnode = newsent.sem_node_with_id(oldnode.id())
|
1079
|
+
if newnode
|
1080
|
+
newuspframe.add_child(newnode)
|
1081
|
+
else
|
1082
|
+
$stderr.puts "Error: unknown frame with ID #{oldnode.id()}"
|
1083
|
+
end
|
1084
|
+
}
|
1085
|
+
}
|
1086
|
+
oldsent.each_usp_feblock { |olduspfe|
|
1087
|
+
newuspfe = newsent.add_usp("fe")
|
1088
|
+
olduspfe.each_child { |oldnode|
|
1089
|
+
newnode = newsent.sem_node_with_id(oldnode.id())
|
1090
|
+
if newnode
|
1091
|
+
newuspfe.add_child(newnode)
|
1092
|
+
else
|
1093
|
+
$stderr.puts "Error: unknown FE with ID #{oldnode.id()}"
|
1094
|
+
end
|
1095
|
+
}
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
end
|
1099
|
+
|
1100
|
+
####################
|
1101
|
+
# add head attributes to each nonterminal in each
|
1102
|
+
# SalsaTigerXML file in a directory
|
1103
|
+
|
1104
|
+
def FrprepHelper.add_head_attributes(st_sent, # SalsaTigerSentence object
|
1105
|
+
interpreter) # SynInterpreter class
|
1106
|
+
st_sent.each_nonterminal {|nt_node|
|
1107
|
+
head_term = interpreter.head_terminal(nt_node)
|
1108
|
+
if head_term and head_term.word()
|
1109
|
+
nt_node.set_attribute("head", head_term.word())
|
1110
|
+
else
|
1111
|
+
nt_node.set_attribute("head", "--")
|
1112
|
+
end
|
1113
|
+
} # each nonterminal
|
1114
|
+
end
|
1115
|
+
|
1116
|
+
# add lemma information to each terminal in a given SalsaTigerSentence object
|
1117
|
+
def FrprepHelper.add_lemmas_from_tab(st_sent, # SalsaTigerSentence object
|
1118
|
+
tab_sent,# FNTabFormatSentence object
|
1119
|
+
mapping) # hash: tab lineno -> array:SynNode
|
1120
|
+
if tab_sent.nil?
|
1121
|
+
# tab sentence not found
|
1122
|
+
return
|
1123
|
+
end
|
1124
|
+
|
1125
|
+
# produce list with word, lemma pairs
|
1126
|
+
lemmat = Array.new
|
1127
|
+
tab_sent.each_line_parsed {|line|
|
1128
|
+
word = line.get("word")
|
1129
|
+
lemma = line.get("lemma")
|
1130
|
+
lemmat << [word,lemma]
|
1131
|
+
}
|
1132
|
+
|
1133
|
+
# match with st_sent terminal list and add lemma attributes
|
1134
|
+
# KE Jan 07: if word mismatch,
|
1135
|
+
# set to Lemmatizer file version,
|
1136
|
+
# but count mismatches
|
1137
|
+
word_mismatches = Array.new()
|
1138
|
+
|
1139
|
+
st_sent.each_terminal_sorted {|t|
|
1140
|
+
matching_lineno = (0..lemmat.length()-1).to_a.detect { |tab_lineno|
|
1141
|
+
mapping[tab_lineno].include? t
|
1142
|
+
}
|
1143
|
+
unless matching_lineno
|
1144
|
+
next
|
1145
|
+
end
|
1146
|
+
word, lemma = lemmat[matching_lineno]
|
1147
|
+
|
1148
|
+
# transform characters to XML-friendly form
|
1149
|
+
# for comparison with st_word, which is also escaped
|
1150
|
+
word = SalsaTigerXMLHelper.escape(word)
|
1151
|
+
st_word = t.word()
|
1152
|
+
if word != st_word and
|
1153
|
+
word != SalsaTigerXMLHelper.escape(st_word)
|
1154
|
+
# true mismatch.
|
1155
|
+
# use the Lemmatizer version of the word, remember the mismatch
|
1156
|
+
word_mismatches << [st_word, word]
|
1157
|
+
t.set_attribute("word", word)
|
1158
|
+
end
|
1159
|
+
|
1160
|
+
if lemma
|
1161
|
+
# we actually do have lemma information
|
1162
|
+
lemmatised_head = SalsaTigerXMLHelper.escape(lemma)
|
1163
|
+
t.set_attribute("lemma",lemmatised_head)
|
1164
|
+
end
|
1165
|
+
} # each terminal
|
1166
|
+
|
1167
|
+
# did we have mismatches? then report them
|
1168
|
+
unless word_mismatches.empty?
|
1169
|
+
$stderr.puts "Warning: Word mismatches found between Lemmatizer file and SalsaTigerXML file generalted from parser output."
|
1170
|
+
$stderr.puts "(May be due to failed reencoding of special character in the parser output.)"
|
1171
|
+
$stderr.puts "I am using the Lemmatizer version by default."
|
1172
|
+
$stderr.puts "Version used:"
|
1173
|
+
$stderr.print "\t"
|
1174
|
+
st_sent.each_terminal_sorted { |t| $stderr.print ">>#{t}<<" }
|
1175
|
+
$stderr.puts
|
1176
|
+
$stderr.print "SalsaTigerXML file had: "
|
1177
|
+
$stderr.print word_mismatches.map { |st_word, tab_word|
|
1178
|
+
"#{st_word} instead of #{tab_word}"
|
1179
|
+
}.join(", ")
|
1180
|
+
$stderr.puts
|
1181
|
+
end
|
1182
|
+
end
|
1183
|
+
|
1184
|
+
###################3
|
1185
|
+
# given a SalsaTigerSentence,
|
1186
|
+
# look for FrameNet frames that are
|
1187
|
+
# test frames, and remove them
|
1188
|
+
def FrprepHelper.remove_deprecated_frames(sent, # SalsaTigerSentence
|
1189
|
+
exp) # FrprepConfigData
|
1190
|
+
|
1191
|
+
unless exp.get("origin") == "FrameNet"
|
1192
|
+
return
|
1193
|
+
end
|
1194
|
+
|
1195
|
+
sent.frames.each { |frame_obj|
|
1196
|
+
if frame_obj.name() == "Boulder" or
|
1197
|
+
frame_obj.name() =~ /^Test/
|
1198
|
+
sent.remove_frame(frame_obj)
|
1199
|
+
end
|
1200
|
+
}
|
1201
|
+
end
|
1202
|
+
|
1203
|
+
end
|
1204
|
+
|
1205
|
+
############################################3
|
1206
|
+
# Class FrprepFlatSyntax:
|
1207
|
+
#
|
1208
|
+
# given a FNTabFormat file,
|
1209
|
+
# yield each of its sentences in SalsaTigerXML,
|
1210
|
+
# constructing a flat syntax
|
1211
|
+
class FrprepFlatSyntax
|
1212
|
+
def initialize(tabfilename, # string: name of tab file
|
1213
|
+
postag_suffix, # postag file suffix (or nil)
|
1214
|
+
lemma_suffix) # lemmatisation file suffix (or nil)
|
1215
|
+
|
1216
|
+
@tabfilename = tabfilename
|
1217
|
+
@pos_suffix = postag_suffix
|
1218
|
+
@lemma_suffix = lemma_suffix
|
1219
|
+
end
|
1220
|
+
|
1221
|
+
# yield each non-parse sentence as a tuple
|
1222
|
+
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
1223
|
+
# of a SalsaTigerSentence object, a FNTabSentence object,
|
1224
|
+
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
1225
|
+
# pointing each tab word to one or more SalsaTigerSentence terminals
|
1226
|
+
def each_sentence(dummy)
|
1227
|
+
|
1228
|
+
# read tab file with lemma and POS info
|
1229
|
+
tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
|
1230
|
+
|
1231
|
+
tabfile.each_sentence() { |tabsent|
|
1232
|
+
# start new, empty sentence with "failed" attribute (i.e. no parse)
|
1233
|
+
# and with the ID of the corresponding TabFormat sentence
|
1234
|
+
sentid = tabsent.get_sent_id()
|
1235
|
+
if sentid.nil? or sentid =~ /^-*$/
|
1236
|
+
$stderr.puts "No sentence ID for sentence:"
|
1237
|
+
tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
|
1238
|
+
$stderr.puts
|
1239
|
+
sentid = Time.new().to_f.to_s
|
1240
|
+
end
|
1241
|
+
sent = SalsaTigerSentence.new("<s id=\"#{SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
|
1242
|
+
|
1243
|
+
# add single nonterminal node, category "S"
|
1244
|
+
single_nonterminal_id = SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
|
1245
|
+
vroot = sent.add_syn("nt", "S", # category
|
1246
|
+
nil, # word
|
1247
|
+
nil, # pos
|
1248
|
+
single_nonterminal_id)
|
1249
|
+
|
1250
|
+
# add terminals
|
1251
|
+
tabsent.each_line_parsed() { |line_obj|
|
1252
|
+
# make terminal node with tab sent info
|
1253
|
+
node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
|
1254
|
+
word = line_obj.get("word")
|
1255
|
+
unless word
|
1256
|
+
word = ""
|
1257
|
+
end
|
1258
|
+
word = SalsaTigerXMLHelper.escape(word)
|
1259
|
+
pos = line_obj.get("pos")
|
1260
|
+
unless pos
|
1261
|
+
pos = ""
|
1262
|
+
end
|
1263
|
+
pos = SalsaTigerXMLHelper.escape(pos)
|
1264
|
+
terminal = sent.add_syn("t", nil, # category
|
1265
|
+
word, pos,
|
1266
|
+
node_id)
|
1267
|
+
|
1268
|
+
if line_obj.get("lemma")
|
1269
|
+
# lemma
|
1270
|
+
terminal.set_attribute("lemma", SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
|
1271
|
+
end
|
1272
|
+
|
1273
|
+
# add new terminal as child of vroot
|
1274
|
+
vroot.add_child(terminal, nil)
|
1275
|
+
terminal.add_parent(vroot, nil)
|
1276
|
+
} # each line of tab file
|
1277
|
+
|
1278
|
+
# yield newly constructed SalsaTigerXMl sentence plus tab sentence
|
1279
|
+
yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
|
1280
|
+
}
|
1281
|
+
end
|
1282
|
+
end
|
1283
|
+
|
1284
|
+
############################################3
|
1285
|
+
# Class FrprepReadStxml
|
1286
|
+
#
|
1287
|
+
# given a STXML file,
|
1288
|
+
# yield each of its sentences
|
1289
|
+
class FrprepReadStxml
|
1290
|
+
def initialize(stxmlfilename, # string: name of SalsaTigerXML file
|
1291
|
+
tabfilename, # string: name of corresponding tab file (or nil)
|
1292
|
+
postag_suffix, # POS tag file suffix (or nil)
|
1293
|
+
lemma_suffix) # lemmatization file suffix (or nil)
|
1294
|
+
|
1295
|
+
@stxmlfilename = stxmlfilename
|
1296
|
+
@tabfilename = tabfilename
|
1297
|
+
@pos_suffix = postag_suffix
|
1298
|
+
@lemma_suffix = lemma_suffix
|
1299
|
+
end
|
1300
|
+
# yield each non-parse sentence as a tuple
|
1301
|
+
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
1302
|
+
# of a SalsaTigerSentence object, a FNTabSentence object,
|
1303
|
+
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
1304
|
+
# pointing each tab word to one or more SalsaTigerSentence terminals
|
1305
|
+
def each_sentence(dummy)
|
1306
|
+
# read corresponding tab file?
|
1307
|
+
tab_sents = Array.new()
|
1308
|
+
if File.exists? @tabfilename
|
1309
|
+
tabfile = FNTabFormatFile.new(@tabfilename,@pos_suffix,@lemma_suffix)
|
1310
|
+
tabfile.each_sentence { |tabsent|
|
1311
|
+
tab_sents << tabsent
|
1312
|
+
}
|
1313
|
+
end
|
1314
|
+
|
1315
|
+
# read STXML file
|
1316
|
+
infile = FilePartsParser.new(@stxmlfilename)
|
1317
|
+
index = 0
|
1318
|
+
infile.scan_s { |sent_string|
|
1319
|
+
sent = SalsaTigerSentence.new(sent_string)
|
1320
|
+
yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
|
1321
|
+
index += 1
|
1322
|
+
}
|
1323
|
+
end
|
1324
|
+
end
|