frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,144 @@
|
|
1
|
+
# sp 28 06 04
|
2
|
+
#
|
3
|
+
# this module offers methods to extract gemma corpora from the FrameNet database#
|
4
|
+
|
5
|
+
require 'FrameXML'
|
6
|
+
|
7
|
+
class FNDatabase
|
8
|
+
|
9
|
+
def each_matching_sentence(file_pred,sent_pred)
|
10
|
+
# fundamental access function to FrameXML files
|
11
|
+
|
12
|
+
# returns file objects where
|
13
|
+
# FrameXMLSentence matches sent_pred
|
14
|
+
# (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
|
15
|
+
each_matching_file(file_pred) {|frameNetFile|
|
16
|
+
frameNetFile.each_sentence {|frameNetSent|
|
17
|
+
if sent_pred.call(frameNetSent)
|
18
|
+
frameNetSent.verify_annotation
|
19
|
+
yield frameNetSent
|
20
|
+
end
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def each_matching_file(file_pred)
|
26
|
+
# fundamental access function to FrameXML files
|
27
|
+
|
28
|
+
# returns file (FrameXMLFile) objects which match file_pred
|
29
|
+
each_framexml_file{|frameNetFile|
|
30
|
+
if file_pred.call(frameNetFile)
|
31
|
+
yield frameNetFile
|
32
|
+
end
|
33
|
+
frameNetFile.close
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
def extract_frame(frame,outfile)
|
38
|
+
each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
|
39
|
+
Proc.new{|fnsent| true}) {|fnsent|
|
40
|
+
if fnsent.contains_FE_annotation_and_target
|
41
|
+
fnsent.print_conll_style_to(outfile)
|
42
|
+
end
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def extract_lemma(lemma,outfile)
|
47
|
+
each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
|
48
|
+
Proc.new{|fnsent| true}) {|fnsent|
|
49
|
+
if fnsent.contains_FE_annotation_and_target
|
50
|
+
fnsent.print_conll_style_to(outfile)
|
51
|
+
end
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_everything(outdirectory)
|
56
|
+
unless outdirectory[-1,1] == "/"
|
57
|
+
outdirectory += "/"
|
58
|
+
end
|
59
|
+
|
60
|
+
outfiles = Hash.new
|
61
|
+
each_matching_sentence(Proc.new{|fnfile| true},
|
62
|
+
Proc.new{|fnsent| true}) {|fnsent|
|
63
|
+
frame = fnsent.get_file_obj.get_frame
|
64
|
+
unless outfiles.key?(frame)
|
65
|
+
outfiles[frame] = File.new(outdirectory+frame+".tab","w")
|
66
|
+
end
|
67
|
+
if fnsent.contains_FE_annotation_and_target
|
68
|
+
fnsent.print_conll_style_to(outfiles[frame])
|
69
|
+
end
|
70
|
+
}
|
71
|
+
# close output files
|
72
|
+
outfiles.each_value {|file|
|
73
|
+
file.close
|
74
|
+
}
|
75
|
+
# remove zero-size files
|
76
|
+
Dir[outdirectory+"*"].each {|filename|
|
77
|
+
if FileTest.zero?(filename)
|
78
|
+
File.unlink(filename)
|
79
|
+
end
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
def initialize(fn_path)
|
85
|
+
unless fn_path[-1,1] == "/"
|
86
|
+
fn_path += "/"
|
87
|
+
end
|
88
|
+
@fn = fn_path
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def each_framexml_file
|
94
|
+
# files might be zipped
|
95
|
+
Dir[@fn+"lu*.xml.gz"].each {|gzfile|
|
96
|
+
Kernel.system("cp "+gzfile+" /tmp/")
|
97
|
+
Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
|
98
|
+
gzfile =~ /(.+)\.gz/
|
99
|
+
yield FrameXMLFile.new("/tmp/"+File.basename($1))
|
100
|
+
}
|
101
|
+
# or might not
|
102
|
+
Dir[@fn+"/lu*.xml"].each {|filename|
|
103
|
+
yield FrameXMLFile.new(filename)
|
104
|
+
}
|
105
|
+
end
|
106
|
+
|
107
|
+
# I don't really remember what this was good for ;-)
|
108
|
+
|
109
|
+
# def browse_everything(allFiles)
|
110
|
+
# if allFiles
|
111
|
+
# Dir[fn+"*.xml.gz"].each {|gzfile|
|
112
|
+
# Kernel.system("cp "+gzfile+" /tmp/")
|
113
|
+
# Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
|
114
|
+
# gzfile =~ /(.+)\.gz/
|
115
|
+
# # STDERR.puts File.basename($1)
|
116
|
+
# # STDERR.print "."
|
117
|
+
# ff = FrameXMLFile.new("/tmp/"+File.basename($1))
|
118
|
+
# ff.each_sentence {|s|
|
119
|
+
# if s.contains_FE_annotation_and_target
|
120
|
+
# s.verify_annotation
|
121
|
+
# if s.verify_annotation
|
122
|
+
# puts "****************** Error: Still problems after 2nd verification!"
|
123
|
+
# end
|
124
|
+
# s.print_conll_style
|
125
|
+
# end
|
126
|
+
# }
|
127
|
+
# }
|
128
|
+
# else
|
129
|
+
# ff = FrameXMLFile.new("/tmp/lu1870.xml")
|
130
|
+
# ff.each_sentence {|s|
|
131
|
+
# if s.contains_FE_annotation_and_target
|
132
|
+
# s.verify_annotation
|
133
|
+
# if s.verify_annotation
|
134
|
+
# puts "****************** Error: Still problems after 2nd verification!"
|
135
|
+
# end
|
136
|
+
# # s.print_layers
|
137
|
+
# s.print_conll_style
|
138
|
+
# end
|
139
|
+
# }
|
140
|
+
# end
|
141
|
+
# end
|
142
|
+
|
143
|
+
end
|
144
|
+
|
@@ -0,0 +1,196 @@
|
|
1
|
+
###
|
2
|
+
# FixSynSemMapping:
|
3
|
+
# Given a SalsaTigerRegXML sentence with semantic role annotation,
|
4
|
+
# simplify the mapping of semantic roles to syntactic constituents
|
5
|
+
#
|
6
|
+
# The following is lifted from the LREC06 paper on Shalmaneser:
|
7
|
+
# During preprocessing, the span of semantic roles in the training corpora is
|
8
|
+
# projected onto the output of the syntactic parser by assigning each
|
9
|
+
# role to the set of maximal constituents covering its word span.
|
10
|
+
# f the word span of a role does not coincide
|
11
|
+
# with parse tree constituents, e.g. due to misparses,
|
12
|
+
# the role is ``spread out'' across several constituents. This leads to
|
13
|
+
# idiosyncratic paths between predicate and semantic role in the parse
|
14
|
+
# tree.
|
15
|
+
#
|
16
|
+
# [The following span standardization algorithm is used to make the
|
17
|
+
# syntax-semantics mapping more uniform:]
|
18
|
+
# Given a role r that has been assigned, let N be the set of
|
19
|
+
# terminal nodes of the syntactic structure that are covered by r.
|
20
|
+
#
|
21
|
+
# Iteratively compute the maximal projection of N in the syntactic
|
22
|
+
# structure:
|
23
|
+
# 1) If n is a node such that all of n's children are in N,
|
24
|
+
# then remove n's children from N and add n instead.
|
25
|
+
# 2) If n is a node with 3 or more children, and all of n's
|
26
|
+
# children except one are in N, then remove n's children from N
|
27
|
+
# and add n instead.
|
28
|
+
# 3) If n is an NP with 2 children, and one of them, another NP,
|
29
|
+
# is in N, and the other, a relative clause, is not, then remove
|
30
|
+
# n's children from N and add n instead.
|
31
|
+
#
|
32
|
+
# If none of the rules is applicable to N anymore, assign r to the
|
33
|
+
# nodes in N.
|
34
|
+
#
|
35
|
+
# Rule 1 implements normal maximal projection. Rule 2 ``repairs'' parser
|
36
|
+
# errors where all children of a node but one have been assigned the
|
37
|
+
# same role. Rule 3 addresses a problem of the FrameNet data, where
|
38
|
+
# relative clauses have been omitted from roles assigned to NPs.
|
39
|
+
|
40
|
+
# KE Feb 08: rule 3 currently out of commission!
|
41
|
+
|
42
|
+
require "frprep/SalsaTigerRegXML"
|
43
|
+
|
44
|
+
module FixSynSemMapping
|
45
|
+
##
|
46
|
+
# fix it
|
47
|
+
#
|
48
|
+
# relevant settings in the experiment file:
|
49
|
+
#
|
50
|
+
# fe_syn_repair:
|
51
|
+
# If there is a node that would be a max. constituent for the
|
52
|
+
# words covered by the given FE, except that it has one child
|
53
|
+
# whose words are not in the FE, use the node as max constituent anyway.
|
54
|
+
# This is to repair cases where the parser has made an attachment choice
|
55
|
+
# that differs from the one in the gold annotation
|
56
|
+
#
|
57
|
+
# fe_rel_repair:
|
58
|
+
# If there is an NP such that all of its children except one have been
|
59
|
+
# assigned the same FE, and that missing child is a relative clause
|
60
|
+
# depending on one of the other children, then take the complete NP as
|
61
|
+
# that FE
|
62
|
+
def FixSynSemMapping.fixit(sent, # SalsaTigerSentence object
|
63
|
+
exp, # experiment file object
|
64
|
+
interpreter_class) # SynInterpreter class
|
65
|
+
|
66
|
+
|
67
|
+
unless exp.get("fe_syn_repair") or exp.get("fe_rel_repair")
|
68
|
+
return
|
69
|
+
end
|
70
|
+
|
71
|
+
if sent.nil?
|
72
|
+
return
|
73
|
+
end
|
74
|
+
|
75
|
+
# "repair" FEs:
|
76
|
+
sent.each_frame { |frame|
|
77
|
+
|
78
|
+
frame.each_child { |fe_or_target|
|
79
|
+
|
80
|
+
# repair only if the FE currently
|
81
|
+
# points to more than one syn node
|
82
|
+
if fe_or_target.children.length() < 2
|
83
|
+
next
|
84
|
+
end
|
85
|
+
|
86
|
+
if exp.get("fe_rel_repair")
|
87
|
+
lastfe = fe_or_target.children.last()
|
88
|
+
if lastfe and interpreter_class.simplified_pt(lastfe) =~ /^(WDT)|(WP\$?)|(WRB)/
|
89
|
+
|
90
|
+
# remove syn nodes that the FE points to
|
91
|
+
old_fe_syn = fe_or_target.children()
|
92
|
+
old_fe_syn.each { |child|
|
93
|
+
fe_or_target.remove_child(child)
|
94
|
+
}
|
95
|
+
|
96
|
+
# set it to point only to the last previous node, the relative pronoun
|
97
|
+
fe_or_target.add_child(lastfe)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
if exp.get("fe_syn_repair")
|
102
|
+
# remove syn nodes that the FE points to
|
103
|
+
old_fe_syn = fe_or_target.children()
|
104
|
+
old_fe_syn.each { |child|
|
105
|
+
fe_or_target.remove_child(child)
|
106
|
+
}
|
107
|
+
|
108
|
+
# and recompute
|
109
|
+
new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t|
|
110
|
+
t.yield_nodes
|
111
|
+
}.flatten.uniq,
|
112
|
+
sent,
|
113
|
+
exp.get("fe_syn_repair"))
|
114
|
+
|
115
|
+
# make the FE point to the new nodes
|
116
|
+
new_fe_syn.each { |syn_node|
|
117
|
+
fe_or_target.add_child(syn_node)
|
118
|
+
}
|
119
|
+
end
|
120
|
+
} # each FE
|
121
|
+
} # each frame
|
122
|
+
end # def fixit
|
123
|
+
end # module
|
124
|
+
|
125
|
+
|
126
|
+
#########3
|
127
|
+
# old code
|
128
|
+
|
129
|
+
# if exp.get("fe_rel_repair")
|
130
|
+
# # repair relative clauses:
|
131
|
+
# # then make a procedure to pass on to max constituents
|
132
|
+
# # that will recognize the relevant cases
|
133
|
+
|
134
|
+
# accept_anyway_proc = Proc.new { |node, children_in, children_out|
|
135
|
+
|
136
|
+
# # node: SynNode
|
137
|
+
# # children_in, children_out: array:SynNode. children_in are the children
|
138
|
+
# # that are already covered by the FE, children_out the ones that aren't
|
139
|
+
|
140
|
+
# # if node is an NP,
|
141
|
+
# # and only one of its children is out,
|
142
|
+
# # and one node in children_in is an NP, and the missing child is an SBAR
|
143
|
+
# # with a child that is a relative pronoun, then consider the child in children_out as covered
|
144
|
+
# if interpreter_class.category(node) == "noun" and
|
145
|
+
# children_out.length() == 1 and
|
146
|
+
# children_in.select { |n| interpreter_class.category(n) == "noun" } and
|
147
|
+
# interpreter_class.category(children_out.first) == "sent" and
|
148
|
+
# (ch = children_out.first.children) and
|
149
|
+
# ch.select { |n| interpreter_class.relative_pronoun?(n) }
|
150
|
+
# true
|
151
|
+
# else
|
152
|
+
# false
|
153
|
+
# end
|
154
|
+
# }
|
155
|
+
|
156
|
+
# else
|
157
|
+
# accept_anyway_proc = nil
|
158
|
+
# end
|
159
|
+
|
160
|
+
|
161
|
+
# # "repair" FEs:
|
162
|
+
# sent.each_frame { |frame|
|
163
|
+
|
164
|
+
# frame.each_child { |fe_or_target|
|
165
|
+
|
166
|
+
# # repair only if the FE currently
|
167
|
+
# # points to more than one syn node, or
|
168
|
+
# # if it is a noun with a non-covered sentence sister
|
169
|
+
# if fe_or_target.children.length() > 1 or
|
170
|
+
# (exp.get("fe_rel_repair") and (curr_marked = fe_or_target.children.first()) and
|
171
|
+
# interpreter_class.category(curr_marked) == "noun" and
|
172
|
+
# (p = curr_marked.parent) and
|
173
|
+
# p.children.select { |n| n != curr_marked and interpreter_class.category(n) == "sent" } )
|
174
|
+
|
175
|
+
# # remember nodes covered by the FE
|
176
|
+
# old_fe_syn = fe_or_target.children()
|
177
|
+
|
178
|
+
# # remove syn nodes that the FE points to
|
179
|
+
# old_fe_syn.each { |child|
|
180
|
+
# fe_or_target.remove_child(child)
|
181
|
+
# }
|
182
|
+
|
183
|
+
# # and recompute
|
184
|
+
# new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t| t.yield_nodes}.flatten.uniq,
|
185
|
+
# sent,
|
186
|
+
# exp.get("fe_syn_repair"),
|
187
|
+
# accept_anyway_proc)
|
188
|
+
|
189
|
+
# # make the FE point to the new nodes
|
190
|
+
# new_fe_syn.each { |syn_node|
|
191
|
+
# fe_or_target.add_child(syn_node)
|
192
|
+
# }
|
193
|
+
|
194
|
+
# end # if FE points to more than one syn node
|
195
|
+
# } # each FE
|
196
|
+
# } # each frame
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# FPrepConfigData
|
2
|
+
# Katrin Erk July 05
|
3
|
+
#
|
4
|
+
# Preprocessing for Fred and Rosy:
|
5
|
+
# access to a configuration and experiment description file
|
6
|
+
|
7
|
+
require "frprep/ConfigData"
|
8
|
+
|
9
|
+
##############################
|
10
|
+
# Class FrPrepConfigData
|
11
|
+
#
|
12
|
+
# inherits from ConfigData,
|
13
|
+
# sets variable names appropriate to preprocessing task
|
14
|
+
|
15
|
+
class FrPrepConfigData < ConfigData
|
16
|
+
def initialize(filename)
|
17
|
+
|
18
|
+
# initialize config data object
|
19
|
+
super(filename, # config file
|
20
|
+
{ "prep_experiment_ID" => "string", # experiment identifier
|
21
|
+
|
22
|
+
"frprep_directory" => "string", # dir for frprep internal data
|
23
|
+
|
24
|
+
# information about the dataset
|
25
|
+
"language" => "string", # en, de
|
26
|
+
"origin"=> "string", # FrameNet, Salsa, or nothing
|
27
|
+
"format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
|
28
|
+
"encoding" => "string", # utf8, iso, hex, or nothing
|
29
|
+
|
30
|
+
|
31
|
+
# directories
|
32
|
+
"directory_input" => "string", # dir with input data
|
33
|
+
"directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
|
34
|
+
"directory_parserout" => "string", # dir with parser output for the parser named below
|
35
|
+
|
36
|
+
# syntactic processing
|
37
|
+
"pos_tagger" => "string", # name of POS tagger
|
38
|
+
"lemmatizer" => "string", # name of lemmatizer
|
39
|
+
"parser" => "string", # name of parser
|
40
|
+
"pos_tagger_path" => "string", # path to POS tagger
|
41
|
+
"lemmatizer_path" => "string", # path to lemmatizer
|
42
|
+
"parser_path" => "string", # path to parser
|
43
|
+
"parser_max_sent_num" => "integer", # max number of sentences per parser input file
|
44
|
+
"parser_max_sent_len" => "integer", # max sentence length the parser handles
|
45
|
+
|
46
|
+
"do_parse" => "bool", # use parser?
|
47
|
+
"do_lemmatize" => "bool",# use lemmatizer?
|
48
|
+
"do_postag" => "bool", # use POS tagger?
|
49
|
+
|
50
|
+
# output format: if tabformat_output == true,
|
51
|
+
# output in Tab format rather than Salsa/Tiger XML
|
52
|
+
# (this will not work if do_parse == true)
|
53
|
+
"tabformat_output" => "bool",
|
54
|
+
|
55
|
+
# syntactic repairs, dependent on existing semantic role annotation
|
56
|
+
"fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
|
57
|
+
"fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
|
58
|
+
},
|
59
|
+
[ ] # variables
|
60
|
+
)
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
|
@@ -0,0 +1,513 @@
|
|
1
|
+
# sp 18 06 2004
|
2
|
+
#
|
3
|
+
# access to FrameNet XML files, sentences, and annotation.
|
4
|
+
#
|
5
|
+
# sp 10 11 04: only data from the first layer with name XY is
|
6
|
+
# used for output. Other data is saved in layer XY.2nd, but is
|
7
|
+
# currently not processed.
|
8
|
+
#
|
9
|
+
# sp 22 05 04: also, if two labels exist which cover the same span
|
10
|
+
# (ie there is a double annotation within the same layer), ignore
|
11
|
+
# all but the first label.
|
12
|
+
#
|
13
|
+
# ke 13 07 05:
|
14
|
+
# - changed to RegXMl.rb
|
15
|
+
# - fixed two problems in analyse_layer:
|
16
|
+
# - Deleting problematic labels:
|
17
|
+
# For some reason, thisLayer[i+1..-1].each_index {|other_i|
|
18
|
+
# included the index 0 in any case, resulting in the 1st
|
19
|
+
# label being deleted in any case.
|
20
|
+
# - Deleting problematic labels, checking for label overlap:
|
21
|
+
# The old formulation worked only if labels occurred in the array
|
22
|
+
# in the order they occurred in the sentence, but that was not the case.
|
23
|
+
# - Change in deleting problematic labels:
|
24
|
+
# No longer delete duplicate labels, since e.g. in the PT level there
|
25
|
+
# may be more than one NP label, and we want to keep those
|
26
|
+
#
|
27
|
+
# KE January 2007:
|
28
|
+
# write new adapted FNTab format
|
29
|
+
# ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
|
30
|
+
|
31
|
+
require 'Ampersand'
|
32
|
+
require 'ISO-8859-1'
|
33
|
+
require 'RegXML'
|
34
|
+
|
35
|
+
class FrameXMLFile # only verified to work for FrameNet v1.1
|
36
|
+
|
37
|
+
def initialize(filename)
|
38
|
+
@filename = filename
|
39
|
+
file = File.new(filename)
|
40
|
+
counter = 0
|
41
|
+
while true
|
42
|
+
counter +=1
|
43
|
+
line = file.gets
|
44
|
+
if line =~ /<lexunit/
|
45
|
+
break
|
46
|
+
end
|
47
|
+
if counter > 3
|
48
|
+
STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
|
49
|
+
Kernel.exit
|
50
|
+
end
|
51
|
+
end
|
52
|
+
# found lexunit
|
53
|
+
string = line
|
54
|
+
while (line = file.gets())
|
55
|
+
string << line
|
56
|
+
end
|
57
|
+
@lexunit = RegXML.new(string)
|
58
|
+
attributes = @lexunit.attributes()
|
59
|
+
@id = attributes["ID"]
|
60
|
+
attributes["name"] =~ /^([^.]+).([^.]+)$/
|
61
|
+
@lu = $1
|
62
|
+
@pos = $2.upcase
|
63
|
+
if @lu.nil?
|
64
|
+
raise "[framexml] no lemma in header of file #{@filename}"
|
65
|
+
elsif @pos.nil?
|
66
|
+
raise "[framexml] no pos in header of file #{@filename}"
|
67
|
+
end
|
68
|
+
@frame = attributes["frame"]
|
69
|
+
end
|
70
|
+
|
71
|
+
def get_lu
|
72
|
+
return @lu.gsub(" ","_")
|
73
|
+
end
|
74
|
+
|
75
|
+
def get_lu_id
|
76
|
+
return @id
|
77
|
+
end
|
78
|
+
|
79
|
+
def get_filename
|
80
|
+
return @filename
|
81
|
+
end
|
82
|
+
|
83
|
+
def get_pos
|
84
|
+
return @pos
|
85
|
+
end
|
86
|
+
|
87
|
+
def get_frame
|
88
|
+
return @frame
|
89
|
+
end
|
90
|
+
|
91
|
+
def close
|
92
|
+
end
|
93
|
+
|
94
|
+
def each_sentence
|
95
|
+
@lexunit.children_and_text().each { |subcorpus|
|
96
|
+
subcorpus.children_and_text().each { |annotationSet|
|
97
|
+
if annotationSet.name == "annotationSet"
|
98
|
+
# sentence found
|
99
|
+
yield FrameXMLSentence.new(annotationSet,self)
|
100
|
+
end
|
101
|
+
}
|
102
|
+
}
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
class FrameXMLSentence
|
107
|
+
def initialize(annotationSet,file_obj)
|
108
|
+
@file_obj = file_obj
|
109
|
+
|
110
|
+
# layers: hash layer_name -> array:[name, start, stop]
|
111
|
+
# name: name of the element, string
|
112
|
+
# start: start character, integer
|
113
|
+
# stop: end character, integer
|
114
|
+
@layers = Hash.new
|
115
|
+
|
116
|
+
annotationSet.children_and_text().each { |sentence_or_layer_elt|
|
117
|
+
|
118
|
+
case sentence_or_layer_elt.name
|
119
|
+
when "sentence"
|
120
|
+
# sentence: has ID, its child is <text>[text]</text>
|
121
|
+
@sent_id = sentence_or_layer_elt.attributes["ID"]
|
122
|
+
text_elt = sentence_or_layer_elt.children_and_text().detect { |child|
|
123
|
+
child.name == "text"
|
124
|
+
}
|
125
|
+
if text_elt
|
126
|
+
# found the text element. its only child should be the text
|
127
|
+
@orig_text = text_elt.children_and_text().detect { |child|
|
128
|
+
child.text?
|
129
|
+
}
|
130
|
+
if @orig_text
|
131
|
+
# take text out of RegXMl object
|
132
|
+
@orig_text = @orig_text.to_s()
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
when "layers"
|
137
|
+
# contains annotation layers
|
138
|
+
sentence_or_layer_elt.children_and_text().each { |layer|
|
139
|
+
unless layer.name == "layer"
|
140
|
+
# additional material, ignore
|
141
|
+
next
|
142
|
+
end
|
143
|
+
|
144
|
+
name = layer.attributes["name"]
|
145
|
+
unless name
|
146
|
+
raise "layer without a name"
|
147
|
+
end
|
148
|
+
unless @layers.key?(name)
|
149
|
+
@layers[name] = analyse_layer(layer, name)
|
150
|
+
end
|
151
|
+
}
|
152
|
+
end
|
153
|
+
}
|
154
|
+
|
155
|
+
@pos_text = UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
|
156
|
+
@text = Ampersand.utf8_to_hex(@orig_text).split(" ") # text with special characters replaced by &...; sequences
|
157
|
+
|
158
|
+
# all text and pos_text have the same number of elements!
|
159
|
+
@start_is = Hash.new # map char indices (start of words) onto word indices
|
160
|
+
@stop_is = Hash.new # map char indices (end of words) onto word indices
|
161
|
+
@charidx = Array.new # maps word indices on [start,stop]
|
162
|
+
|
163
|
+
@double_space = Array.new
|
164
|
+
pos = 0
|
165
|
+
while (match = @orig_text.index(/(\s\s+)/,pos))
|
166
|
+
@double_space << match
|
167
|
+
pos = match+1
|
168
|
+
end
|
169
|
+
|
170
|
+
|
171
|
+
# fill start, stop and charidx arrays
|
172
|
+
char_i = 0
|
173
|
+
@pos_text.each_index {|word_i|
|
174
|
+
@start_is[char_i] = word_i
|
175
|
+
startchar = char_i
|
176
|
+
# puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
|
177
|
+
char_i += our_length(@pos_text[word_i])
|
178
|
+
@stop_is[char_i-1] = word_i
|
179
|
+
|
180
|
+
stopchar = char_i-1
|
181
|
+
|
182
|
+
# puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
|
183
|
+
|
184
|
+
@charidx << [startchar,stopchar]
|
185
|
+
|
186
|
+
# separators
|
187
|
+
if @double_space.include?(char_i) then
|
188
|
+
char_i += 2
|
189
|
+
else
|
190
|
+
char_i += 1
|
191
|
+
end
|
192
|
+
}
|
193
|
+
end
|
194
|
+
|
195
|
+
def get_file_obj
|
196
|
+
return @file_obj
|
197
|
+
end
|
198
|
+
|
199
|
+
def get_sent_id
|
200
|
+
return @sent_id
|
201
|
+
end
|
202
|
+
|
203
|
+
def print_text
|
204
|
+
puts "("+@id+ ")\t"+@text
|
205
|
+
end
|
206
|
+
|
207
|
+
def contains_FE_annotation_and_target
|
208
|
+
target_info = @layers["Target"][0]
|
209
|
+
unless target_info[0] == "Target"
|
210
|
+
STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
|
211
|
+
STDERR.puts "Sentence: "+@text
|
212
|
+
return false
|
213
|
+
else
|
214
|
+
return (@layers.key?("FE") and target_info[2] != 0)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
# we only verify the interesting layers (FE,GF,Target)
|
219
|
+
# if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
|
220
|
+
|
221
|
+
def verify_annotation # returns true if some change has taken place
|
222
|
+
change = false
|
223
|
+
@layers.each_pair {|layername,l|
|
224
|
+
|
225
|
+
if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
|
226
|
+
|
227
|
+
l.each_index {|i|
|
228
|
+
|
229
|
+
element,start,stop = l[i]
|
230
|
+
|
231
|
+
newstart = start
|
232
|
+
newstop = stop
|
233
|
+
|
234
|
+
@charidx.each_index{|j|
|
235
|
+
unless j== 0
|
236
|
+
pstartidx, pstopidx = @charidx[j-1]
|
237
|
+
end
|
238
|
+
startidx, stopidx = @charidx[j]
|
239
|
+
|
240
|
+
if (start > startidx and start <= stopidx) or
|
241
|
+
(j != 0 and start > pstopidx and start < startidx)
|
242
|
+
newstart = startidx
|
243
|
+
end
|
244
|
+
|
245
|
+
if (stop >= startidx and stop < stopidx)
|
246
|
+
newstop = stopidx
|
247
|
+
elsif (j != 0 and stop > pstopidx and stop < startidx)
|
248
|
+
newstop = pstopidx
|
249
|
+
end
|
250
|
+
|
251
|
+
}
|
252
|
+
if start != newstart or stop != newstop
|
253
|
+
change = true
|
254
|
+
@layers[layername][i] = [element,newstart,newstop]
|
255
|
+
STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
|
256
|
+
markable_as_string(layername,element).each {|string|
|
257
|
+
STDERR.puts "New markable: "+string
|
258
|
+
}
|
259
|
+
STDERR.puts "Sentence: "+@pos_text.join(" ")
|
260
|
+
puts
|
261
|
+
end
|
262
|
+
}
|
263
|
+
end
|
264
|
+
}
|
265
|
+
return change
|
266
|
+
end
|
267
|
+
|
268
|
+
def print_conll_style
|
269
|
+
print_conll_style_to(STDOUT)
|
270
|
+
end
|
271
|
+
|
272
|
+
# CHANGED KE January 2007:
|
273
|
+
# write new adapted FNTab format
|
274
|
+
# ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
|
275
|
+
def print_conll_style_to(out)
|
276
|
+
|
277
|
+
# even though in principle there might be multiple
|
278
|
+
# labels for one span [i.e. in one value of the
|
279
|
+
# {gf,fe,pt} hashes], we only ever record one
|
280
|
+
|
281
|
+
gf = Hash.new
|
282
|
+
add_all_to_hash(gf,"GF")
|
283
|
+
fe = Hash.new
|
284
|
+
add_all_to_hash(fe,"FE")
|
285
|
+
pt = Hash.new
|
286
|
+
add_all_to_hash(pt,"PT")
|
287
|
+
target = Hash.new
|
288
|
+
add_all_to_hash(target,"Target")
|
289
|
+
|
290
|
+
in_target = false
|
291
|
+
|
292
|
+
@pos_text.each_index {|i|
|
293
|
+
# write format:
|
294
|
+
# "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
|
295
|
+
line = Array.new
|
296
|
+
# word
|
297
|
+
word = @pos_text[i]
|
298
|
+
line << word
|
299
|
+
|
300
|
+
start, stop = @charidx[i]
|
301
|
+
# "pt", "gf", "role",
|
302
|
+
[pt,gf,fe].each {|hash|
|
303
|
+
token = Array.new
|
304
|
+
if hash.key?([start,"start"])
|
305
|
+
markables = hash.delete([start,"start"])
|
306
|
+
markables.each {|element|
|
307
|
+
token << "B-"+element
|
308
|
+
}
|
309
|
+
end
|
310
|
+
if hash.key?([stop,"stop"])
|
311
|
+
markables = hash.delete([stop,"stop"])
|
312
|
+
markables.each {|element|
|
313
|
+
token << "E-"+element
|
314
|
+
}
|
315
|
+
end
|
316
|
+
if token.empty?
|
317
|
+
line << "-"
|
318
|
+
else
|
319
|
+
line << token.sort.join(":")
|
320
|
+
end
|
321
|
+
}
|
322
|
+
# "target"
|
323
|
+
if target.key?([start,"start"])
|
324
|
+
target.delete([start,"start"])
|
325
|
+
in_target = true
|
326
|
+
end
|
327
|
+
if in_target
|
328
|
+
line << @file_obj.get_lu+"."+@file_obj.get_pos
|
329
|
+
else
|
330
|
+
line << "-"
|
331
|
+
end
|
332
|
+
if target.key?([stop,"stop"])
|
333
|
+
target.delete([stop,"stop"])
|
334
|
+
in_target = false
|
335
|
+
end
|
336
|
+
# "frame"
|
337
|
+
line << @file_obj.get_frame
|
338
|
+
|
339
|
+
# "stuff" "ne",
|
340
|
+
line << "-"
|
341
|
+
line << "-"
|
342
|
+
|
343
|
+
# "sent_id"
|
344
|
+
line << @file_obj.get_lu_id+"-"+@sent_id
|
345
|
+
|
346
|
+
out.puts line.join("\t")
|
347
|
+
}
|
348
|
+
|
349
|
+
out.puts
|
350
|
+
|
351
|
+
[gf,fe,pt,target].each {|hash|
|
352
|
+
unless hash.empty?
|
353
|
+
STDERR.puts @file_obj.get_filename
|
354
|
+
raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
|
355
|
+
end
|
356
|
+
}
|
357
|
+
end
|
358
|
+
|
359
|
+
|
360
|
+
def print_layers
|
361
|
+
@layers.each {|ln,l|
|
362
|
+
puts "Layer "+ln+":"
|
363
|
+
l.each {|element,start,stop|
|
364
|
+
puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
|
365
|
+
}
|
366
|
+
puts "***"
|
367
|
+
}
|
368
|
+
end
|
369
|
+
|
370
|
+
|
371
|
+
private
|
372
|
+
|
373
|
+
|
374
|
+
def our_length(string) # (1) replace &...; with 1 char and " with two chars
|
375
|
+
return string.gsub(/&(.+?);/,"X").length
|
376
|
+
end
|
377
|
+
|
378
|
+
def is_fe(fename)
|
379
|
+
@layers["FE"].each {|name,start,stop|
|
380
|
+
if fename == name
|
381
|
+
return true
|
382
|
+
end
|
383
|
+
}
|
384
|
+
return false
|
385
|
+
end
|
386
|
+
|
387
|
+
|
388
|
+
def markable_as_string(layername,markup_name) # returns an array of all markables with this name
|
389
|
+
|
390
|
+
result = Array.new
|
391
|
+
|
392
|
+
festart = nil
|
393
|
+
festop = nil
|
394
|
+
@layers[layername].each {|name,start,stop|
|
395
|
+
if markup_name == name
|
396
|
+
fe = Array.new
|
397
|
+
infe = false
|
398
|
+
@charidx.each_index {|i|
|
399
|
+
startidx,stopidx = @charidx[i]
|
400
|
+
if startidx == start
|
401
|
+
infe = true
|
402
|
+
end
|
403
|
+
if infe
|
404
|
+
fe << @pos_text[i]
|
405
|
+
end
|
406
|
+
if stopidx == stop
|
407
|
+
result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
|
408
|
+
break
|
409
|
+
elsif stopidx > stop
|
410
|
+
result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
|
411
|
+
break
|
412
|
+
end
|
413
|
+
}
|
414
|
+
end
|
415
|
+
}
|
416
|
+
return result
|
417
|
+
end
|
418
|
+
|
419
|
+
def add_to_hash(hash,key,name)
|
420
|
+
exists = false
|
421
|
+
if hash.key?(key)
|
422
|
+
exists = true
|
423
|
+
else
|
424
|
+
hash[key] = Array.new
|
425
|
+
hash[key] << name
|
426
|
+
end
|
427
|
+
return exists
|
428
|
+
end
|
429
|
+
|
430
|
+
def add_all_to_hash(hash,layername)
|
431
|
+
# use "uniq" to remove wrong double annotations
|
432
|
+
@layers[layername].uniq.each {|element,start,stop|
|
433
|
+
exists = add_to_hash(hash,[start, "start"],element)
|
434
|
+
if exists
|
435
|
+
STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
|
436
|
+
else
|
437
|
+
add_to_hash(hash,[stop, "stop"],element)
|
438
|
+
end
|
439
|
+
}
|
440
|
+
end
|
441
|
+
|
442
|
+
|
443
|
+
def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
|
444
|
+
if name.nil?
|
445
|
+
STDERR.puts "Error: layer line "+line+" with empty name."
|
446
|
+
end
|
447
|
+
|
448
|
+
# thisLayer, retv: array:[name(string), start(integer), end(integer)]
|
449
|
+
thisLayer = Array.new
|
450
|
+
retv = Array.new
|
451
|
+
|
452
|
+
labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
|
453
|
+
unless labels_elt
|
454
|
+
# no labels found, return empty array
|
455
|
+
return thisLayer
|
456
|
+
end
|
457
|
+
|
458
|
+
labels_elt.children_and_text.each { |label|
|
459
|
+
unless label.name == "label"
|
460
|
+
# some other markup, ignore
|
461
|
+
next
|
462
|
+
end
|
463
|
+
|
464
|
+
attributes = label.attributes()
|
465
|
+
if attributes["itype"]
|
466
|
+
# null instantiation, don't retain
|
467
|
+
next
|
468
|
+
end
|
469
|
+
if not(attributes["start"]) and not(attributes["end"])
|
470
|
+
# no start and end labels
|
471
|
+
next
|
472
|
+
end
|
473
|
+
thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
|
474
|
+
}
|
475
|
+
|
476
|
+
# sanity check: verify that
|
477
|
+
# 1. we don't have overlapping labels
|
478
|
+
|
479
|
+
deleteHash = Hash.new # keep track of the labels which are to be deleted
|
480
|
+
# i -> Boolean
|
481
|
+
|
482
|
+
thisLayer.each_index {|i|
|
483
|
+
# efficiency: skip already delete labels
|
484
|
+
if deleteHash[i]
|
485
|
+
next
|
486
|
+
end
|
487
|
+
this_label, this_from , this_to = thisLayer[i]
|
488
|
+
|
489
|
+
# compare with all remaining labels
|
490
|
+
(i+1..thisLayer.length()-1).to_a.each { |other_i|
|
491
|
+
other_label,other_from,other_to = thisLayer[other_i]
|
492
|
+
|
493
|
+
# overlap? Throw out the later FE
|
494
|
+
if this_from <= other_from and other_from <= this_to
|
495
|
+
$stderr.puts "Warning: Label overlap, deleting #{other_label}"
|
496
|
+
deleteHash[other_i] = true
|
497
|
+
elsif this_from <= other_to and other_to <= this_to
|
498
|
+
$stderr.puts "Warning: Label overlap, deleting #{this_label}"
|
499
|
+
deleteHash[i] = true
|
500
|
+
end
|
501
|
+
}
|
502
|
+
# matched with all other labels. If "keep", return
|
503
|
+
|
504
|
+
if deleteHash[i]
|
505
|
+
# $stderr.puts " deleting entry #{i}"
|
506
|
+
else
|
507
|
+
retv << thisLayer[i]
|
508
|
+
end
|
509
|
+
}
|
510
|
+
|
511
|
+
return retv
|
512
|
+
end
|
513
|
+
end
|