frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,144 @@
|
|
1
|
+
# sp 28 06 04
|
2
|
+
#
|
3
|
+
# this module offers methods to extract gemma corpora from the FrameNet database#
|
4
|
+
|
5
|
+
require 'FrameXML'
|
6
|
+
|
7
|
+
class FNDatabase
|
8
|
+
|
9
|
+
def each_matching_sentence(file_pred,sent_pred)
|
10
|
+
# fundamental access function to FrameXML files
|
11
|
+
|
12
|
+
# returns file objects where
|
13
|
+
# FrameXMLSentence matches sent_pred
|
14
|
+
# (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
|
15
|
+
each_matching_file(file_pred) {|frameNetFile|
|
16
|
+
frameNetFile.each_sentence {|frameNetSent|
|
17
|
+
if sent_pred.call(frameNetSent)
|
18
|
+
frameNetSent.verify_annotation
|
19
|
+
yield frameNetSent
|
20
|
+
end
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def each_matching_file(file_pred)
|
26
|
+
# fundamental access function to FrameXML files
|
27
|
+
|
28
|
+
# returns file (FrameXMLFile) objects which match file_pred
|
29
|
+
each_framexml_file{|frameNetFile|
|
30
|
+
if file_pred.call(frameNetFile)
|
31
|
+
yield frameNetFile
|
32
|
+
end
|
33
|
+
frameNetFile.close
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
def extract_frame(frame,outfile)
|
38
|
+
each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
|
39
|
+
Proc.new{|fnsent| true}) {|fnsent|
|
40
|
+
if fnsent.contains_FE_annotation_and_target
|
41
|
+
fnsent.print_conll_style_to(outfile)
|
42
|
+
end
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def extract_lemma(lemma,outfile)
|
47
|
+
each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
|
48
|
+
Proc.new{|fnsent| true}) {|fnsent|
|
49
|
+
if fnsent.contains_FE_annotation_and_target
|
50
|
+
fnsent.print_conll_style_to(outfile)
|
51
|
+
end
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_everything(outdirectory)
|
56
|
+
unless outdirectory[-1,1] == "/"
|
57
|
+
outdirectory += "/"
|
58
|
+
end
|
59
|
+
|
60
|
+
outfiles = Hash.new
|
61
|
+
each_matching_sentence(Proc.new{|fnfile| true},
|
62
|
+
Proc.new{|fnsent| true}) {|fnsent|
|
63
|
+
frame = fnsent.get_file_obj.get_frame
|
64
|
+
unless outfiles.key?(frame)
|
65
|
+
outfiles[frame] = File.new(outdirectory+frame+".tab","w")
|
66
|
+
end
|
67
|
+
if fnsent.contains_FE_annotation_and_target
|
68
|
+
fnsent.print_conll_style_to(outfiles[frame])
|
69
|
+
end
|
70
|
+
}
|
71
|
+
# close output files
|
72
|
+
outfiles.each_value {|file|
|
73
|
+
file.close
|
74
|
+
}
|
75
|
+
# remove zero-size files
|
76
|
+
Dir[outdirectory+"*"].each {|filename|
|
77
|
+
if FileTest.zero?(filename)
|
78
|
+
File.unlink(filename)
|
79
|
+
end
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
def initialize(fn_path)
|
85
|
+
unless fn_path[-1,1] == "/"
|
86
|
+
fn_path += "/"
|
87
|
+
end
|
88
|
+
@fn = fn_path
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def each_framexml_file
|
94
|
+
# files might be zipped
|
95
|
+
Dir[@fn+"lu*.xml.gz"].each {|gzfile|
|
96
|
+
Kernel.system("cp "+gzfile+" /tmp/")
|
97
|
+
Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
|
98
|
+
gzfile =~ /(.+)\.gz/
|
99
|
+
yield FrameXMLFile.new("/tmp/"+File.basename($1))
|
100
|
+
}
|
101
|
+
# or might not
|
102
|
+
Dir[@fn+"/lu*.xml"].each {|filename|
|
103
|
+
yield FrameXMLFile.new(filename)
|
104
|
+
}
|
105
|
+
end
|
106
|
+
|
107
|
+
# I don't really remember what this was good for ;-)
|
108
|
+
|
109
|
+
# def browse_everything(allFiles)
|
110
|
+
# if allFiles
|
111
|
+
# Dir[fn+"*.xml.gz"].each {|gzfile|
|
112
|
+
# Kernel.system("cp "+gzfile+" /tmp/")
|
113
|
+
# Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
|
114
|
+
# gzfile =~ /(.+)\.gz/
|
115
|
+
# # STDERR.puts File.basename($1)
|
116
|
+
# # STDERR.print "."
|
117
|
+
# ff = FrameXMLFile.new("/tmp/"+File.basename($1))
|
118
|
+
# ff.each_sentence {|s|
|
119
|
+
# if s.contains_FE_annotation_and_target
|
120
|
+
# s.verify_annotation
|
121
|
+
# if s.verify_annotation
|
122
|
+
# puts "****************** Error: Still problems after 2nd verification!"
|
123
|
+
# end
|
124
|
+
# s.print_conll_style
|
125
|
+
# end
|
126
|
+
# }
|
127
|
+
# }
|
128
|
+
# else
|
129
|
+
# ff = FrameXMLFile.new("/tmp/lu1870.xml")
|
130
|
+
# ff.each_sentence {|s|
|
131
|
+
# if s.contains_FE_annotation_and_target
|
132
|
+
# s.verify_annotation
|
133
|
+
# if s.verify_annotation
|
134
|
+
# puts "****************** Error: Still problems after 2nd verification!"
|
135
|
+
# end
|
136
|
+
# # s.print_layers
|
137
|
+
# s.print_conll_style
|
138
|
+
# end
|
139
|
+
# }
|
140
|
+
# end
|
141
|
+
# end
|
142
|
+
|
143
|
+
end
|
144
|
+
|
@@ -0,0 +1,196 @@
|
|
1
|
+
###
|
2
|
+
# FixSynSemMapping:
|
3
|
+
# Given a SalsaTigerRegXML sentence with semantic role annotation,
|
4
|
+
# simplify the mapping of semantic roles to syntactic constituents
|
5
|
+
#
|
6
|
+
# The following is lifted from the LREC06 paper on Shalmaneser:
|
7
|
+
# During preprocessing, the span of semantic roles in the training corpora is
|
8
|
+
# projected onto the output of the syntactic parser by assigning each
|
9
|
+
# role to the set of maximal constituents covering its word span.
|
10
|
+
# f the word span of a role does not coincide
|
11
|
+
# with parse tree constituents, e.g. due to misparses,
|
12
|
+
# the role is ``spread out'' across several constituents. This leads to
|
13
|
+
# idiosyncratic paths between predicate and semantic role in the parse
|
14
|
+
# tree.
|
15
|
+
#
|
16
|
+
# [The following span standardization algorithm is used to make the
|
17
|
+
# syntax-semantics mapping more uniform:]
|
18
|
+
# Given a role r that has been assigned, let N be the set of
|
19
|
+
# terminal nodes of the syntactic structure that are covered by r.
|
20
|
+
#
|
21
|
+
# Iteratively compute the maximal projection of N in the syntactic
|
22
|
+
# structure:
|
23
|
+
# 1) If n is a node such that all of n's children are in N,
|
24
|
+
# then remove n's children from N and add n instead.
|
25
|
+
# 2) If n is a node with 3 or more children, and all of n's
|
26
|
+
# children except one are in N, then remove n's children from N
|
27
|
+
# and add n instead.
|
28
|
+
# 3) If n is an NP with 2 children, and one of them, another NP,
|
29
|
+
# is in N, and the other, a relative clause, is not, then remove
|
30
|
+
# n's children from N and add n instead.
|
31
|
+
#
|
32
|
+
# If none of the rules is applicable to N anymore, assign r to the
|
33
|
+
# nodes in N.
|
34
|
+
#
|
35
|
+
# Rule 1 implements normal maximal projection. Rule 2 ``repairs'' parser
|
36
|
+
# errors where all children of a node but one have been assigned the
|
37
|
+
# same role. Rule 3 addresses a problem of the FrameNet data, where
|
38
|
+
# relative clauses have been omitted from roles assigned to NPs.
|
39
|
+
|
40
|
+
# KE Feb 08: rule 3 currently out of commission!
|
41
|
+
|
42
|
+
require "frprep/SalsaTigerRegXML"
|
43
|
+
|
44
|
+
module FixSynSemMapping
|
45
|
+
##
|
46
|
+
# fix it
|
47
|
+
#
|
48
|
+
# relevant settings in the experiment file:
|
49
|
+
#
|
50
|
+
# fe_syn_repair:
|
51
|
+
# If there is a node that would be a max. constituent for the
|
52
|
+
# words covered by the given FE, except that it has one child
|
53
|
+
# whose words are not in the FE, use the node as max constituent anyway.
|
54
|
+
# This is to repair cases where the parser has made an attachment choice
|
55
|
+
# that differs from the one in the gold annotation
|
56
|
+
#
|
57
|
+
# fe_rel_repair:
|
58
|
+
# If there is an NP such that all of its children except one have been
|
59
|
+
# assigned the same FE, and that missing child is a relative clause
|
60
|
+
# depending on one of the other children, then take the complete NP as
|
61
|
+
# that FE
|
62
|
+
def FixSynSemMapping.fixit(sent, # SalsaTigerSentence object
|
63
|
+
exp, # experiment file object
|
64
|
+
interpreter_class) # SynInterpreter class
|
65
|
+
|
66
|
+
|
67
|
+
unless exp.get("fe_syn_repair") or exp.get("fe_rel_repair")
|
68
|
+
return
|
69
|
+
end
|
70
|
+
|
71
|
+
if sent.nil?
|
72
|
+
return
|
73
|
+
end
|
74
|
+
|
75
|
+
# "repair" FEs:
|
76
|
+
sent.each_frame { |frame|
|
77
|
+
|
78
|
+
frame.each_child { |fe_or_target|
|
79
|
+
|
80
|
+
# repair only if the FE currently
|
81
|
+
# points to more than one syn node
|
82
|
+
if fe_or_target.children.length() < 2
|
83
|
+
next
|
84
|
+
end
|
85
|
+
|
86
|
+
if exp.get("fe_rel_repair")
|
87
|
+
lastfe = fe_or_target.children.last()
|
88
|
+
if lastfe and interpreter_class.simplified_pt(lastfe) =~ /^(WDT)|(WP\$?)|(WRB)/
|
89
|
+
|
90
|
+
# remove syn nodes that the FE points to
|
91
|
+
old_fe_syn = fe_or_target.children()
|
92
|
+
old_fe_syn.each { |child|
|
93
|
+
fe_or_target.remove_child(child)
|
94
|
+
}
|
95
|
+
|
96
|
+
# set it to point only to the last previous node, the relative pronoun
|
97
|
+
fe_or_target.add_child(lastfe)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
if exp.get("fe_syn_repair")
|
102
|
+
# remove syn nodes that the FE points to
|
103
|
+
old_fe_syn = fe_or_target.children()
|
104
|
+
old_fe_syn.each { |child|
|
105
|
+
fe_or_target.remove_child(child)
|
106
|
+
}
|
107
|
+
|
108
|
+
# and recompute
|
109
|
+
new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t|
|
110
|
+
t.yield_nodes
|
111
|
+
}.flatten.uniq,
|
112
|
+
sent,
|
113
|
+
exp.get("fe_syn_repair"))
|
114
|
+
|
115
|
+
# make the FE point to the new nodes
|
116
|
+
new_fe_syn.each { |syn_node|
|
117
|
+
fe_or_target.add_child(syn_node)
|
118
|
+
}
|
119
|
+
end
|
120
|
+
} # each FE
|
121
|
+
} # each frame
|
122
|
+
end # def fixit
|
123
|
+
end # module
|
124
|
+
|
125
|
+
|
126
|
+
#########3
|
127
|
+
# old code
|
128
|
+
|
129
|
+
# if exp.get("fe_rel_repair")
|
130
|
+
# # repair relative clauses:
|
131
|
+
# # then make a procedure to pass on to max constituents
|
132
|
+
# # that will recognize the relevant cases
|
133
|
+
|
134
|
+
# accept_anyway_proc = Proc.new { |node, children_in, children_out|
|
135
|
+
|
136
|
+
# # node: SynNode
|
137
|
+
# # children_in, children_out: array:SynNode. children_in are the children
|
138
|
+
# # that are already covered by the FE, children_out the ones that aren't
|
139
|
+
|
140
|
+
# # if node is an NP,
|
141
|
+
# # and only one of its children is out,
|
142
|
+
# # and one node in children_in is an NP, and the missing child is an SBAR
|
143
|
+
# # with a child that is a relative pronoun, then consider the child in children_out as covered
|
144
|
+
# if interpreter_class.category(node) == "noun" and
|
145
|
+
# children_out.length() == 1 and
|
146
|
+
# children_in.select { |n| interpreter_class.category(n) == "noun" } and
|
147
|
+
# interpreter_class.category(children_out.first) == "sent" and
|
148
|
+
# (ch = children_out.first.children) and
|
149
|
+
# ch.select { |n| interpreter_class.relative_pronoun?(n) }
|
150
|
+
# true
|
151
|
+
# else
|
152
|
+
# false
|
153
|
+
# end
|
154
|
+
# }
|
155
|
+
|
156
|
+
# else
|
157
|
+
# accept_anyway_proc = nil
|
158
|
+
# end
|
159
|
+
|
160
|
+
|
161
|
+
# # "repair" FEs:
|
162
|
+
# sent.each_frame { |frame|
|
163
|
+
|
164
|
+
# frame.each_child { |fe_or_target|
|
165
|
+
|
166
|
+
# # repair only if the FE currently
|
167
|
+
# # points to more than one syn node, or
|
168
|
+
# # if it is a noun with a non-covered sentence sister
|
169
|
+
# if fe_or_target.children.length() > 1 or
|
170
|
+
# (exp.get("fe_rel_repair") and (curr_marked = fe_or_target.children.first()) and
|
171
|
+
# interpreter_class.category(curr_marked) == "noun" and
|
172
|
+
# (p = curr_marked.parent) and
|
173
|
+
# p.children.select { |n| n != curr_marked and interpreter_class.category(n) == "sent" } )
|
174
|
+
|
175
|
+
# # remember nodes covered by the FE
|
176
|
+
# old_fe_syn = fe_or_target.children()
|
177
|
+
|
178
|
+
# # remove syn nodes that the FE points to
|
179
|
+
# old_fe_syn.each { |child|
|
180
|
+
# fe_or_target.remove_child(child)
|
181
|
+
# }
|
182
|
+
|
183
|
+
# # and recompute
|
184
|
+
# new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t| t.yield_nodes}.flatten.uniq,
|
185
|
+
# sent,
|
186
|
+
# exp.get("fe_syn_repair"),
|
187
|
+
# accept_anyway_proc)
|
188
|
+
|
189
|
+
# # make the FE point to the new nodes
|
190
|
+
# new_fe_syn.each { |syn_node|
|
191
|
+
# fe_or_target.add_child(syn_node)
|
192
|
+
# }
|
193
|
+
|
194
|
+
# end # if FE points to more than one syn node
|
195
|
+
# } # each FE
|
196
|
+
# } # each frame
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# FPrepConfigData
|
2
|
+
# Katrin Erk July 05
|
3
|
+
#
|
4
|
+
# Preprocessing for Fred and Rosy:
|
5
|
+
# access to a configuration and experiment description file
|
6
|
+
|
7
|
+
require "frprep/ConfigData"
|
8
|
+
|
9
|
+
##############################
|
10
|
+
# Class FrPrepConfigData
|
11
|
+
#
|
12
|
+
# inherits from ConfigData,
|
13
|
+
# sets variable names appropriate to preprocessing task
|
14
|
+
|
15
|
+
class FrPrepConfigData < ConfigData
|
16
|
+
def initialize(filename)
|
17
|
+
|
18
|
+
# initialize config data object
|
19
|
+
super(filename, # config file
|
20
|
+
{ "prep_experiment_ID" => "string", # experiment identifier
|
21
|
+
|
22
|
+
"frprep_directory" => "string", # dir for frprep internal data
|
23
|
+
|
24
|
+
# information about the dataset
|
25
|
+
"language" => "string", # en, de
|
26
|
+
"origin"=> "string", # FrameNet, Salsa, or nothing
|
27
|
+
"format" => "string", # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
|
28
|
+
"encoding" => "string", # utf8, iso, hex, or nothing
|
29
|
+
|
30
|
+
|
31
|
+
# directories
|
32
|
+
"directory_input" => "string", # dir with input data
|
33
|
+
"directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
|
34
|
+
"directory_parserout" => "string", # dir with parser output for the parser named below
|
35
|
+
|
36
|
+
# syntactic processing
|
37
|
+
"pos_tagger" => "string", # name of POS tagger
|
38
|
+
"lemmatizer" => "string", # name of lemmatizer
|
39
|
+
"parser" => "string", # name of parser
|
40
|
+
"pos_tagger_path" => "string", # path to POS tagger
|
41
|
+
"lemmatizer_path" => "string", # path to lemmatizer
|
42
|
+
"parser_path" => "string", # path to parser
|
43
|
+
"parser_max_sent_num" => "integer", # max number of sentences per parser input file
|
44
|
+
"parser_max_sent_len" => "integer", # max sentence length the parser handles
|
45
|
+
|
46
|
+
"do_parse" => "bool", # use parser?
|
47
|
+
"do_lemmatize" => "bool",# use lemmatizer?
|
48
|
+
"do_postag" => "bool", # use POS tagger?
|
49
|
+
|
50
|
+
# output format: if tabformat_output == true,
|
51
|
+
# output in Tab format rather than Salsa/Tiger XML
|
52
|
+
# (this will not work if do_parse == true)
|
53
|
+
"tabformat_output" => "bool",
|
54
|
+
|
55
|
+
# syntactic repairs, dependent on existing semantic role annotation
|
56
|
+
"fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
|
57
|
+
"fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
|
58
|
+
},
|
59
|
+
[ ] # variables
|
60
|
+
)
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
|
@@ -0,0 +1,513 @@
|
|
1
|
+
# sp 18 06 2004
|
2
|
+
#
|
3
|
+
# access to FrameNet XML files, sentences, and annotation.
|
4
|
+
#
|
5
|
+
# sp 10 11 04: only data from the first layer with name XY is
|
6
|
+
# used for output. Other data is saved in layer XY.2nd, but is
|
7
|
+
# currently not processed.
|
8
|
+
#
|
9
|
+
# sp 22 05 04: also, if two labels exist which cover the same span
|
10
|
+
# (ie there is a double annotation within the same layer), ignore
|
11
|
+
# all but the first label.
|
12
|
+
#
|
13
|
+
# ke 13 07 05:
|
14
|
+
# - changed to RegXMl.rb
|
15
|
+
# - fixed two problems in analyse_layer:
|
16
|
+
# - Deleting problematic labels:
|
17
|
+
# For some reason, thisLayer[i+1..-1].each_index {|other_i|
|
18
|
+
# included the index 0 in any case, resulting in the 1st
|
19
|
+
# label being deleted in any case.
|
20
|
+
# - Deleting problematic labels, checking for label overlap:
|
21
|
+
# The old formulation worked only if labels occurred in the array
|
22
|
+
# in the order they occurred in the sentence, but that was not the case.
|
23
|
+
# - Change in deleting problematic labels:
|
24
|
+
# No longer delete duplicate labels, since e.g. in the PT level there
|
25
|
+
# may be more than one NP label, and we want to keep those
|
26
|
+
#
|
27
|
+
# KE January 2007:
|
28
|
+
# write new adapted FNTab format
|
29
|
+
# ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
|
30
|
+
|
31
|
+
require 'Ampersand'
|
32
|
+
require 'ISO-8859-1'
|
33
|
+
require 'RegXML'
|
34
|
+
|
35
|
+
class FrameXMLFile # only verified to work for FrameNet v1.1
|
36
|
+
|
37
|
+
def initialize(filename)
|
38
|
+
@filename = filename
|
39
|
+
file = File.new(filename)
|
40
|
+
counter = 0
|
41
|
+
while true
|
42
|
+
counter +=1
|
43
|
+
line = file.gets
|
44
|
+
if line =~ /<lexunit/
|
45
|
+
break
|
46
|
+
end
|
47
|
+
if counter > 3
|
48
|
+
STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
|
49
|
+
Kernel.exit
|
50
|
+
end
|
51
|
+
end
|
52
|
+
# found lexunit
|
53
|
+
string = line
|
54
|
+
while (line = file.gets())
|
55
|
+
string << line
|
56
|
+
end
|
57
|
+
@lexunit = RegXML.new(string)
|
58
|
+
attributes = @lexunit.attributes()
|
59
|
+
@id = attributes["ID"]
|
60
|
+
attributes["name"] =~ /^([^.]+).([^.]+)$/
|
61
|
+
@lu = $1
|
62
|
+
@pos = $2.upcase
|
63
|
+
if @lu.nil?
|
64
|
+
raise "[framexml] no lemma in header of file #{@filename}"
|
65
|
+
elsif @pos.nil?
|
66
|
+
raise "[framexml] no pos in header of file #{@filename}"
|
67
|
+
end
|
68
|
+
@frame = attributes["frame"]
|
69
|
+
end
|
70
|
+
|
71
|
+
def get_lu
|
72
|
+
return @lu.gsub(" ","_")
|
73
|
+
end
|
74
|
+
|
75
|
+
def get_lu_id
|
76
|
+
return @id
|
77
|
+
end
|
78
|
+
|
79
|
+
def get_filename
|
80
|
+
return @filename
|
81
|
+
end
|
82
|
+
|
83
|
+
def get_pos
|
84
|
+
return @pos
|
85
|
+
end
|
86
|
+
|
87
|
+
def get_frame
|
88
|
+
return @frame
|
89
|
+
end
|
90
|
+
|
91
|
+
def close
|
92
|
+
end
|
93
|
+
|
94
|
+
def each_sentence
|
95
|
+
@lexunit.children_and_text().each { |subcorpus|
|
96
|
+
subcorpus.children_and_text().each { |annotationSet|
|
97
|
+
if annotationSet.name == "annotationSet"
|
98
|
+
# sentence found
|
99
|
+
yield FrameXMLSentence.new(annotationSet,self)
|
100
|
+
end
|
101
|
+
}
|
102
|
+
}
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
class FrameXMLSentence
|
107
|
+
def initialize(annotationSet,file_obj)
|
108
|
+
@file_obj = file_obj
|
109
|
+
|
110
|
+
# layers: hash layer_name -> array:[name, start, stop]
|
111
|
+
# name: name of the element, string
|
112
|
+
# start: start character, integer
|
113
|
+
# stop: end character, integer
|
114
|
+
@layers = Hash.new
|
115
|
+
|
116
|
+
annotationSet.children_and_text().each { |sentence_or_layer_elt|
|
117
|
+
|
118
|
+
case sentence_or_layer_elt.name
|
119
|
+
when "sentence"
|
120
|
+
# sentence: has ID, its child is <text>[text]</text>
|
121
|
+
@sent_id = sentence_or_layer_elt.attributes["ID"]
|
122
|
+
text_elt = sentence_or_layer_elt.children_and_text().detect { |child|
|
123
|
+
child.name == "text"
|
124
|
+
}
|
125
|
+
if text_elt
|
126
|
+
# found the text element. its only child should be the text
|
127
|
+
@orig_text = text_elt.children_and_text().detect { |child|
|
128
|
+
child.text?
|
129
|
+
}
|
130
|
+
if @orig_text
|
131
|
+
# take text out of RegXMl object
|
132
|
+
@orig_text = @orig_text.to_s()
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
when "layers"
|
137
|
+
# contains annotation layers
|
138
|
+
sentence_or_layer_elt.children_and_text().each { |layer|
|
139
|
+
unless layer.name == "layer"
|
140
|
+
# additional material, ignore
|
141
|
+
next
|
142
|
+
end
|
143
|
+
|
144
|
+
name = layer.attributes["name"]
|
145
|
+
unless name
|
146
|
+
raise "layer without a name"
|
147
|
+
end
|
148
|
+
unless @layers.key?(name)
|
149
|
+
@layers[name] = analyse_layer(layer, name)
|
150
|
+
end
|
151
|
+
}
|
152
|
+
end
|
153
|
+
}
|
154
|
+
|
155
|
+
@pos_text = UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
|
156
|
+
@text = Ampersand.utf8_to_hex(@orig_text).split(" ") # text with special characters replaced by &...; sequences
|
157
|
+
|
158
|
+
# all text and pos_text have the same number of elements!
|
159
|
+
@start_is = Hash.new # map char indices (start of words) onto word indices
|
160
|
+
@stop_is = Hash.new # map char indices (end of words) onto word indices
|
161
|
+
@charidx = Array.new # maps word indices on [start,stop]
|
162
|
+
|
163
|
+
@double_space = Array.new
|
164
|
+
pos = 0
|
165
|
+
while (match = @orig_text.index(/(\s\s+)/,pos))
|
166
|
+
@double_space << match
|
167
|
+
pos = match+1
|
168
|
+
end
|
169
|
+
|
170
|
+
|
171
|
+
# fill start, stop and charidx arrays
|
172
|
+
char_i = 0
|
173
|
+
@pos_text.each_index {|word_i|
|
174
|
+
@start_is[char_i] = word_i
|
175
|
+
startchar = char_i
|
176
|
+
# puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
|
177
|
+
char_i += our_length(@pos_text[word_i])
|
178
|
+
@stop_is[char_i-1] = word_i
|
179
|
+
|
180
|
+
stopchar = char_i-1
|
181
|
+
|
182
|
+
# puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
|
183
|
+
|
184
|
+
@charidx << [startchar,stopchar]
|
185
|
+
|
186
|
+
# separators
|
187
|
+
if @double_space.include?(char_i) then
|
188
|
+
char_i += 2
|
189
|
+
else
|
190
|
+
char_i += 1
|
191
|
+
end
|
192
|
+
}
|
193
|
+
end
|
194
|
+
|
195
|
+
def get_file_obj
|
196
|
+
return @file_obj
|
197
|
+
end
|
198
|
+
|
199
|
+
def get_sent_id
|
200
|
+
return @sent_id
|
201
|
+
end
|
202
|
+
|
203
|
+
def print_text
|
204
|
+
puts "("+@id+ ")\t"+@text
|
205
|
+
end
|
206
|
+
|
207
|
+
def contains_FE_annotation_and_target
|
208
|
+
target_info = @layers["Target"][0]
|
209
|
+
unless target_info[0] == "Target"
|
210
|
+
STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
|
211
|
+
STDERR.puts "Sentence: "+@text
|
212
|
+
return false
|
213
|
+
else
|
214
|
+
return (@layers.key?("FE") and target_info[2] != 0)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
# we only verify the interesting layers (FE,GF,Target)
|
219
|
+
# if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
|
220
|
+
|
221
|
+
def verify_annotation # returns true if some change has taken place
|
222
|
+
change = false
|
223
|
+
@layers.each_pair {|layername,l|
|
224
|
+
|
225
|
+
if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
|
226
|
+
|
227
|
+
l.each_index {|i|
|
228
|
+
|
229
|
+
element,start,stop = l[i]
|
230
|
+
|
231
|
+
newstart = start
|
232
|
+
newstop = stop
|
233
|
+
|
234
|
+
@charidx.each_index{|j|
|
235
|
+
unless j== 0
|
236
|
+
pstartidx, pstopidx = @charidx[j-1]
|
237
|
+
end
|
238
|
+
startidx, stopidx = @charidx[j]
|
239
|
+
|
240
|
+
if (start > startidx and start <= stopidx) or
|
241
|
+
(j != 0 and start > pstopidx and start < startidx)
|
242
|
+
newstart = startidx
|
243
|
+
end
|
244
|
+
|
245
|
+
if (stop >= startidx and stop < stopidx)
|
246
|
+
newstop = stopidx
|
247
|
+
elsif (j != 0 and stop > pstopidx and stop < startidx)
|
248
|
+
newstop = pstopidx
|
249
|
+
end
|
250
|
+
|
251
|
+
}
|
252
|
+
if start != newstart or stop != newstop
|
253
|
+
change = true
|
254
|
+
@layers[layername][i] = [element,newstart,newstop]
|
255
|
+
STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
|
256
|
+
markable_as_string(layername,element).each {|string|
|
257
|
+
STDERR.puts "New markable: "+string
|
258
|
+
}
|
259
|
+
STDERR.puts "Sentence: "+@pos_text.join(" ")
|
260
|
+
puts
|
261
|
+
end
|
262
|
+
}
|
263
|
+
end
|
264
|
+
}
|
265
|
+
return change
|
266
|
+
end
|
267
|
+
|
268
|
+
def print_conll_style
|
269
|
+
print_conll_style_to(STDOUT)
|
270
|
+
end
|
271
|
+
|
272
|
+
# CHANGED KE January 2007:
|
273
|
+
# write new adapted FNTab format
|
274
|
+
# ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
|
275
|
+
def print_conll_style_to(out)
|
276
|
+
|
277
|
+
# even though in principle there might be multiple
|
278
|
+
# labels for one span [i.e. in one value of the
|
279
|
+
# {gf,fe,pt} hashes], we only ever record one
|
280
|
+
|
281
|
+
gf = Hash.new
|
282
|
+
add_all_to_hash(gf,"GF")
|
283
|
+
fe = Hash.new
|
284
|
+
add_all_to_hash(fe,"FE")
|
285
|
+
pt = Hash.new
|
286
|
+
add_all_to_hash(pt,"PT")
|
287
|
+
target = Hash.new
|
288
|
+
add_all_to_hash(target,"Target")
|
289
|
+
|
290
|
+
in_target = false
|
291
|
+
|
292
|
+
@pos_text.each_index {|i|
|
293
|
+
# write format:
|
294
|
+
# "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
|
295
|
+
line = Array.new
|
296
|
+
# word
|
297
|
+
word = @pos_text[i]
|
298
|
+
line << word
|
299
|
+
|
300
|
+
start, stop = @charidx[i]
|
301
|
+
# "pt", "gf", "role",
|
302
|
+
[pt,gf,fe].each {|hash|
|
303
|
+
token = Array.new
|
304
|
+
if hash.key?([start,"start"])
|
305
|
+
markables = hash.delete([start,"start"])
|
306
|
+
markables.each {|element|
|
307
|
+
token << "B-"+element
|
308
|
+
}
|
309
|
+
end
|
310
|
+
if hash.key?([stop,"stop"])
|
311
|
+
markables = hash.delete([stop,"stop"])
|
312
|
+
markables.each {|element|
|
313
|
+
token << "E-"+element
|
314
|
+
}
|
315
|
+
end
|
316
|
+
if token.empty?
|
317
|
+
line << "-"
|
318
|
+
else
|
319
|
+
line << token.sort.join(":")
|
320
|
+
end
|
321
|
+
}
|
322
|
+
# "target"
|
323
|
+
if target.key?([start,"start"])
|
324
|
+
target.delete([start,"start"])
|
325
|
+
in_target = true
|
326
|
+
end
|
327
|
+
if in_target
|
328
|
+
line << @file_obj.get_lu+"."+@file_obj.get_pos
|
329
|
+
else
|
330
|
+
line << "-"
|
331
|
+
end
|
332
|
+
if target.key?([stop,"stop"])
|
333
|
+
target.delete([stop,"stop"])
|
334
|
+
in_target = false
|
335
|
+
end
|
336
|
+
# "frame"
|
337
|
+
line << @file_obj.get_frame
|
338
|
+
|
339
|
+
# "stuff" "ne",
|
340
|
+
line << "-"
|
341
|
+
line << "-"
|
342
|
+
|
343
|
+
# "sent_id"
|
344
|
+
line << @file_obj.get_lu_id+"-"+@sent_id
|
345
|
+
|
346
|
+
out.puts line.join("\t")
|
347
|
+
}
|
348
|
+
|
349
|
+
out.puts
|
350
|
+
|
351
|
+
[gf,fe,pt,target].each {|hash|
|
352
|
+
unless hash.empty?
|
353
|
+
STDERR.puts @file_obj.get_filename
|
354
|
+
raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
|
355
|
+
end
|
356
|
+
}
|
357
|
+
end
|
358
|
+
|
359
|
+
|
360
|
+
def print_layers
|
361
|
+
@layers.each {|ln,l|
|
362
|
+
puts "Layer "+ln+":"
|
363
|
+
l.each {|element,start,stop|
|
364
|
+
puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
|
365
|
+
}
|
366
|
+
puts "***"
|
367
|
+
}
|
368
|
+
end
|
369
|
+
|
370
|
+
|
371
|
+
private
|
372
|
+
|
373
|
+
|
374
|
+
def our_length(string) # (1) replace &...; with 1 char and " with two chars
|
375
|
+
return string.gsub(/&(.+?);/,"X").length
|
376
|
+
end
|
377
|
+
|
378
|
+
def is_fe(fename)
|
379
|
+
@layers["FE"].each {|name,start,stop|
|
380
|
+
if fename == name
|
381
|
+
return true
|
382
|
+
end
|
383
|
+
}
|
384
|
+
return false
|
385
|
+
end
|
386
|
+
|
387
|
+
|
388
|
+
def markable_as_string(layername,markup_name) # returns an array of all markables with this name
|
389
|
+
|
390
|
+
result = Array.new
|
391
|
+
|
392
|
+
festart = nil
|
393
|
+
festop = nil
|
394
|
+
@layers[layername].each {|name,start,stop|
|
395
|
+
if markup_name == name
|
396
|
+
fe = Array.new
|
397
|
+
infe = false
|
398
|
+
@charidx.each_index {|i|
|
399
|
+
startidx,stopidx = @charidx[i]
|
400
|
+
if startidx == start
|
401
|
+
infe = true
|
402
|
+
end
|
403
|
+
if infe
|
404
|
+
fe << @pos_text[i]
|
405
|
+
end
|
406
|
+
if stopidx == stop
|
407
|
+
result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
|
408
|
+
break
|
409
|
+
elsif stopidx > stop
|
410
|
+
result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
|
411
|
+
break
|
412
|
+
end
|
413
|
+
}
|
414
|
+
end
|
415
|
+
}
|
416
|
+
return result
|
417
|
+
end
|
418
|
+
|
419
|
+
def add_to_hash(hash,key,name)
|
420
|
+
exists = false
|
421
|
+
if hash.key?(key)
|
422
|
+
exists = true
|
423
|
+
else
|
424
|
+
hash[key] = Array.new
|
425
|
+
hash[key] << name
|
426
|
+
end
|
427
|
+
return exists
|
428
|
+
end
|
429
|
+
|
430
|
+
def add_all_to_hash(hash,layername)
|
431
|
+
# use "uniq" to remove wrong double annotations
|
432
|
+
@layers[layername].uniq.each {|element,start,stop|
|
433
|
+
exists = add_to_hash(hash,[start, "start"],element)
|
434
|
+
if exists
|
435
|
+
STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
|
436
|
+
else
|
437
|
+
add_to_hash(hash,[stop, "stop"],element)
|
438
|
+
end
|
439
|
+
}
|
440
|
+
end
|
441
|
+
|
442
|
+
|
443
|
+
def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
|
444
|
+
if name.nil?
|
445
|
+
STDERR.puts "Error: layer line "+line+" with empty name."
|
446
|
+
end
|
447
|
+
|
448
|
+
# thisLayer, retv: array:[name(string), start(integer), end(integer)]
|
449
|
+
thisLayer = Array.new
|
450
|
+
retv = Array.new
|
451
|
+
|
452
|
+
labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
|
453
|
+
unless labels_elt
|
454
|
+
# no labels found, return empty array
|
455
|
+
return thisLayer
|
456
|
+
end
|
457
|
+
|
458
|
+
labels_elt.children_and_text.each { |label|
|
459
|
+
unless label.name == "label"
|
460
|
+
# some other markup, ignore
|
461
|
+
next
|
462
|
+
end
|
463
|
+
|
464
|
+
attributes = label.attributes()
|
465
|
+
if attributes["itype"]
|
466
|
+
# null instantiation, don't retain
|
467
|
+
next
|
468
|
+
end
|
469
|
+
if not(attributes["start"]) and not(attributes["end"])
|
470
|
+
# no start and end labels
|
471
|
+
next
|
472
|
+
end
|
473
|
+
thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
|
474
|
+
}
|
475
|
+
|
476
|
+
# sanity check: verify that
|
477
|
+
# 1. we don't have overlapping labels
|
478
|
+
|
479
|
+
deleteHash = Hash.new # keep track of the labels which are to be deleted
|
480
|
+
# i -> Boolean
|
481
|
+
|
482
|
+
thisLayer.each_index {|i|
|
483
|
+
# efficiency: skip already delete labels
|
484
|
+
if deleteHash[i]
|
485
|
+
next
|
486
|
+
end
|
487
|
+
this_label, this_from , this_to = thisLayer[i]
|
488
|
+
|
489
|
+
# compare with all remaining labels
|
490
|
+
(i+1..thisLayer.length()-1).to_a.each { |other_i|
|
491
|
+
other_label,other_from,other_to = thisLayer[other_i]
|
492
|
+
|
493
|
+
# overlap? Throw out the later FE
|
494
|
+
if this_from <= other_from and other_from <= this_to
|
495
|
+
$stderr.puts "Warning: Label overlap, deleting #{other_label}"
|
496
|
+
deleteHash[other_i] = true
|
497
|
+
elsif this_from <= other_to and other_to <= this_to
|
498
|
+
$stderr.puts "Warning: Label overlap, deleting #{this_label}"
|
499
|
+
deleteHash[i] = true
|
500
|
+
end
|
501
|
+
}
|
502
|
+
# matched with all other labels. If "keep", return
|
503
|
+
|
504
|
+
if deleteHash[i]
|
505
|
+
# $stderr.puts " deleting entry #{i}"
|
506
|
+
else
|
507
|
+
retv << thisLayer[i]
|
508
|
+
end
|
509
|
+
}
|
510
|
+
|
511
|
+
return retv
|
512
|
+
end
|
513
|
+
end
|