shalmaneser 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/shalmaneser +8 -2
- data/doc/index.md +1 -0
- data/lib/shalmaneser/opt_parser.rb +68 -67
- metadata +49 -119
- data/bin/fred +0 -16
- data/bin/frprep +0 -34
- data/bin/rosy +0 -17
- data/lib/common/AbstractSynInterface.rb +0 -1229
- data/lib/common/Counter.rb +0 -18
- data/lib/common/EnduserMode.rb +0 -27
- data/lib/common/Eval.rb +0 -480
- data/lib/common/FixSynSemMapping.rb +0 -196
- data/lib/common/Graph.rb +0 -345
- data/lib/common/ISO-8859-1.rb +0 -24
- data/lib/common/ML.rb +0 -186
- data/lib/common/Mallet.rb +0 -236
- data/lib/common/Maxent.rb +0 -229
- data/lib/common/Optimise.rb +0 -195
- data/lib/common/Parser.rb +0 -213
- data/lib/common/RegXML.rb +0 -269
- data/lib/common/RosyConventions.rb +0 -171
- data/lib/common/STXmlTerminalOrder.rb +0 -194
- data/lib/common/SalsaTigerRegXML.rb +0 -2347
- data/lib/common/SalsaTigerXMLHelper.rb +0 -99
- data/lib/common/SynInterfaces.rb +0 -282
- data/lib/common/TabFormat.rb +0 -721
- data/lib/common/Tiger.rb +0 -1448
- data/lib/common/Timbl.rb +0 -144
- data/lib/common/Tree.rb +0 -61
- data/lib/common/config_data.rb +0 -470
- data/lib/common/config_format_element.rb +0 -220
- data/lib/common/headz.rb +0 -338
- data/lib/common/option_parser.rb +0 -13
- data/lib/common/prep_config_data.rb +0 -62
- data/lib/common/prep_helper.rb +0 -1330
- data/lib/common/ruby_class_extensions.rb +0 -310
- data/lib/db/db_interface.rb +0 -48
- data/lib/db/db_mysql.rb +0 -145
- data/lib/db/db_sqlite.rb +0 -280
- data/lib/db/db_table.rb +0 -239
- data/lib/db/db_wrapper.rb +0 -176
- data/lib/db/sql_query.rb +0 -243
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredConventions.rb +0 -232
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred.rb +0 -47
- data/lib/fred/fred_config_data.rb +0 -185
- data/lib/fred/md5.rb +0 -23
- data/lib/fred/opt_parser.rb +0 -250
- data/lib/frprep/Ampersand.rb +0 -39
- data/lib/frprep/CollinsInterface.rb +0 -1165
- data/lib/frprep/Counter.rb +0 -18
- data/lib/frprep/FNCorpusXML.rb +0 -643
- data/lib/frprep/FNDatabase.rb +0 -144
- data/lib/frprep/FrameXML.rb +0 -513
- data/lib/frprep/Graph.rb +0 -345
- data/lib/frprep/MiniparInterface.rb +0 -1388
- data/lib/frprep/RegXML.rb +0 -269
- data/lib/frprep/STXmlTerminalOrder.rb +0 -194
- data/lib/frprep/SleepyInterface.rb +0 -384
- data/lib/frprep/TntInterface.rb +0 -44
- data/lib/frprep/TreetaggerInterface.rb +0 -327
- data/lib/frprep/do_parses.rb +0 -143
- data/lib/frprep/frprep.rb +0 -693
- data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
- data/lib/frprep/interfaces/stanford_interface.rb +0 -353
- data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
- data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
- data/lib/frprep/one_parsed_file.rb +0 -28
- data/lib/frprep/opt_parser.rb +0 -94
- data/lib/frprep/ruby_class_extensions.rb +0 -310
- data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/FailedParses.rb +0 -130
- data/lib/rosy/FeatureInfo.rb +0 -242
- data/lib/rosy/GfInduce.rb +0 -1115
- data/lib/rosy/GfInduceFeature.rb +0 -148
- data/lib/rosy/InputData.rb +0 -294
- data/lib/rosy/RosyConfusability.rb +0 -338
- data/lib/rosy/RosyEval.rb +0 -465
- data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
- data/lib/rosy/RosyFeaturize.rb +0 -281
- data/lib/rosy/RosyInspect.rb +0 -336
- data/lib/rosy/RosyIterator.rb +0 -478
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
- data/lib/rosy/RosyPruning.rb +0 -165
- data/lib/rosy/RosyServices.rb +0 -744
- data/lib/rosy/RosySplit.rb +0 -232
- data/lib/rosy/RosyTask.rb +0 -19
- data/lib/rosy/RosyTest.rb +0 -829
- data/lib/rosy/RosyTrain.rb +0 -234
- data/lib/rosy/RosyTrainingTestTable.rb +0 -787
- data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/opt_parser.rb +0 -379
- data/lib/rosy/rosy.rb +0 -78
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/lib/shalmaneser/version.rb +0 -3
data/lib/frprep/Counter.rb
DELETED
data/lib/frprep/FNCorpusXML.rb
DELETED
@@ -1,643 +0,0 @@
|
|
1
|
-
# KE Dec 2006
|
2
|
-
# Access for FrameNet corpus XML file
|
3
|
-
# Mainly taken over from FramesXML
|
4
|
-
#
|
5
|
-
# changes:
|
6
|
-
# - no single frame for the whole corpus
|
7
|
-
# - below <sentence> level there is an <annotationSet> level.
|
8
|
-
# One annotationSet may include a single frame,
|
9
|
-
# or a reference to all named entities in a sentence
|
10
|
-
#
|
11
|
-
# Write out in tab format, one line per word:
|
12
|
-
# Format:
|
13
|
-
# word (pt gf role target frame stuff)* ne sent_id
|
14
|
-
# with
|
15
|
-
# word: word
|
16
|
-
# whole bracketed group: information about one frame annotation
|
17
|
-
# pt: phrase type
|
18
|
-
# gf: grammatical function
|
19
|
-
# role: frame element
|
20
|
-
# target: LU occurrence
|
21
|
-
# frame: frame
|
22
|
-
# stuff: support, and other things
|
23
|
-
# ne: named entity
|
24
|
-
# sent_id: sentence ID
|
25
|
-
|
26
|
-
require 'frprep/Ampersand'
|
27
|
-
require 'common/ISO-8859-1'
|
28
|
-
require 'common/RegXML'
|
29
|
-
|
30
|
-
#####################
|
31
|
-
# mixins to make work with RegXML a little less repetitive
|
32
|
-
class RegXML
|
33
|
-
def first_child_matching(child_name)
|
34
|
-
return children_and_text().detect { |c| c.name() == child_name }
|
35
|
-
end
|
36
|
-
|
37
|
-
def each_child_matching(child_name)
|
38
|
-
children_and_text().each { |c|
|
39
|
-
if c.name() == child_name
|
40
|
-
yield c
|
41
|
-
end
|
42
|
-
}
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
#####################
|
47
|
-
# class to keep data for one frame
|
48
|
-
class FNCorpusAset
|
49
|
-
attr_reader :layers, :aset_type, :aset_id, :frame_name, :lu
|
50
|
-
|
51
|
-
#######
|
52
|
-
# Analyze RegXML object, store in object variables:
|
53
|
-
#
|
54
|
-
# @aset_type: "frame" or "NER"
|
55
|
-
# @frame_name: frame name for "frame" type
|
56
|
-
# @lu: LU for "frame" type
|
57
|
-
# @aset_id: ID of the annotation set
|
58
|
-
# @layers: hash: layer type (FE, GF, PT, Target, NER) -> [offset, "start"/"stop"] -> list of labels
|
59
|
-
# string -> int*string -> array:string
|
60
|
-
#
|
61
|
-
def initialize(aset, #RegXML object
|
62
|
-
charidx) # array of pairs [start index, stop index] int*int
|
63
|
-
|
64
|
-
@layers = Hash.new()
|
65
|
-
@frame_name = nil
|
66
|
-
@lu = nil
|
67
|
-
@aset_type = nil
|
68
|
-
|
69
|
-
attributes = aset.attributes()
|
70
|
-
|
71
|
-
@aset_id = attributes["ID"]
|
72
|
-
|
73
|
-
if attributes["frameName"]
|
74
|
-
# all of these seem to be frames. store in 'frames' array
|
75
|
-
unless attributes["luName"]
|
76
|
-
$stderr.puts "FNCorpusAset warning: cannot determine LU name"
|
77
|
-
$stder.puts aset.to_s()
|
78
|
-
return
|
79
|
-
end
|
80
|
-
@aset_type = "frame"
|
81
|
-
@frame_name = attributes["frameName"]
|
82
|
-
@lu = attributes["luName"]
|
83
|
-
|
84
|
-
unless (layers = aset.first_child_matching("layers"))
|
85
|
-
$stderr.puts "FNCorpusAset warning: unexpectedly no layers found"
|
86
|
-
$stderr.puts aset.to_s()
|
87
|
-
return
|
88
|
-
end
|
89
|
-
|
90
|
-
layers.each_child_matching("layer") { |l| analyze_layer(l, charidx) }
|
91
|
-
|
92
|
-
else
|
93
|
-
# all we seem to get here are named entity labels.
|
94
|
-
@aset_type = "NER"
|
95
|
-
|
96
|
-
unless (layers = aset.first_child_matching("layers"))
|
97
|
-
$stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
|
98
|
-
$stderr.puts aset.to_s()
|
99
|
-
return
|
100
|
-
end
|
101
|
-
unless (layer = layers.first_child_matching("layer"))
|
102
|
-
$stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
|
103
|
-
$stderr.puts aset.to_s()
|
104
|
-
return
|
105
|
-
end
|
106
|
-
|
107
|
-
unless layer.attributes()["name"] == "NER"
|
108
|
-
$stderr.puts "FNCorpusAset Warning: unexpected layer #{layer.attributes()["name"]}, was expecting only an NER layer."
|
109
|
-
$stderr.puts aset.to_s()
|
110
|
-
return
|
111
|
-
end
|
112
|
-
|
113
|
-
analyze_layer(layer, charidx)
|
114
|
-
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
|
119
|
-
#############
|
120
|
-
# input: <layer> RegXML object
|
121
|
-
# analyze this, put into @layers data structure
|
122
|
-
def analyze_layer(layer, # RegXML object
|
123
|
-
charidx) # array:int*int pairs start/end index of words
|
124
|
-
layer_name = layer.attributes()["name"]
|
125
|
-
unless layer_name
|
126
|
-
$stderr.puts "FNCorpusAset warning: cannot determine layer name"
|
127
|
-
$stderr.puts layer.to_s
|
128
|
-
return
|
129
|
-
end
|
130
|
-
|
131
|
-
# FN-specific: skip 2nd layer FEs for now
|
132
|
-
if layer_name == "FE" and layer.attributes()["rank"] == "2"
|
133
|
-
return
|
134
|
-
end
|
135
|
-
|
136
|
-
unless @layers[layer_name]
|
137
|
-
@layers[layer_name] = Hash.new()
|
138
|
-
end
|
139
|
-
|
140
|
-
unless (labels = layer.first_child_matching("labels"))
|
141
|
-
# nothing to record for this layer
|
142
|
-
return
|
143
|
-
end
|
144
|
-
|
145
|
-
|
146
|
-
# taking over much of analyse_layer() from class FrameXML
|
147
|
-
thisLayer = Array.new()
|
148
|
-
|
149
|
-
labels.each_child_matching("label") { |label|
|
150
|
-
attributes = label.attributes()
|
151
|
-
if attributes["itype"] =~ /NI/
|
152
|
-
# null instantiation, ignore
|
153
|
-
next
|
154
|
-
end
|
155
|
-
|
156
|
-
if not(attributes["start"]) and not(attributes["end"])
|
157
|
-
# no start and end labels
|
158
|
-
next
|
159
|
-
end
|
160
|
-
thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
|
161
|
-
}
|
162
|
-
|
163
|
-
# sanity check: do indices
|
164
|
-
# match word start and end indices?
|
165
|
-
thisLayer = verify_annotation(thisLayer, charidx)
|
166
|
-
|
167
|
-
# sanity check: verify that
|
168
|
-
# we don't have overlapping labels
|
169
|
-
|
170
|
-
deleteHash = Hash.new # keep track of the labels which are to be deleted
|
171
|
-
# i -> Boolean
|
172
|
-
|
173
|
-
thisLayer.each_index {|i|
|
174
|
-
# efficiency: skip already delete labels
|
175
|
-
if deleteHash[i]
|
176
|
-
next
|
177
|
-
end
|
178
|
-
this_label, this_from , this_to = thisLayer[i]
|
179
|
-
|
180
|
-
# compare with all remaining labels
|
181
|
-
(i+1..thisLayer.length()-1).to_a.each { |other_i|
|
182
|
-
other_label,other_from,other_to = thisLayer[other_i]
|
183
|
-
|
184
|
-
# overlap? Throw out the later FE
|
185
|
-
if this_from <= other_from and other_from <= this_to
|
186
|
-
$stderr.puts "Warning: Label overlap, deleting #{other_label}"
|
187
|
-
deleteHash[other_i] = true
|
188
|
-
elsif this_from <= other_to and other_to <= this_to
|
189
|
-
$stderr.puts "Warning: Label overlap, deleting #{this_label}"
|
190
|
-
delete_hash[i] = true
|
191
|
-
end
|
192
|
-
}
|
193
|
-
# matched with all other labels. If "keep", return
|
194
|
-
|
195
|
-
if deleteHash[i]
|
196
|
-
# $stderr.puts " deleting entry #{i}"
|
197
|
-
else
|
198
|
-
[ [this_from, "start"], [this_to, "stop"]].each { |offset, start_or_stop|
|
199
|
-
unless @layers[layer_name].has_key?([offset, start_or_stop])
|
200
|
-
@layers[layer_name][[offset, start_or_stop]] = Array.new()
|
201
|
-
end
|
202
|
-
@layers[layer_name][ [offset, start_or_stop] ] << this_label
|
203
|
-
}
|
204
|
-
end
|
205
|
-
}
|
206
|
-
end
|
207
|
-
|
208
|
-
##############3
|
209
|
-
# verify found triples label/from_index/to_index
|
210
|
-
# against given start/end indices of words
|
211
|
-
#
|
212
|
-
# returns: triples, possibly changed
|
213
|
-
def verify_annotation(found, # array: label/from/to, string*int*int
|
214
|
-
charidx) # array: from/to, int*int
|
215
|
-
|
216
|
-
return found.map {|element, start, stop|
|
217
|
-
|
218
|
-
newstart = start
|
219
|
-
newstop = stop
|
220
|
-
|
221
|
-
# compare against word start/stop indices
|
222
|
-
charidx.each_index{|j|
|
223
|
-
unless j== 0
|
224
|
-
pstartidx, pstopidx = charidx[j-1]
|
225
|
-
end
|
226
|
-
startidx, stopidx = charidx[j]
|
227
|
-
|
228
|
-
if (start > startidx and start <= stopidx) or
|
229
|
-
(j != 0 and start > pstopidx and start < startidx)
|
230
|
-
newstart = startidx
|
231
|
-
end
|
232
|
-
|
233
|
-
if (stop >= startidx and stop < stopidx)
|
234
|
-
newstop = stopidx
|
235
|
-
elsif (j != 0 and stop > pstopidx and stop < startidx)
|
236
|
-
newstop = pstopidx
|
237
|
-
end
|
238
|
-
}
|
239
|
-
|
240
|
-
# change?
|
241
|
-
if start != newstart or stop != newstop
|
242
|
-
# report change
|
243
|
-
$stderr.puts "FNCorpusXML warning: Heuristics has changed element "+element
|
244
|
-
$stderr.puts "\tfrom ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"]"
|
245
|
-
|
246
|
-
[element, newstart, newstop]
|
247
|
-
|
248
|
-
else
|
249
|
-
|
250
|
-
[element, start, stop]
|
251
|
-
end
|
252
|
-
}
|
253
|
-
end
|
254
|
-
end
|
255
|
-
|
256
|
-
#####################
|
257
|
-
# one FrameNet corpus
|
258
|
-
#
|
259
|
-
# just the filename is stored,
|
260
|
-
# the text is read only on demand
|
261
|
-
class FNCorpusXMLFile
|
262
|
-
|
263
|
-
###
|
264
|
-
def initialize(filename)
|
265
|
-
@filename = filename
|
266
|
-
|
267
|
-
end
|
268
|
-
|
269
|
-
###
|
270
|
-
# yield each document in this corpus
|
271
|
-
# as a string
|
272
|
-
def each_document_string()
|
273
|
-
# read each <document> element and yield it
|
274
|
-
|
275
|
-
doc_string = ""
|
276
|
-
inside_doc_elem = false
|
277
|
-
f = File.new(@filename)
|
278
|
-
|
279
|
-
# <corpus>
|
280
|
-
# <documents>
|
281
|
-
# <document ...>
|
282
|
-
# </document>
|
283
|
-
# <document ...>
|
284
|
-
# </document>
|
285
|
-
# </documents>
|
286
|
-
# </corpus>
|
287
|
-
f.each { |line|
|
288
|
-
if not(inside_doc_elem) and line =~ /^.*?(<document\s.*)$/
|
289
|
-
# start of <document>
|
290
|
-
inside_doc_elem = true
|
291
|
-
doc_string << $1
|
292
|
-
elsif inside_doc_elem and line =~ /^(.*?<\/document>).*$/
|
293
|
-
# end of <document>
|
294
|
-
doc_string << $1
|
295
|
-
yield doc_string
|
296
|
-
doc_string = ""
|
297
|
-
inside_doc_elem = false
|
298
|
-
elsif inside_doc_elem
|
299
|
-
# within <document>
|
300
|
-
doc_string << line
|
301
|
-
end
|
302
|
-
}
|
303
|
-
end
|
304
|
-
|
305
|
-
###
|
306
|
-
# yield each sentence
|
307
|
-
# as a FNCorpusXMLSentence object
|
308
|
-
def each_sentence()
|
309
|
-
# read each <document> element and yield it
|
310
|
-
|
311
|
-
sent_string = ""
|
312
|
-
inside_sent_elem = false
|
313
|
-
f = File.new(@filename)
|
314
|
-
|
315
|
-
# <corpus>
|
316
|
-
# <documents>
|
317
|
-
# <document ...>
|
318
|
-
# <paragraphs>
|
319
|
-
# <paragraph>
|
320
|
-
# <sentences>
|
321
|
-
# <sentence ...>
|
322
|
-
f.each { |line|
|
323
|
-
if not(inside_sent_elem) and line =~ /^.*?(<sentence\s.*)$/
|
324
|
-
# start of <sentence>
|
325
|
-
inside_sent_elem = true
|
326
|
-
sent_string << $1
|
327
|
-
elsif inside_sent_elem and line =~ /^(.*?<\/sentence>).*$/
|
328
|
-
# end of <document>
|
329
|
-
sent_string << $1
|
330
|
-
yield FNCorpusXMLSentence.new(sent_string)
|
331
|
-
sent_string = ""
|
332
|
-
inside_sent_elem = false
|
333
|
-
elsif inside_sent_elem
|
334
|
-
# within <sentence>
|
335
|
-
sent_string << line.chomp()
|
336
|
-
end
|
337
|
-
}
|
338
|
-
end
|
339
|
-
|
340
|
-
###
|
341
|
-
# print whole FN file in tab format
|
342
|
-
def print_conll_style(file = $stdout)
|
343
|
-
each_sentence() { |s_obj|
|
344
|
-
s_obj.print_conll_style(file)
|
345
|
-
}
|
346
|
-
end
|
347
|
-
end
|
348
|
-
|
349
|
-
#######################################
|
350
|
-
# Keep one sentence from FN corpus XML
|
351
|
-
# as a RegXML object,
|
352
|
-
# offer printout in tabular format
|
353
|
-
class FNCorpusXMLSentence
|
354
|
-
|
355
|
-
#########
|
356
|
-
def initialize(sent_string)
|
357
|
-
@sent = RegXML.new(sent_string)
|
358
|
-
@sent_id = @sent.attributes()["ID"]
|
359
|
-
end
|
360
|
-
|
361
|
-
##############
|
362
|
-
# print to file
|
363
|
-
# in tabular format
|
364
|
-
#
|
365
|
-
# row format:
|
366
|
-
# word (pt gf role target frame stuff)* ne sent_id
|
367
|
-
#
|
368
|
-
# word: word
|
369
|
-
# whole bracketed group: information about one frame annotation
|
370
|
-
# pt: phrase type
|
371
|
-
# gf: grammatical function
|
372
|
-
# role: frame element
|
373
|
-
# target: LU occurrence
|
374
|
-
# frame: frame
|
375
|
-
# stuff: support, and other things
|
376
|
-
# ne: named entity
|
377
|
-
# sent_id: sentence ID
|
378
|
-
def print_conll_style(file = $stdout)
|
379
|
-
pos_text, charidx = read_sentence()
|
380
|
-
asets = read_annotation_sets(charidx)
|
381
|
-
|
382
|
-
# aset -> are we inside the target or not?
|
383
|
-
in_target = Hash.new(false)
|
384
|
-
# aset -> are we in all sorts of other annotations, like Support?
|
385
|
-
in_stuff = Hash.new()
|
386
|
-
# are we inside a named entity?
|
387
|
-
in_ne = nil
|
388
|
-
|
389
|
-
# record every opening and closing label we recognize,
|
390
|
-
# to check later
|
391
|
-
recognized_labels = Hash.new()
|
392
|
-
|
393
|
-
pos_text.each_index {|i|
|
394
|
-
line = Array.new
|
395
|
-
word = pos_text[i]
|
396
|
-
|
397
|
-
# add: word
|
398
|
-
line << word
|
399
|
-
|
400
|
-
start, stop = charidx[i]
|
401
|
-
|
402
|
-
# iterate over the frames we have
|
403
|
-
# add: (pt gf role target frame stuff)
|
404
|
-
asets.each { |aset|
|
405
|
-
unless aset.aset_type == "frame"
|
406
|
-
# don't treat NEs as a frame here
|
407
|
-
next
|
408
|
-
end
|
409
|
-
|
410
|
-
# pt, gf, role
|
411
|
-
["PT", "GF", "FE"].each { |layer|
|
412
|
-
token = Array.new
|
413
|
-
hash = aset.layers[layer]
|
414
|
-
if hash.has_key?([start,"start"])
|
415
|
-
recognized_labels[[layer, start, "start"]] = true
|
416
|
-
|
417
|
-
markables = hash[[start,"start"]]
|
418
|
-
markables.each {|element|
|
419
|
-
token << "B-"+element
|
420
|
-
}
|
421
|
-
end
|
422
|
-
if hash.has_key?([stop,"stop"])
|
423
|
-
recognized_labels[[layer, stop, "stop"]] = true
|
424
|
-
|
425
|
-
markables = hash[[stop,"stop"]]
|
426
|
-
markables.each {|element|
|
427
|
-
token << "E-"+element
|
428
|
-
}
|
429
|
-
end
|
430
|
-
|
431
|
-
if token.empty?
|
432
|
-
line << "-"
|
433
|
-
else
|
434
|
-
line << token.sort.join(":")
|
435
|
-
end
|
436
|
-
}
|
437
|
-
|
438
|
-
# target
|
439
|
-
target = aset.layers["Target"]
|
440
|
-
if target.has_key?([start,"start"])
|
441
|
-
recognized_labels[["Target", start, "start"]] = true
|
442
|
-
in_target[aset] = true
|
443
|
-
end
|
444
|
-
if in_target[aset]
|
445
|
-
line << aset.lu
|
446
|
-
else
|
447
|
-
line << "-"
|
448
|
-
end
|
449
|
-
if target.has_key?([stop,"stop"])
|
450
|
-
recognized_labels[["Target", stop, "stop"]] = true
|
451
|
-
in_target[aset] = false
|
452
|
-
end
|
453
|
-
|
454
|
-
# frame
|
455
|
-
line << aset.frame_name
|
456
|
-
|
457
|
-
# stuff
|
458
|
-
unless in_stuff.has_key?(aset)
|
459
|
-
in_stuff[aset] = Array.new()
|
460
|
-
end
|
461
|
-
aset.layers.each_key { |layer|
|
462
|
-
if ["PT", "GF", "FE", "Target"].include? layer
|
463
|
-
# already done those
|
464
|
-
next
|
465
|
-
end
|
466
|
-
# all the rest goes in "stuff"
|
467
|
-
if aset.layers[layer].has_key?([start, "start"])
|
468
|
-
aset.layers[layer][[start, "start"]].each { |entry|
|
469
|
-
in_stuff[aset] << layer + "-" + entry
|
470
|
-
}
|
471
|
-
recognized_labels[[layer, start, "start"]] = true
|
472
|
-
end
|
473
|
-
}
|
474
|
-
if in_stuff[aset].empty?
|
475
|
-
line << "-"
|
476
|
-
else
|
477
|
-
line << in_stuff[aset].join(":")
|
478
|
-
end
|
479
|
-
aset.layers.each_key { |layer|
|
480
|
-
if aset.layers[layer].has_key?([stop, "stop"])
|
481
|
-
recognized_labels[[layer, stop, "stop"]] = true
|
482
|
-
aset.layers[layer][[stop, "stop"]].each { |entry|
|
483
|
-
in_stuff[aset].delete(layer + "-" + entry)
|
484
|
-
}
|
485
|
-
end
|
486
|
-
}
|
487
|
-
}
|
488
|
-
|
489
|
-
# ne
|
490
|
-
if (ner = asets.detect { |a| a.aset_type == "NER" })
|
491
|
-
if ner.layers["NER"] and ner.layers["NER"].has_key?([start, "start"])
|
492
|
-
recognized_labels[["NER", start, "start"]] = true
|
493
|
-
in_ne = ner.layers["NER"][[start,"start"]]
|
494
|
-
end
|
495
|
-
if in_ne
|
496
|
-
line << in_ne.join(":")
|
497
|
-
else
|
498
|
-
line << "-"
|
499
|
-
end
|
500
|
-
if ner.layers["NER"] and ner.layers["NER"].has_key?([stop, "stop"])
|
501
|
-
recognized_labels[["NER", stop, "stop"]] = true
|
502
|
-
in_ne = nil
|
503
|
-
end
|
504
|
-
end
|
505
|
-
|
506
|
-
# sent id
|
507
|
-
line << @sent_id
|
508
|
-
|
509
|
-
# sanity check:
|
510
|
-
# row format:
|
511
|
-
# word (pt gf role target frame stuff)* ne sent_id
|
512
|
-
# so number of columns must be 3 + 6x for some x >= 0
|
513
|
-
unless (line.length() - 3)%6 == 0
|
514
|
-
$stderr.puts "Something wrong with the line length."
|
515
|
-
$stderr.puts "I have #{asets.length() - 1} frames plus NEs, "
|
516
|
-
$stderr.puts "but #{line.length()} columns."
|
517
|
-
raise
|
518
|
-
end
|
519
|
-
|
520
|
-
|
521
|
-
file.puts line.join("\t")
|
522
|
-
}
|
523
|
-
|
524
|
-
# sanity check:
|
525
|
-
# now count all labels,
|
526
|
-
# to see if we've printed them all
|
527
|
-
lost_labels = Array.new()
|
528
|
-
asets.each { |aset|
|
529
|
-
aset.layers.each_key { |layer|
|
530
|
-
aset.layers[layer].each_key() { |offset, start_or_stop|
|
531
|
-
unless recognized_labels[[layer, offset, start_or_stop]]
|
532
|
-
lost_labels << [layer, offset, start_or_stop,
|
533
|
-
aset.layers[layer][[offset, start_or_stop]]]
|
534
|
-
end
|
535
|
-
}
|
536
|
-
}
|
537
|
-
}
|
538
|
-
unless lost_labels.empty?
|
539
|
-
$stderr.puts "Offsets: "
|
540
|
-
pos_text.each_index { |i|
|
541
|
-
$stderr.puts "\t#{pos_text[i]} #{charidx[i][0]} #{charidx[i][1]}"
|
542
|
-
}
|
543
|
-
# $stderr.puts "Recognized:"
|
544
|
-
# recognized_labels.each_key { |k|
|
545
|
-
# $stderr.puts "\t" + k.to_s()
|
546
|
-
# }
|
547
|
-
lost_labels.each { |layer, offset, start_or_stop, labels|
|
548
|
-
$stderr.puts "FNCorpusXML warning: lost label"
|
549
|
-
$stderr.puts "\tLayer #{layer}"
|
550
|
-
$stderr.puts "\tOffset #{offset}"
|
551
|
-
$stderr.puts "\tStatus #{start_or_stop}"
|
552
|
-
$stderr.puts "\tLabels #{labels.join(" ")}"
|
553
|
-
}
|
554
|
-
end
|
555
|
-
|
556
|
-
file.puts
|
557
|
-
end
|
558
|
-
|
559
|
-
################
|
560
|
-
private
|
561
|
-
|
562
|
-
###
|
563
|
-
# read annotation sets:
|
564
|
-
# parse the annotation sets in the @sent object,
|
565
|
-
# return as:
|
566
|
-
# array of FNCorpusAset objects
|
567
|
-
def read_annotation_sets(charidx)
|
568
|
-
unless (annotation_sets = @sent.first_child_matching("annotationSets"))
|
569
|
-
return
|
570
|
-
end
|
571
|
-
|
572
|
-
# return values
|
573
|
-
frames = Array.new()
|
574
|
-
|
575
|
-
annotation_sets.each_child_matching("annotationSet") { |aset|
|
576
|
-
frames << FNCorpusAset.new(aset, charidx)
|
577
|
-
}
|
578
|
-
|
579
|
-
return frames
|
580
|
-
end
|
581
|
-
|
582
|
-
###
|
583
|
-
# basically taken over from FrameXML.rb
|
584
|
-
# read sentence words,
|
585
|
-
# return as: sentence, indices
|
586
|
-
# - sentence as array of strings, one word per string
|
587
|
-
# - indices: array of pairs [word start char.index, word end char.index] int*int
|
588
|
-
def read_sentence()
|
589
|
-
# all text and pos_text have the same number of elements!
|
590
|
-
charidx = Array.new # maps word indices on [start,stop]
|
591
|
-
pos_text = []
|
592
|
-
|
593
|
-
unless (text_elt = @sent.first_child_matching("text"))
|
594
|
-
# no text found for this sentence
|
595
|
-
return [pos_text, charidx]
|
596
|
-
end
|
597
|
-
|
598
|
-
orig_text = text_elt.children_and_text().detect { |child|
|
599
|
-
child.text?
|
600
|
-
}
|
601
|
-
if orig_text
|
602
|
-
# take text out of RegXMl object
|
603
|
-
orig_text = orig_text.to_s()
|
604
|
-
end
|
605
|
-
|
606
|
-
pos_text = UtfIso.to_iso_8859_1(orig_text).split(" ") # text with special char.s replaced by iso8859 char.s
|
607
|
-
|
608
|
-
double_space = Array.new
|
609
|
-
pos = 0
|
610
|
-
while (match = orig_text.index(/(\s\s+)/,pos))
|
611
|
-
double_space << match
|
612
|
-
pos = match+1
|
613
|
-
end
|
614
|
-
|
615
|
-
# fill charidx array
|
616
|
-
char_i = 0
|
617
|
-
pos_text.each_index {|word_i|
|
618
|
-
startchar = char_i
|
619
|
-
# puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
|
620
|
-
char_i += our_length(pos_text[word_i])
|
621
|
-
stopchar = char_i-1
|
622
|
-
|
623
|
-
# puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
|
624
|
-
|
625
|
-
charidx << [startchar,stopchar]
|
626
|
-
|
627
|
-
# separators
|
628
|
-
if double_space.include?(char_i) then
|
629
|
-
char_i += 2
|
630
|
-
else
|
631
|
-
char_i += 1
|
632
|
-
end
|
633
|
-
}
|
634
|
-
|
635
|
-
return [pos_text, charidx]
|
636
|
-
end
|
637
|
-
|
638
|
-
###
|
639
|
-
def our_length(string) # (1) replace &...; with 1 char and " with two chars
|
640
|
-
return string.gsub(/&(.+?);/,"X").length
|
641
|
-
end
|
642
|
-
|
643
|
-
end
|