shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,18 +0,0 @@
1
- # Counter class - provides unique ids with state
2
-
3
- class Counter
4
-
5
- def get
6
- return @v
7
- end
8
-
9
- def next
10
- @v += 1
11
- return (@v-1)
12
- end
13
-
14
- def initialize(init_value)
15
- @v = init_value
16
- end
17
-
18
- end
@@ -1,643 +0,0 @@
1
- # KE Dec 2006
2
- # Access for FrameNet corpus XML file
3
- # Mainly taken over from FramesXML
4
- #
5
- # changes:
6
- # - no single frame for the whole corpus
7
- # - below <sentence> level there is an <annotationSet> level.
8
- # One annotationSet may include a single frame,
9
- # or a reference to all named entities in a sentence
10
- #
11
- # Write out in tab format, one line per word:
12
- # Format:
13
- # word (pt gf role target frame stuff)* ne sent_id
14
- # with
15
- # word: word
16
- # whole bracketed group: information about one frame annotation
17
- # pt: phrase type
18
- # gf: grammatical function
19
- # role: frame element
20
- # target: LU occurrence
21
- # frame: frame
22
- # stuff: support, and other things
23
- # ne: named entity
24
- # sent_id: sentence ID
25
-
26
- require 'frprep/Ampersand'
27
- require 'common/ISO-8859-1'
28
- require 'common/RegXML'
29
-
30
- #####################
31
- # mixins to make work with RegXML a little less repetitive
32
- class RegXML
33
- def first_child_matching(child_name)
34
- return children_and_text().detect { |c| c.name() == child_name }
35
- end
36
-
37
- def each_child_matching(child_name)
38
- children_and_text().each { |c|
39
- if c.name() == child_name
40
- yield c
41
- end
42
- }
43
- end
44
- end
45
-
46
- #####################
47
- # class to keep data for one frame
48
- class FNCorpusAset
49
- attr_reader :layers, :aset_type, :aset_id, :frame_name, :lu
50
-
51
- #######
52
- # Analyze RegXML object, store in object variables:
53
- #
54
- # @aset_type: "frame" or "NER"
55
- # @frame_name: frame name for "frame" type
56
- # @lu: LU for "frame" type
57
- # @aset_id: ID of the annotation set
58
- # @layers: hash: layer type (FE, GF, PT, Target, NER) -> [offset, "start"/"stop"] -> list of labels
59
- # string -> int*string -> array:string
60
- #
61
- def initialize(aset, #RegXML object
62
- charidx) # array of pairs [start index, stop index] int*int
63
-
64
- @layers = Hash.new()
65
- @frame_name = nil
66
- @lu = nil
67
- @aset_type = nil
68
-
69
- attributes = aset.attributes()
70
-
71
- @aset_id = attributes["ID"]
72
-
73
- if attributes["frameName"]
74
- # all of these seem to be frames. store in 'frames' array
75
- unless attributes["luName"]
76
- $stderr.puts "FNCorpusAset warning: cannot determine LU name"
77
- $stder.puts aset.to_s()
78
- return
79
- end
80
- @aset_type = "frame"
81
- @frame_name = attributes["frameName"]
82
- @lu = attributes["luName"]
83
-
84
- unless (layers = aset.first_child_matching("layers"))
85
- $stderr.puts "FNCorpusAset warning: unexpectedly no layers found"
86
- $stderr.puts aset.to_s()
87
- return
88
- end
89
-
90
- layers.each_child_matching("layer") { |l| analyze_layer(l, charidx) }
91
-
92
- else
93
- # all we seem to get here are named entity labels.
94
- @aset_type = "NER"
95
-
96
- unless (layers = aset.first_child_matching("layers"))
97
- $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
98
- $stderr.puts aset.to_s()
99
- return
100
- end
101
- unless (layer = layers.first_child_matching("layer"))
102
- $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
103
- $stderr.puts aset.to_s()
104
- return
105
- end
106
-
107
- unless layer.attributes()["name"] == "NER"
108
- $stderr.puts "FNCorpusAset Warning: unexpected layer #{layer.attributes()["name"]}, was expecting only an NER layer."
109
- $stderr.puts aset.to_s()
110
- return
111
- end
112
-
113
- analyze_layer(layer, charidx)
114
-
115
- end
116
- end
117
-
118
-
119
- #############
120
- # input: <layer> RegXML object
121
- # analyze this, put into @layers data structure
122
- def analyze_layer(layer, # RegXML object
123
- charidx) # array:int*int pairs start/end index of words
124
- layer_name = layer.attributes()["name"]
125
- unless layer_name
126
- $stderr.puts "FNCorpusAset warning: cannot determine layer name"
127
- $stderr.puts layer.to_s
128
- return
129
- end
130
-
131
- # FN-specific: skip 2nd layer FEs for now
132
- if layer_name == "FE" and layer.attributes()["rank"] == "2"
133
- return
134
- end
135
-
136
- unless @layers[layer_name]
137
- @layers[layer_name] = Hash.new()
138
- end
139
-
140
- unless (labels = layer.first_child_matching("labels"))
141
- # nothing to record for this layer
142
- return
143
- end
144
-
145
-
146
- # taking over much of analyse_layer() from class FrameXML
147
- thisLayer = Array.new()
148
-
149
- labels.each_child_matching("label") { |label|
150
- attributes = label.attributes()
151
- if attributes["itype"] =~ /NI/
152
- # null instantiation, ignore
153
- next
154
- end
155
-
156
- if not(attributes["start"]) and not(attributes["end"])
157
- # no start and end labels
158
- next
159
- end
160
- thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
161
- }
162
-
163
- # sanity check: do indices
164
- # match word start and end indices?
165
- thisLayer = verify_annotation(thisLayer, charidx)
166
-
167
- # sanity check: verify that
168
- # we don't have overlapping labels
169
-
170
- deleteHash = Hash.new # keep track of the labels which are to be deleted
171
- # i -> Boolean
172
-
173
- thisLayer.each_index {|i|
174
- # efficiency: skip already delete labels
175
- if deleteHash[i]
176
- next
177
- end
178
- this_label, this_from , this_to = thisLayer[i]
179
-
180
- # compare with all remaining labels
181
- (i+1..thisLayer.length()-1).to_a.each { |other_i|
182
- other_label,other_from,other_to = thisLayer[other_i]
183
-
184
- # overlap? Throw out the later FE
185
- if this_from <= other_from and other_from <= this_to
186
- $stderr.puts "Warning: Label overlap, deleting #{other_label}"
187
- deleteHash[other_i] = true
188
- elsif this_from <= other_to and other_to <= this_to
189
- $stderr.puts "Warning: Label overlap, deleting #{this_label}"
190
- delete_hash[i] = true
191
- end
192
- }
193
- # matched with all other labels. If "keep", return
194
-
195
- if deleteHash[i]
196
- # $stderr.puts " deleting entry #{i}"
197
- else
198
- [ [this_from, "start"], [this_to, "stop"]].each { |offset, start_or_stop|
199
- unless @layers[layer_name].has_key?([offset, start_or_stop])
200
- @layers[layer_name][[offset, start_or_stop]] = Array.new()
201
- end
202
- @layers[layer_name][ [offset, start_or_stop] ] << this_label
203
- }
204
- end
205
- }
206
- end
207
-
208
- ##############3
209
- # verify found triples label/from_index/to_index
210
- # against given start/end indices of words
211
- #
212
- # returns: triples, possibly changed
213
- def verify_annotation(found, # array: label/from/to, string*int*int
214
- charidx) # array: from/to, int*int
215
-
216
- return found.map {|element, start, stop|
217
-
218
- newstart = start
219
- newstop = stop
220
-
221
- # compare against word start/stop indices
222
- charidx.each_index{|j|
223
- unless j== 0
224
- pstartidx, pstopidx = charidx[j-1]
225
- end
226
- startidx, stopidx = charidx[j]
227
-
228
- if (start > startidx and start <= stopidx) or
229
- (j != 0 and start > pstopidx and start < startidx)
230
- newstart = startidx
231
- end
232
-
233
- if (stop >= startidx and stop < stopidx)
234
- newstop = stopidx
235
- elsif (j != 0 and stop > pstopidx and stop < startidx)
236
- newstop = pstopidx
237
- end
238
- }
239
-
240
- # change?
241
- if start != newstart or stop != newstop
242
- # report change
243
- $stderr.puts "FNCorpusXML warning: Heuristics has changed element "+element
244
- $stderr.puts "\tfrom ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"]"
245
-
246
- [element, newstart, newstop]
247
-
248
- else
249
-
250
- [element, start, stop]
251
- end
252
- }
253
- end
254
- end
255
-
256
- #####################
257
- # one FrameNet corpus
258
- #
259
- # just the filename is stored,
260
- # the text is read only on demand
261
- class FNCorpusXMLFile
262
-
263
- ###
264
- def initialize(filename)
265
- @filename = filename
266
-
267
- end
268
-
269
- ###
270
- # yield each document in this corpus
271
- # as a string
272
- def each_document_string()
273
- # read each <document> element and yield it
274
-
275
- doc_string = ""
276
- inside_doc_elem = false
277
- f = File.new(@filename)
278
-
279
- # <corpus>
280
- # <documents>
281
- # <document ...>
282
- # </document>
283
- # <document ...>
284
- # </document>
285
- # </documents>
286
- # </corpus>
287
- f.each { |line|
288
- if not(inside_doc_elem) and line =~ /^.*?(<document\s.*)$/
289
- # start of <document>
290
- inside_doc_elem = true
291
- doc_string << $1
292
- elsif inside_doc_elem and line =~ /^(.*?<\/document>).*$/
293
- # end of <document>
294
- doc_string << $1
295
- yield doc_string
296
- doc_string = ""
297
- inside_doc_elem = false
298
- elsif inside_doc_elem
299
- # within <document>
300
- doc_string << line
301
- end
302
- }
303
- end
304
-
305
- ###
306
- # yield each sentence
307
- # as a FNCorpusXMLSentence object
308
- def each_sentence()
309
- # read each <document> element and yield it
310
-
311
- sent_string = ""
312
- inside_sent_elem = false
313
- f = File.new(@filename)
314
-
315
- # <corpus>
316
- # <documents>
317
- # <document ...>
318
- # <paragraphs>
319
- # <paragraph>
320
- # <sentences>
321
- # <sentence ...>
322
- f.each { |line|
323
- if not(inside_sent_elem) and line =~ /^.*?(<sentence\s.*)$/
324
- # start of <sentence>
325
- inside_sent_elem = true
326
- sent_string << $1
327
- elsif inside_sent_elem and line =~ /^(.*?<\/sentence>).*$/
328
- # end of <document>
329
- sent_string << $1
330
- yield FNCorpusXMLSentence.new(sent_string)
331
- sent_string = ""
332
- inside_sent_elem = false
333
- elsif inside_sent_elem
334
- # within <sentence>
335
- sent_string << line.chomp()
336
- end
337
- }
338
- end
339
-
340
- ###
341
- # print whole FN file in tab format
342
- def print_conll_style(file = $stdout)
343
- each_sentence() { |s_obj|
344
- s_obj.print_conll_style(file)
345
- }
346
- end
347
- end
348
-
349
- #######################################
350
- # Keep one sentence from FN corpus XML
351
- # as a RegXML object,
352
- # offer printout in tabular format
353
- class FNCorpusXMLSentence
354
-
355
- #########
356
- def initialize(sent_string)
357
- @sent = RegXML.new(sent_string)
358
- @sent_id = @sent.attributes()["ID"]
359
- end
360
-
361
- ##############
362
- # print to file
363
- # in tabular format
364
- #
365
- # row format:
366
- # word (pt gf role target frame stuff)* ne sent_id
367
- #
368
- # word: word
369
- # whole bracketed group: information about one frame annotation
370
- # pt: phrase type
371
- # gf: grammatical function
372
- # role: frame element
373
- # target: LU occurrence
374
- # frame: frame
375
- # stuff: support, and other things
376
- # ne: named entity
377
- # sent_id: sentence ID
378
- def print_conll_style(file = $stdout)
379
- pos_text, charidx = read_sentence()
380
- asets = read_annotation_sets(charidx)
381
-
382
- # aset -> are we inside the target or not?
383
- in_target = Hash.new(false)
384
- # aset -> are we in all sorts of other annotations, like Support?
385
- in_stuff = Hash.new()
386
- # are we inside a named entity?
387
- in_ne = nil
388
-
389
- # record every opening and closing label we recognize,
390
- # to check later
391
- recognized_labels = Hash.new()
392
-
393
- pos_text.each_index {|i|
394
- line = Array.new
395
- word = pos_text[i]
396
-
397
- # add: word
398
- line << word
399
-
400
- start, stop = charidx[i]
401
-
402
- # iterate over the frames we have
403
- # add: (pt gf role target frame stuff)
404
- asets.each { |aset|
405
- unless aset.aset_type == "frame"
406
- # don't treat NEs as a frame here
407
- next
408
- end
409
-
410
- # pt, gf, role
411
- ["PT", "GF", "FE"].each { |layer|
412
- token = Array.new
413
- hash = aset.layers[layer]
414
- if hash.has_key?([start,"start"])
415
- recognized_labels[[layer, start, "start"]] = true
416
-
417
- markables = hash[[start,"start"]]
418
- markables.each {|element|
419
- token << "B-"+element
420
- }
421
- end
422
- if hash.has_key?([stop,"stop"])
423
- recognized_labels[[layer, stop, "stop"]] = true
424
-
425
- markables = hash[[stop,"stop"]]
426
- markables.each {|element|
427
- token << "E-"+element
428
- }
429
- end
430
-
431
- if token.empty?
432
- line << "-"
433
- else
434
- line << token.sort.join(":")
435
- end
436
- }
437
-
438
- # target
439
- target = aset.layers["Target"]
440
- if target.has_key?([start,"start"])
441
- recognized_labels[["Target", start, "start"]] = true
442
- in_target[aset] = true
443
- end
444
- if in_target[aset]
445
- line << aset.lu
446
- else
447
- line << "-"
448
- end
449
- if target.has_key?([stop,"stop"])
450
- recognized_labels[["Target", stop, "stop"]] = true
451
- in_target[aset] = false
452
- end
453
-
454
- # frame
455
- line << aset.frame_name
456
-
457
- # stuff
458
- unless in_stuff.has_key?(aset)
459
- in_stuff[aset] = Array.new()
460
- end
461
- aset.layers.each_key { |layer|
462
- if ["PT", "GF", "FE", "Target"].include? layer
463
- # already done those
464
- next
465
- end
466
- # all the rest goes in "stuff"
467
- if aset.layers[layer].has_key?([start, "start"])
468
- aset.layers[layer][[start, "start"]].each { |entry|
469
- in_stuff[aset] << layer + "-" + entry
470
- }
471
- recognized_labels[[layer, start, "start"]] = true
472
- end
473
- }
474
- if in_stuff[aset].empty?
475
- line << "-"
476
- else
477
- line << in_stuff[aset].join(":")
478
- end
479
- aset.layers.each_key { |layer|
480
- if aset.layers[layer].has_key?([stop, "stop"])
481
- recognized_labels[[layer, stop, "stop"]] = true
482
- aset.layers[layer][[stop, "stop"]].each { |entry|
483
- in_stuff[aset].delete(layer + "-" + entry)
484
- }
485
- end
486
- }
487
- }
488
-
489
- # ne
490
- if (ner = asets.detect { |a| a.aset_type == "NER" })
491
- if ner.layers["NER"] and ner.layers["NER"].has_key?([start, "start"])
492
- recognized_labels[["NER", start, "start"]] = true
493
- in_ne = ner.layers["NER"][[start,"start"]]
494
- end
495
- if in_ne
496
- line << in_ne.join(":")
497
- else
498
- line << "-"
499
- end
500
- if ner.layers["NER"] and ner.layers["NER"].has_key?([stop, "stop"])
501
- recognized_labels[["NER", stop, "stop"]] = true
502
- in_ne = nil
503
- end
504
- end
505
-
506
- # sent id
507
- line << @sent_id
508
-
509
- # sanity check:
510
- # row format:
511
- # word (pt gf role target frame stuff)* ne sent_id
512
- # so number of columns must be 3 + 6x for some x >= 0
513
- unless (line.length() - 3)%6 == 0
514
- $stderr.puts "Something wrong with the line length."
515
- $stderr.puts "I have #{asets.length() - 1} frames plus NEs, "
516
- $stderr.puts "but #{line.length()} columns."
517
- raise
518
- end
519
-
520
-
521
- file.puts line.join("\t")
522
- }
523
-
524
- # sanity check:
525
- # now count all labels,
526
- # to see if we've printed them all
527
- lost_labels = Array.new()
528
- asets.each { |aset|
529
- aset.layers.each_key { |layer|
530
- aset.layers[layer].each_key() { |offset, start_or_stop|
531
- unless recognized_labels[[layer, offset, start_or_stop]]
532
- lost_labels << [layer, offset, start_or_stop,
533
- aset.layers[layer][[offset, start_or_stop]]]
534
- end
535
- }
536
- }
537
- }
538
- unless lost_labels.empty?
539
- $stderr.puts "Offsets: "
540
- pos_text.each_index { |i|
541
- $stderr.puts "\t#{pos_text[i]} #{charidx[i][0]} #{charidx[i][1]}"
542
- }
543
- # $stderr.puts "Recognized:"
544
- # recognized_labels.each_key { |k|
545
- # $stderr.puts "\t" + k.to_s()
546
- # }
547
- lost_labels.each { |layer, offset, start_or_stop, labels|
548
- $stderr.puts "FNCorpusXML warning: lost label"
549
- $stderr.puts "\tLayer #{layer}"
550
- $stderr.puts "\tOffset #{offset}"
551
- $stderr.puts "\tStatus #{start_or_stop}"
552
- $stderr.puts "\tLabels #{labels.join(" ")}"
553
- }
554
- end
555
-
556
- file.puts
557
- end
558
-
559
- ################
560
- private
561
-
562
- ###
563
- # read annotation sets:
564
- # parse the annotation sets in the @sent object,
565
- # return as:
566
- # array of FNCorpusAset objects
567
- def read_annotation_sets(charidx)
568
- unless (annotation_sets = @sent.first_child_matching("annotationSets"))
569
- return
570
- end
571
-
572
- # return values
573
- frames = Array.new()
574
-
575
- annotation_sets.each_child_matching("annotationSet") { |aset|
576
- frames << FNCorpusAset.new(aset, charidx)
577
- }
578
-
579
- return frames
580
- end
581
-
582
- ###
583
- # basically taken over from FrameXML.rb
584
- # read sentence words,
585
- # return as: sentence, indices
586
- # - sentence as array of strings, one word per string
587
- # - indices: array of pairs [word start char.index, word end char.index] int*int
588
- def read_sentence()
589
- # all text and pos_text have the same number of elements!
590
- charidx = Array.new # maps word indices on [start,stop]
591
- pos_text = []
592
-
593
- unless (text_elt = @sent.first_child_matching("text"))
594
- # no text found for this sentence
595
- return [pos_text, charidx]
596
- end
597
-
598
- orig_text = text_elt.children_and_text().detect { |child|
599
- child.text?
600
- }
601
- if orig_text
602
- # take text out of RegXMl object
603
- orig_text = orig_text.to_s()
604
- end
605
-
606
- pos_text = UtfIso.to_iso_8859_1(orig_text).split(" ") # text with special char.s replaced by iso8859 char.s
607
-
608
- double_space = Array.new
609
- pos = 0
610
- while (match = orig_text.index(/(\s\s+)/,pos))
611
- double_space << match
612
- pos = match+1
613
- end
614
-
615
- # fill charidx array
616
- char_i = 0
617
- pos_text.each_index {|word_i|
618
- startchar = char_i
619
- # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
620
- char_i += our_length(pos_text[word_i])
621
- stopchar = char_i-1
622
-
623
- # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
624
-
625
- charidx << [startchar,stopchar]
626
-
627
- # separators
628
- if double_space.include?(char_i) then
629
- char_i += 2
630
- else
631
- char_i += 1
632
- end
633
- }
634
-
635
- return [pos_text, charidx]
636
- end
637
-
638
- ###
639
- def our_length(string) # (1) replace &...; with 1 char and " with two chars
640
- return string.gsub(/&(.+?);/,"X").length
641
- end
642
-
643
- end