shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,144 +0,0 @@
1
- # sp 28 06 04
2
- #
3
- # this module offers methods to extract gemma corpora from the FrameNet database#
4
-
5
- require 'frprep/FrameXML'
6
-
7
- class FNDatabase
8
-
9
- def each_matching_sentence(file_pred,sent_pred)
10
- # fundamental access function to FrameXML files
11
-
12
- # returns file objects where
13
- # FrameXMLSentence matches sent_pred
14
- # (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
15
- each_matching_file(file_pred) {|frameNetFile|
16
- frameNetFile.each_sentence {|frameNetSent|
17
- if sent_pred.call(frameNetSent)
18
- frameNetSent.verify_annotation
19
- yield frameNetSent
20
- end
21
- }
22
- }
23
- end
24
-
25
- def each_matching_file(file_pred)
26
- # fundamental access function to FrameXML files
27
-
28
- # returns file (FrameXMLFile) objects which match file_pred
29
- each_framexml_file{|frameNetFile|
30
- if file_pred.call(frameNetFile)
31
- yield frameNetFile
32
- end
33
- frameNetFile.close
34
- }
35
- end
36
-
37
- def extract_frame(frame,outfile)
38
- each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
39
- Proc.new{|fnsent| true}) {|fnsent|
40
- if fnsent.contains_FE_annotation_and_target
41
- fnsent.print_conll_style_to(outfile)
42
- end
43
- }
44
- end
45
-
46
- def extract_lemma(lemma,outfile)
47
- each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
48
- Proc.new{|fnsent| true}) {|fnsent|
49
- if fnsent.contains_FE_annotation_and_target
50
- fnsent.print_conll_style_to(outfile)
51
- end
52
- }
53
- end
54
-
55
- def extract_everything(outdirectory)
56
- unless outdirectory[-1,1] == "/"
57
- outdirectory += "/"
58
- end
59
-
60
- outfiles = Hash.new
61
- each_matching_sentence(Proc.new{|fnfile| true},
62
- Proc.new{|fnsent| true}) {|fnsent|
63
- frame = fnsent.get_file_obj.get_frame
64
- unless outfiles.key?(frame)
65
- outfiles[frame] = File.new(outdirectory+frame+".tab","w")
66
- end
67
- if fnsent.contains_FE_annotation_and_target
68
- fnsent.print_conll_style_to(outfiles[frame])
69
- end
70
- }
71
- # close output files
72
- outfiles.each_value {|file|
73
- file.close
74
- }
75
- # remove zero-size files
76
- Dir[outdirectory+"*"].each {|filename|
77
- if FileTest.zero?(filename)
78
- File.unlink(filename)
79
- end
80
- }
81
- end
82
-
83
-
84
- def initialize(fn_path)
85
- unless fn_path[-1,1] == "/"
86
- fn_path += "/"
87
- end
88
- @fn = fn_path
89
- end
90
-
91
- private
92
-
93
- def each_framexml_file
94
- # files might be zipped
95
- Dir[@fn+"lu*.xml.gz"].each {|gzfile|
96
- Kernel.system("cp "+gzfile+" /tmp/")
97
- Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
98
- gzfile =~ /(.+)\.gz/
99
- yield FrameXMLFile.new("/tmp/"+File.basename($1))
100
- }
101
- # or might not
102
- Dir[@fn+"/lu*.xml"].each {|filename|
103
- yield FrameXMLFile.new(filename)
104
- }
105
- end
106
-
107
- # I don't really remember what this was good for ;-)
108
-
109
- # def browse_everything(allFiles)
110
- # if allFiles
111
- # Dir[fn+"*.xml.gz"].each {|gzfile|
112
- # Kernel.system("cp "+gzfile+" /tmp/")
113
- # Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
114
- # gzfile =~ /(.+)\.gz/
115
- # # STDERR.puts File.basename($1)
116
- # # STDERR.print "."
117
- # ff = FrameXMLFile.new("/tmp/"+File.basename($1))
118
- # ff.each_sentence {|s|
119
- # if s.contains_FE_annotation_and_target
120
- # s.verify_annotation
121
- # if s.verify_annotation
122
- # puts "****************** Error: Still problems after 2nd verification!"
123
- # end
124
- # s.print_conll_style
125
- # end
126
- # }
127
- # }
128
- # else
129
- # ff = FrameXMLFile.new("/tmp/lu1870.xml")
130
- # ff.each_sentence {|s|
131
- # if s.contains_FE_annotation_and_target
132
- # s.verify_annotation
133
- # if s.verify_annotation
134
- # puts "****************** Error: Still problems after 2nd verification!"
135
- # end
136
- # # s.print_layers
137
- # s.print_conll_style
138
- # end
139
- # }
140
- # end
141
- # end
142
-
143
- end
144
-
@@ -1,513 +0,0 @@
1
- # sp 18 06 2004
2
- #
3
- # access to FrameNet XML files, sentences, and annotation.
4
- #
5
- # sp 10 11 04: only data from the first layer with name XY is
6
- # used for output. Other data is saved in layer XY.2nd, but is
7
- # currently not processed.
8
- #
9
- # sp 22 05 04: also, if two labels exist which cover the same span
10
- # (ie there is a double annotation within the same layer), ignore
11
- # all but the first label.
12
- #
13
- # ke 13 07 05:
14
- # - changed to RegXMl.rb
15
- # - fixed two problems in analyse_layer:
16
- # - Deleting problematic labels:
17
- # For some reason, thisLayer[i+1..-1].each_index {|other_i|
18
- # included the index 0 in any case, resulting in the 1st
19
- # label being deleted in any case.
20
- # - Deleting problematic labels, checking for label overlap:
21
- # The old formulation worked only if labels occurred in the array
22
- # in the order they occurred in the sentence, but that was not the case.
23
- # - Change in deleting problematic labels:
24
- # No longer delete duplicate labels, since e.g. in the PT level there
25
- # may be more than one NP label, and we want to keep those
26
- #
27
- # KE January 2007:
28
- # write new adapted FNTab format
29
- # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
30
-
31
- require 'frprep/Ampersand'
32
- require 'common/ISO-8859-1'
33
- require 'common/RegXML'
34
-
35
- class FrameXMLFile # only verified to work for FrameNet v1.1
36
-
37
- def initialize(filename)
38
- @filename = filename
39
- file = File.new(filename)
40
- counter = 0
41
- while true
42
- counter +=1
43
- line = file.gets
44
- if line =~ /<lexunit/
45
- break
46
- end
47
- if counter > 3
48
- STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
49
- Kernel.exit
50
- end
51
- end
52
- # found lexunit
53
- string = line
54
- while (line = file.gets())
55
- string << line
56
- end
57
- @lexunit = RegXML.new(string)
58
- attributes = @lexunit.attributes()
59
- @id = attributes["ID"]
60
- attributes["name"] =~ /^([^.]+).([^.]+)$/
61
- @lu = $1
62
- @pos = $2.upcase
63
- if @lu.nil?
64
- raise "[framexml] no lemma in header of file #{@filename}"
65
- elsif @pos.nil?
66
- raise "[framexml] no pos in header of file #{@filename}"
67
- end
68
- @frame = attributes["frame"]
69
- end
70
-
71
- def get_lu
72
- return @lu.gsub(" ","_")
73
- end
74
-
75
- def get_lu_id
76
- return @id
77
- end
78
-
79
- def get_filename
80
- return @filename
81
- end
82
-
83
- def get_pos
84
- return @pos
85
- end
86
-
87
- def get_frame
88
- return @frame
89
- end
90
-
91
- def close
92
- end
93
-
94
- def each_sentence
95
- @lexunit.children_and_text().each { |subcorpus|
96
- subcorpus.children_and_text().each { |annotationSet|
97
- if annotationSet.name == "annotationSet"
98
- # sentence found
99
- yield FrameXMLSentence.new(annotationSet,self)
100
- end
101
- }
102
- }
103
- end
104
- end
105
-
106
- class FrameXMLSentence
107
- def initialize(annotationSet,file_obj)
108
- @file_obj = file_obj
109
-
110
- # layers: hash layer_name -> array:[name, start, stop]
111
- # name: name of the element, string
112
- # start: start character, integer
113
- # stop: end character, integer
114
- @layers = Hash.new
115
-
116
- annotationSet.children_and_text().each { |sentence_or_layer_elt|
117
-
118
- case sentence_or_layer_elt.name
119
- when "sentence"
120
- # sentence: has ID, its child is <text>[text]</text>
121
- @sent_id = sentence_or_layer_elt.attributes["ID"]
122
- text_elt = sentence_or_layer_elt.children_and_text().detect { |child|
123
- child.name == "text"
124
- }
125
- if text_elt
126
- # found the text element. its only child should be the text
127
- @orig_text = text_elt.children_and_text().detect { |child|
128
- child.text?
129
- }
130
- if @orig_text
131
- # take text out of RegXMl object
132
- @orig_text = @orig_text.to_s()
133
- end
134
- end
135
-
136
- when "layers"
137
- # contains annotation layers
138
- sentence_or_layer_elt.children_and_text().each { |layer|
139
- unless layer.name == "layer"
140
- # additional material, ignore
141
- next
142
- end
143
-
144
- name = layer.attributes["name"]
145
- unless name
146
- raise "layer without a name"
147
- end
148
- unless @layers.key?(name)
149
- @layers[name] = analyse_layer(layer, name)
150
- end
151
- }
152
- end
153
- }
154
-
155
- @pos_text = UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
156
- @text = Ampersand.utf8_to_hex(@orig_text).split(" ") # text with special characters replaced by &...; sequences
157
-
158
- # all text and pos_text have the same number of elements!
159
- @start_is = Hash.new # map char indices (start of words) onto word indices
160
- @stop_is = Hash.new # map char indices (end of words) onto word indices
161
- @charidx = Array.new # maps word indices on [start,stop]
162
-
163
- @double_space = Array.new
164
- pos = 0
165
- while (match = @orig_text.index(/(\s\s+)/,pos))
166
- @double_space << match
167
- pos = match+1
168
- end
169
-
170
-
171
- # fill start, stop and charidx arrays
172
- char_i = 0
173
- @pos_text.each_index {|word_i|
174
- @start_is[char_i] = word_i
175
- startchar = char_i
176
- # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
177
- char_i += our_length(@pos_text[word_i])
178
- @stop_is[char_i-1] = word_i
179
-
180
- stopchar = char_i-1
181
-
182
- # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
183
-
184
- @charidx << [startchar,stopchar]
185
-
186
- # separators
187
- if @double_space.include?(char_i) then
188
- char_i += 2
189
- else
190
- char_i += 1
191
- end
192
- }
193
- end
194
-
195
- def get_file_obj
196
- return @file_obj
197
- end
198
-
199
- def get_sent_id
200
- return @sent_id
201
- end
202
-
203
- def print_text
204
- puts "("+@id+ ")\t"+@text
205
- end
206
-
207
- def contains_FE_annotation_and_target
208
- target_info = @layers["Target"][0]
209
- unless target_info[0] == "Target"
210
- STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
211
- STDERR.puts "Sentence: "+@text
212
- return false
213
- else
214
- return (@layers.key?("FE") and target_info[2] != 0)
215
- end
216
- end
217
-
218
- # we only verify the interesting layers (FE,GF,Target)
219
- # if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
220
-
221
- def verify_annotation # returns true if some change has taken place
222
- change = false
223
- @layers.each_pair {|layername,l|
224
-
225
- if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
226
-
227
- l.each_index {|i|
228
-
229
- element,start,stop = l[i]
230
-
231
- newstart = start
232
- newstop = stop
233
-
234
- @charidx.each_index{|j|
235
- unless j== 0
236
- pstartidx, pstopidx = @charidx[j-1]
237
- end
238
- startidx, stopidx = @charidx[j]
239
-
240
- if (start > startidx and start <= stopidx) or
241
- (j != 0 and start > pstopidx and start < startidx)
242
- newstart = startidx
243
- end
244
-
245
- if (stop >= startidx and stop < stopidx)
246
- newstop = stopidx
247
- elsif (j != 0 and stop > pstopidx and stop < startidx)
248
- newstop = pstopidx
249
- end
250
-
251
- }
252
- if start != newstart or stop != newstop
253
- change = true
254
- @layers[layername][i] = [element,newstart,newstop]
255
- STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
256
- markable_as_string(layername,element).each {|string|
257
- STDERR.puts "New markable: "+string
258
- }
259
- STDERR.puts "Sentence: "+@pos_text.join(" ")
260
- puts
261
- end
262
- }
263
- end
264
- }
265
- return change
266
- end
267
-
268
- def print_conll_style
269
- print_conll_style_to(STDOUT)
270
- end
271
-
272
- # CHANGED KE January 2007:
273
- # write new adapted FNTab format
274
- # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
275
- def print_conll_style_to(out)
276
-
277
- # even though in principle there might be multiple
278
- # labels for one span [i.e. in one value of the
279
- # {gf,fe,pt} hashes], we only ever record one
280
-
281
- gf = Hash.new
282
- add_all_to_hash(gf,"GF")
283
- fe = Hash.new
284
- add_all_to_hash(fe,"FE")
285
- pt = Hash.new
286
- add_all_to_hash(pt,"PT")
287
- target = Hash.new
288
- add_all_to_hash(target,"Target")
289
-
290
- in_target = false
291
-
292
- @pos_text.each_index {|i|
293
- # write format:
294
- # "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
295
- line = Array.new
296
- # word
297
- word = @pos_text[i]
298
- line << word
299
-
300
- start, stop = @charidx[i]
301
- # "pt", "gf", "role",
302
- [pt,gf,fe].each {|hash|
303
- token = Array.new
304
- if hash.key?([start,"start"])
305
- markables = hash.delete([start,"start"])
306
- markables.each {|element|
307
- token << "B-"+element
308
- }
309
- end
310
- if hash.key?([stop,"stop"])
311
- markables = hash.delete([stop,"stop"])
312
- markables.each {|element|
313
- token << "E-"+element
314
- }
315
- end
316
- if token.empty?
317
- line << "-"
318
- else
319
- line << token.sort.join(":")
320
- end
321
- }
322
- # "target"
323
- if target.key?([start,"start"])
324
- target.delete([start,"start"])
325
- in_target = true
326
- end
327
- if in_target
328
- line << @file_obj.get_lu+"."+@file_obj.get_pos
329
- else
330
- line << "-"
331
- end
332
- if target.key?([stop,"stop"])
333
- target.delete([stop,"stop"])
334
- in_target = false
335
- end
336
- # "frame"
337
- line << @file_obj.get_frame
338
-
339
- # "stuff" "ne",
340
- line << "-"
341
- line << "-"
342
-
343
- # "sent_id"
344
- line << @file_obj.get_lu_id+"-"+@sent_id
345
-
346
- out.puts line.join("\t")
347
- }
348
-
349
- out.puts
350
-
351
- [gf,fe,pt,target].each {|hash|
352
- unless hash.empty?
353
- STDERR.puts @file_obj.get_filename
354
- raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
355
- end
356
- }
357
- end
358
-
359
-
360
- def print_layers
361
- @layers.each {|ln,l|
362
- puts "Layer "+ln+":"
363
- l.each {|element,start,stop|
364
- puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
365
- }
366
- puts "***"
367
- }
368
- end
369
-
370
-
371
- private
372
-
373
-
374
- def our_length(string) # (1) replace &...; with 1 char and " with two chars
375
- return string.gsub(/&(.+?);/,"X").length
376
- end
377
-
378
- def is_fe(fename)
379
- @layers["FE"].each {|name,start,stop|
380
- if fename == name
381
- return true
382
- end
383
- }
384
- return false
385
- end
386
-
387
-
388
- def markable_as_string(layername,markup_name) # returns an array of all markables with this name
389
-
390
- result = Array.new
391
-
392
- festart = nil
393
- festop = nil
394
- @layers[layername].each {|name,start,stop|
395
- if markup_name == name
396
- fe = Array.new
397
- infe = false
398
- @charidx.each_index {|i|
399
- startidx,stopidx = @charidx[i]
400
- if startidx == start
401
- infe = true
402
- end
403
- if infe
404
- fe << @pos_text[i]
405
- end
406
- if stopidx == stop
407
- result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
408
- break
409
- elsif stopidx > stop
410
- result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
411
- break
412
- end
413
- }
414
- end
415
- }
416
- return result
417
- end
418
-
419
- def add_to_hash(hash,key,name)
420
- exists = false
421
- if hash.key?(key)
422
- exists = true
423
- else
424
- hash[key] = Array.new
425
- hash[key] << name
426
- end
427
- return exists
428
- end
429
-
430
- def add_all_to_hash(hash,layername)
431
- # use "uniq" to remove wrong double annotations
432
- @layers[layername].uniq.each {|element,start,stop|
433
- exists = add_to_hash(hash,[start, "start"],element)
434
- if exists
435
- STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
436
- else
437
- add_to_hash(hash,[stop, "stop"],element)
438
- end
439
- }
440
- end
441
-
442
-
443
- def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
444
- if name.nil?
445
- STDERR.puts "Error: layer line "+line+" with empty name."
446
- end
447
-
448
- # thisLayer, retv: array:[name(string), start(integer), end(integer)]
449
- thisLayer = Array.new
450
- retv = Array.new
451
-
452
- labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
453
- unless labels_elt
454
- # no labels found, return empty array
455
- return thisLayer
456
- end
457
-
458
- labels_elt.children_and_text.each { |label|
459
- unless label.name == "label"
460
- # some other markup, ignore
461
- next
462
- end
463
-
464
- attributes = label.attributes()
465
- if attributes["itype"]
466
- # null instantiation, don't retain
467
- next
468
- end
469
- if not(attributes["start"]) and not(attributes["end"])
470
- # no start and end labels
471
- next
472
- end
473
- thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
474
- }
475
-
476
- # sanity check: verify that
477
- # 1. we don't have overlapping labels
478
-
479
- deleteHash = Hash.new # keep track of the labels which are to be deleted
480
- # i -> Boolean
481
-
482
- thisLayer.each_index {|i|
483
- # efficiency: skip already delete labels
484
- if deleteHash[i]
485
- next
486
- end
487
- this_label, this_from , this_to = thisLayer[i]
488
-
489
- # compare with all remaining labels
490
- (i+1..thisLayer.length()-1).to_a.each { |other_i|
491
- other_label,other_from,other_to = thisLayer[other_i]
492
-
493
- # overlap? Throw out the later FE
494
- if this_from <= other_from and other_from <= this_to
495
- $stderr.puts "Warning: Label overlap, deleting #{other_label}"
496
- deleteHash[other_i] = true
497
- elsif this_from <= other_to and other_to <= this_to
498
- $stderr.puts "Warning: Label overlap, deleting #{this_label}"
499
- deleteHash[i] = true
500
- end
501
- }
502
- # matched with all other labels. If "keep", return
503
-
504
- if deleteHash[i]
505
- # $stderr.puts " deleting entry #{i}"
506
- else
507
- retv << thisLayer[i]
508
- end
509
- }
510
-
511
- return retv
512
- end
513
- end