shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,104 @@
1
+ # sp 18 06 2004
2
+ #
3
+ # access to FrameNet XML files, sentences, and annotation.
4
+ #
5
+ # sp 10 11 04: only data from the first layer with name XY is
6
+ # used for output. Other data is saved in layer XY.2nd, but is
7
+ # currently not processed.
8
+ #
9
+ # sp 22 05 04: also, if two labels exist which cover the same span
10
+ # (ie there is a double annotation within the same layer), ignore
11
+ # all but the first label.
12
+ #
13
+ # ke 13 07 05:
14
+ # - changed to RegXMl.rb
15
+ # - fixed two problems in analyse_layer:
16
+ # - Deleting problematic labels:
17
+ # For some reason, thisLayer[i+1..-1].each_index {|other_i|
18
+ # included the index 0 in any case, resulting in the 1st
19
+ # label being deleted in any case.
20
+ # - Deleting problematic labels, checking for label overlap:
21
+ # The old formulation worked only if labels occurred in the array
22
+ # in the order they occurred in the sentence, but that was not the case.
23
+ # - Change in deleting problematic labels:
24
+ # No longer delete duplicate labels, since e.g. in the PT level there
25
+ # may be more than one NP label, and we want to keep those
26
+ #
27
+ # KE January 2007:
28
+ # write new adapted FNTab format
29
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
30
+
31
+
32
+ require_relative 'frame_xml_sentence'
33
+ require 'salsa_tiger_xml/reg_xml'
34
+
35
+ class FrameXMLFile # only verified to work for FrameNet v1.1
36
+
37
+ def initialize(filename)
38
+ @filename = filename
39
+ file = File.new(filename)
40
+ counter = 0
41
+ while true
42
+ counter +=1
43
+ line = file.gets
44
+ if line =~ /<lexunit/
45
+ break
46
+ end
47
+ if counter > 3
48
+ STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
49
+ Kernel.exit
50
+ end
51
+ end
52
+ # found lexunit
53
+ string = line
54
+ while (line = file.gets)
55
+ string << line
56
+ end
57
+ @lexunit = STXML::RegXML.new(string)
58
+ attributes = @lexunit.attributes
59
+ @id = attributes["ID"]
60
+ attributes["name"] =~ /^([^.]+).([^.]+)$/
61
+ @lu = $1
62
+ @pos = $2.upcase
63
+ if @lu.nil?
64
+ raise "[framexml] no lemma in header of file #{@filename}"
65
+ elsif @pos.nil?
66
+ raise "[framexml] no pos in header of file #{@filename}"
67
+ end
68
+ @frame = attributes["frame"]
69
+ end
70
+
71
+ def get_lu
72
+ return @lu.gsub(" ","_")
73
+ end
74
+
75
+ def get_lu_id
76
+ return @id
77
+ end
78
+
79
+ def get_filename
80
+ return @filename
81
+ end
82
+
83
+ def get_pos
84
+ return @pos
85
+ end
86
+
87
+ def get_frame
88
+ return @frame
89
+ end
90
+
91
+ def close
92
+ end
93
+
94
+ def each_sentence
95
+ @lexunit.children_and_text.each { |subcorpus|
96
+ subcorpus.children_and_text.each { |annotationSet|
97
+ if annotationSet.name == "annotationSet"
98
+ # sentence found
99
+ yield FrameXMLSentence.new(annotationSet,self)
100
+ end
101
+ }
102
+ }
103
+ end
104
+ end
@@ -0,0 +1,411 @@
1
+ require 'frappe/Ampersand'
2
+ require 'frappe/utf_iso'
3
+
4
+ class FrameXMLSentence
5
+ def initialize(annotationSet, file_obj)
6
+ @file_obj = file_obj
7
+
8
+ # layers: hash layer_name -> array:[name, start, stop]
9
+ # name: name of the element, string
10
+ # start: start character, integer
11
+ # stop: end character, integer
12
+ @layers = {}
13
+
14
+ annotationSet.children_and_text.each { |sentence_or_layer_elt|
15
+
16
+ case sentence_or_layer_elt.name
17
+ when "sentence"
18
+ # sentence: has ID, its child is <text>[text]</text>
19
+ @sent_id = sentence_or_layer_elt.attributes["ID"]
20
+ text_elt = sentence_or_layer_elt.children_and_text.detect { |child|
21
+ child.name == "text"
22
+ }
23
+ if text_elt
24
+ # found the text element. its only child should be the text
25
+ @orig_text = text_elt.children_and_text.detect { |child|
26
+ child.text?
27
+ }
28
+ if @orig_text
29
+ # take text out of RegXMl object
30
+ @orig_text = @orig_text.to_s
31
+ end
32
+ end
33
+
34
+ when "layers"
35
+ # contains annotation layers
36
+ sentence_or_layer_elt.children_and_text.each { |layer|
37
+ unless layer.name == "layer"
38
+ # additional material, ignore
39
+ next
40
+ end
41
+
42
+ name = layer.attributes["name"]
43
+ unless name
44
+ raise "layer without a name"
45
+ end
46
+ unless @layers.key?(name)
47
+ @layers[name] = analyse_layer(layer, name)
48
+ end
49
+ }
50
+ end
51
+ }
52
+
53
+ @pos_text = ::Shalmaneser::Frappe::UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
54
+ @text = ::Shalmaneser::Frappe::Ampersand.utf8_to_hex(@orig_text).split(" ") # text with special characters replaced by &...; sequences
55
+
56
+ # all text and pos_text have the same number of elements!
57
+ @start_is = {} # map char indices (start of words) onto word indices
58
+ @stop_is = {} # map char indices (end of words) onto word indices
59
+ @charidx = [] # maps word indices on [start,stop]
60
+
61
+ @double_space = []
62
+ pos = 0
63
+ while (match = @orig_text.index(/(\s\s+)/,pos))
64
+ @double_space << match
65
+ pos = match+1
66
+ end
67
+
68
+
69
+ # fill start, stop and charidx arrays
70
+ char_i = 0
71
+ @pos_text.each_index {|word_i|
72
+ @start_is[char_i] = word_i
73
+ startchar = char_i
74
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
75
+ char_i += our_length(@pos_text[word_i])
76
+ @stop_is[char_i-1] = word_i
77
+
78
+ stopchar = char_i-1
79
+
80
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
81
+
82
+ @charidx << [startchar,stopchar]
83
+
84
+ # separators
85
+ if @double_space.include?(char_i) then
86
+ char_i += 2
87
+ else
88
+ char_i += 1
89
+ end
90
+ }
91
+ end
92
+
93
+ def get_file_obj
94
+ return @file_obj
95
+ end
96
+
97
+ def get_sent_id
98
+ return @sent_id
99
+ end
100
+
101
+ def print_text
102
+ puts "("+@id+ ")\t"+@text
103
+ end
104
+
105
+ def contains_FE_annotation_and_target
106
+ target_info = @layers["Target"][0]
107
+ unless target_info[0] == "Target"
108
+ STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
109
+ STDERR.puts "Sentence: "+@text
110
+ return false
111
+ else
112
+ return (@layers.key?("FE") and target_info[2] != 0)
113
+ end
114
+ end
115
+
116
+ # we only verify the interesting layers (FE,GF,Target)
117
+ # if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
118
+
119
+ def verify_annotation # returns true if some change has taken place
120
+ change = false
121
+ @layers.each_pair {|layername,l|
122
+
123
+ if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
124
+
125
+ l.each_index {|i|
126
+
127
+ element,start,stop = l[i]
128
+
129
+ newstart = start
130
+ newstop = stop
131
+
132
+ @charidx.each_index{|j|
133
+ unless j== 0
134
+ pstartidx, pstopidx = @charidx[j-1]
135
+ end
136
+ startidx, stopidx = @charidx[j]
137
+
138
+ if (start > startidx and start <= stopidx) or
139
+ (j != 0 and start > pstopidx and start < startidx)
140
+ newstart = startidx
141
+ end
142
+
143
+ if (stop >= startidx and stop < stopidx)
144
+ newstop = stopidx
145
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
146
+ newstop = pstopidx
147
+ end
148
+
149
+ }
150
+ if start != newstart or stop != newstop
151
+ change = true
152
+ @layers[layername][i] = [element,newstart,newstop]
153
+ STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
154
+ markable_as_string(layername,element).each {|string|
155
+ STDERR.puts "New markable: "+string
156
+ }
157
+ STDERR.puts "Sentence: "+@pos_text.join(" ")
158
+ puts
159
+ end
160
+ }
161
+ end
162
+ }
163
+ return change
164
+ end
165
+
166
+ def print_conll_style
167
+ print_conll_style_to(STDOUT)
168
+ end
169
+
170
+ # CHANGED KE January 2007:
171
+ # write new adapted FNTab format
172
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
173
+ def print_conll_style_to(out)
174
+
175
+ # even though in principle there might be multiple
176
+ # labels for one span [i.e. in one value of the
177
+ # {gf,fe,pt} hashes], we only ever record one
178
+
179
+ gf = {}
180
+ add_all_to_hash(gf,"GF")
181
+ fe = {}
182
+ add_all_to_hash(fe,"FE")
183
+ pt = {}
184
+ add_all_to_hash(pt,"PT")
185
+ target = {}
186
+ add_all_to_hash(target,"Target")
187
+
188
+ in_target = false
189
+
190
+ @pos_text.each_index {|i|
191
+ # write format:
192
+ # "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
193
+ line = []
194
+ # word
195
+ word = @pos_text[i]
196
+ line << word
197
+
198
+ start, stop = @charidx[i]
199
+ # "pt", "gf", "role",
200
+ [pt,gf,fe].each {|hash|
201
+ token = []
202
+ if hash.key?([start,"start"])
203
+ markables = hash.delete([start,"start"])
204
+ markables.each {|element|
205
+ token << "B-"+element
206
+ }
207
+ end
208
+ if hash.key?([stop,"stop"])
209
+ markables = hash.delete([stop,"stop"])
210
+ markables.each {|element|
211
+ token << "E-"+element
212
+ }
213
+ end
214
+ if token.empty?
215
+ line << "-"
216
+ else
217
+ line << token.sort.join(":")
218
+ end
219
+ }
220
+ # "target"
221
+ if target.key?([start,"start"])
222
+ target.delete([start,"start"])
223
+ in_target = true
224
+ end
225
+ if in_target
226
+ line << @file_obj.get_lu+"."+@file_obj.get_pos
227
+ else
228
+ line << "-"
229
+ end
230
+ if target.key?([stop,"stop"])
231
+ target.delete([stop,"stop"])
232
+ in_target = false
233
+ end
234
+ # "frame"
235
+ line << @file_obj.get_frame
236
+
237
+ # "stuff" "ne",
238
+ line << "-"
239
+ line << "-"
240
+
241
+ # "sent_id"
242
+ line << @file_obj.get_lu_id+"-"+@sent_id
243
+
244
+ out.puts line.join("\t")
245
+ }
246
+
247
+ out.puts
248
+
249
+ [gf,fe,pt,target].each {|hash|
250
+ unless hash.empty?
251
+ STDERR.puts @file_obj.get_filename
252
+ raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
253
+ end
254
+ }
255
+ end
256
+
257
+
258
+ def print_layers
259
+ @layers.each {|ln,l|
260
+ puts "Layer "+ln+":"
261
+ l.each {|element,start,stop|
262
+ puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
263
+ }
264
+ puts "***"
265
+ }
266
+ end
267
+
268
+
269
+ private
270
+
271
+
272
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
273
+ return string.gsub(/&(.+?);/,"X").length
274
+ end
275
+
276
+ def is_fe(fename)
277
+ @layers["FE"].each {|name,start,stop|
278
+ if fename == name
279
+ return true
280
+ end
281
+ }
282
+ return false
283
+ end
284
+
285
+
286
+ def markable_as_string(layername,markup_name) # returns an array of all markables with this name
287
+
288
+ result = []
289
+
290
+ festart = nil
291
+ festop = nil
292
+ @layers[layername].each {|name,start,stop|
293
+ if markup_name == name
294
+ fe = []
295
+ infe = false
296
+ @charidx.each_index {|i|
297
+ startidx,stopidx = @charidx[i]
298
+ if startidx == start
299
+ infe = true
300
+ end
301
+ if infe
302
+ fe << @pos_text[i]
303
+ end
304
+ if stopidx == stop
305
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
306
+ break
307
+ elsif stopidx > stop
308
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
309
+ break
310
+ end
311
+ }
312
+ end
313
+ }
314
+ return result
315
+ end
316
+
317
+ def add_to_hash(hash,key,name)
318
+ exists = false
319
+ if hash.key?(key)
320
+ exists = true
321
+ else
322
+ hash[key] = []
323
+ hash[key] << name
324
+ end
325
+ return exists
326
+ end
327
+
328
+ def add_all_to_hash(hash,layername)
329
+ # use "uniq" to remove wrong double annotations
330
+ @layers[layername].uniq.each {|element,start,stop|
331
+ exists = add_to_hash(hash,[start, "start"],element)
332
+ if exists
333
+ STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
334
+ else
335
+ add_to_hash(hash,[stop, "stop"],element)
336
+ end
337
+ }
338
+ end
339
+
340
+
341
+ def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
342
+ if name.nil?
343
+ STDERR.puts "Error: layer line "+line+" with empty name."
344
+ end
345
+
346
+ # thisLayer, retv: array:[name(string), start(integer), end(integer)]
347
+ thisLayer = []
348
+ retv = []
349
+
350
+ labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
351
+ unless labels_elt
352
+ # no labels found, return empty array
353
+ return thisLayer
354
+ end
355
+
356
+ labels_elt.children_and_text.each { |label|
357
+ unless label.name == "label"
358
+ # some other markup, ignore
359
+ next
360
+ end
361
+
362
+ attributes = label.attributes
363
+ if attributes["itype"]
364
+ # null instantiation, don't retain
365
+ next
366
+ end
367
+ if not(attributes["start"]) and not(attributes["end"])
368
+ # no start and end labels
369
+ next
370
+ end
371
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
372
+ }
373
+
374
+ # sanity check: verify that
375
+ # 1. we don't have overlapping labels
376
+
377
+ deleteHash = {} # keep track of the labels which are to be deleted
378
+ # i -> Boolean
379
+
380
+ thisLayer.each_index {|i|
381
+ # efficiency: skip already delete labels
382
+ if deleteHash[i]
383
+ next
384
+ end
385
+ this_label, this_from , this_to = thisLayer[i]
386
+
387
+ # compare with all remaining labels
388
+ (i+1..thisLayer.length-1).to_a.each { |other_i|
389
+ other_label,other_from,other_to = thisLayer[other_i]
390
+
391
+ # overlap? Throw out the later FE
392
+ if this_from <= other_from and other_from <= this_to
393
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
394
+ deleteHash[other_i] = true
395
+ elsif this_from <= other_to and other_to <= this_to
396
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
397
+ deleteHash[i] = true
398
+ end
399
+ }
400
+ # matched with all other labels. If "keep", return
401
+
402
+ if deleteHash[i]
403
+ # $stderr.puts " deleting entry #{i}"
404
+ else
405
+ retv << thisLayer[i]
406
+ end
407
+ }
408
+
409
+ return retv
410
+ end
411
+ end