shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,104 @@
1
+ # sp 18 06 2004
2
+ #
3
+ # access to FrameNet XML files, sentences, and annotation.
4
+ #
5
+ # sp 10 11 04: only data from the first layer with name XY is
6
+ # used for output. Other data is saved in layer XY.2nd, but is
7
+ # currently not processed.
8
+ #
9
+ # sp 22 05 04: also, if two labels exist which cover the same span
10
+ # (ie there is a double annotation within the same layer), ignore
11
+ # all but the first label.
12
+ #
13
+ # ke 13 07 05:
14
+ # - changed to RegXMl.rb
15
+ # - fixed two problems in analyse_layer:
16
+ # - Deleting problematic labels:
17
+ # For some reason, thisLayer[i+1..-1].each_index {|other_i|
18
+ # included the index 0 in any case, resulting in the 1st
19
+ # label being deleted in any case.
20
+ # - Deleting problematic labels, checking for label overlap:
21
+ # The old formulation worked only if labels occurred in the array
22
+ # in the order they occurred in the sentence, but that was not the case.
23
+ # - Change in deleting problematic labels:
24
+ # No longer delete duplicate labels, since e.g. in the PT level there
25
+ # may be more than one NP label, and we want to keep those
26
+ #
27
+ # KE January 2007:
28
+ # write new adapted FNTab format
29
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
30
+
31
+
32
+ require_relative 'frame_xml_sentence'
33
+ require 'salsa_tiger_xml/reg_xml'
34
+
35
+ class FrameXMLFile # only verified to work for FrameNet v1.1
36
+
37
+ def initialize(filename)
38
+ @filename = filename
39
+ file = File.new(filename)
40
+ counter = 0
41
+ while true
42
+ counter +=1
43
+ line = file.gets
44
+ if line =~ /<lexunit/
45
+ break
46
+ end
47
+ if counter > 3
48
+ STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
49
+ Kernel.exit
50
+ end
51
+ end
52
+ # found lexunit
53
+ string = line
54
+ while (line = file.gets)
55
+ string << line
56
+ end
57
+ @lexunit = STXML::RegXML.new(string)
58
+ attributes = @lexunit.attributes
59
+ @id = attributes["ID"]
60
+ attributes["name"] =~ /^([^.]+).([^.]+)$/
61
+ @lu = $1
62
+ @pos = $2.upcase
63
+ if @lu.nil?
64
+ raise "[framexml] no lemma in header of file #{@filename}"
65
+ elsif @pos.nil?
66
+ raise "[framexml] no pos in header of file #{@filename}"
67
+ end
68
+ @frame = attributes["frame"]
69
+ end
70
+
71
+ def get_lu
72
+ return @lu.gsub(" ","_")
73
+ end
74
+
75
+ def get_lu_id
76
+ return @id
77
+ end
78
+
79
+ def get_filename
80
+ return @filename
81
+ end
82
+
83
+ def get_pos
84
+ return @pos
85
+ end
86
+
87
+ def get_frame
88
+ return @frame
89
+ end
90
+
91
+ def close
92
+ end
93
+
94
+ def each_sentence
95
+ @lexunit.children_and_text.each { |subcorpus|
96
+ subcorpus.children_and_text.each { |annotationSet|
97
+ if annotationSet.name == "annotationSet"
98
+ # sentence found
99
+ yield FrameXMLSentence.new(annotationSet,self)
100
+ end
101
+ }
102
+ }
103
+ end
104
+ end
@@ -0,0 +1,411 @@
1
+ require 'frappe/Ampersand'
2
+ require 'frappe/utf_iso'
3
+
4
+ class FrameXMLSentence
5
+ def initialize(annotationSet, file_obj)
6
+ @file_obj = file_obj
7
+
8
+ # layers: hash layer_name -> array:[name, start, stop]
9
+ # name: name of the element, string
10
+ # start: start character, integer
11
+ # stop: end character, integer
12
+ @layers = {}
13
+
14
+ annotationSet.children_and_text.each { |sentence_or_layer_elt|
15
+
16
+ case sentence_or_layer_elt.name
17
+ when "sentence"
18
+ # sentence: has ID, its child is <text>[text]</text>
19
+ @sent_id = sentence_or_layer_elt.attributes["ID"]
20
+ text_elt = sentence_or_layer_elt.children_and_text.detect { |child|
21
+ child.name == "text"
22
+ }
23
+ if text_elt
24
+ # found the text element. its only child should be the text
25
+ @orig_text = text_elt.children_and_text.detect { |child|
26
+ child.text?
27
+ }
28
+ if @orig_text
29
+ # take text out of RegXMl object
30
+ @orig_text = @orig_text.to_s
31
+ end
32
+ end
33
+
34
+ when "layers"
35
+ # contains annotation layers
36
+ sentence_or_layer_elt.children_and_text.each { |layer|
37
+ unless layer.name == "layer"
38
+ # additional material, ignore
39
+ next
40
+ end
41
+
42
+ name = layer.attributes["name"]
43
+ unless name
44
+ raise "layer without a name"
45
+ end
46
+ unless @layers.key?(name)
47
+ @layers[name] = analyse_layer(layer, name)
48
+ end
49
+ }
50
+ end
51
+ }
52
+
53
+ @pos_text = ::Shalmaneser::Frappe::UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
54
+ @text = ::Shalmaneser::Frappe::Ampersand.utf8_to_hex(@orig_text).split(" ") # text with special characters replaced by &...; sequences
55
+
56
+ # all text and pos_text have the same number of elements!
57
+ @start_is = {} # map char indices (start of words) onto word indices
58
+ @stop_is = {} # map char indices (end of words) onto word indices
59
+ @charidx = [] # maps word indices on [start,stop]
60
+
61
+ @double_space = []
62
+ pos = 0
63
+ while (match = @orig_text.index(/(\s\s+)/,pos))
64
+ @double_space << match
65
+ pos = match+1
66
+ end
67
+
68
+
69
+ # fill start, stop and charidx arrays
70
+ char_i = 0
71
+ @pos_text.each_index {|word_i|
72
+ @start_is[char_i] = word_i
73
+ startchar = char_i
74
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
75
+ char_i += our_length(@pos_text[word_i])
76
+ @stop_is[char_i-1] = word_i
77
+
78
+ stopchar = char_i-1
79
+
80
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
81
+
82
+ @charidx << [startchar,stopchar]
83
+
84
+ # separators
85
+ if @double_space.include?(char_i) then
86
+ char_i += 2
87
+ else
88
+ char_i += 1
89
+ end
90
+ }
91
+ end
92
+
93
+ def get_file_obj
94
+ return @file_obj
95
+ end
96
+
97
+ def get_sent_id
98
+ return @sent_id
99
+ end
100
+
101
+ def print_text
102
+ puts "("+@id+ ")\t"+@text
103
+ end
104
+
105
+ def contains_FE_annotation_and_target
106
+ target_info = @layers["Target"][0]
107
+ unless target_info[0] == "Target"
108
+ STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
109
+ STDERR.puts "Sentence: "+@text
110
+ return false
111
+ else
112
+ return (@layers.key?("FE") and target_info[2] != 0)
113
+ end
114
+ end
115
+
116
+ # we only verify the interesting layers (FE,GF,Target)
117
+ # if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
118
+
119
+ def verify_annotation # returns true if some change has taken place
120
+ change = false
121
+ @layers.each_pair {|layername,l|
122
+
123
+ if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
124
+
125
+ l.each_index {|i|
126
+
127
+ element,start,stop = l[i]
128
+
129
+ newstart = start
130
+ newstop = stop
131
+
132
+ @charidx.each_index{|j|
133
+ unless j== 0
134
+ pstartidx, pstopidx = @charidx[j-1]
135
+ end
136
+ startidx, stopidx = @charidx[j]
137
+
138
+ if (start > startidx and start <= stopidx) or
139
+ (j != 0 and start > pstopidx and start < startidx)
140
+ newstart = startidx
141
+ end
142
+
143
+ if (stop >= startidx and stop < stopidx)
144
+ newstop = stopidx
145
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
146
+ newstop = pstopidx
147
+ end
148
+
149
+ }
150
+ if start != newstart or stop != newstop
151
+ change = true
152
+ @layers[layername][i] = [element,newstart,newstop]
153
+ STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
154
+ markable_as_string(layername,element).each {|string|
155
+ STDERR.puts "New markable: "+string
156
+ }
157
+ STDERR.puts "Sentence: "+@pos_text.join(" ")
158
+ puts
159
+ end
160
+ }
161
+ end
162
+ }
163
+ return change
164
+ end
165
+
166
+ def print_conll_style
167
+ print_conll_style_to(STDOUT)
168
+ end
169
+
170
+ # CHANGED KE January 2007:
171
+ # write new adapted FNTab format
172
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
173
+ def print_conll_style_to(out)
174
+
175
+ # even though in principle there might be multiple
176
+ # labels for one span [i.e. in one value of the
177
+ # {gf,fe,pt} hashes], we only ever record one
178
+
179
+ gf = {}
180
+ add_all_to_hash(gf,"GF")
181
+ fe = {}
182
+ add_all_to_hash(fe,"FE")
183
+ pt = {}
184
+ add_all_to_hash(pt,"PT")
185
+ target = {}
186
+ add_all_to_hash(target,"Target")
187
+
188
+ in_target = false
189
+
190
+ @pos_text.each_index {|i|
191
+ # write format:
192
+ # "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
193
+ line = []
194
+ # word
195
+ word = @pos_text[i]
196
+ line << word
197
+
198
+ start, stop = @charidx[i]
199
+ # "pt", "gf", "role",
200
+ [pt,gf,fe].each {|hash|
201
+ token = []
202
+ if hash.key?([start,"start"])
203
+ markables = hash.delete([start,"start"])
204
+ markables.each {|element|
205
+ token << "B-"+element
206
+ }
207
+ end
208
+ if hash.key?([stop,"stop"])
209
+ markables = hash.delete([stop,"stop"])
210
+ markables.each {|element|
211
+ token << "E-"+element
212
+ }
213
+ end
214
+ if token.empty?
215
+ line << "-"
216
+ else
217
+ line << token.sort.join(":")
218
+ end
219
+ }
220
+ # "target"
221
+ if target.key?([start,"start"])
222
+ target.delete([start,"start"])
223
+ in_target = true
224
+ end
225
+ if in_target
226
+ line << @file_obj.get_lu+"."+@file_obj.get_pos
227
+ else
228
+ line << "-"
229
+ end
230
+ if target.key?([stop,"stop"])
231
+ target.delete([stop,"stop"])
232
+ in_target = false
233
+ end
234
+ # "frame"
235
+ line << @file_obj.get_frame
236
+
237
+ # "stuff" "ne",
238
+ line << "-"
239
+ line << "-"
240
+
241
+ # "sent_id"
242
+ line << @file_obj.get_lu_id+"-"+@sent_id
243
+
244
+ out.puts line.join("\t")
245
+ }
246
+
247
+ out.puts
248
+
249
+ [gf,fe,pt,target].each {|hash|
250
+ unless hash.empty?
251
+ STDERR.puts @file_obj.get_filename
252
+ raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
253
+ end
254
+ }
255
+ end
256
+
257
+
258
+ def print_layers
259
+ @layers.each {|ln,l|
260
+ puts "Layer "+ln+":"
261
+ l.each {|element,start,stop|
262
+ puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
263
+ }
264
+ puts "***"
265
+ }
266
+ end
267
+
268
+
269
+ private
270
+
271
+
272
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
273
+ return string.gsub(/&(.+?);/,"X").length
274
+ end
275
+
276
+ def is_fe(fename)
277
+ @layers["FE"].each {|name,start,stop|
278
+ if fename == name
279
+ return true
280
+ end
281
+ }
282
+ return false
283
+ end
284
+
285
+
286
+ def markable_as_string(layername,markup_name) # returns an array of all markables with this name
287
+
288
+ result = []
289
+
290
+ festart = nil
291
+ festop = nil
292
+ @layers[layername].each {|name,start,stop|
293
+ if markup_name == name
294
+ fe = []
295
+ infe = false
296
+ @charidx.each_index {|i|
297
+ startidx,stopidx = @charidx[i]
298
+ if startidx == start
299
+ infe = true
300
+ end
301
+ if infe
302
+ fe << @pos_text[i]
303
+ end
304
+ if stopidx == stop
305
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
306
+ break
307
+ elsif stopidx > stop
308
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
309
+ break
310
+ end
311
+ }
312
+ end
313
+ }
314
+ return result
315
+ end
316
+
317
+ def add_to_hash(hash,key,name)
318
+ exists = false
319
+ if hash.key?(key)
320
+ exists = true
321
+ else
322
+ hash[key] = []
323
+ hash[key] << name
324
+ end
325
+ return exists
326
+ end
327
+
328
+ def add_all_to_hash(hash,layername)
329
+ # use "uniq" to remove wrong double annotations
330
+ @layers[layername].uniq.each {|element,start,stop|
331
+ exists = add_to_hash(hash,[start, "start"],element)
332
+ if exists
333
+ STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
334
+ else
335
+ add_to_hash(hash,[stop, "stop"],element)
336
+ end
337
+ }
338
+ end
339
+
340
+
341
+ def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
342
+ if name.nil?
343
+ STDERR.puts "Error: layer line "+line+" with empty name."
344
+ end
345
+
346
+ # thisLayer, retv: array:[name(string), start(integer), end(integer)]
347
+ thisLayer = []
348
+ retv = []
349
+
350
+ labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
351
+ unless labels_elt
352
+ # no labels found, return empty array
353
+ return thisLayer
354
+ end
355
+
356
+ labels_elt.children_and_text.each { |label|
357
+ unless label.name == "label"
358
+ # some other markup, ignore
359
+ next
360
+ end
361
+
362
+ attributes = label.attributes
363
+ if attributes["itype"]
364
+ # null instantiation, don't retain
365
+ next
366
+ end
367
+ if not(attributes["start"]) and not(attributes["end"])
368
+ # no start and end labels
369
+ next
370
+ end
371
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
372
+ }
373
+
374
+ # sanity check: verify that
375
+ # 1. we don't have overlapping labels
376
+
377
+ deleteHash = {} # keep track of the labels which are to be deleted
378
+ # i -> Boolean
379
+
380
+ thisLayer.each_index {|i|
381
+ # efficiency: skip already delete labels
382
+ if deleteHash[i]
383
+ next
384
+ end
385
+ this_label, this_from , this_to = thisLayer[i]
386
+
387
+ # compare with all remaining labels
388
+ (i+1..thisLayer.length-1).to_a.each { |other_i|
389
+ other_label,other_from,other_to = thisLayer[other_i]
390
+
391
+ # overlap? Throw out the later FE
392
+ if this_from <= other_from and other_from <= this_to
393
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
394
+ deleteHash[other_i] = true
395
+ elsif this_from <= other_to and other_to <= this_to
396
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
397
+ deleteHash[i] = true
398
+ end
399
+ }
400
+ # matched with all other labels. If "keep", return
401
+
402
+ if deleteHash[i]
403
+ # $stderr.puts " deleting entry #{i}"
404
+ else
405
+ retv << thisLayer[i]
406
+ end
407
+ }
408
+
409
+ return retv
410
+ end
411
+ end