shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,120 @@
1
+ # KE Dec 2006
2
+ # Access for FrameNet corpus XML file
3
+ # Mainly taken over from FramesXML
4
+ #
5
+ # changes:
6
+ # - no single frame for the whole corpus
7
+ # - below <sentence> level there is an <annotationSet> level.
8
+ # One annotationSet may include a single frame,
9
+ # or a reference to all named entities in a sentence
10
+ #
11
+ # Write out in tab format, one line per word:
12
+ # Format:
13
+ # word (pt gf role target frame stuff)* ne sent_id
14
+ # with
15
+ # word: word
16
+ # whole bracketed group: information about one frame annotation
17
+ # pt: phrase type
18
+ # gf: grammatical function
19
+ # role: frame element
20
+ # target: LU occurrence
21
+ # frame: frame
22
+ # stuff: support, and other things
23
+ # ne: named entity
24
+ # sent_id: sentence ID
25
+
26
+ #####################
27
+ # one FrameNet corpus
28
+ #
29
+ # just the filename is stored,
30
+ # the text is read only on demand
31
+
32
+ require_relative 'fn_corpus_xml_sentence'
33
+
34
+ class FNCorpusXMLFile
35
+
36
+ ###
37
+ def initialize(filename)
38
+ @filename = filename
39
+
40
+ end
41
+
42
+ ###
43
+ # yield each document in this corpus
44
+ # as a string
45
+ def each_document_string
46
+ # read each <document> element and yield it
47
+
48
+ doc_string = ""
49
+ inside_doc_elem = false
50
+ f = File.new(@filename)
51
+
52
+ # <corpus>
53
+ # <documents>
54
+ # <document ...>
55
+ # </document>
56
+ # <document ...>
57
+ # </document>
58
+ # </documents>
59
+ # </corpus>
60
+ f.each { |line|
61
+ if not(inside_doc_elem) and line =~ /^.*?(<document\s.*)$/
62
+ # start of <document>
63
+ inside_doc_elem = true
64
+ doc_string << $1
65
+ elsif inside_doc_elem and line =~ /^(.*?<\/document>).*$/
66
+ # end of <document>
67
+ doc_string << $1
68
+ yield doc_string
69
+ doc_string = ""
70
+ inside_doc_elem = false
71
+ elsif inside_doc_elem
72
+ # within <document>
73
+ doc_string << line
74
+ end
75
+ }
76
+ end
77
+
78
+ ###
79
+ # yield each sentence
80
+ # as a FNCorpusXMLSentence object
81
+ def each_sentence
82
+ # read each <document> element and yield it
83
+
84
+ sent_string = ""
85
+ inside_sent_elem = false
86
+ f = File.new(@filename)
87
+
88
+ # <corpus>
89
+ # <documents>
90
+ # <document ...>
91
+ # <paragraphs>
92
+ # <paragraph>
93
+ # <sentences>
94
+ # <sentence ...>
95
+ f.each { |line|
96
+ if not(inside_sent_elem) and line =~ /^.*?(<sentence\s.*)$/
97
+ # start of <sentence>
98
+ inside_sent_elem = true
99
+ sent_string << $1
100
+ elsif inside_sent_elem and line =~ /^(.*?<\/sentence>).*$/
101
+ # end of <document>
102
+ sent_string << $1
103
+ yield FNCorpusXMLSentence.new(sent_string)
104
+ sent_string = ""
105
+ inside_sent_elem = false
106
+ elsif inside_sent_elem
107
+ # within <sentence>
108
+ sent_string << line.chomp
109
+ end
110
+ }
111
+ end
112
+
113
+ ###
114
+ # print whole FN file in tab format
115
+ def print_conll_style(file = $stdout)
116
+ each_sentence { |s_obj|
117
+ s_obj.print_conll_style(file)
118
+ }
119
+ end
120
+ end
@@ -0,0 +1,299 @@
1
+ require_relative 'fn_corpus_aset'
2
+ require 'frappe/utf_iso'
3
+ require 'salsa_tiger_xml/reg_xml'
4
+
5
+ #######################################
6
+ # Keep one sentence from FN corpus XML
7
+ # as a RegXML object,
8
+ # offer printout in tabular format
9
+ class FNCorpusXMLSentence
10
+
11
+ #########
12
+ def initialize(sent_string)
13
+ @sent = STXML::RegXML.new(sent_string)
14
+ @sent_id = @sent.attributes["ID"]
15
+ end
16
+
17
+ ##############
18
+ # print to file
19
+ # in tabular format
20
+ #
21
+ # row format:
22
+ # word (pt gf role target frame stuff)* ne sent_id
23
+ #
24
+ # word: word
25
+ # whole bracketed group: information about one frame annotation
26
+ # pt: phrase type
27
+ # gf: grammatical function
28
+ # role: frame element
29
+ # target: LU occurrence
30
+ # frame: frame
31
+ # stuff: support, and other things
32
+ # ne: named entity
33
+ # sent_id: sentence ID
34
+ def print_conll_style(file = $stdout)
35
+ pos_text, charidx = read_sentence
36
+ asets = read_annotation_sets(charidx)
37
+
38
+ # aset -> are we inside the target or not?
39
+ in_target = Hash.new(false)
40
+ # aset -> are we in all sorts of other annotations, like Support?
41
+ in_stuff = {}
42
+ # are we inside a named entity?
43
+ in_ne = nil
44
+
45
+ # record every opening and closing label we recognize,
46
+ # to check later
47
+ recognized_labels = {}
48
+
49
+ pos_text.each_index {|i|
50
+ line = []
51
+ word = pos_text[i]
52
+
53
+ # add: word
54
+ line << word
55
+
56
+ start, stop = charidx[i]
57
+
58
+ # iterate over the frames we have
59
+ # add: (pt gf role target frame stuff)
60
+ asets.each { |aset|
61
+ unless aset.aset_type == "frame"
62
+ # don't treat NEs as a frame here
63
+ next
64
+ end
65
+
66
+ # pt, gf, role
67
+ ["PT", "GF", "FE"].each { |layer|
68
+ token = []
69
+ hash = aset.layers[layer]
70
+ if hash.has_key?([start,"start"])
71
+ recognized_labels[[layer, start, "start"]] = true
72
+
73
+ markables = hash[[start,"start"]]
74
+ markables.each {|element|
75
+ token << "B-"+element
76
+ }
77
+ end
78
+ if hash.has_key?([stop,"stop"])
79
+ recognized_labels[[layer, stop, "stop"]] = true
80
+
81
+ markables = hash[[stop,"stop"]]
82
+ markables.each {|element|
83
+ token << "E-"+element
84
+ }
85
+ end
86
+
87
+ if token.empty?
88
+ line << "-"
89
+ else
90
+ line << token.sort.join(":")
91
+ end
92
+ }
93
+
94
+ # target
95
+ target = aset.layers["Target"]
96
+ if target.has_key?([start,"start"])
97
+ recognized_labels[["Target", start, "start"]] = true
98
+ in_target[aset] = true
99
+ end
100
+ if in_target[aset]
101
+ line << aset.lu
102
+ else
103
+ line << "-"
104
+ end
105
+ if target.has_key?([stop,"stop"])
106
+ recognized_labels[["Target", stop, "stop"]] = true
107
+ in_target[aset] = false
108
+ end
109
+
110
+ # frame
111
+ line << aset.frame_name
112
+
113
+ # stuff
114
+ unless in_stuff.has_key?(aset)
115
+ in_stuff[aset] = []
116
+ end
117
+ aset.layers.each_key { |layer|
118
+ if ["PT", "GF", "FE", "Target"].include? layer
119
+ # already done those
120
+ next
121
+ end
122
+ # all the rest goes in "stuff"
123
+ if aset.layers[layer].has_key?([start, "start"])
124
+ aset.layers[layer][[start, "start"]].each { |entry|
125
+ in_stuff[aset] << layer + "-" + entry
126
+ }
127
+ recognized_labels[[layer, start, "start"]] = true
128
+ end
129
+ }
130
+ if in_stuff[aset].empty?
131
+ line << "-"
132
+ else
133
+ line << in_stuff[aset].join(":")
134
+ end
135
+ aset.layers.each_key { |layer|
136
+ if aset.layers[layer].has_key?([stop, "stop"])
137
+ recognized_labels[[layer, stop, "stop"]] = true
138
+ aset.layers[layer][[stop, "stop"]].each { |entry|
139
+ in_stuff[aset].delete(layer + "-" + entry)
140
+ }
141
+ end
142
+ }
143
+ }
144
+
145
+ # ne
146
+ if (ner = asets.detect { |a| a.aset_type == "NER" })
147
+ if ner.layers["NER"] and ner.layers["NER"].has_key?([start, "start"])
148
+ recognized_labels[["NER", start, "start"]] = true
149
+ in_ne = ner.layers["NER"][[start,"start"]]
150
+ end
151
+ if in_ne
152
+ line << in_ne.join(":")
153
+ else
154
+ line << "-"
155
+ end
156
+ if ner.layers["NER"] and ner.layers["NER"].has_key?([stop, "stop"])
157
+ recognized_labels[["NER", stop, "stop"]] = true
158
+ in_ne = nil
159
+ end
160
+ end
161
+
162
+ # sent id
163
+ line << @sent_id
164
+
165
+ # sanity check:
166
+ # row format:
167
+ # word (pt gf role target frame stuff)* ne sent_id
168
+ # so number of columns must be 3 + 6x for some x >= 0
169
+ unless (line.length - 3)%6 == 0
170
+ $stderr.puts "Something wrong with the line length."
171
+ $stderr.puts "I have #{asets.length - 1} frames plus NEs, "
172
+ $stderr.puts "but #{line.length} columns."
173
+ raise
174
+ end
175
+
176
+
177
+ file.puts line.join("\t")
178
+ }
179
+
180
+ # sanity check:
181
+ # now count all labels,
182
+ # to see if we've printed them all
183
+ lost_labels = []
184
+ asets.each { |aset|
185
+ aset.layers.each_key { |layer|
186
+ aset.layers[layer].each_key { |offset, start_or_stop|
187
+ unless recognized_labels[[layer, offset, start_or_stop]]
188
+ lost_labels << [layer, offset, start_or_stop,
189
+ aset.layers[layer][[offset, start_or_stop]]]
190
+ end
191
+ }
192
+ }
193
+ }
194
+ unless lost_labels.empty?
195
+ $stderr.puts "Offsets: "
196
+ pos_text.each_index { |i|
197
+ $stderr.puts "\t#{pos_text[i]} #{charidx[i][0]} #{charidx[i][1]}"
198
+ }
199
+ # $stderr.puts "Recognized:"
200
+ # recognized_labels.each_key { |k|
201
+ # $stderr.puts "\t" + k.to_s
202
+ # }
203
+ lost_labels.each { |layer, offset, start_or_stop, labels|
204
+ $stderr.puts "FNCorpusXML warning: lost label"
205
+ $stderr.puts "\tLayer #{layer}"
206
+ $stderr.puts "\tOffset #{offset}"
207
+ $stderr.puts "\tStatus #{start_or_stop}"
208
+ $stderr.puts "\tLabels #{labels.join(" ")}"
209
+ }
210
+ end
211
+
212
+ file.puts
213
+ end
214
+
215
+ ################
216
+ private
217
+
218
+ ###
219
+ # read annotation sets:
220
+ # parse the annotation sets in the @sent object,
221
+ # return as:
222
+ # array of FNCorpusAset objects
223
+ def read_annotation_sets(charidx)
224
+ unless (annotation_sets = @sent.first_child_matching("annotationSets"))
225
+ return
226
+ end
227
+
228
+ # return values
229
+ frames = []
230
+
231
+ annotation_sets.each_child_matching("annotationSet") { |aset|
232
+ frames << FNCorpusAset.new(aset, charidx)
233
+ }
234
+
235
+ return frames
236
+ end
237
+
238
+ ###
239
+ # basically taken over from FrameXML.rb
240
+ # read sentence words,
241
+ # return as: sentence, indices
242
+ # - sentence as array of strings, one word per string
243
+ # - indices: array of pairs [word start char.index, word end char.index] int*int
244
+ def read_sentence
245
+ # all text and pos_text have the same number of elements!
246
+ charidx = [] # maps word indices on [start,stop]
247
+ pos_text = []
248
+
249
+ unless (text_elt = @sent.first_child_matching("text"))
250
+ # no text found for this sentence
251
+ return [pos_text, charidx]
252
+ end
253
+
254
+ orig_text = text_elt.children_and_text.detect { |child|
255
+ child.text?
256
+ }
257
+ if orig_text
258
+ # take text out of RegXMl object
259
+ orig_text = orig_text.to_s
260
+ end
261
+
262
+ pos_text = ::Shalmaneser::Frappe::UtfIso.to_iso_8859_1(orig_text).split(" ") # text with special char.s replaced by iso8859 char.s
263
+
264
+ double_space = []
265
+ pos = 0
266
+ while (match = orig_text.index(/(\s\s+)/,pos))
267
+ double_space << match
268
+ pos = match+1
269
+ end
270
+
271
+ # fill charidx array
272
+ char_i = 0
273
+ pos_text.each_index {|word_i|
274
+ startchar = char_i
275
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
276
+ char_i += our_length(pos_text[word_i])
277
+ stopchar = char_i-1
278
+
279
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
280
+
281
+ charidx << [startchar,stopchar]
282
+
283
+ # separators
284
+ if double_space.include?(char_i) then
285
+ char_i += 2
286
+ else
287
+ char_i += 1
288
+ end
289
+ }
290
+
291
+ return [pos_text, charidx]
292
+ end
293
+
294
+ ###
295
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
296
+ return string.gsub(/&(.+?);/,"X").length
297
+ end
298
+
299
+ end
@@ -0,0 +1,143 @@
1
+ # sp 28 06 04
2
+ #
3
+ # this module offers methods to extract gemma corpora from the FrameNet database#
4
+
5
+ require_relative 'frame_xml_file'
6
+
7
+ class FNDatabase
8
+
9
+ def each_matching_sentence(file_pred,sent_pred)
10
+ # fundamental access function to FrameXML files
11
+
12
+ # returns file objects where
13
+ # FrameXMLSentence matches sent_pred
14
+ # (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
15
+ each_matching_file(file_pred) {|frameNetFile|
16
+ frameNetFile.each_sentence {|frameNetSent|
17
+ if sent_pred.call(frameNetSent)
18
+ frameNetSent.verify_annotation
19
+ yield frameNetSent
20
+ end
21
+ }
22
+ }
23
+ end
24
+
25
+ def each_matching_file(file_pred)
26
+ # fundamental access function to FrameXML files
27
+
28
+ # returns file (FrameXMLFile) objects which match file_pred
29
+ each_framexml_file{|frameNetFile|
30
+ if file_pred.call(frameNetFile)
31
+ yield frameNetFile
32
+ end
33
+ frameNetFile.close
34
+ }
35
+ end
36
+
37
+ def extract_frame(frame,outfile)
38
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
39
+ Proc.new{|fnsent| true}) {|fnsent|
40
+ if fnsent.contains_FE_annotation_and_target
41
+ fnsent.print_conll_style_to(outfile)
42
+ end
43
+ }
44
+ end
45
+
46
+ def extract_lemma(lemma,outfile)
47
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
48
+ Proc.new{|fnsent| true}) {|fnsent|
49
+ if fnsent.contains_FE_annotation_and_target
50
+ fnsent.print_conll_style_to(outfile)
51
+ end
52
+ }
53
+ end
54
+
55
+ def extract_everything(outdirectory)
56
+ unless outdirectory[-1,1] == "/"
57
+ outdirectory += "/"
58
+ end
59
+
60
+ outfiles = {}
61
+ each_matching_sentence(Proc.new{|fnfile| true},
62
+ Proc.new{|fnsent| true}) {|fnsent|
63
+ frame = fnsent.get_file_obj.get_frame
64
+ unless outfiles.key?(frame)
65
+ outfiles[frame] = File.new(outdirectory+frame+".tab","w")
66
+ end
67
+ if fnsent.contains_FE_annotation_and_target
68
+ fnsent.print_conll_style_to(outfiles[frame])
69
+ end
70
+ }
71
+ # close output files
72
+ outfiles.each_value {|file|
73
+ file.close
74
+ }
75
+ # remove zero-size files
76
+ Dir[outdirectory+"*"].each {|filename|
77
+ if FileTest.zero?(filename)
78
+ File.unlink(filename)
79
+ end
80
+ }
81
+ end
82
+
83
+
84
+ def initialize(fn_path)
85
+ unless fn_path[-1,1] == "/"
86
+ fn_path += "/"
87
+ end
88
+ @fn = fn_path
89
+ end
90
+
91
+ private
92
+
93
+ def each_framexml_file
94
+ # files might be zipped
95
+ Dir[@fn+"lu*.xml.gz"].each {|gzfile|
96
+ Kernel.system("cp "+gzfile+" /tmp/")
97
+ Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
98
+ gzfile =~ /(.+)\.gz/
99
+ yield FrameXMLFile.new("/tmp/"+File.basename($1))
100
+ }
101
+ # or might not
102
+ Dir[@fn+"/lu*.xml"].each {|filename|
103
+ yield FrameXMLFile.new(filename)
104
+ }
105
+ end
106
+
107
+ # I don't really remember what this was good for ;-)
108
+
109
+ # def browse_everything(allFiles)
110
+ # if allFiles
111
+ # Dir[fn+"*.xml.gz"].each {|gzfile|
112
+ # Kernel.system("cp "+gzfile+" /tmp/")
113
+ # Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
114
+ # gzfile =~ /(.+)\.gz/
115
+ # # STDERR.puts File.basename($1)
116
+ # # STDERR.print "."
117
+ # ff = FrameXMLFile.new("/tmp/"+File.basename($1))
118
+ # ff.each_sentence {|s|
119
+ # if s.contains_FE_annotation_and_target
120
+ # s.verify_annotation
121
+ # if s.verify_annotation
122
+ # puts "****************** Error: Still problems after 2nd verification!"
123
+ # end
124
+ # s.print_conll_style
125
+ # end
126
+ # }
127
+ # }
128
+ # else
129
+ # ff = FrameXMLFile.new("/tmp/lu1870.xml")
130
+ # ff.each_sentence {|s|
131
+ # if s.contains_FE_annotation_and_target
132
+ # s.verify_annotation
133
+ # if s.verify_annotation
134
+ # puts "****************** Error: Still problems after 2nd verification!"
135
+ # end
136
+ # # s.print_layers
137
+ # s.print_conll_style
138
+ # end
139
+ # }
140
+ # end
141
+ # end
142
+
143
+ end