shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,120 @@
1
+ # KE Dec 2006
2
+ # Access for FrameNet corpus XML file
3
+ # Mainly taken over from FramesXML
4
+ #
5
+ # changes:
6
+ # - no single frame for the whole corpus
7
+ # - below <sentence> level there is an <annotationSet> level.
8
+ # One annotationSet may include a single frame,
9
+ # or a reference to all named entities in a sentence
10
+ #
11
+ # Write out in tab format, one line per word:
12
+ # Format:
13
+ # word (pt gf role target frame stuff)* ne sent_id
14
+ # with
15
+ # word: word
16
+ # whole bracketed group: information about one frame annotation
17
+ # pt: phrase type
18
+ # gf: grammatical function
19
+ # role: frame element
20
+ # target: LU occurrence
21
+ # frame: frame
22
+ # stuff: support, and other things
23
+ # ne: named entity
24
+ # sent_id: sentence ID
25
+
26
+ #####################
27
+ # one FrameNet corpus
28
+ #
29
+ # just the filename is stored,
30
+ # the text is read only on demand
31
+
32
+ require_relative 'fn_corpus_xml_sentence'
33
+
34
+ class FNCorpusXMLFile
35
+
36
+ ###
37
+ def initialize(filename)
38
+ @filename = filename
39
+
40
+ end
41
+
42
+ ###
43
+ # yield each document in this corpus
44
+ # as a string
45
+ def each_document_string
46
+ # read each <document> element and yield it
47
+
48
+ doc_string = ""
49
+ inside_doc_elem = false
50
+ f = File.new(@filename)
51
+
52
+ # <corpus>
53
+ # <documents>
54
+ # <document ...>
55
+ # </document>
56
+ # <document ...>
57
+ # </document>
58
+ # </documents>
59
+ # </corpus>
60
+ f.each { |line|
61
+ if not(inside_doc_elem) and line =~ /^.*?(<document\s.*)$/
62
+ # start of <document>
63
+ inside_doc_elem = true
64
+ doc_string << $1
65
+ elsif inside_doc_elem and line =~ /^(.*?<\/document>).*$/
66
+ # end of <document>
67
+ doc_string << $1
68
+ yield doc_string
69
+ doc_string = ""
70
+ inside_doc_elem = false
71
+ elsif inside_doc_elem
72
+ # within <document>
73
+ doc_string << line
74
+ end
75
+ }
76
+ end
77
+
78
+ ###
79
+ # yield each sentence
80
+ # as a FNCorpusXMLSentence object
81
+ def each_sentence
82
+ # read each <document> element and yield it
83
+
84
+ sent_string = ""
85
+ inside_sent_elem = false
86
+ f = File.new(@filename)
87
+
88
+ # <corpus>
89
+ # <documents>
90
+ # <document ...>
91
+ # <paragraphs>
92
+ # <paragraph>
93
+ # <sentences>
94
+ # <sentence ...>
95
+ f.each { |line|
96
+ if not(inside_sent_elem) and line =~ /^.*?(<sentence\s.*)$/
97
+ # start of <sentence>
98
+ inside_sent_elem = true
99
+ sent_string << $1
100
+ elsif inside_sent_elem and line =~ /^(.*?<\/sentence>).*$/
101
+ # end of <document>
102
+ sent_string << $1
103
+ yield FNCorpusXMLSentence.new(sent_string)
104
+ sent_string = ""
105
+ inside_sent_elem = false
106
+ elsif inside_sent_elem
107
+ # within <sentence>
108
+ sent_string << line.chomp
109
+ end
110
+ }
111
+ end
112
+
113
+ ###
114
+ # print whole FN file in tab format
115
+ def print_conll_style(file = $stdout)
116
+ each_sentence { |s_obj|
117
+ s_obj.print_conll_style(file)
118
+ }
119
+ end
120
+ end
@@ -0,0 +1,299 @@
1
+ require_relative 'fn_corpus_aset'
2
+ require 'frappe/utf_iso'
3
+ require 'salsa_tiger_xml/reg_xml'
4
+
5
+ #######################################
6
+ # Keep one sentence from FN corpus XML
7
+ # as a RegXML object,
8
+ # offer printout in tabular format
9
+ class FNCorpusXMLSentence
10
+
11
+ #########
12
+ def initialize(sent_string)
13
+ @sent = STXML::RegXML.new(sent_string)
14
+ @sent_id = @sent.attributes["ID"]
15
+ end
16
+
17
+ ##############
18
+ # print to file
19
+ # in tabular format
20
+ #
21
+ # row format:
22
+ # word (pt gf role target frame stuff)* ne sent_id
23
+ #
24
+ # word: word
25
+ # whole bracketed group: information about one frame annotation
26
+ # pt: phrase type
27
+ # gf: grammatical function
28
+ # role: frame element
29
+ # target: LU occurrence
30
+ # frame: frame
31
+ # stuff: support, and other things
32
+ # ne: named entity
33
+ # sent_id: sentence ID
34
+ def print_conll_style(file = $stdout)
35
+ pos_text, charidx = read_sentence
36
+ asets = read_annotation_sets(charidx)
37
+
38
+ # aset -> are we inside the target or not?
39
+ in_target = Hash.new(false)
40
+ # aset -> are we in all sorts of other annotations, like Support?
41
+ in_stuff = {}
42
+ # are we inside a named entity?
43
+ in_ne = nil
44
+
45
+ # record every opening and closing label we recognize,
46
+ # to check later
47
+ recognized_labels = {}
48
+
49
+ pos_text.each_index {|i|
50
+ line = []
51
+ word = pos_text[i]
52
+
53
+ # add: word
54
+ line << word
55
+
56
+ start, stop = charidx[i]
57
+
58
+ # iterate over the frames we have
59
+ # add: (pt gf role target frame stuff)
60
+ asets.each { |aset|
61
+ unless aset.aset_type == "frame"
62
+ # don't treat NEs as a frame here
63
+ next
64
+ end
65
+
66
+ # pt, gf, role
67
+ ["PT", "GF", "FE"].each { |layer|
68
+ token = []
69
+ hash = aset.layers[layer]
70
+ if hash.has_key?([start,"start"])
71
+ recognized_labels[[layer, start, "start"]] = true
72
+
73
+ markables = hash[[start,"start"]]
74
+ markables.each {|element|
75
+ token << "B-"+element
76
+ }
77
+ end
78
+ if hash.has_key?([stop,"stop"])
79
+ recognized_labels[[layer, stop, "stop"]] = true
80
+
81
+ markables = hash[[stop,"stop"]]
82
+ markables.each {|element|
83
+ token << "E-"+element
84
+ }
85
+ end
86
+
87
+ if token.empty?
88
+ line << "-"
89
+ else
90
+ line << token.sort.join(":")
91
+ end
92
+ }
93
+
94
+ # target
95
+ target = aset.layers["Target"]
96
+ if target.has_key?([start,"start"])
97
+ recognized_labels[["Target", start, "start"]] = true
98
+ in_target[aset] = true
99
+ end
100
+ if in_target[aset]
101
+ line << aset.lu
102
+ else
103
+ line << "-"
104
+ end
105
+ if target.has_key?([stop,"stop"])
106
+ recognized_labels[["Target", stop, "stop"]] = true
107
+ in_target[aset] = false
108
+ end
109
+
110
+ # frame
111
+ line << aset.frame_name
112
+
113
+ # stuff
114
+ unless in_stuff.has_key?(aset)
115
+ in_stuff[aset] = []
116
+ end
117
+ aset.layers.each_key { |layer|
118
+ if ["PT", "GF", "FE", "Target"].include? layer
119
+ # already done those
120
+ next
121
+ end
122
+ # all the rest goes in "stuff"
123
+ if aset.layers[layer].has_key?([start, "start"])
124
+ aset.layers[layer][[start, "start"]].each { |entry|
125
+ in_stuff[aset] << layer + "-" + entry
126
+ }
127
+ recognized_labels[[layer, start, "start"]] = true
128
+ end
129
+ }
130
+ if in_stuff[aset].empty?
131
+ line << "-"
132
+ else
133
+ line << in_stuff[aset].join(":")
134
+ end
135
+ aset.layers.each_key { |layer|
136
+ if aset.layers[layer].has_key?([stop, "stop"])
137
+ recognized_labels[[layer, stop, "stop"]] = true
138
+ aset.layers[layer][[stop, "stop"]].each { |entry|
139
+ in_stuff[aset].delete(layer + "-" + entry)
140
+ }
141
+ end
142
+ }
143
+ }
144
+
145
+ # ne
146
+ if (ner = asets.detect { |a| a.aset_type == "NER" })
147
+ if ner.layers["NER"] and ner.layers["NER"].has_key?([start, "start"])
148
+ recognized_labels[["NER", start, "start"]] = true
149
+ in_ne = ner.layers["NER"][[start,"start"]]
150
+ end
151
+ if in_ne
152
+ line << in_ne.join(":")
153
+ else
154
+ line << "-"
155
+ end
156
+ if ner.layers["NER"] and ner.layers["NER"].has_key?([stop, "stop"])
157
+ recognized_labels[["NER", stop, "stop"]] = true
158
+ in_ne = nil
159
+ end
160
+ end
161
+
162
+ # sent id
163
+ line << @sent_id
164
+
165
+ # sanity check:
166
+ # row format:
167
+ # word (pt gf role target frame stuff)* ne sent_id
168
+ # so number of columns must be 3 + 6x for some x >= 0
169
+ unless (line.length - 3)%6 == 0
170
+ $stderr.puts "Something wrong with the line length."
171
+ $stderr.puts "I have #{asets.length - 1} frames plus NEs, "
172
+ $stderr.puts "but #{line.length} columns."
173
+ raise
174
+ end
175
+
176
+
177
+ file.puts line.join("\t")
178
+ }
179
+
180
+ # sanity check:
181
+ # now count all labels,
182
+ # to see if we've printed them all
183
+ lost_labels = []
184
+ asets.each { |aset|
185
+ aset.layers.each_key { |layer|
186
+ aset.layers[layer].each_key { |offset, start_or_stop|
187
+ unless recognized_labels[[layer, offset, start_or_stop]]
188
+ lost_labels << [layer, offset, start_or_stop,
189
+ aset.layers[layer][[offset, start_or_stop]]]
190
+ end
191
+ }
192
+ }
193
+ }
194
+ unless lost_labels.empty?
195
+ $stderr.puts "Offsets: "
196
+ pos_text.each_index { |i|
197
+ $stderr.puts "\t#{pos_text[i]} #{charidx[i][0]} #{charidx[i][1]}"
198
+ }
199
+ # $stderr.puts "Recognized:"
200
+ # recognized_labels.each_key { |k|
201
+ # $stderr.puts "\t" + k.to_s
202
+ # }
203
+ lost_labels.each { |layer, offset, start_or_stop, labels|
204
+ $stderr.puts "FNCorpusXML warning: lost label"
205
+ $stderr.puts "\tLayer #{layer}"
206
+ $stderr.puts "\tOffset #{offset}"
207
+ $stderr.puts "\tStatus #{start_or_stop}"
208
+ $stderr.puts "\tLabels #{labels.join(" ")}"
209
+ }
210
+ end
211
+
212
+ file.puts
213
+ end
214
+
215
+ ################
216
+ private
217
+
218
+ ###
219
+ # read annotation sets:
220
+ # parse the annotation sets in the @sent object,
221
+ # return as:
222
+ # array of FNCorpusAset objects
223
+ def read_annotation_sets(charidx)
224
+ unless (annotation_sets = @sent.first_child_matching("annotationSets"))
225
+ return
226
+ end
227
+
228
+ # return values
229
+ frames = []
230
+
231
+ annotation_sets.each_child_matching("annotationSet") { |aset|
232
+ frames << FNCorpusAset.new(aset, charidx)
233
+ }
234
+
235
+ return frames
236
+ end
237
+
238
+ ###
239
+ # basically taken over from FrameXML.rb
240
+ # read sentence words,
241
+ # return as: sentence, indices
242
+ # - sentence as array of strings, one word per string
243
+ # - indices: array of pairs [word start char.index, word end char.index] int*int
244
+ def read_sentence
245
+ # all text and pos_text have the same number of elements!
246
+ charidx = [] # maps word indices on [start,stop]
247
+ pos_text = []
248
+
249
+ unless (text_elt = @sent.first_child_matching("text"))
250
+ # no text found for this sentence
251
+ return [pos_text, charidx]
252
+ end
253
+
254
+ orig_text = text_elt.children_and_text.detect { |child|
255
+ child.text?
256
+ }
257
+ if orig_text
258
+ # take text out of RegXMl object
259
+ orig_text = orig_text.to_s
260
+ end
261
+
262
+ pos_text = ::Shalmaneser::Frappe::UtfIso.to_iso_8859_1(orig_text).split(" ") # text with special char.s replaced by iso8859 char.s
263
+
264
+ double_space = []
265
+ pos = 0
266
+ while (match = orig_text.index(/(\s\s+)/,pos))
267
+ double_space << match
268
+ pos = match+1
269
+ end
270
+
271
+ # fill charidx array
272
+ char_i = 0
273
+ pos_text.each_index {|word_i|
274
+ startchar = char_i
275
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
276
+ char_i += our_length(pos_text[word_i])
277
+ stopchar = char_i-1
278
+
279
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
280
+
281
+ charidx << [startchar,stopchar]
282
+
283
+ # separators
284
+ if double_space.include?(char_i) then
285
+ char_i += 2
286
+ else
287
+ char_i += 1
288
+ end
289
+ }
290
+
291
+ return [pos_text, charidx]
292
+ end
293
+
294
+ ###
295
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
296
+ return string.gsub(/&(.+?);/,"X").length
297
+ end
298
+
299
+ end
@@ -0,0 +1,143 @@
1
+ # sp 28 06 04
2
+ #
3
+ # this module offers methods to extract gemma corpora from the FrameNet database#
4
+
5
+ require_relative 'frame_xml_file'
6
+
7
+ class FNDatabase
8
+
9
+ def each_matching_sentence(file_pred,sent_pred)
10
+ # fundamental access function to FrameXML files
11
+
12
+ # returns file objects where
13
+ # FrameXMLSentence matches sent_pred
14
+ # (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
15
+ each_matching_file(file_pred) {|frameNetFile|
16
+ frameNetFile.each_sentence {|frameNetSent|
17
+ if sent_pred.call(frameNetSent)
18
+ frameNetSent.verify_annotation
19
+ yield frameNetSent
20
+ end
21
+ }
22
+ }
23
+ end
24
+
25
+ def each_matching_file(file_pred)
26
+ # fundamental access function to FrameXML files
27
+
28
+ # returns file (FrameXMLFile) objects which match file_pred
29
+ each_framexml_file{|frameNetFile|
30
+ if file_pred.call(frameNetFile)
31
+ yield frameNetFile
32
+ end
33
+ frameNetFile.close
34
+ }
35
+ end
36
+
37
+ def extract_frame(frame,outfile)
38
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
39
+ Proc.new{|fnsent| true}) {|fnsent|
40
+ if fnsent.contains_FE_annotation_and_target
41
+ fnsent.print_conll_style_to(outfile)
42
+ end
43
+ }
44
+ end
45
+
46
+ def extract_lemma(lemma,outfile)
47
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
48
+ Proc.new{|fnsent| true}) {|fnsent|
49
+ if fnsent.contains_FE_annotation_and_target
50
+ fnsent.print_conll_style_to(outfile)
51
+ end
52
+ }
53
+ end
54
+
55
+ def extract_everything(outdirectory)
56
+ unless outdirectory[-1,1] == "/"
57
+ outdirectory += "/"
58
+ end
59
+
60
+ outfiles = {}
61
+ each_matching_sentence(Proc.new{|fnfile| true},
62
+ Proc.new{|fnsent| true}) {|fnsent|
63
+ frame = fnsent.get_file_obj.get_frame
64
+ unless outfiles.key?(frame)
65
+ outfiles[frame] = File.new(outdirectory+frame+".tab","w")
66
+ end
67
+ if fnsent.contains_FE_annotation_and_target
68
+ fnsent.print_conll_style_to(outfiles[frame])
69
+ end
70
+ }
71
+ # close output files
72
+ outfiles.each_value {|file|
73
+ file.close
74
+ }
75
+ # remove zero-size files
76
+ Dir[outdirectory+"*"].each {|filename|
77
+ if FileTest.zero?(filename)
78
+ File.unlink(filename)
79
+ end
80
+ }
81
+ end
82
+
83
+
84
+ def initialize(fn_path)
85
+ unless fn_path[-1,1] == "/"
86
+ fn_path += "/"
87
+ end
88
+ @fn = fn_path
89
+ end
90
+
91
+ private
92
+
93
+ def each_framexml_file
94
+ # files might be zipped
95
+ Dir[@fn+"lu*.xml.gz"].each {|gzfile|
96
+ Kernel.system("cp "+gzfile+" /tmp/")
97
+ Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
98
+ gzfile =~ /(.+)\.gz/
99
+ yield FrameXMLFile.new("/tmp/"+File.basename($1))
100
+ }
101
+ # or might not
102
+ Dir[@fn+"/lu*.xml"].each {|filename|
103
+ yield FrameXMLFile.new(filename)
104
+ }
105
+ end
106
+
107
+ # I don't really remember what this was good for ;-)
108
+
109
+ # def browse_everything(allFiles)
110
+ # if allFiles
111
+ # Dir[fn+"*.xml.gz"].each {|gzfile|
112
+ # Kernel.system("cp "+gzfile+" /tmp/")
113
+ # Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
114
+ # gzfile =~ /(.+)\.gz/
115
+ # # STDERR.puts File.basename($1)
116
+ # # STDERR.print "."
117
+ # ff = FrameXMLFile.new("/tmp/"+File.basename($1))
118
+ # ff.each_sentence {|s|
119
+ # if s.contains_FE_annotation_and_target
120
+ # s.verify_annotation
121
+ # if s.verify_annotation
122
+ # puts "****************** Error: Still problems after 2nd verification!"
123
+ # end
124
+ # s.print_conll_style
125
+ # end
126
+ # }
127
+ # }
128
+ # else
129
+ # ff = FrameXMLFile.new("/tmp/lu1870.xml")
130
+ # ff.each_sentence {|s|
131
+ # if s.contains_FE_annotation_and_target
132
+ # s.verify_annotation
133
+ # if s.verify_annotation
134
+ # puts "****************** Error: Still problems after 2nd verification!"
135
+ # end
136
+ # # s.print_layers
137
+ # s.print_conll_style
138
+ # end
139
+ # }
140
+ # end
141
+ # end
142
+
143
+ end