shalmaneser-lib 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/configuration/config_data.rb +457 -0
- data/lib/configuration/config_format_element.rb +210 -0
- data/lib/configuration/configuration_error.rb +15 -0
- data/lib/configuration/external_config_data.rb +56 -0
- data/lib/configuration/frappe_config_data.rb +134 -0
- data/lib/configuration/fred_config_data.rb +199 -0
- data/lib/configuration/rosy_config_data.rb +126 -0
- data/lib/db/db_interface.rb +50 -0
- data/lib/db/db_mysql.rb +141 -0
- data/lib/db/db_sqlite.rb +280 -0
- data/lib/db/db_table.rb +237 -0
- data/lib/db/db_view.rb +416 -0
- data/lib/db/db_wrapper.rb +175 -0
- data/lib/db/select_table_and_columns.rb +10 -0
- data/lib/db/sql_query.rb +243 -0
- data/lib/definitions.rb +19 -0
- data/lib/eval.rb +482 -0
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/external_systems.rb +251 -0
- data/lib/framenet_format/fn_corpus_aset.rb +209 -0
- data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
- data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
- data/lib/framenet_format/fn_database.rb +143 -0
- data/lib/framenet_format/frame_xml_file.rb +104 -0
- data/lib/framenet_format/frame_xml_sentence.rb +411 -0
- data/lib/logging.rb +25 -0
- data/lib/ml/classifier.rb +189 -0
- data/lib/ml/mallet.rb +236 -0
- data/lib/ml/maxent.rb +229 -0
- data/lib/ml/optimize.rb +195 -0
- data/lib/ml/timbl.rb +140 -0
- data/lib/monkey_patching/array.rb +82 -0
- data/lib/monkey_patching/enumerable_bool.rb +24 -0
- data/lib/monkey_patching/enumerable_distribute.rb +18 -0
- data/lib/monkey_patching/file.rb +131 -0
- data/lib/monkey_patching/subsumed.rb +24 -0
- data/lib/ruby_class_extensions.rb +4 -0
- data/lib/salsa_tiger_xml/corpus.rb +24 -0
- data/lib/salsa_tiger_xml/fe_node.rb +98 -0
- data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
- data/lib/salsa_tiger_xml/frame_node.rb +145 -0
- data/lib/salsa_tiger_xml/graph_node.rb +347 -0
- data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
- data/lib/salsa_tiger_xml/sem_node.rb +58 -0
- data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
- data/lib/salsa_tiger_xml/syn_node.rb +169 -0
- data/lib/salsa_tiger_xml/tree_node.rb +59 -0
- data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
- data/lib/salsa_tiger_xml/usp_node.rb +72 -0
- data/lib/salsa_tiger_xml/xml_node.rb +163 -0
- data/lib/shalmaneser/lib.rb +1 -0
- data/lib/tabular_format/fn_tab_format_file.rb +38 -0
- data/lib/tabular_format/fn_tab_frame.rb +67 -0
- data/lib/tabular_format/fn_tab_sentence.rb +169 -0
- data/lib/tabular_format/tab_format_file.rb +91 -0
- data/lib/tabular_format/tab_format_named_args.rb +184 -0
- data/lib/tabular_format/tab_format_sentence.rb +119 -0
- data/lib/value_restriction.rb +49 -0
- metadata +131 -0
@@ -0,0 +1,120 @@
|
|
1
|
+
# KE Dec 2006
|
2
|
+
# Access for FrameNet corpus XML file
|
3
|
+
# Mainly taken over from FramesXML
|
4
|
+
#
|
5
|
+
# changes:
|
6
|
+
# - no single frame for the whole corpus
|
7
|
+
# - below <sentence> level there is an <annotationSet> level.
|
8
|
+
# One annotationSet may include a single frame,
|
9
|
+
# or a reference to all named entities in a sentence
|
10
|
+
#
|
11
|
+
# Write out in tab format, one line per word:
|
12
|
+
# Format:
|
13
|
+
# word (pt gf role target frame stuff)* ne sent_id
|
14
|
+
# with
|
15
|
+
# word: word
|
16
|
+
# whole bracketed group: information about one frame annotation
|
17
|
+
# pt: phrase type
|
18
|
+
# gf: grammatical function
|
19
|
+
# role: frame element
|
20
|
+
# target: LU occurrence
|
21
|
+
# frame: frame
|
22
|
+
# stuff: support, and other things
|
23
|
+
# ne: named entity
|
24
|
+
# sent_id: sentence ID
|
25
|
+
|
26
|
+
#####################
|
27
|
+
# one FrameNet corpus
|
28
|
+
#
|
29
|
+
# just the filename is stored,
|
30
|
+
# the text is read only on demand
|
31
|
+
|
32
|
+
require_relative 'fn_corpus_xml_sentence'
|
33
|
+
|
34
|
+
class FNCorpusXMLFile
|
35
|
+
|
36
|
+
###
|
37
|
+
def initialize(filename)
|
38
|
+
@filename = filename
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
###
|
43
|
+
# yield each document in this corpus
|
44
|
+
# as a string
|
45
|
+
def each_document_string
|
46
|
+
# read each <document> element and yield it
|
47
|
+
|
48
|
+
doc_string = ""
|
49
|
+
inside_doc_elem = false
|
50
|
+
f = File.new(@filename)
|
51
|
+
|
52
|
+
# <corpus>
|
53
|
+
# <documents>
|
54
|
+
# <document ...>
|
55
|
+
# </document>
|
56
|
+
# <document ...>
|
57
|
+
# </document>
|
58
|
+
# </documents>
|
59
|
+
# </corpus>
|
60
|
+
f.each { |line|
|
61
|
+
if not(inside_doc_elem) and line =~ /^.*?(<document\s.*)$/
|
62
|
+
# start of <document>
|
63
|
+
inside_doc_elem = true
|
64
|
+
doc_string << $1
|
65
|
+
elsif inside_doc_elem and line =~ /^(.*?<\/document>).*$/
|
66
|
+
# end of <document>
|
67
|
+
doc_string << $1
|
68
|
+
yield doc_string
|
69
|
+
doc_string = ""
|
70
|
+
inside_doc_elem = false
|
71
|
+
elsif inside_doc_elem
|
72
|
+
# within <document>
|
73
|
+
doc_string << line
|
74
|
+
end
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
###
|
79
|
+
# yield each sentence
|
80
|
+
# as a FNCorpusXMLSentence object
|
81
|
+
def each_sentence
|
82
|
+
# read each <document> element and yield it
|
83
|
+
|
84
|
+
sent_string = ""
|
85
|
+
inside_sent_elem = false
|
86
|
+
f = File.new(@filename)
|
87
|
+
|
88
|
+
# <corpus>
|
89
|
+
# <documents>
|
90
|
+
# <document ...>
|
91
|
+
# <paragraphs>
|
92
|
+
# <paragraph>
|
93
|
+
# <sentences>
|
94
|
+
# <sentence ...>
|
95
|
+
f.each { |line|
|
96
|
+
if not(inside_sent_elem) and line =~ /^.*?(<sentence\s.*)$/
|
97
|
+
# start of <sentence>
|
98
|
+
inside_sent_elem = true
|
99
|
+
sent_string << $1
|
100
|
+
elsif inside_sent_elem and line =~ /^(.*?<\/sentence>).*$/
|
101
|
+
# end of <document>
|
102
|
+
sent_string << $1
|
103
|
+
yield FNCorpusXMLSentence.new(sent_string)
|
104
|
+
sent_string = ""
|
105
|
+
inside_sent_elem = false
|
106
|
+
elsif inside_sent_elem
|
107
|
+
# within <sentence>
|
108
|
+
sent_string << line.chomp
|
109
|
+
end
|
110
|
+
}
|
111
|
+
end
|
112
|
+
|
113
|
+
###
|
114
|
+
# print whole FN file in tab format
|
115
|
+
def print_conll_style(file = $stdout)
|
116
|
+
each_sentence { |s_obj|
|
117
|
+
s_obj.print_conll_style(file)
|
118
|
+
}
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,299 @@
|
|
1
|
+
require_relative 'fn_corpus_aset'
|
2
|
+
require 'frappe/utf_iso'
|
3
|
+
require 'salsa_tiger_xml/reg_xml'
|
4
|
+
|
5
|
+
#######################################
|
6
|
+
# Keep one sentence from FN corpus XML
|
7
|
+
# as a RegXML object,
|
8
|
+
# offer printout in tabular format
|
9
|
+
class FNCorpusXMLSentence
|
10
|
+
|
11
|
+
#########
|
12
|
+
def initialize(sent_string)
|
13
|
+
@sent = STXML::RegXML.new(sent_string)
|
14
|
+
@sent_id = @sent.attributes["ID"]
|
15
|
+
end
|
16
|
+
|
17
|
+
##############
|
18
|
+
# print to file
|
19
|
+
# in tabular format
|
20
|
+
#
|
21
|
+
# row format:
|
22
|
+
# word (pt gf role target frame stuff)* ne sent_id
|
23
|
+
#
|
24
|
+
# word: word
|
25
|
+
# whole bracketed group: information about one frame annotation
|
26
|
+
# pt: phrase type
|
27
|
+
# gf: grammatical function
|
28
|
+
# role: frame element
|
29
|
+
# target: LU occurrence
|
30
|
+
# frame: frame
|
31
|
+
# stuff: support, and other things
|
32
|
+
# ne: named entity
|
33
|
+
# sent_id: sentence ID
|
34
|
+
def print_conll_style(file = $stdout)
|
35
|
+
pos_text, charidx = read_sentence
|
36
|
+
asets = read_annotation_sets(charidx)
|
37
|
+
|
38
|
+
# aset -> are we inside the target or not?
|
39
|
+
in_target = Hash.new(false)
|
40
|
+
# aset -> are we in all sorts of other annotations, like Support?
|
41
|
+
in_stuff = {}
|
42
|
+
# are we inside a named entity?
|
43
|
+
in_ne = nil
|
44
|
+
|
45
|
+
# record every opening and closing label we recognize,
|
46
|
+
# to check later
|
47
|
+
recognized_labels = {}
|
48
|
+
|
49
|
+
pos_text.each_index {|i|
|
50
|
+
line = []
|
51
|
+
word = pos_text[i]
|
52
|
+
|
53
|
+
# add: word
|
54
|
+
line << word
|
55
|
+
|
56
|
+
start, stop = charidx[i]
|
57
|
+
|
58
|
+
# iterate over the frames we have
|
59
|
+
# add: (pt gf role target frame stuff)
|
60
|
+
asets.each { |aset|
|
61
|
+
unless aset.aset_type == "frame"
|
62
|
+
# don't treat NEs as a frame here
|
63
|
+
next
|
64
|
+
end
|
65
|
+
|
66
|
+
# pt, gf, role
|
67
|
+
["PT", "GF", "FE"].each { |layer|
|
68
|
+
token = []
|
69
|
+
hash = aset.layers[layer]
|
70
|
+
if hash.has_key?([start,"start"])
|
71
|
+
recognized_labels[[layer, start, "start"]] = true
|
72
|
+
|
73
|
+
markables = hash[[start,"start"]]
|
74
|
+
markables.each {|element|
|
75
|
+
token << "B-"+element
|
76
|
+
}
|
77
|
+
end
|
78
|
+
if hash.has_key?([stop,"stop"])
|
79
|
+
recognized_labels[[layer, stop, "stop"]] = true
|
80
|
+
|
81
|
+
markables = hash[[stop,"stop"]]
|
82
|
+
markables.each {|element|
|
83
|
+
token << "E-"+element
|
84
|
+
}
|
85
|
+
end
|
86
|
+
|
87
|
+
if token.empty?
|
88
|
+
line << "-"
|
89
|
+
else
|
90
|
+
line << token.sort.join(":")
|
91
|
+
end
|
92
|
+
}
|
93
|
+
|
94
|
+
# target
|
95
|
+
target = aset.layers["Target"]
|
96
|
+
if target.has_key?([start,"start"])
|
97
|
+
recognized_labels[["Target", start, "start"]] = true
|
98
|
+
in_target[aset] = true
|
99
|
+
end
|
100
|
+
if in_target[aset]
|
101
|
+
line << aset.lu
|
102
|
+
else
|
103
|
+
line << "-"
|
104
|
+
end
|
105
|
+
if target.has_key?([stop,"stop"])
|
106
|
+
recognized_labels[["Target", stop, "stop"]] = true
|
107
|
+
in_target[aset] = false
|
108
|
+
end
|
109
|
+
|
110
|
+
# frame
|
111
|
+
line << aset.frame_name
|
112
|
+
|
113
|
+
# stuff
|
114
|
+
unless in_stuff.has_key?(aset)
|
115
|
+
in_stuff[aset] = []
|
116
|
+
end
|
117
|
+
aset.layers.each_key { |layer|
|
118
|
+
if ["PT", "GF", "FE", "Target"].include? layer
|
119
|
+
# already done those
|
120
|
+
next
|
121
|
+
end
|
122
|
+
# all the rest goes in "stuff"
|
123
|
+
if aset.layers[layer].has_key?([start, "start"])
|
124
|
+
aset.layers[layer][[start, "start"]].each { |entry|
|
125
|
+
in_stuff[aset] << layer + "-" + entry
|
126
|
+
}
|
127
|
+
recognized_labels[[layer, start, "start"]] = true
|
128
|
+
end
|
129
|
+
}
|
130
|
+
if in_stuff[aset].empty?
|
131
|
+
line << "-"
|
132
|
+
else
|
133
|
+
line << in_stuff[aset].join(":")
|
134
|
+
end
|
135
|
+
aset.layers.each_key { |layer|
|
136
|
+
if aset.layers[layer].has_key?([stop, "stop"])
|
137
|
+
recognized_labels[[layer, stop, "stop"]] = true
|
138
|
+
aset.layers[layer][[stop, "stop"]].each { |entry|
|
139
|
+
in_stuff[aset].delete(layer + "-" + entry)
|
140
|
+
}
|
141
|
+
end
|
142
|
+
}
|
143
|
+
}
|
144
|
+
|
145
|
+
# ne
|
146
|
+
if (ner = asets.detect { |a| a.aset_type == "NER" })
|
147
|
+
if ner.layers["NER"] and ner.layers["NER"].has_key?([start, "start"])
|
148
|
+
recognized_labels[["NER", start, "start"]] = true
|
149
|
+
in_ne = ner.layers["NER"][[start,"start"]]
|
150
|
+
end
|
151
|
+
if in_ne
|
152
|
+
line << in_ne.join(":")
|
153
|
+
else
|
154
|
+
line << "-"
|
155
|
+
end
|
156
|
+
if ner.layers["NER"] and ner.layers["NER"].has_key?([stop, "stop"])
|
157
|
+
recognized_labels[["NER", stop, "stop"]] = true
|
158
|
+
in_ne = nil
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# sent id
|
163
|
+
line << @sent_id
|
164
|
+
|
165
|
+
# sanity check:
|
166
|
+
# row format:
|
167
|
+
# word (pt gf role target frame stuff)* ne sent_id
|
168
|
+
# so number of columns must be 3 + 6x for some x >= 0
|
169
|
+
unless (line.length - 3)%6 == 0
|
170
|
+
$stderr.puts "Something wrong with the line length."
|
171
|
+
$stderr.puts "I have #{asets.length - 1} frames plus NEs, "
|
172
|
+
$stderr.puts "but #{line.length} columns."
|
173
|
+
raise
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
file.puts line.join("\t")
|
178
|
+
}
|
179
|
+
|
180
|
+
# sanity check:
|
181
|
+
# now count all labels,
|
182
|
+
# to see if we've printed them all
|
183
|
+
lost_labels = []
|
184
|
+
asets.each { |aset|
|
185
|
+
aset.layers.each_key { |layer|
|
186
|
+
aset.layers[layer].each_key { |offset, start_or_stop|
|
187
|
+
unless recognized_labels[[layer, offset, start_or_stop]]
|
188
|
+
lost_labels << [layer, offset, start_or_stop,
|
189
|
+
aset.layers[layer][[offset, start_or_stop]]]
|
190
|
+
end
|
191
|
+
}
|
192
|
+
}
|
193
|
+
}
|
194
|
+
unless lost_labels.empty?
|
195
|
+
$stderr.puts "Offsets: "
|
196
|
+
pos_text.each_index { |i|
|
197
|
+
$stderr.puts "\t#{pos_text[i]} #{charidx[i][0]} #{charidx[i][1]}"
|
198
|
+
}
|
199
|
+
# $stderr.puts "Recognized:"
|
200
|
+
# recognized_labels.each_key { |k|
|
201
|
+
# $stderr.puts "\t" + k.to_s
|
202
|
+
# }
|
203
|
+
lost_labels.each { |layer, offset, start_or_stop, labels|
|
204
|
+
$stderr.puts "FNCorpusXML warning: lost label"
|
205
|
+
$stderr.puts "\tLayer #{layer}"
|
206
|
+
$stderr.puts "\tOffset #{offset}"
|
207
|
+
$stderr.puts "\tStatus #{start_or_stop}"
|
208
|
+
$stderr.puts "\tLabels #{labels.join(" ")}"
|
209
|
+
}
|
210
|
+
end
|
211
|
+
|
212
|
+
file.puts
|
213
|
+
end
|
214
|
+
|
215
|
+
################
|
216
|
+
private
|
217
|
+
|
218
|
+
###
|
219
|
+
# read annotation sets:
|
220
|
+
# parse the annotation sets in the @sent object,
|
221
|
+
# return as:
|
222
|
+
# array of FNCorpusAset objects
|
223
|
+
def read_annotation_sets(charidx)
|
224
|
+
unless (annotation_sets = @sent.first_child_matching("annotationSets"))
|
225
|
+
return
|
226
|
+
end
|
227
|
+
|
228
|
+
# return values
|
229
|
+
frames = []
|
230
|
+
|
231
|
+
annotation_sets.each_child_matching("annotationSet") { |aset|
|
232
|
+
frames << FNCorpusAset.new(aset, charidx)
|
233
|
+
}
|
234
|
+
|
235
|
+
return frames
|
236
|
+
end
|
237
|
+
|
238
|
+
###
|
239
|
+
# basically taken over from FrameXML.rb
|
240
|
+
# read sentence words,
|
241
|
+
# return as: sentence, indices
|
242
|
+
# - sentence as array of strings, one word per string
|
243
|
+
# - indices: array of pairs [word start char.index, word end char.index] int*int
|
244
|
+
def read_sentence
|
245
|
+
# all text and pos_text have the same number of elements!
|
246
|
+
charidx = [] # maps word indices on [start,stop]
|
247
|
+
pos_text = []
|
248
|
+
|
249
|
+
unless (text_elt = @sent.first_child_matching("text"))
|
250
|
+
# no text found for this sentence
|
251
|
+
return [pos_text, charidx]
|
252
|
+
end
|
253
|
+
|
254
|
+
orig_text = text_elt.children_and_text.detect { |child|
|
255
|
+
child.text?
|
256
|
+
}
|
257
|
+
if orig_text
|
258
|
+
# take text out of RegXMl object
|
259
|
+
orig_text = orig_text.to_s
|
260
|
+
end
|
261
|
+
|
262
|
+
pos_text = ::Shalmaneser::Frappe::UtfIso.to_iso_8859_1(orig_text).split(" ") # text with special char.s replaced by iso8859 char.s
|
263
|
+
|
264
|
+
double_space = []
|
265
|
+
pos = 0
|
266
|
+
while (match = orig_text.index(/(\s\s+)/,pos))
|
267
|
+
double_space << match
|
268
|
+
pos = match+1
|
269
|
+
end
|
270
|
+
|
271
|
+
# fill charidx array
|
272
|
+
char_i = 0
|
273
|
+
pos_text.each_index {|word_i|
|
274
|
+
startchar = char_i
|
275
|
+
# puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
|
276
|
+
char_i += our_length(pos_text[word_i])
|
277
|
+
stopchar = char_i-1
|
278
|
+
|
279
|
+
# puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
|
280
|
+
|
281
|
+
charidx << [startchar,stopchar]
|
282
|
+
|
283
|
+
# separators
|
284
|
+
if double_space.include?(char_i) then
|
285
|
+
char_i += 2
|
286
|
+
else
|
287
|
+
char_i += 1
|
288
|
+
end
|
289
|
+
}
|
290
|
+
|
291
|
+
return [pos_text, charidx]
|
292
|
+
end
|
293
|
+
|
294
|
+
###
|
295
|
+
def our_length(string) # (1) replace &...; with 1 char and " with two chars
|
296
|
+
return string.gsub(/&(.+?);/,"X").length
|
297
|
+
end
|
298
|
+
|
299
|
+
end
|
@@ -0,0 +1,143 @@
|
|
1
|
+
# sp 28 06 04
|
2
|
+
#
|
3
|
+
# this module offers methods to extract gemma corpora from the FrameNet database#
|
4
|
+
|
5
|
+
require_relative 'frame_xml_file'
|
6
|
+
|
7
|
+
class FNDatabase
|
8
|
+
|
9
|
+
def each_matching_sentence(file_pred,sent_pred)
|
10
|
+
# fundamental access function to FrameXML files
|
11
|
+
|
12
|
+
# returns file objects where
|
13
|
+
# FrameXMLSentence matches sent_pred
|
14
|
+
# (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
|
15
|
+
each_matching_file(file_pred) {|frameNetFile|
|
16
|
+
frameNetFile.each_sentence {|frameNetSent|
|
17
|
+
if sent_pred.call(frameNetSent)
|
18
|
+
frameNetSent.verify_annotation
|
19
|
+
yield frameNetSent
|
20
|
+
end
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def each_matching_file(file_pred)
|
26
|
+
# fundamental access function to FrameXML files
|
27
|
+
|
28
|
+
# returns file (FrameXMLFile) objects which match file_pred
|
29
|
+
each_framexml_file{|frameNetFile|
|
30
|
+
if file_pred.call(frameNetFile)
|
31
|
+
yield frameNetFile
|
32
|
+
end
|
33
|
+
frameNetFile.close
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
def extract_frame(frame,outfile)
|
38
|
+
each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
|
39
|
+
Proc.new{|fnsent| true}) {|fnsent|
|
40
|
+
if fnsent.contains_FE_annotation_and_target
|
41
|
+
fnsent.print_conll_style_to(outfile)
|
42
|
+
end
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def extract_lemma(lemma,outfile)
|
47
|
+
each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
|
48
|
+
Proc.new{|fnsent| true}) {|fnsent|
|
49
|
+
if fnsent.contains_FE_annotation_and_target
|
50
|
+
fnsent.print_conll_style_to(outfile)
|
51
|
+
end
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_everything(outdirectory)
|
56
|
+
unless outdirectory[-1,1] == "/"
|
57
|
+
outdirectory += "/"
|
58
|
+
end
|
59
|
+
|
60
|
+
outfiles = {}
|
61
|
+
each_matching_sentence(Proc.new{|fnfile| true},
|
62
|
+
Proc.new{|fnsent| true}) {|fnsent|
|
63
|
+
frame = fnsent.get_file_obj.get_frame
|
64
|
+
unless outfiles.key?(frame)
|
65
|
+
outfiles[frame] = File.new(outdirectory+frame+".tab","w")
|
66
|
+
end
|
67
|
+
if fnsent.contains_FE_annotation_and_target
|
68
|
+
fnsent.print_conll_style_to(outfiles[frame])
|
69
|
+
end
|
70
|
+
}
|
71
|
+
# close output files
|
72
|
+
outfiles.each_value {|file|
|
73
|
+
file.close
|
74
|
+
}
|
75
|
+
# remove zero-size files
|
76
|
+
Dir[outdirectory+"*"].each {|filename|
|
77
|
+
if FileTest.zero?(filename)
|
78
|
+
File.unlink(filename)
|
79
|
+
end
|
80
|
+
}
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
def initialize(fn_path)
|
85
|
+
unless fn_path[-1,1] == "/"
|
86
|
+
fn_path += "/"
|
87
|
+
end
|
88
|
+
@fn = fn_path
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def each_framexml_file
|
94
|
+
# files might be zipped
|
95
|
+
Dir[@fn+"lu*.xml.gz"].each {|gzfile|
|
96
|
+
Kernel.system("cp "+gzfile+" /tmp/")
|
97
|
+
Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
|
98
|
+
gzfile =~ /(.+)\.gz/
|
99
|
+
yield FrameXMLFile.new("/tmp/"+File.basename($1))
|
100
|
+
}
|
101
|
+
# or might not
|
102
|
+
Dir[@fn+"/lu*.xml"].each {|filename|
|
103
|
+
yield FrameXMLFile.new(filename)
|
104
|
+
}
|
105
|
+
end
|
106
|
+
|
107
|
+
# I don't really remember what this was good for ;-)
|
108
|
+
|
109
|
+
# def browse_everything(allFiles)
|
110
|
+
# if allFiles
|
111
|
+
# Dir[fn+"*.xml.gz"].each {|gzfile|
|
112
|
+
# Kernel.system("cp "+gzfile+" /tmp/")
|
113
|
+
# Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
|
114
|
+
# gzfile =~ /(.+)\.gz/
|
115
|
+
# # STDERR.puts File.basename($1)
|
116
|
+
# # STDERR.print "."
|
117
|
+
# ff = FrameXMLFile.new("/tmp/"+File.basename($1))
|
118
|
+
# ff.each_sentence {|s|
|
119
|
+
# if s.contains_FE_annotation_and_target
|
120
|
+
# s.verify_annotation
|
121
|
+
# if s.verify_annotation
|
122
|
+
# puts "****************** Error: Still problems after 2nd verification!"
|
123
|
+
# end
|
124
|
+
# s.print_conll_style
|
125
|
+
# end
|
126
|
+
# }
|
127
|
+
# }
|
128
|
+
# else
|
129
|
+
# ff = FrameXMLFile.new("/tmp/lu1870.xml")
|
130
|
+
# ff.each_sentence {|s|
|
131
|
+
# if s.contains_FE_annotation_and_target
|
132
|
+
# s.verify_annotation
|
133
|
+
# if s.verify_annotation
|
134
|
+
# puts "****************** Error: Still problems after 2nd verification!"
|
135
|
+
# end
|
136
|
+
# # s.print_layers
|
137
|
+
# s.print_conll_style
|
138
|
+
# end
|
139
|
+
# }
|
140
|
+
# end
|
141
|
+
# end
|
142
|
+
|
143
|
+
end
|