shalmaneser-lib 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/configuration/config_data.rb +457 -0
- data/lib/configuration/config_format_element.rb +210 -0
- data/lib/configuration/configuration_error.rb +15 -0
- data/lib/configuration/external_config_data.rb +56 -0
- data/lib/configuration/frappe_config_data.rb +134 -0
- data/lib/configuration/fred_config_data.rb +199 -0
- data/lib/configuration/rosy_config_data.rb +126 -0
- data/lib/db/db_interface.rb +50 -0
- data/lib/db/db_mysql.rb +141 -0
- data/lib/db/db_sqlite.rb +280 -0
- data/lib/db/db_table.rb +237 -0
- data/lib/db/db_view.rb +416 -0
- data/lib/db/db_wrapper.rb +175 -0
- data/lib/db/select_table_and_columns.rb +10 -0
- data/lib/db/sql_query.rb +243 -0
- data/lib/definitions.rb +19 -0
- data/lib/eval.rb +482 -0
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/external_systems.rb +251 -0
- data/lib/framenet_format/fn_corpus_aset.rb +209 -0
- data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
- data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
- data/lib/framenet_format/fn_database.rb +143 -0
- data/lib/framenet_format/frame_xml_file.rb +104 -0
- data/lib/framenet_format/frame_xml_sentence.rb +411 -0
- data/lib/logging.rb +25 -0
- data/lib/ml/classifier.rb +189 -0
- data/lib/ml/mallet.rb +236 -0
- data/lib/ml/maxent.rb +229 -0
- data/lib/ml/optimize.rb +195 -0
- data/lib/ml/timbl.rb +140 -0
- data/lib/monkey_patching/array.rb +82 -0
- data/lib/monkey_patching/enumerable_bool.rb +24 -0
- data/lib/monkey_patching/enumerable_distribute.rb +18 -0
- data/lib/monkey_patching/file.rb +131 -0
- data/lib/monkey_patching/subsumed.rb +24 -0
- data/lib/ruby_class_extensions.rb +4 -0
- data/lib/salsa_tiger_xml/corpus.rb +24 -0
- data/lib/salsa_tiger_xml/fe_node.rb +98 -0
- data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
- data/lib/salsa_tiger_xml/frame_node.rb +145 -0
- data/lib/salsa_tiger_xml/graph_node.rb +347 -0
- data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
- data/lib/salsa_tiger_xml/sem_node.rb +58 -0
- data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
- data/lib/salsa_tiger_xml/syn_node.rb +169 -0
- data/lib/salsa_tiger_xml/tree_node.rb +59 -0
- data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
- data/lib/salsa_tiger_xml/usp_node.rb +72 -0
- data/lib/salsa_tiger_xml/xml_node.rb +163 -0
- data/lib/shalmaneser/lib.rb +1 -0
- data/lib/tabular_format/fn_tab_format_file.rb +38 -0
- data/lib/tabular_format/fn_tab_frame.rb +67 -0
- data/lib/tabular_format/fn_tab_sentence.rb +169 -0
- data/lib/tabular_format/tab_format_file.rb +91 -0
- data/lib/tabular_format/tab_format_named_args.rb +184 -0
- data/lib/tabular_format/tab_format_sentence.rb +119 -0
- data/lib/value_restriction.rb +49 -0
- metadata +131 -0
@@ -0,0 +1,82 @@
|
|
1
|
+
require_relative 'enumerable_bool'
|
2
|
+
require_relative 'enumerable_distribute'
|
3
|
+
require_relative 'subsumed'
|
4
|
+
|
5
|
+
# Extensions for the class Array.
|
6
|
+
class Array
|
7
|
+
include EnumerableBool
|
8
|
+
include EnumerableDistribute
|
9
|
+
include Subsumed
|
10
|
+
|
11
|
+
###
|
12
|
+
# interleave N arrays:
|
13
|
+
# given arrays [a1... an], [b1,...,bn], ..[z1, ...,zn]
|
14
|
+
# return [[a1,b1, .., z1]...,[an,bn, .., zn]]
|
15
|
+
#
|
16
|
+
# if one array is longer than the other,
|
17
|
+
# e.g. [a1...an], [b1,...,bm] with n> m
|
18
|
+
# the result is
|
19
|
+
# [[a1,b1],...[am, bm], [am+1, nil], ..., [an, nil]]
|
20
|
+
# and analogously for m>n
|
21
|
+
def interleave(*arrays)
|
22
|
+
len = [length, arrays.map(&:length).max].max
|
23
|
+
(0..len-1).to_a.map do |ix|
|
24
|
+
[at(ix)] + arrays.map { |a| a[ix] }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
###
|
29
|
+
# count the number of occurrences of element in this array
|
30
|
+
def count(element)
|
31
|
+
num = 0
|
32
|
+
each { |my_element|
|
33
|
+
if my_element == element
|
34
|
+
num += 1
|
35
|
+
end
|
36
|
+
}
|
37
|
+
|
38
|
+
num
|
39
|
+
end
|
40
|
+
|
41
|
+
###
|
42
|
+
# count the number of occurrences of
|
43
|
+
# elements from list in this array
|
44
|
+
def counts(list)
|
45
|
+
num = 0
|
46
|
+
each { |my_element|
|
47
|
+
if list.include? my_element
|
48
|
+
num += 1
|
49
|
+
end
|
50
|
+
}
|
51
|
+
return num
|
52
|
+
end
|
53
|
+
|
54
|
+
###
|
55
|
+
# draw a random sample of size N
|
56
|
+
# from this array
|
57
|
+
def sample(size)
|
58
|
+
if size < 0
|
59
|
+
return nil
|
60
|
+
elsif size == 0
|
61
|
+
return []
|
62
|
+
elsif size >= length
|
63
|
+
return self.clone
|
64
|
+
end
|
65
|
+
|
66
|
+
rank = {}
|
67
|
+
each { |my_element|
|
68
|
+
rank[my_element] = rand
|
69
|
+
}
|
70
|
+
return self.sort { |a, b| rank[a] <=> rank[b] }[0..size-1]
|
71
|
+
end
|
72
|
+
|
73
|
+
def map_with_index(&block)
|
74
|
+
retv = []
|
75
|
+
|
76
|
+
each_with_index { |x, index|
|
77
|
+
retv << block.call(x, index)
|
78
|
+
}
|
79
|
+
|
80
|
+
return retv
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
################
|
2
|
+
module EnumerableBool
|
3
|
+
###
|
4
|
+
# And_(x \in X) block(x)
|
5
|
+
def big_and(&block)
|
6
|
+
each do |x|
|
7
|
+
unless block.call(x)
|
8
|
+
return false
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
###
|
16
|
+
# Sum_(x \in X) block(x)
|
17
|
+
def big_sum(init = 0, &block)
|
18
|
+
sum = init
|
19
|
+
block = proc { |x| x } unless block_given?
|
20
|
+
each { |x| sum += block.call(x) }
|
21
|
+
|
22
|
+
sum
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
################
|
2
|
+
# Given an enumerable, distribute its items into two bins (arrays)
|
3
|
+
# depending on whether the block returns true
|
4
|
+
module EnumerableDistribute
|
5
|
+
def distribute(&block)
|
6
|
+
retv1 = []
|
7
|
+
retv2 = []
|
8
|
+
each do |x|
|
9
|
+
if block.call(x)
|
10
|
+
retv1 << x
|
11
|
+
else
|
12
|
+
retv2 << x
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
[retv1, retv2]
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
# Extensions for the class File.
|
4
|
+
class File
|
5
|
+
########
|
6
|
+
# check whether a given path exists,
|
7
|
+
# and if it doesn't, make sure it is created.
|
8
|
+
#
|
9
|
+
# piece together the strings in 'pieces' to make the path,
|
10
|
+
# appending "/" to all strings if necessary
|
11
|
+
#
|
12
|
+
# returns: the path pieced together
|
13
|
+
# strings, to be pieced together
|
14
|
+
def self.new_dir(*pieces)
|
15
|
+
dir_path, _dummy = File.make_path(pieces, true)
|
16
|
+
|
17
|
+
unless File.exist?(dir_path)
|
18
|
+
FileUtils.mkdir_p(dir_path)
|
19
|
+
end
|
20
|
+
# check that all went well in creating the directory)
|
21
|
+
File.existing_dir(dir_path)
|
22
|
+
|
23
|
+
dir_path
|
24
|
+
end
|
25
|
+
|
26
|
+
########
|
27
|
+
# same as new_dir, but last piece is a filename
|
28
|
+
def self.new_filename(*pieces)
|
29
|
+
dir_path, whole_path = File.make_path(pieces, false)
|
30
|
+
|
31
|
+
unless File.exist?(dir_path)
|
32
|
+
FileUtils.mkdir_p dir_path
|
33
|
+
end
|
34
|
+
# check that all went well in creating the directory)
|
35
|
+
File.existing_dir(dir_path)
|
36
|
+
|
37
|
+
whole_path
|
38
|
+
end
|
39
|
+
|
40
|
+
#####
|
41
|
+
# check whether a given path exists,
|
42
|
+
# and report failure of it does not exist.
|
43
|
+
#
|
44
|
+
# piece together the strings in 'pieces' to make the path,
|
45
|
+
# appending "/" to all strings if necessary
|
46
|
+
#
|
47
|
+
# returns: the path pieced together
|
48
|
+
def self.existing_dir(*pieces) # strings
|
49
|
+
dir_path, _dummy = File.make_path(pieces, true)
|
50
|
+
|
51
|
+
unless File.exist?(dir_path) && File.directory?(dir_path)
|
52
|
+
raise "Error: Directory #{dir_path} doesn't exist."
|
53
|
+
end
|
54
|
+
unless File.executable? dir_path
|
55
|
+
raise "Error: Cannot access directory #{dir_path}."
|
56
|
+
end
|
57
|
+
|
58
|
+
dir_path
|
59
|
+
end
|
60
|
+
|
61
|
+
# @note AB: This method is not used anywhere.
|
62
|
+
=begin
|
63
|
+
####
|
64
|
+
# like existing_dir, but last bit is filename
|
65
|
+
def self.existing_filename(*pieces)
|
66
|
+
dir_path, whole_path = File.make_path(pieces, false)
|
67
|
+
|
68
|
+
unless File.exist?(dir_path) && File.directory?(dir_path)
|
69
|
+
$stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting"
|
70
|
+
exit(1)
|
71
|
+
end
|
72
|
+
|
73
|
+
unless File.executable?(dir_path)
|
74
|
+
$stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
|
75
|
+
exit(1)
|
76
|
+
end
|
77
|
+
|
78
|
+
whole_path
|
79
|
+
end
|
80
|
+
=end
|
81
|
+
|
82
|
+
####
|
83
|
+
# piece together the strings in 'pieces' to make a path,
|
84
|
+
# appending "/" to all but the last string if necessary
|
85
|
+
#
|
86
|
+
# if 'pieces' is already a string, take that as a one-piece path
|
87
|
+
#
|
88
|
+
# if dir is true, also append "/" to the last piece of the string
|
89
|
+
#
|
90
|
+
# the resulting path is expanded: For example, initial
|
91
|
+
# ~ is expanded to the setting of $HOME
|
92
|
+
#
|
93
|
+
# returns: pair of strings (directory_part, whole_path)
|
94
|
+
# @param pieces [String, Array]
|
95
|
+
# @param is_dir [True, False, Nil]
|
96
|
+
# @api private
|
97
|
+
def self.make_path(pieces, is_dir = false)
|
98
|
+
if pieces.is_a?(String)
|
99
|
+
pieces = [pieces]
|
100
|
+
end
|
101
|
+
|
102
|
+
dir = ''
|
103
|
+
# iterate over all but the filename
|
104
|
+
if is_dir
|
105
|
+
last_dir_index = -1
|
106
|
+
else
|
107
|
+
last_dir_index = -2
|
108
|
+
end
|
109
|
+
pieces[0..last_dir_index].each { |piece|
|
110
|
+
if piece.nil?
|
111
|
+
# whoops, nil entry in name of path!
|
112
|
+
$stderr.puts "File.make_path ERROR: nil for piece of path name."
|
113
|
+
next
|
114
|
+
end
|
115
|
+
if piece =~ /\/$/
|
116
|
+
dir << piece
|
117
|
+
else
|
118
|
+
dir << piece << "/"
|
119
|
+
end
|
120
|
+
}
|
121
|
+
|
122
|
+
dir = File.expand_path(dir)
|
123
|
+
|
124
|
+
# expand_path removes the final "/" again
|
125
|
+
unless dir =~ %r{/$}
|
126
|
+
dir += "/"
|
127
|
+
end
|
128
|
+
|
129
|
+
is_dir ? [dir, dir] : [dir, dir + pieces[-1]]
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
###
|
2
|
+
# extend Array class by subsumption
|
3
|
+
module Subsumed
|
4
|
+
# @note This method is used by [RosyConfusability]
|
5
|
+
def subsumed_by?(array2)
|
6
|
+
temp = array2.clone
|
7
|
+
|
8
|
+
self.each { |el|
|
9
|
+
found = false
|
10
|
+
temp.each_index { |ix|
|
11
|
+
if el == temp[ix]
|
12
|
+
temp.delete_at(ix)
|
13
|
+
found = true
|
14
|
+
break
|
15
|
+
end
|
16
|
+
}
|
17
|
+
unless found
|
18
|
+
return false
|
19
|
+
end
|
20
|
+
}
|
21
|
+
|
22
|
+
return true
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module STXML
|
4
|
+
class Corpus
|
5
|
+
attr_reader :doc
|
6
|
+
|
7
|
+
def initialize(filename)
|
8
|
+
@doc = File.open(filename) do |f|
|
9
|
+
Nokogiri::XML(f)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def each_sentence
|
14
|
+
return enum_for(:each_sentence) unless block_given?
|
15
|
+
@doc.xpath('//s').each do |s|
|
16
|
+
yield s
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def sentences
|
21
|
+
@doc.xpath('//s')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require_relative 'sem_node'
|
2
|
+
require_relative 'reg_xml'
|
3
|
+
|
4
|
+
module STXML
|
5
|
+
#############
|
6
|
+
# class FeNode
|
7
|
+
#
|
8
|
+
# inherits from SemNode,
|
9
|
+
# adds to it methods specific to nodes
|
10
|
+
# that describe a frame element or target
|
11
|
+
#
|
12
|
+
# additional/changed methods:
|
13
|
+
#----------------------------
|
14
|
+
#
|
15
|
+
# name returns the name of the frame element, or "target"
|
16
|
+
#
|
17
|
+
# add_child, remove_child
|
18
|
+
class FeNode < SemNode
|
19
|
+
###
|
20
|
+
# either RegXMl object or the name of the FE as a string
|
21
|
+
# string: ID to use if we just got the name of the FE
|
22
|
+
def initialize(name_or_xml, id_if_name = nil)
|
23
|
+
case name_or_xml
|
24
|
+
when String
|
25
|
+
if name_or_xml == "target"
|
26
|
+
super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
27
|
+
@i_am_target = true
|
28
|
+
else
|
29
|
+
super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
30
|
+
@i_am_target = false
|
31
|
+
end
|
32
|
+
when RegXML
|
33
|
+
super(name_or_xml)
|
34
|
+
|
35
|
+
if name_or_xml.name == "target"
|
36
|
+
@i_am_target = true
|
37
|
+
else
|
38
|
+
@i_am_target = false
|
39
|
+
end
|
40
|
+
else
|
41
|
+
raise "Shouldn't be here: #{name_or_xml.class}."
|
42
|
+
end
|
43
|
+
|
44
|
+
# child_attr: keep additional attributes of <fenode> elements,
|
45
|
+
# if there are any
|
46
|
+
# child_attr: hash syn_node_id(string) -> attributes(hash)
|
47
|
+
@child_attr = {}
|
48
|
+
end
|
49
|
+
|
50
|
+
###
|
51
|
+
def name
|
52
|
+
if @i_am_target
|
53
|
+
'target'
|
54
|
+
else
|
55
|
+
get_attribute("name")
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
###
|
60
|
+
def add_child(syn_node, xml_obj = nil)
|
61
|
+
if xml_obj
|
62
|
+
# we've been given the fenode XML element
|
63
|
+
# see if there are any attributes that we will need:
|
64
|
+
# get attributes, remove the idref (we get that from the
|
65
|
+
# child's ID directly)
|
66
|
+
at = xml_obj.attributes
|
67
|
+
at.delete("idref")
|
68
|
+
unless at.empty?
|
69
|
+
@child_attr[syn_node.id] = at
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
74
|
+
end
|
75
|
+
|
76
|
+
###
|
77
|
+
def remove_child(syn_node, varhash={})
|
78
|
+
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
79
|
+
end
|
80
|
+
|
81
|
+
protected
|
82
|
+
|
83
|
+
def get_xml_ofchildren
|
84
|
+
return children.map { |child|
|
85
|
+
if @child_attr[child.id]
|
86
|
+
"<fenode idref=\'#{xml_secure_val(child.id)}\'" +
|
87
|
+
@child_attr[child.id].to_a.map { |attr, val|
|
88
|
+
" #{attr}=\'#{xml_secure_val(val)}\'"
|
89
|
+
}.join +
|
90
|
+
"/>\n"
|
91
|
+
|
92
|
+
else
|
93
|
+
"<fenode idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
94
|
+
end
|
95
|
+
}.join
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,214 @@
|
|
1
|
+
# Alexander Koller 2003
|
2
|
+
# extended Katrin Erk June 2003
|
3
|
+
#
|
4
|
+
# Classes that return a list of sentence DOMs, from various sources
|
5
|
+
#
|
6
|
+
# Each class in this file defines the following methods:
|
7
|
+
#
|
8
|
+
# initialize(...) "..." depends on the class
|
9
|
+
# extractDOMs() return list of all s nodes as DOM objects
|
10
|
+
# each_s() iterate over s nodes; may take less memory
|
11
|
+
|
12
|
+
require "rexml/document"
|
13
|
+
|
14
|
+
module STXML
|
15
|
+
|
16
|
+
class FilePartsParser
|
17
|
+
# <@file> = File object for the corpus
|
18
|
+
# <@head> = string up to the first <s> tag
|
19
|
+
# <@tail> = string after the last </s> tag
|
20
|
+
# <@rest> = string starting with the latest <s> tag (complete this to
|
21
|
+
# a <s>...</s> structure by reading up to next </s> tag)
|
22
|
+
# <@readCompletely> = boolean specifying whether there's still something
|
23
|
+
# left to read in the file
|
24
|
+
|
25
|
+
attr_reader :head, :tail
|
26
|
+
|
27
|
+
def initialize(filename)
|
28
|
+
@file = File.new(filename)
|
29
|
+
@readCompletely = false
|
30
|
+
# read stuff into @head and initialize @rest
|
31
|
+
@head = ''
|
32
|
+
begin
|
33
|
+
loop do
|
34
|
+
line = @file.readline
|
35
|
+
if line =~ /(.*)(<s\s.*)/ then
|
36
|
+
@head = @head << $1
|
37
|
+
@rest = $2
|
38
|
+
break
|
39
|
+
elsif line =~ /^(.*)(<\/body[\s>].*)$/
|
40
|
+
# empty corpus
|
41
|
+
@head = @head << $1
|
42
|
+
@tail = $2
|
43
|
+
while (line = @file.readline)
|
44
|
+
@tail << "\n" + line
|
45
|
+
end
|
46
|
+
@readCompletely = true
|
47
|
+
break
|
48
|
+
else
|
49
|
+
# @todo Edit this horror!
|
50
|
+
@head = @head << line
|
51
|
+
end
|
52
|
+
end
|
53
|
+
rescue EOFError
|
54
|
+
@readCompletely = true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def close
|
59
|
+
@file.close
|
60
|
+
end
|
61
|
+
|
62
|
+
# @note AB: This method isn't used anywhere.
|
63
|
+
def extractDOMs
|
64
|
+
allDOMs = []
|
65
|
+
|
66
|
+
process_s! do |dom|
|
67
|
+
allDOMs.push(dom)
|
68
|
+
Element.new("x")
|
69
|
+
end
|
70
|
+
|
71
|
+
allDOMs
|
72
|
+
end
|
73
|
+
|
74
|
+
# @note AB: This method isn't used anywhere.
|
75
|
+
def each_s
|
76
|
+
process_s! do |dom|
|
77
|
+
yield(dom)
|
78
|
+
Element.new("x")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# This function returns the string for the modified corpus.
|
83
|
+
# It doesn't change the internal state of the FilePartsParser,
|
84
|
+
# and is much more memory (and probably time) efficient than
|
85
|
+
# FileParser#process_s!.
|
86
|
+
# The block that is called by the method is given an element
|
87
|
+
# as its argument and is expected to return a changed element.
|
88
|
+
# @note This method isn't used anywhere.
|
89
|
+
def process_s!
|
90
|
+
if @readCompletely
|
91
|
+
return
|
92
|
+
end
|
93
|
+
|
94
|
+
ret = ''
|
95
|
+
scan_s { |element|
|
96
|
+
# Process the <s> ... </s> element
|
97
|
+
doc = Document.new(element)
|
98
|
+
elt = doc.root
|
99
|
+
changedElt = yield(elt)
|
100
|
+
|
101
|
+
changedEltAsString = ''
|
102
|
+
changedElt.write(changedEltAsString, 0)
|
103
|
+
ret <<= changedEltAsString
|
104
|
+
}
|
105
|
+
|
106
|
+
return ret
|
107
|
+
end
|
108
|
+
|
109
|
+
# KE 12.6.03: scan_s :
|
110
|
+
# doesn't parse a sentence before yielding it
|
111
|
+
# doesn't allow for any changes
|
112
|
+
# but otherwise the same as process_s!
|
113
|
+
# @return [String] A String with one xml encoded sentence.
|
114
|
+
def scan_s
|
115
|
+
if @readCompletely
|
116
|
+
return
|
117
|
+
end
|
118
|
+
|
119
|
+
begin
|
120
|
+
while true do
|
121
|
+
# Invariant: At this point, @rest always starts with an
|
122
|
+
# unseen <s> tag.
|
123
|
+
|
124
|
+
# First, we continue reading until we find the closing </s>
|
125
|
+
# No exception should occur in this loop if we're parsing
|
126
|
+
# a valid XML document.
|
127
|
+
while @rest !~ /^(.*<\/s>)(.*)/m do
|
128
|
+
@rest = @rest << @file.readline
|
129
|
+
end
|
130
|
+
|
131
|
+
element = $1
|
132
|
+
@rest = $2
|
133
|
+
|
134
|
+
yield(element) # change HERE: element not parsed!
|
135
|
+
|
136
|
+
# Read on up to the next <s>
|
137
|
+
while @rest !~ /(.*)(<s\s.*)/m do
|
138
|
+
@rest = @rest << @file.readline
|
139
|
+
end
|
140
|
+
|
141
|
+
@rest = $2
|
142
|
+
end
|
143
|
+
rescue EOFError
|
144
|
+
@tail = @rest
|
145
|
+
@readCompletely = true
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# KE 5.11.03: get_rest: read all of the file not processed up to this point
|
150
|
+
# and return it as a string
|
151
|
+
def get_rest
|
152
|
+
begin
|
153
|
+
loop do
|
154
|
+
@rest = @rest << @file.readline
|
155
|
+
end
|
156
|
+
rescue EOFError
|
157
|
+
@readCompletely = true
|
158
|
+
end
|
159
|
+
return @rest
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
# This part seems to be obsolete, delete it!
|
165
|
+
=begin
|
166
|
+
|
167
|
+
class FileParser
|
168
|
+
|
169
|
+
include REXML
|
170
|
+
|
171
|
+
def initialize(filename)
|
172
|
+
@file = File.new(filename)
|
173
|
+
@doc = nil
|
174
|
+
end
|
175
|
+
|
176
|
+
# returns an array of DOMs for the sentences
|
177
|
+
def extractDOMs()
|
178
|
+
ensureParsedDocument()
|
179
|
+
@doc.get_elements("/corpus/body/s")
|
180
|
+
end
|
181
|
+
|
182
|
+
# Iterates over all sentence nodes. This may be more memory
|
183
|
+
# efficient than using extractDOMs(), but isn't in this case.
|
184
|
+
def each_s()
|
185
|
+
extractDOMs().each { |dom| yield(dom) }
|
186
|
+
end
|
187
|
+
|
188
|
+
# Iterates over all sentence nodes. The block passed to this
|
189
|
+
# method should return a DOM object as a value. After the iteration
|
190
|
+
# has been completed, the contents of /corpus/body are then replaced
|
191
|
+
# by the list of these results.
|
192
|
+
# At the moment, this changes the FileParser object. This should
|
193
|
+
# probably change in the future, but I don't want to mess with
|
194
|
+
# cloning now.
|
195
|
+
def process_s!()
|
196
|
+
newBody = Element.new('body')
|
197
|
+
each_s { |dom| newBody.add_element( yield(dom) ) }
|
198
|
+
|
199
|
+
@doc.delete_element("/corpus/body")
|
200
|
+
@doc.elements["corpus"].add_element(newBody)
|
201
|
+
|
202
|
+
return @doc
|
203
|
+
end
|
204
|
+
|
205
|
+
private
|
206
|
+
|
207
|
+
def ensureParsedDocument()
|
208
|
+
if @doc == nil then
|
209
|
+
@doc = Document.new(@file)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
=end
|