shalmaneser-lib 1.2.rc5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/configuration/config_data.rb +457 -0
- data/lib/configuration/config_format_element.rb +210 -0
- data/lib/configuration/configuration_error.rb +15 -0
- data/lib/configuration/external_config_data.rb +56 -0
- data/lib/configuration/frappe_config_data.rb +134 -0
- data/lib/configuration/fred_config_data.rb +199 -0
- data/lib/configuration/rosy_config_data.rb +126 -0
- data/lib/db/db_interface.rb +50 -0
- data/lib/db/db_mysql.rb +141 -0
- data/lib/db/db_sqlite.rb +280 -0
- data/lib/db/db_table.rb +237 -0
- data/lib/db/db_view.rb +416 -0
- data/lib/db/db_wrapper.rb +175 -0
- data/lib/db/select_table_and_columns.rb +10 -0
- data/lib/db/sql_query.rb +243 -0
- data/lib/definitions.rb +19 -0
- data/lib/eval.rb +482 -0
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/external_systems.rb +251 -0
- data/lib/framenet_format/fn_corpus_aset.rb +209 -0
- data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
- data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
- data/lib/framenet_format/fn_database.rb +143 -0
- data/lib/framenet_format/frame_xml_file.rb +104 -0
- data/lib/framenet_format/frame_xml_sentence.rb +411 -0
- data/lib/logging.rb +25 -0
- data/lib/ml/classifier.rb +189 -0
- data/lib/ml/mallet.rb +236 -0
- data/lib/ml/maxent.rb +229 -0
- data/lib/ml/optimize.rb +195 -0
- data/lib/ml/timbl.rb +140 -0
- data/lib/monkey_patching/array.rb +82 -0
- data/lib/monkey_patching/enumerable_bool.rb +24 -0
- data/lib/monkey_patching/enumerable_distribute.rb +18 -0
- data/lib/monkey_patching/file.rb +131 -0
- data/lib/monkey_patching/subsumed.rb +24 -0
- data/lib/ruby_class_extensions.rb +4 -0
- data/lib/salsa_tiger_xml/corpus.rb +24 -0
- data/lib/salsa_tiger_xml/fe_node.rb +98 -0
- data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
- data/lib/salsa_tiger_xml/frame_node.rb +145 -0
- data/lib/salsa_tiger_xml/graph_node.rb +347 -0
- data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
- data/lib/salsa_tiger_xml/sem_node.rb +58 -0
- data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
- data/lib/salsa_tiger_xml/syn_node.rb +169 -0
- data/lib/salsa_tiger_xml/tree_node.rb +59 -0
- data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
- data/lib/salsa_tiger_xml/usp_node.rb +72 -0
- data/lib/salsa_tiger_xml/xml_node.rb +163 -0
- data/lib/shalmaneser/lib.rb +1 -0
- data/lib/tabular_format/fn_tab_format_file.rb +38 -0
- data/lib/tabular_format/fn_tab_frame.rb +67 -0
- data/lib/tabular_format/fn_tab_sentence.rb +169 -0
- data/lib/tabular_format/tab_format_file.rb +91 -0
- data/lib/tabular_format/tab_format_named_args.rb +184 -0
- data/lib/tabular_format/tab_format_sentence.rb +119 -0
- data/lib/value_restriction.rb +49 -0
- metadata +131 -0
@@ -0,0 +1,82 @@
|
|
1
|
+
require_relative 'enumerable_bool'
|
2
|
+
require_relative 'enumerable_distribute'
|
3
|
+
require_relative 'subsumed'
|
4
|
+
|
5
|
+
# Extensions for the class Array.
|
6
|
+
class Array
|
7
|
+
include EnumerableBool
|
8
|
+
include EnumerableDistribute
|
9
|
+
include Subsumed
|
10
|
+
|
11
|
+
###
|
12
|
+
# interleave N arrays:
|
13
|
+
# given arrays [a1... an], [b1,...,bn], ..[z1, ...,zn]
|
14
|
+
# return [[a1,b1, .., z1]...,[an,bn, .., zn]]
|
15
|
+
#
|
16
|
+
# if one array is longer than the other,
|
17
|
+
# e.g. [a1...an], [b1,...,bm] with n> m
|
18
|
+
# the result is
|
19
|
+
# [[a1,b1],...[am, bm], [am+1, nil], ..., [an, nil]]
|
20
|
+
# and analogously for m>n
|
21
|
+
def interleave(*arrays)
|
22
|
+
len = [length, arrays.map(&:length).max].max
|
23
|
+
(0..len-1).to_a.map do |ix|
|
24
|
+
[at(ix)] + arrays.map { |a| a[ix] }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
###
|
29
|
+
# count the number of occurrences of element in this array
|
30
|
+
def count(element)
|
31
|
+
num = 0
|
32
|
+
each { |my_element|
|
33
|
+
if my_element == element
|
34
|
+
num += 1
|
35
|
+
end
|
36
|
+
}
|
37
|
+
|
38
|
+
num
|
39
|
+
end
|
40
|
+
|
41
|
+
###
|
42
|
+
# count the number of occurrences of
|
43
|
+
# elements from list in this array
|
44
|
+
def counts(list)
|
45
|
+
num = 0
|
46
|
+
each { |my_element|
|
47
|
+
if list.include? my_element
|
48
|
+
num += 1
|
49
|
+
end
|
50
|
+
}
|
51
|
+
return num
|
52
|
+
end
|
53
|
+
|
54
|
+
###
|
55
|
+
# draw a random sample of size N
|
56
|
+
# from this array
|
57
|
+
def sample(size)
|
58
|
+
if size < 0
|
59
|
+
return nil
|
60
|
+
elsif size == 0
|
61
|
+
return []
|
62
|
+
elsif size >= length
|
63
|
+
return self.clone
|
64
|
+
end
|
65
|
+
|
66
|
+
rank = {}
|
67
|
+
each { |my_element|
|
68
|
+
rank[my_element] = rand
|
69
|
+
}
|
70
|
+
return self.sort { |a, b| rank[a] <=> rank[b] }[0..size-1]
|
71
|
+
end
|
72
|
+
|
73
|
+
def map_with_index(&block)
|
74
|
+
retv = []
|
75
|
+
|
76
|
+
each_with_index { |x, index|
|
77
|
+
retv << block.call(x, index)
|
78
|
+
}
|
79
|
+
|
80
|
+
return retv
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
################
|
2
|
+
module EnumerableBool
|
3
|
+
###
|
4
|
+
# And_(x \in X) block(x)
|
5
|
+
def big_and(&block)
|
6
|
+
each do |x|
|
7
|
+
unless block.call(x)
|
8
|
+
return false
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
###
|
16
|
+
# Sum_(x \in X) block(x)
|
17
|
+
def big_sum(init = 0, &block)
|
18
|
+
sum = init
|
19
|
+
block = proc { |x| x } unless block_given?
|
20
|
+
each { |x| sum += block.call(x) }
|
21
|
+
|
22
|
+
sum
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
################
|
2
|
+
# Given an enumerable, distribute its items into two bins (arrays)
|
3
|
+
# depending on whether the block returns true
|
4
|
+
module EnumerableDistribute
|
5
|
+
def distribute(&block)
|
6
|
+
retv1 = []
|
7
|
+
retv2 = []
|
8
|
+
each do |x|
|
9
|
+
if block.call(x)
|
10
|
+
retv1 << x
|
11
|
+
else
|
12
|
+
retv2 << x
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
[retv1, retv2]
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
# Extensions for the class File.
|
4
|
+
class File
|
5
|
+
########
|
6
|
+
# check whether a given path exists,
|
7
|
+
# and if it doesn't, make sure it is created.
|
8
|
+
#
|
9
|
+
# piece together the strings in 'pieces' to make the path,
|
10
|
+
# appending "/" to all strings if necessary
|
11
|
+
#
|
12
|
+
# returns: the path pieced together
|
13
|
+
# strings, to be pieced together
|
14
|
+
def self.new_dir(*pieces)
|
15
|
+
dir_path, _dummy = File.make_path(pieces, true)
|
16
|
+
|
17
|
+
unless File.exist?(dir_path)
|
18
|
+
FileUtils.mkdir_p(dir_path)
|
19
|
+
end
|
20
|
+
# check that all went well in creating the directory)
|
21
|
+
File.existing_dir(dir_path)
|
22
|
+
|
23
|
+
dir_path
|
24
|
+
end
|
25
|
+
|
26
|
+
########
|
27
|
+
# same as new_dir, but last piece is a filename
|
28
|
+
def self.new_filename(*pieces)
|
29
|
+
dir_path, whole_path = File.make_path(pieces, false)
|
30
|
+
|
31
|
+
unless File.exist?(dir_path)
|
32
|
+
FileUtils.mkdir_p dir_path
|
33
|
+
end
|
34
|
+
# check that all went well in creating the directory)
|
35
|
+
File.existing_dir(dir_path)
|
36
|
+
|
37
|
+
whole_path
|
38
|
+
end
|
39
|
+
|
40
|
+
#####
|
41
|
+
# check whether a given path exists,
|
42
|
+
# and report failure of it does not exist.
|
43
|
+
#
|
44
|
+
# piece together the strings in 'pieces' to make the path,
|
45
|
+
# appending "/" to all strings if necessary
|
46
|
+
#
|
47
|
+
# returns: the path pieced together
|
48
|
+
def self.existing_dir(*pieces) # strings
|
49
|
+
dir_path, _dummy = File.make_path(pieces, true)
|
50
|
+
|
51
|
+
unless File.exist?(dir_path) && File.directory?(dir_path)
|
52
|
+
raise "Error: Directory #{dir_path} doesn't exist."
|
53
|
+
end
|
54
|
+
unless File.executable? dir_path
|
55
|
+
raise "Error: Cannot access directory #{dir_path}."
|
56
|
+
end
|
57
|
+
|
58
|
+
dir_path
|
59
|
+
end
|
60
|
+
|
61
|
+
# @note AB: This method is not used anywhere.
|
62
|
+
=begin
|
63
|
+
####
|
64
|
+
# like existing_dir, but last bit is filename
|
65
|
+
def self.existing_filename(*pieces)
|
66
|
+
dir_path, whole_path = File.make_path(pieces, false)
|
67
|
+
|
68
|
+
unless File.exist?(dir_path) && File.directory?(dir_path)
|
69
|
+
$stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting"
|
70
|
+
exit(1)
|
71
|
+
end
|
72
|
+
|
73
|
+
unless File.executable?(dir_path)
|
74
|
+
$stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
|
75
|
+
exit(1)
|
76
|
+
end
|
77
|
+
|
78
|
+
whole_path
|
79
|
+
end
|
80
|
+
=end
|
81
|
+
|
82
|
+
####
|
83
|
+
# piece together the strings in 'pieces' to make a path,
|
84
|
+
# appending "/" to all but the last string if necessary
|
85
|
+
#
|
86
|
+
# if 'pieces' is already a string, take that as a one-piece path
|
87
|
+
#
|
88
|
+
# if dir is true, also append "/" to the last piece of the string
|
89
|
+
#
|
90
|
+
# the resulting path is expanded: For example, initial
|
91
|
+
# ~ is expanded to the setting of $HOME
|
92
|
+
#
|
93
|
+
# returns: pair of strings (directory_part, whole_path)
|
94
|
+
# @param pieces [String, Array]
|
95
|
+
# @param is_dir [True, False, Nil]
|
96
|
+
# @api private
|
97
|
+
def self.make_path(pieces, is_dir = false)
|
98
|
+
if pieces.is_a?(String)
|
99
|
+
pieces = [pieces]
|
100
|
+
end
|
101
|
+
|
102
|
+
dir = ''
|
103
|
+
# iterate over all but the filename
|
104
|
+
if is_dir
|
105
|
+
last_dir_index = -1
|
106
|
+
else
|
107
|
+
last_dir_index = -2
|
108
|
+
end
|
109
|
+
pieces[0..last_dir_index].each { |piece|
|
110
|
+
if piece.nil?
|
111
|
+
# whoops, nil entry in name of path!
|
112
|
+
$stderr.puts "File.make_path ERROR: nil for piece of path name."
|
113
|
+
next
|
114
|
+
end
|
115
|
+
if piece =~ /\/$/
|
116
|
+
dir << piece
|
117
|
+
else
|
118
|
+
dir << piece << "/"
|
119
|
+
end
|
120
|
+
}
|
121
|
+
|
122
|
+
dir = File.expand_path(dir)
|
123
|
+
|
124
|
+
# expand_path removes the final "/" again
|
125
|
+
unless dir =~ %r{/$}
|
126
|
+
dir += "/"
|
127
|
+
end
|
128
|
+
|
129
|
+
is_dir ? [dir, dir] : [dir, dir + pieces[-1]]
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
###
|
2
|
+
# extend Array class by subsumption
|
3
|
+
module Subsumed
|
4
|
+
# @note This method is used by [RosyConfusability]
|
5
|
+
def subsumed_by?(array2)
|
6
|
+
temp = array2.clone
|
7
|
+
|
8
|
+
self.each { |el|
|
9
|
+
found = false
|
10
|
+
temp.each_index { |ix|
|
11
|
+
if el == temp[ix]
|
12
|
+
temp.delete_at(ix)
|
13
|
+
found = true
|
14
|
+
break
|
15
|
+
end
|
16
|
+
}
|
17
|
+
unless found
|
18
|
+
return false
|
19
|
+
end
|
20
|
+
}
|
21
|
+
|
22
|
+
return true
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module STXML
|
4
|
+
class Corpus
|
5
|
+
attr_reader :doc
|
6
|
+
|
7
|
+
def initialize(filename)
|
8
|
+
@doc = File.open(filename) do |f|
|
9
|
+
Nokogiri::XML(f)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def each_sentence
|
14
|
+
return enum_for(:each_sentence) unless block_given?
|
15
|
+
@doc.xpath('//s').each do |s|
|
16
|
+
yield s
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def sentences
|
21
|
+
@doc.xpath('//s')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require_relative 'sem_node'
|
2
|
+
require_relative 'reg_xml'
|
3
|
+
|
4
|
+
module STXML
|
5
|
+
#############
|
6
|
+
# class FeNode
|
7
|
+
#
|
8
|
+
# inherits from SemNode,
|
9
|
+
# adds to it methods specific to nodes
|
10
|
+
# that describe a frame element or target
|
11
|
+
#
|
12
|
+
# additional/changed methods:
|
13
|
+
#----------------------------
|
14
|
+
#
|
15
|
+
# name returns the name of the frame element, or "target"
|
16
|
+
#
|
17
|
+
# add_child, remove_child
|
18
|
+
class FeNode < SemNode
|
19
|
+
###
|
20
|
+
# either RegXMl object or the name of the FE as a string
|
21
|
+
# string: ID to use if we just got the name of the FE
|
22
|
+
def initialize(name_or_xml, id_if_name = nil)
|
23
|
+
case name_or_xml
|
24
|
+
when String
|
25
|
+
if name_or_xml == "target"
|
26
|
+
super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
27
|
+
@i_am_target = true
|
28
|
+
else
|
29
|
+
super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
30
|
+
@i_am_target = false
|
31
|
+
end
|
32
|
+
when RegXML
|
33
|
+
super(name_or_xml)
|
34
|
+
|
35
|
+
if name_or_xml.name == "target"
|
36
|
+
@i_am_target = true
|
37
|
+
else
|
38
|
+
@i_am_target = false
|
39
|
+
end
|
40
|
+
else
|
41
|
+
raise "Shouldn't be here: #{name_or_xml.class}."
|
42
|
+
end
|
43
|
+
|
44
|
+
# child_attr: keep additional attributes of <fenode> elements,
|
45
|
+
# if there are any
|
46
|
+
# child_attr: hash syn_node_id(string) -> attributes(hash)
|
47
|
+
@child_attr = {}
|
48
|
+
end
|
49
|
+
|
50
|
+
###
|
51
|
+
def name
|
52
|
+
if @i_am_target
|
53
|
+
'target'
|
54
|
+
else
|
55
|
+
get_attribute("name")
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
###
|
60
|
+
def add_child(syn_node, xml_obj = nil)
|
61
|
+
if xml_obj
|
62
|
+
# we've been given the fenode XML element
|
63
|
+
# see if there are any attributes that we will need:
|
64
|
+
# get attributes, remove the idref (we get that from the
|
65
|
+
# child's ID directly)
|
66
|
+
at = xml_obj.attributes
|
67
|
+
at.delete("idref")
|
68
|
+
unless at.empty?
|
69
|
+
@child_attr[syn_node.id] = at
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
74
|
+
end
|
75
|
+
|
76
|
+
###
|
77
|
+
def remove_child(syn_node, varhash={})
|
78
|
+
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
79
|
+
end
|
80
|
+
|
81
|
+
protected
|
82
|
+
|
83
|
+
def get_xml_ofchildren
|
84
|
+
return children.map { |child|
|
85
|
+
if @child_attr[child.id]
|
86
|
+
"<fenode idref=\'#{xml_secure_val(child.id)}\'" +
|
87
|
+
@child_attr[child.id].to_a.map { |attr, val|
|
88
|
+
" #{attr}=\'#{xml_secure_val(val)}\'"
|
89
|
+
}.join +
|
90
|
+
"/>\n"
|
91
|
+
|
92
|
+
else
|
93
|
+
"<fenode idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
94
|
+
end
|
95
|
+
}.join
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,214 @@
|
|
1
|
+
# Alexander Koller 2003
|
2
|
+
# extended Katrin Erk June 2003
|
3
|
+
#
|
4
|
+
# Classes that return a list of sentence DOMs, from various sources
|
5
|
+
#
|
6
|
+
# Each class in this file defines the following methods:
|
7
|
+
#
|
8
|
+
# initialize(...) "..." depends on the class
|
9
|
+
# extractDOMs() return list of all s nodes as DOM objects
|
10
|
+
# each_s() iterate over s nodes; may take less memory
|
11
|
+
|
12
|
+
require "rexml/document"
|
13
|
+
|
14
|
+
module STXML
|
15
|
+
|
16
|
+
class FilePartsParser
|
17
|
+
# <@file> = File object for the corpus
|
18
|
+
# <@head> = string up to the first <s> tag
|
19
|
+
# <@tail> = string after the last </s> tag
|
20
|
+
# <@rest> = string starting with the latest <s> tag (complete this to
|
21
|
+
# a <s>...</s> structure by reading up to next </s> tag)
|
22
|
+
# <@readCompletely> = boolean specifying whether there's still something
|
23
|
+
# left to read in the file
|
24
|
+
|
25
|
+
attr_reader :head, :tail
|
26
|
+
|
27
|
+
def initialize(filename)
|
28
|
+
@file = File.new(filename)
|
29
|
+
@readCompletely = false
|
30
|
+
# read stuff into @head and initialize @rest
|
31
|
+
@head = ''
|
32
|
+
begin
|
33
|
+
loop do
|
34
|
+
line = @file.readline
|
35
|
+
if line =~ /(.*)(<s\s.*)/ then
|
36
|
+
@head = @head << $1
|
37
|
+
@rest = $2
|
38
|
+
break
|
39
|
+
elsif line =~ /^(.*)(<\/body[\s>].*)$/
|
40
|
+
# empty corpus
|
41
|
+
@head = @head << $1
|
42
|
+
@tail = $2
|
43
|
+
while (line = @file.readline)
|
44
|
+
@tail << "\n" + line
|
45
|
+
end
|
46
|
+
@readCompletely = true
|
47
|
+
break
|
48
|
+
else
|
49
|
+
# @todo Edit this horror!
|
50
|
+
@head = @head << line
|
51
|
+
end
|
52
|
+
end
|
53
|
+
rescue EOFError
|
54
|
+
@readCompletely = true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def close
|
59
|
+
@file.close
|
60
|
+
end
|
61
|
+
|
62
|
+
# @note AB: This method isn't used anywhere.
|
63
|
+
def extractDOMs
|
64
|
+
allDOMs = []
|
65
|
+
|
66
|
+
process_s! do |dom|
|
67
|
+
allDOMs.push(dom)
|
68
|
+
Element.new("x")
|
69
|
+
end
|
70
|
+
|
71
|
+
allDOMs
|
72
|
+
end
|
73
|
+
|
74
|
+
# @note AB: This method isn't used anywhere.
|
75
|
+
def each_s
|
76
|
+
process_s! do |dom|
|
77
|
+
yield(dom)
|
78
|
+
Element.new("x")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# This function returns the string for the modified corpus.
|
83
|
+
# It doesn't change the internal state of the FilePartsParser,
|
84
|
+
# and is much more memory (and probably time) efficient than
|
85
|
+
# FileParser#process_s!.
|
86
|
+
# The block that is called by the method is given an element
|
87
|
+
# as its argument and is expected to return a changed element.
|
88
|
+
# @note This method isn't used anywhere.
|
89
|
+
def process_s!
|
90
|
+
if @readCompletely
|
91
|
+
return
|
92
|
+
end
|
93
|
+
|
94
|
+
ret = ''
|
95
|
+
scan_s { |element|
|
96
|
+
# Process the <s> ... </s> element
|
97
|
+
doc = Document.new(element)
|
98
|
+
elt = doc.root
|
99
|
+
changedElt = yield(elt)
|
100
|
+
|
101
|
+
changedEltAsString = ''
|
102
|
+
changedElt.write(changedEltAsString, 0)
|
103
|
+
ret <<= changedEltAsString
|
104
|
+
}
|
105
|
+
|
106
|
+
return ret
|
107
|
+
end
|
108
|
+
|
109
|
+
# KE 12.6.03: scan_s :
|
110
|
+
# doesn't parse a sentence before yielding it
|
111
|
+
# doesn't allow for any changes
|
112
|
+
# but otherwise the same as process_s!
|
113
|
+
# @return [String] A String with one xml encoded sentence.
|
114
|
+
def scan_s
|
115
|
+
if @readCompletely
|
116
|
+
return
|
117
|
+
end
|
118
|
+
|
119
|
+
begin
|
120
|
+
while true do
|
121
|
+
# Invariant: At this point, @rest always starts with an
|
122
|
+
# unseen <s> tag.
|
123
|
+
|
124
|
+
# First, we continue reading until we find the closing </s>
|
125
|
+
# No exception should occur in this loop if we're parsing
|
126
|
+
# a valid XML document.
|
127
|
+
while @rest !~ /^(.*<\/s>)(.*)/m do
|
128
|
+
@rest = @rest << @file.readline
|
129
|
+
end
|
130
|
+
|
131
|
+
element = $1
|
132
|
+
@rest = $2
|
133
|
+
|
134
|
+
yield(element) # change HERE: element not parsed!
|
135
|
+
|
136
|
+
# Read on up to the next <s>
|
137
|
+
while @rest !~ /(.*)(<s\s.*)/m do
|
138
|
+
@rest = @rest << @file.readline
|
139
|
+
end
|
140
|
+
|
141
|
+
@rest = $2
|
142
|
+
end
|
143
|
+
rescue EOFError
|
144
|
+
@tail = @rest
|
145
|
+
@readCompletely = true
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# KE 5.11.03: get_rest: read all of the file not processed up to this point
|
150
|
+
# and return it as a string
|
151
|
+
def get_rest
|
152
|
+
begin
|
153
|
+
loop do
|
154
|
+
@rest = @rest << @file.readline
|
155
|
+
end
|
156
|
+
rescue EOFError
|
157
|
+
@readCompletely = true
|
158
|
+
end
|
159
|
+
return @rest
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
# This part seems to be obsolete, delete it!
|
165
|
+
=begin
|
166
|
+
|
167
|
+
class FileParser
|
168
|
+
|
169
|
+
include REXML
|
170
|
+
|
171
|
+
def initialize(filename)
|
172
|
+
@file = File.new(filename)
|
173
|
+
@doc = nil
|
174
|
+
end
|
175
|
+
|
176
|
+
# returns an array of DOMs for the sentences
|
177
|
+
def extractDOMs()
|
178
|
+
ensureParsedDocument()
|
179
|
+
@doc.get_elements("/corpus/body/s")
|
180
|
+
end
|
181
|
+
|
182
|
+
# Iterates over all sentence nodes. This may be more memory
|
183
|
+
# efficient than using extractDOMs(), but isn't in this case.
|
184
|
+
def each_s()
|
185
|
+
extractDOMs().each { |dom| yield(dom) }
|
186
|
+
end
|
187
|
+
|
188
|
+
# Iterates over all sentence nodes. The block passed to this
|
189
|
+
# method should return a DOM object as a value. After the iteration
|
190
|
+
# has been completed, the contents of /corpus/body are then replaced
|
191
|
+
# by the list of these results.
|
192
|
+
# At the moment, this changes the FileParser object. This should
|
193
|
+
# probably change in the future, but I don't want to mess with
|
194
|
+
# cloning now.
|
195
|
+
def process_s!()
|
196
|
+
newBody = Element.new('body')
|
197
|
+
each_s { |dom| newBody.add_element( yield(dom) ) }
|
198
|
+
|
199
|
+
@doc.delete_element("/corpus/body")
|
200
|
+
@doc.elements["corpus"].add_element(newBody)
|
201
|
+
|
202
|
+
return @doc
|
203
|
+
end
|
204
|
+
|
205
|
+
private
|
206
|
+
|
207
|
+
def ensureParsedDocument()
|
208
|
+
if @doc == nil then
|
209
|
+
@doc = Document.new(@file)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
=end
|