shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,82 @@
1
+ require_relative 'enumerable_bool'
2
+ require_relative 'enumerable_distribute'
3
+ require_relative 'subsumed'
4
+
5
+ # Extensions for the class Array.
6
+ class Array
7
+ include EnumerableBool
8
+ include EnumerableDistribute
9
+ include Subsumed
10
+
11
+ ###
12
+ # interleave N arrays:
13
+ # given arrays [a1... an], [b1,...,bn], ..[z1, ...,zn]
14
+ # return [[a1,b1, .., z1]...,[an,bn, .., zn]]
15
+ #
16
+ # if one array is longer than the other,
17
+ # e.g. [a1...an], [b1,...,bm] with n> m
18
+ # the result is
19
+ # [[a1,b1],...[am, bm], [am+1, nil], ..., [an, nil]]
20
+ # and analogously for m>n
21
+ def interleave(*arrays)
22
+ len = [length, arrays.map(&:length).max].max
23
+ (0..len-1).to_a.map do |ix|
24
+ [at(ix)] + arrays.map { |a| a[ix] }
25
+ end
26
+ end
27
+
28
+ ###
29
+ # count the number of occurrences of element in this array
30
+ def count(element)
31
+ num = 0
32
+ each { |my_element|
33
+ if my_element == element
34
+ num += 1
35
+ end
36
+ }
37
+
38
+ num
39
+ end
40
+
41
+ ###
42
+ # count the number of occurrences of
43
+ # elements from list in this array
44
+ def counts(list)
45
+ num = 0
46
+ each { |my_element|
47
+ if list.include? my_element
48
+ num += 1
49
+ end
50
+ }
51
+ return num
52
+ end
53
+
54
+ ###
55
+ # draw a random sample of size N
56
+ # from this array
57
+ def sample(size)
58
+ if size < 0
59
+ return nil
60
+ elsif size == 0
61
+ return []
62
+ elsif size >= length
63
+ return self.clone
64
+ end
65
+
66
+ rank = {}
67
+ each { |my_element|
68
+ rank[my_element] = rand
69
+ }
70
+ return self.sort { |a, b| rank[a] <=> rank[b] }[0..size-1]
71
+ end
72
+
73
+ def map_with_index(&block)
74
+ retv = []
75
+
76
+ each_with_index { |x, index|
77
+ retv << block.call(x, index)
78
+ }
79
+
80
+ return retv
81
+ end
82
+ end
@@ -0,0 +1,24 @@
1
+ ################
2
+ module EnumerableBool
3
+ ###
4
+ # And_(x \in X) block(x)
5
+ def big_and(&block)
6
+ each do |x|
7
+ unless block.call(x)
8
+ return false
9
+ end
10
+ end
11
+
12
+ true
13
+ end
14
+
15
+ ###
16
+ # Sum_(x \in X) block(x)
17
+ def big_sum(init = 0, &block)
18
+ sum = init
19
+ block = proc { |x| x } unless block_given?
20
+ each { |x| sum += block.call(x) }
21
+
22
+ sum
23
+ end
24
+ end
@@ -0,0 +1,18 @@
1
+ ################
2
+ # Given an enumerable, distribute its items into two bins (arrays)
3
+ # depending on whether the block returns true
4
+ module EnumerableDistribute
5
+ def distribute(&block)
6
+ retv1 = []
7
+ retv2 = []
8
+ each do |x|
9
+ if block.call(x)
10
+ retv1 << x
11
+ else
12
+ retv2 << x
13
+ end
14
+ end
15
+
16
+ [retv1, retv2]
17
+ end
18
+ end
@@ -0,0 +1,131 @@
1
+ require 'fileutils'
2
+
3
+ # Extensions for the class File.
4
+ class File
5
+ ########
6
+ # check whether a given path exists,
7
+ # and if it doesn't, make sure it is created.
8
+ #
9
+ # piece together the strings in 'pieces' to make the path,
10
+ # appending "/" to all strings if necessary
11
+ #
12
+ # returns: the path pieced together
13
+ # strings, to be pieced together
14
+ def self.new_dir(*pieces)
15
+ dir_path, _dummy = File.make_path(pieces, true)
16
+
17
+ unless File.exist?(dir_path)
18
+ FileUtils.mkdir_p(dir_path)
19
+ end
20
+ # check that all went well in creating the directory)
21
+ File.existing_dir(dir_path)
22
+
23
+ dir_path
24
+ end
25
+
26
+ ########
27
+ # same as new_dir, but last piece is a filename
28
+ def self.new_filename(*pieces)
29
+ dir_path, whole_path = File.make_path(pieces, false)
30
+
31
+ unless File.exist?(dir_path)
32
+ FileUtils.mkdir_p dir_path
33
+ end
34
+ # check that all went well in creating the directory)
35
+ File.existing_dir(dir_path)
36
+
37
+ whole_path
38
+ end
39
+
40
+ #####
41
+ # check whether a given path exists,
42
+ # and report failure of it does not exist.
43
+ #
44
+ # piece together the strings in 'pieces' to make the path,
45
+ # appending "/" to all strings if necessary
46
+ #
47
+ # returns: the path pieced together
48
+ def self.existing_dir(*pieces) # strings
49
+ dir_path, _dummy = File.make_path(pieces, true)
50
+
51
+ unless File.exist?(dir_path) && File.directory?(dir_path)
52
+ raise "Error: Directory #{dir_path} doesn't exist."
53
+ end
54
+ unless File.executable? dir_path
55
+ raise "Error: Cannot access directory #{dir_path}."
56
+ end
57
+
58
+ dir_path
59
+ end
60
+
61
+ # @note AB: This method is not used anywhere.
62
+ =begin
63
+ ####
64
+ # like existing_dir, but last bit is filename
65
+ def self.existing_filename(*pieces)
66
+ dir_path, whole_path = File.make_path(pieces, false)
67
+
68
+ unless File.exist?(dir_path) && File.directory?(dir_path)
69
+ $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting"
70
+ exit(1)
71
+ end
72
+
73
+ unless File.executable?(dir_path)
74
+ $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
75
+ exit(1)
76
+ end
77
+
78
+ whole_path
79
+ end
80
+ =end
81
+
82
+ ####
83
+ # piece together the strings in 'pieces' to make a path,
84
+ # appending "/" to all but the last string if necessary
85
+ #
86
+ # if 'pieces' is already a string, take that as a one-piece path
87
+ #
88
+ # if dir is true, also append "/" to the last piece of the string
89
+ #
90
+ # the resulting path is expanded: For example, initial
91
+ # ~ is expanded to the setting of $HOME
92
+ #
93
+ # returns: pair of strings (directory_part, whole_path)
94
+ # @param pieces [String, Array]
95
+ # @param is_dir [True, False, Nil]
96
+ # @api private
97
+ def self.make_path(pieces, is_dir = false)
98
+ if pieces.is_a?(String)
99
+ pieces = [pieces]
100
+ end
101
+
102
+ dir = ''
103
+ # iterate over all but the filename
104
+ if is_dir
105
+ last_dir_index = -1
106
+ else
107
+ last_dir_index = -2
108
+ end
109
+ pieces[0..last_dir_index].each { |piece|
110
+ if piece.nil?
111
+ # whoops, nil entry in name of path!
112
+ $stderr.puts "File.make_path ERROR: nil for piece of path name."
113
+ next
114
+ end
115
+ if piece =~ /\/$/
116
+ dir << piece
117
+ else
118
+ dir << piece << "/"
119
+ end
120
+ }
121
+
122
+ dir = File.expand_path(dir)
123
+
124
+ # expand_path removes the final "/" again
125
+ unless dir =~ %r{/$}
126
+ dir += "/"
127
+ end
128
+
129
+ is_dir ? [dir, dir] : [dir, dir + pieces[-1]]
130
+ end
131
+ end
@@ -0,0 +1,24 @@
1
+ ###
2
+ # extend Array class by subsumption
3
+ module Subsumed
4
+ # @note This method is used by [RosyConfusability]
5
+ def subsumed_by?(array2)
6
+ temp = array2.clone
7
+
8
+ self.each { |el|
9
+ found = false
10
+ temp.each_index { |ix|
11
+ if el == temp[ix]
12
+ temp.delete_at(ix)
13
+ found = true
14
+ break
15
+ end
16
+ }
17
+ unless found
18
+ return false
19
+ end
20
+ }
21
+
22
+ return true
23
+ end
24
+ end
@@ -0,0 +1,4 @@
1
+ require_relative 'monkey_patching/file'
2
+ require_relative 'monkey_patching/array'
3
+ require_relative 'monkey_patching/enumerable_bool'
4
+ require_relative 'monkey_patching/enumerable_distribute'
@@ -0,0 +1,24 @@
1
+ require 'nokogiri'
2
+
3
+ module STXML
4
+ class Corpus
5
+ attr_reader :doc
6
+
7
+ def initialize(filename)
8
+ @doc = File.open(filename) do |f|
9
+ Nokogiri::XML(f)
10
+ end
11
+ end
12
+
13
+ def each_sentence
14
+ return enum_for(:each_sentence) unless block_given?
15
+ @doc.xpath('//s').each do |s|
16
+ yield s
17
+ end
18
+ end
19
+
20
+ def sentences
21
+ @doc.xpath('//s')
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,98 @@
1
+ require_relative 'sem_node'
2
+ require_relative 'reg_xml'
3
+
4
+ module STXML
5
+ #############
6
+ # class FeNode
7
+ #
8
+ # inherits from SemNode,
9
+ # adds to it methods specific to nodes
10
+ # that describe a frame element or target
11
+ #
12
+ # additional/changed methods:
13
+ #----------------------------
14
+ #
15
+ # name returns the name of the frame element, or "target"
16
+ #
17
+ # add_child, remove_child
18
+ class FeNode < SemNode
19
+ ###
20
+ # either RegXMl object or the name of the FE as a string
21
+ # string: ID to use if we just got the name of the FE
22
+ def initialize(name_or_xml, id_if_name = nil)
23
+ case name_or_xml
24
+ when String
25
+ if name_or_xml == "target"
26
+ super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
27
+ @i_am_target = true
28
+ else
29
+ super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
30
+ @i_am_target = false
31
+ end
32
+ when RegXML
33
+ super(name_or_xml)
34
+
35
+ if name_or_xml.name == "target"
36
+ @i_am_target = true
37
+ else
38
+ @i_am_target = false
39
+ end
40
+ else
41
+ raise "Shouldn't be here: #{name_or_xml.class}."
42
+ end
43
+
44
+ # child_attr: keep additional attributes of <fenode> elements,
45
+ # if there are any
46
+ # child_attr: hash syn_node_id(string) -> attributes(hash)
47
+ @child_attr = {}
48
+ end
49
+
50
+ ###
51
+ def name
52
+ if @i_am_target
53
+ 'target'
54
+ else
55
+ get_attribute("name")
56
+ end
57
+ end
58
+
59
+ ###
60
+ def add_child(syn_node, xml_obj = nil)
61
+ if xml_obj
62
+ # we've been given the fenode XML element
63
+ # see if there are any attributes that we will need:
64
+ # get attributes, remove the idref (we get that from the
65
+ # child's ID directly)
66
+ at = xml_obj.attributes
67
+ at.delete("idref")
68
+ unless at.empty?
69
+ @child_attr[syn_node.id] = at
70
+ end
71
+ end
72
+
73
+ super(syn_node, nil, "pointer_insteadof_edge" => true)
74
+ end
75
+
76
+ ###
77
+ def remove_child(syn_node, varhash={})
78
+ super(syn_node, nil, "pointer_insteadof_edge" => true)
79
+ end
80
+
81
+ protected
82
+
83
+ def get_xml_ofchildren
84
+ return children.map { |child|
85
+ if @child_attr[child.id]
86
+ "<fenode idref=\'#{xml_secure_val(child.id)}\'" +
87
+ @child_attr[child.id].to_a.map { |attr, val|
88
+ " #{attr}=\'#{xml_secure_val(val)}\'"
89
+ }.join +
90
+ "/>\n"
91
+
92
+ else
93
+ "<fenode idref=\'#{xml_secure_val(child.id)}\'/>\n"
94
+ end
95
+ }.join
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,214 @@
1
+ # Alexander Koller 2003
2
+ # extended Katrin Erk June 2003
3
+ #
4
+ # Classes that return a list of sentence DOMs, from various sources
5
+ #
6
+ # Each class in this file defines the following methods:
7
+ #
8
+ # initialize(...) "..." depends on the class
9
+ # extractDOMs() return list of all s nodes as DOM objects
10
+ # each_s() iterate over s nodes; may take less memory
11
+
12
+ require "rexml/document"
13
+
14
+ module STXML
15
+
16
+ class FilePartsParser
17
+ # <@file> = File object for the corpus
18
+ # <@head> = string up to the first <s> tag
19
+ # <@tail> = string after the last </s> tag
20
+ # <@rest> = string starting with the latest <s> tag (complete this to
21
+ # a <s>...</s> structure by reading up to next </s> tag)
22
+ # <@readCompletely> = boolean specifying whether there's still something
23
+ # left to read in the file
24
+
25
+ attr_reader :head, :tail
26
+
27
+ def initialize(filename)
28
+ @file = File.new(filename)
29
+ @readCompletely = false
30
+ # read stuff into @head and initialize @rest
31
+ @head = ''
32
+ begin
33
+ loop do
34
+ line = @file.readline
35
+ if line =~ /(.*)(<s\s.*)/ then
36
+ @head = @head << $1
37
+ @rest = $2
38
+ break
39
+ elsif line =~ /^(.*)(<\/body[\s>].*)$/
40
+ # empty corpus
41
+ @head = @head << $1
42
+ @tail = $2
43
+ while (line = @file.readline)
44
+ @tail << "\n" + line
45
+ end
46
+ @readCompletely = true
47
+ break
48
+ else
49
+ # @todo Edit this horror!
50
+ @head = @head << line
51
+ end
52
+ end
53
+ rescue EOFError
54
+ @readCompletely = true
55
+ end
56
+ end
57
+
58
+ def close
59
+ @file.close
60
+ end
61
+
62
+ # @note AB: This method isn't used anywhere.
63
+ def extractDOMs
64
+ allDOMs = []
65
+
66
+ process_s! do |dom|
67
+ allDOMs.push(dom)
68
+ Element.new("x")
69
+ end
70
+
71
+ allDOMs
72
+ end
73
+
74
+ # @note AB: This method isn't used anywhere.
75
+ def each_s
76
+ process_s! do |dom|
77
+ yield(dom)
78
+ Element.new("x")
79
+ end
80
+ end
81
+
82
+ # This function returns the string for the modified corpus.
83
+ # It doesn't change the internal state of the FilePartsParser,
84
+ # and is much more memory (and probably time) efficient than
85
+ # FileParser#process_s!.
86
+ # The block that is called by the method is given an element
87
+ # as its argument and is expected to return a changed element.
88
+ # @note This method isn't used anywhere.
89
+ def process_s!
90
+ if @readCompletely
91
+ return
92
+ end
93
+
94
+ ret = ''
95
+ scan_s { |element|
96
+ # Process the <s> ... </s> element
97
+ doc = Document.new(element)
98
+ elt = doc.root
99
+ changedElt = yield(elt)
100
+
101
+ changedEltAsString = ''
102
+ changedElt.write(changedEltAsString, 0)
103
+ ret <<= changedEltAsString
104
+ }
105
+
106
+ return ret
107
+ end
108
+
109
+ # KE 12.6.03: scan_s :
110
+ # doesn't parse a sentence before yielding it
111
+ # doesn't allow for any changes
112
+ # but otherwise the same as process_s!
113
+ # @return [String] A String with one xml encoded sentence.
114
+ def scan_s
115
+ if @readCompletely
116
+ return
117
+ end
118
+
119
+ begin
120
+ while true do
121
+ # Invariant: At this point, @rest always starts with an
122
+ # unseen <s> tag.
123
+
124
+ # First, we continue reading until we find the closing </s>
125
+ # No exception should occur in this loop if we're parsing
126
+ # a valid XML document.
127
+ while @rest !~ /^(.*<\/s>)(.*)/m do
128
+ @rest = @rest << @file.readline
129
+ end
130
+
131
+ element = $1
132
+ @rest = $2
133
+
134
+ yield(element) # change HERE: element not parsed!
135
+
136
+ # Read on up to the next <s>
137
+ while @rest !~ /(.*)(<s\s.*)/m do
138
+ @rest = @rest << @file.readline
139
+ end
140
+
141
+ @rest = $2
142
+ end
143
+ rescue EOFError
144
+ @tail = @rest
145
+ @readCompletely = true
146
+ end
147
+ end
148
+
149
+ # KE 5.11.03: get_rest: read all of the file not processed up to this point
150
+ # and return it as a string
151
+ def get_rest
152
+ begin
153
+ loop do
154
+ @rest = @rest << @file.readline
155
+ end
156
+ rescue EOFError
157
+ @readCompletely = true
158
+ end
159
+ return @rest
160
+ end
161
+ end
162
+ end
163
+
164
+ # This part seems to be obsolete, delete it!
165
+ =begin
166
+
167
+ class FileParser
168
+
169
+ include REXML
170
+
171
+ def initialize(filename)
172
+ @file = File.new(filename)
173
+ @doc = nil
174
+ end
175
+
176
+ # returns an array of DOMs for the sentences
177
+ def extractDOMs()
178
+ ensureParsedDocument()
179
+ @doc.get_elements("/corpus/body/s")
180
+ end
181
+
182
+ # Iterates over all sentence nodes. This may be more memory
183
+ # efficient than using extractDOMs(), but isn't in this case.
184
+ def each_s()
185
+ extractDOMs().each { |dom| yield(dom) }
186
+ end
187
+
188
+ # Iterates over all sentence nodes. The block passed to this
189
+ # method should return a DOM object as a value. After the iteration
190
+ # has been completed, the contents of /corpus/body are then replaced
191
+ # by the list of these results.
192
+ # At the moment, this changes the FileParser object. This should
193
+ # probably change in the future, but I don't want to mess with
194
+ # cloning now.
195
+ def process_s!()
196
+ newBody = Element.new('body')
197
+ each_s { |dom| newBody.add_element( yield(dom) ) }
198
+
199
+ @doc.delete_element("/corpus/body")
200
+ @doc.elements["corpus"].add_element(newBody)
201
+
202
+ return @doc
203
+ end
204
+
205
+ private
206
+
207
+ def ensureParsedDocument()
208
+ if @doc == nil then
209
+ @doc = Document.new(@file)
210
+ end
211
+ end
212
+ end
213
+
214
+ =end