shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,82 @@
1
+ require_relative 'enumerable_bool'
2
+ require_relative 'enumerable_distribute'
3
+ require_relative 'subsumed'
4
+
5
+ # Extensions for the class Array.
6
+ class Array
7
+ include EnumerableBool
8
+ include EnumerableDistribute
9
+ include Subsumed
10
+
11
+ ###
12
+ # interleave N arrays:
13
+ # given arrays [a1... an], [b1,...,bn], ..[z1, ...,zn]
14
+ # return [[a1,b1, .., z1]...,[an,bn, .., zn]]
15
+ #
16
+ # if one array is longer than the other,
17
+ # e.g. [a1...an], [b1,...,bm] with n> m
18
+ # the result is
19
+ # [[a1,b1],...[am, bm], [am+1, nil], ..., [an, nil]]
20
+ # and analogously for m>n
21
+ def interleave(*arrays)
22
+ len = [length, arrays.map(&:length).max].max
23
+ (0..len-1).to_a.map do |ix|
24
+ [at(ix)] + arrays.map { |a| a[ix] }
25
+ end
26
+ end
27
+
28
+ ###
29
+ # count the number of occurrences of element in this array
30
+ def count(element)
31
+ num = 0
32
+ each { |my_element|
33
+ if my_element == element
34
+ num += 1
35
+ end
36
+ }
37
+
38
+ num
39
+ end
40
+
41
+ ###
42
+ # count the number of occurrences of
43
+ # elements from list in this array
44
+ def counts(list)
45
+ num = 0
46
+ each { |my_element|
47
+ if list.include? my_element
48
+ num += 1
49
+ end
50
+ }
51
+ return num
52
+ end
53
+
54
+ ###
55
+ # draw a random sample of size N
56
+ # from this array
57
+ def sample(size)
58
+ if size < 0
59
+ return nil
60
+ elsif size == 0
61
+ return []
62
+ elsif size >= length
63
+ return self.clone
64
+ end
65
+
66
+ rank = {}
67
+ each { |my_element|
68
+ rank[my_element] = rand
69
+ }
70
+ return self.sort { |a, b| rank[a] <=> rank[b] }[0..size-1]
71
+ end
72
+
73
+ def map_with_index(&block)
74
+ retv = []
75
+
76
+ each_with_index { |x, index|
77
+ retv << block.call(x, index)
78
+ }
79
+
80
+ return retv
81
+ end
82
+ end
@@ -0,0 +1,24 @@
1
+ ################
2
+ module EnumerableBool
3
+ ###
4
+ # And_(x \in X) block(x)
5
+ def big_and(&block)
6
+ each do |x|
7
+ unless block.call(x)
8
+ return false
9
+ end
10
+ end
11
+
12
+ true
13
+ end
14
+
15
+ ###
16
+ # Sum_(x \in X) block(x)
17
+ def big_sum(init = 0, &block)
18
+ sum = init
19
+ block = proc { |x| x } unless block_given?
20
+ each { |x| sum += block.call(x) }
21
+
22
+ sum
23
+ end
24
+ end
@@ -0,0 +1,18 @@
1
+ ################
2
+ # Given an enumerable, distribute its items into two bins (arrays)
3
+ # depending on whether the block returns true
4
+ module EnumerableDistribute
5
+ def distribute(&block)
6
+ retv1 = []
7
+ retv2 = []
8
+ each do |x|
9
+ if block.call(x)
10
+ retv1 << x
11
+ else
12
+ retv2 << x
13
+ end
14
+ end
15
+
16
+ [retv1, retv2]
17
+ end
18
+ end
@@ -0,0 +1,131 @@
1
+ require 'fileutils'
2
+
3
+ # Extensions for the class File.
4
+ class File
5
+ ########
6
+ # check whether a given path exists,
7
+ # and if it doesn't, make sure it is created.
8
+ #
9
+ # piece together the strings in 'pieces' to make the path,
10
+ # appending "/" to all strings if necessary
11
+ #
12
+ # returns: the path pieced together
13
+ # strings, to be pieced together
14
+ def self.new_dir(*pieces)
15
+ dir_path, _dummy = File.make_path(pieces, true)
16
+
17
+ unless File.exist?(dir_path)
18
+ FileUtils.mkdir_p(dir_path)
19
+ end
20
+ # check that all went well in creating the directory)
21
+ File.existing_dir(dir_path)
22
+
23
+ dir_path
24
+ end
25
+
26
+ ########
27
+ # same as new_dir, but last piece is a filename
28
+ def self.new_filename(*pieces)
29
+ dir_path, whole_path = File.make_path(pieces, false)
30
+
31
+ unless File.exist?(dir_path)
32
+ FileUtils.mkdir_p dir_path
33
+ end
34
+ # check that all went well in creating the directory)
35
+ File.existing_dir(dir_path)
36
+
37
+ whole_path
38
+ end
39
+
40
+ #####
41
+ # check whether a given path exists,
42
+ # and report failure of it does not exist.
43
+ #
44
+ # piece together the strings in 'pieces' to make the path,
45
+ # appending "/" to all strings if necessary
46
+ #
47
+ # returns: the path pieced together
48
+ def self.existing_dir(*pieces) # strings
49
+ dir_path, _dummy = File.make_path(pieces, true)
50
+
51
+ unless File.exist?(dir_path) && File.directory?(dir_path)
52
+ raise "Error: Directory #{dir_path} doesn't exist."
53
+ end
54
+ unless File.executable? dir_path
55
+ raise "Error: Cannot access directory #{dir_path}."
56
+ end
57
+
58
+ dir_path
59
+ end
60
+
61
+ # @note AB: This method is not used anywhere.
62
+ =begin
63
+ ####
64
+ # like existing_dir, but last bit is filename
65
+ def self.existing_filename(*pieces)
66
+ dir_path, whole_path = File.make_path(pieces, false)
67
+
68
+ unless File.exist?(dir_path) && File.directory?(dir_path)
69
+ $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting"
70
+ exit(1)
71
+ end
72
+
73
+ unless File.executable?(dir_path)
74
+ $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
75
+ exit(1)
76
+ end
77
+
78
+ whole_path
79
+ end
80
+ =end
81
+
82
+ ####
83
+ # piece together the strings in 'pieces' to make a path,
84
+ # appending "/" to all but the last string if necessary
85
+ #
86
+ # if 'pieces' is already a string, take that as a one-piece path
87
+ #
88
+ # if dir is true, also append "/" to the last piece of the string
89
+ #
90
+ # the resulting path is expanded: For example, initial
91
+ # ~ is expanded to the setting of $HOME
92
+ #
93
+ # returns: pair of strings (directory_part, whole_path)
94
+ # @param pieces [String, Array]
95
+ # @param is_dir [True, False, Nil]
96
+ # @api private
97
+ def self.make_path(pieces, is_dir = false)
98
+ if pieces.is_a?(String)
99
+ pieces = [pieces]
100
+ end
101
+
102
+ dir = ''
103
+ # iterate over all but the filename
104
+ if is_dir
105
+ last_dir_index = -1
106
+ else
107
+ last_dir_index = -2
108
+ end
109
+ pieces[0..last_dir_index].each { |piece|
110
+ if piece.nil?
111
+ # whoops, nil entry in name of path!
112
+ $stderr.puts "File.make_path ERROR: nil for piece of path name."
113
+ next
114
+ end
115
+ if piece =~ /\/$/
116
+ dir << piece
117
+ else
118
+ dir << piece << "/"
119
+ end
120
+ }
121
+
122
+ dir = File.expand_path(dir)
123
+
124
+ # expand_path removes the final "/" again
125
+ unless dir =~ %r{/$}
126
+ dir += "/"
127
+ end
128
+
129
+ is_dir ? [dir, dir] : [dir, dir + pieces[-1]]
130
+ end
131
+ end
@@ -0,0 +1,24 @@
1
+ ###
2
+ # extend Array class by subsumption
3
+ module Subsumed
4
+ # @note This method is used by [RosyConfusability]
5
+ def subsumed_by?(array2)
6
+ temp = array2.clone
7
+
8
+ self.each { |el|
9
+ found = false
10
+ temp.each_index { |ix|
11
+ if el == temp[ix]
12
+ temp.delete_at(ix)
13
+ found = true
14
+ break
15
+ end
16
+ }
17
+ unless found
18
+ return false
19
+ end
20
+ }
21
+
22
+ return true
23
+ end
24
+ end
@@ -0,0 +1,4 @@
1
+ require_relative 'monkey_patching/file'
2
+ require_relative 'monkey_patching/array'
3
+ require_relative 'monkey_patching/enumerable_bool'
4
+ require_relative 'monkey_patching/enumerable_distribute'
@@ -0,0 +1,24 @@
1
+ require 'nokogiri'
2
+
3
+ module STXML
4
+ class Corpus
5
+ attr_reader :doc
6
+
7
+ def initialize(filename)
8
+ @doc = File.open(filename) do |f|
9
+ Nokogiri::XML(f)
10
+ end
11
+ end
12
+
13
+ def each_sentence
14
+ return enum_for(:each_sentence) unless block_given?
15
+ @doc.xpath('//s').each do |s|
16
+ yield s
17
+ end
18
+ end
19
+
20
+ def sentences
21
+ @doc.xpath('//s')
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,98 @@
1
+ require_relative 'sem_node'
2
+ require_relative 'reg_xml'
3
+
4
+ module STXML
5
+ #############
6
+ # class FeNode
7
+ #
8
+ # inherits from SemNode,
9
+ # adds to it methods specific to nodes
10
+ # that describe a frame element or target
11
+ #
12
+ # additional/changed methods:
13
+ #----------------------------
14
+ #
15
+ # name returns the name of the frame element, or "target"
16
+ #
17
+ # add_child, remove_child
18
+ class FeNode < SemNode
19
+ ###
20
+ # either RegXMl object or the name of the FE as a string
21
+ # string: ID to use if we just got the name of the FE
22
+ def initialize(name_or_xml, id_if_name = nil)
23
+ case name_or_xml
24
+ when String
25
+ if name_or_xml == "target"
26
+ super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
27
+ @i_am_target = true
28
+ else
29
+ super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
30
+ @i_am_target = false
31
+ end
32
+ when RegXML
33
+ super(name_or_xml)
34
+
35
+ if name_or_xml.name == "target"
36
+ @i_am_target = true
37
+ else
38
+ @i_am_target = false
39
+ end
40
+ else
41
+ raise "Shouldn't be here: #{name_or_xml.class}."
42
+ end
43
+
44
+ # child_attr: keep additional attributes of <fenode> elements,
45
+ # if there are any
46
+ # child_attr: hash syn_node_id(string) -> attributes(hash)
47
+ @child_attr = {}
48
+ end
49
+
50
+ ###
51
+ def name
52
+ if @i_am_target
53
+ 'target'
54
+ else
55
+ get_attribute("name")
56
+ end
57
+ end
58
+
59
+ ###
60
+ def add_child(syn_node, xml_obj = nil)
61
+ if xml_obj
62
+ # we've been given the fenode XML element
63
+ # see if there are any attributes that we will need:
64
+ # get attributes, remove the idref (we get that from the
65
+ # child's ID directly)
66
+ at = xml_obj.attributes
67
+ at.delete("idref")
68
+ unless at.empty?
69
+ @child_attr[syn_node.id] = at
70
+ end
71
+ end
72
+
73
+ super(syn_node, nil, "pointer_insteadof_edge" => true)
74
+ end
75
+
76
+ ###
77
+ def remove_child(syn_node, varhash={})
78
+ super(syn_node, nil, "pointer_insteadof_edge" => true)
79
+ end
80
+
81
+ protected
82
+
83
+ def get_xml_ofchildren
84
+ return children.map { |child|
85
+ if @child_attr[child.id]
86
+ "<fenode idref=\'#{xml_secure_val(child.id)}\'" +
87
+ @child_attr[child.id].to_a.map { |attr, val|
88
+ " #{attr}=\'#{xml_secure_val(val)}\'"
89
+ }.join +
90
+ "/>\n"
91
+
92
+ else
93
+ "<fenode idref=\'#{xml_secure_val(child.id)}\'/>\n"
94
+ end
95
+ }.join
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,214 @@
1
+ # Alexander Koller 2003
2
+ # extended Katrin Erk June 2003
3
+ #
4
+ # Classes that return a list of sentence DOMs, from various sources
5
+ #
6
+ # Each class in this file defines the following methods:
7
+ #
8
+ # initialize(...) "..." depends on the class
9
+ # extractDOMs() return list of all s nodes as DOM objects
10
+ # each_s() iterate over s nodes; may take less memory
11
+
12
+ require "rexml/document"
13
+
14
+ module STXML
15
+
16
+ class FilePartsParser
17
+ # <@file> = File object for the corpus
18
+ # <@head> = string up to the first <s> tag
19
+ # <@tail> = string after the last </s> tag
20
+ # <@rest> = string starting with the latest <s> tag (complete this to
21
+ # a <s>...</s> structure by reading up to next </s> tag)
22
+ # <@readCompletely> = boolean specifying whether there's still something
23
+ # left to read in the file
24
+
25
+ attr_reader :head, :tail
26
+
27
+ def initialize(filename)
28
+ @file = File.new(filename)
29
+ @readCompletely = false
30
+ # read stuff into @head and initialize @rest
31
+ @head = ''
32
+ begin
33
+ loop do
34
+ line = @file.readline
35
+ if line =~ /(.*)(<s\s.*)/ then
36
+ @head = @head << $1
37
+ @rest = $2
38
+ break
39
+ elsif line =~ /^(.*)(<\/body[\s>].*)$/
40
+ # empty corpus
41
+ @head = @head << $1
42
+ @tail = $2
43
+ while (line = @file.readline)
44
+ @tail << "\n" + line
45
+ end
46
+ @readCompletely = true
47
+ break
48
+ else
49
+ # @todo Edit this horror!
50
+ @head = @head << line
51
+ end
52
+ end
53
+ rescue EOFError
54
+ @readCompletely = true
55
+ end
56
+ end
57
+
58
+ def close
59
+ @file.close
60
+ end
61
+
62
+ # @note AB: This method isn't used anywhere.
63
+ def extractDOMs
64
+ allDOMs = []
65
+
66
+ process_s! do |dom|
67
+ allDOMs.push(dom)
68
+ Element.new("x")
69
+ end
70
+
71
+ allDOMs
72
+ end
73
+
74
+ # @note AB: This method isn't used anywhere.
75
+ def each_s
76
+ process_s! do |dom|
77
+ yield(dom)
78
+ Element.new("x")
79
+ end
80
+ end
81
+
82
+ # This function returns the string for the modified corpus.
83
+ # It doesn't change the internal state of the FilePartsParser,
84
+ # and is much more memory (and probably time) efficient than
85
+ # FileParser#process_s!.
86
+ # The block that is called by the method is given an element
87
+ # as its argument and is expected to return a changed element.
88
+ # @note This method isn't used anywhere.
89
+ def process_s!
90
+ if @readCompletely
91
+ return
92
+ end
93
+
94
+ ret = ''
95
+ scan_s { |element|
96
+ # Process the <s> ... </s> element
97
+ doc = Document.new(element)
98
+ elt = doc.root
99
+ changedElt = yield(elt)
100
+
101
+ changedEltAsString = ''
102
+ changedElt.write(changedEltAsString, 0)
103
+ ret <<= changedEltAsString
104
+ }
105
+
106
+ return ret
107
+ end
108
+
109
+ # KE 12.6.03: scan_s :
110
+ # doesn't parse a sentence before yielding it
111
+ # doesn't allow for any changes
112
+ # but otherwise the same as process_s!
113
+ # @return [String] A String with one xml encoded sentence.
114
+ def scan_s
115
+ if @readCompletely
116
+ return
117
+ end
118
+
119
+ begin
120
+ while true do
121
+ # Invariant: At this point, @rest always starts with an
122
+ # unseen <s> tag.
123
+
124
+ # First, we continue reading until we find the closing </s>
125
+ # No exception should occur in this loop if we're parsing
126
+ # a valid XML document.
127
+ while @rest !~ /^(.*<\/s>)(.*)/m do
128
+ @rest = @rest << @file.readline
129
+ end
130
+
131
+ element = $1
132
+ @rest = $2
133
+
134
+ yield(element) # change HERE: element not parsed!
135
+
136
+ # Read on up to the next <s>
137
+ while @rest !~ /(.*)(<s\s.*)/m do
138
+ @rest = @rest << @file.readline
139
+ end
140
+
141
+ @rest = $2
142
+ end
143
+ rescue EOFError
144
+ @tail = @rest
145
+ @readCompletely = true
146
+ end
147
+ end
148
+
149
+ # KE 5.11.03: get_rest: read all of the file not processed up to this point
150
+ # and return it as a string
151
+ def get_rest
152
+ begin
153
+ loop do
154
+ @rest = @rest << @file.readline
155
+ end
156
+ rescue EOFError
157
+ @readCompletely = true
158
+ end
159
+ return @rest
160
+ end
161
+ end
162
+ end
163
+
164
+ # This part seems to be obsolete, delete it!
165
+ =begin
166
+
167
+ class FileParser
168
+
169
+ include REXML
170
+
171
+ def initialize(filename)
172
+ @file = File.new(filename)
173
+ @doc = nil
174
+ end
175
+
176
+ # returns an array of DOMs for the sentences
177
+ def extractDOMs()
178
+ ensureParsedDocument()
179
+ @doc.get_elements("/corpus/body/s")
180
+ end
181
+
182
+ # Iterates over all sentence nodes. This may be more memory
183
+ # efficient than using extractDOMs(), but isn't in this case.
184
+ def each_s()
185
+ extractDOMs().each { |dom| yield(dom) }
186
+ end
187
+
188
+ # Iterates over all sentence nodes. The block passed to this
189
+ # method should return a DOM object as a value. After the iteration
190
+ # has been completed, the contents of /corpus/body are then replaced
191
+ # by the list of these results.
192
+ # At the moment, this changes the FileParser object. This should
193
+ # probably change in the future, but I don't want to mess with
194
+ # cloning now.
195
+ def process_s!()
196
+ newBody = Element.new('body')
197
+ each_s { |dom| newBody.add_element( yield(dom) ) }
198
+
199
+ @doc.delete_element("/corpus/body")
200
+ @doc.elements["corpus"].add_element(newBody)
201
+
202
+ return @doc
203
+ end
204
+
205
+ private
206
+
207
+ def ensureParsedDocument()
208
+ if @doc == nil then
209
+ @doc = Document.new(@file)
210
+ end
211
+ end
212
+ end
213
+
214
+ =end