shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,84 @@
1
+ module STXML
2
+ # sp jul 05 05
3
+ #
4
+ # Static helper methods for SalsaTigerRegXML:
5
+
6
+ # - provide header and footer for Salsa/Tiger XML files
7
+ # - escape and unescape HTML entities
8
+ #
9
+ # changed KE nov 05:
10
+ # many methods moved to FrappeHelper
11
+ class SalsaTigerXMLHelper
12
+ # escape and unescape strings for representation in XML
13
+ @replacements = [
14
+ # ["''","""], # added by ines (09/03/09), might cause problems for unescape???
15
+ ["&", "&"], # must be first for escaping, last for unescaping
16
+ ["<", "&lt;"],
17
+ [">", "&gt;"],
18
+ ["\"", "&apos;&apos;"],
19
+ # ["\"","&quot;"],
20
+ # ["\'\'","&quot;"],
21
+ # ["\`\`","&quot;"],
22
+ ["\'", "&apos;"],
23
+ ["\`\`", "&apos;&apos;"],
24
+ # ["''","&apos;&apos;"]
25
+ ]
26
+
27
+ ###
28
+ # get header of SalsaTigerXML files (as string)
29
+ def self.get_header
30
+ header = <<ENDOFHEADER
31
+ <?xml version="1.0" encoding="UTF-8"?>
32
+ <corpus corpusname="corpus" target="">
33
+ <head>
34
+ <meta>
35
+ <format>NeGra format, version 3</format>
36
+ </meta>
37
+ <frames xmlns="http://www.clt-st.de/framenet/frame-database">
38
+ </frames>
39
+ <wordtags xmlns="http://www.clt-st.de/salsa/wordtags">
40
+ </wordtags>
41
+ <flags>
42
+ </flags>
43
+ <annotation>
44
+ <edgelabel>
45
+ </edgelabel>
46
+ <secedgelabel>
47
+ </secedgelabel>
48
+ </annotation>
49
+ </head>
50
+ <body>
51
+ ENDOFHEADER
52
+
53
+ header
54
+ end
55
+
56
+ ###
57
+ # get footer of SALSATigerXML files (as string)
58
+ def self.get_footer
59
+ footer = <<ENDOFFOOTER
60
+ </body>
61
+ </corpus>
62
+ ENDOFFOOTER
63
+
64
+ footer
65
+ end
66
+
67
+ def self.escape(string)
68
+ @replacements.each do |unescaped, escaped|
69
+ string.gsub!(unescaped, escaped)
70
+ end
71
+
72
+ string
73
+ end
74
+
75
+ def self.unescape(string)
76
+ # reverse replacements to replace &amp last
77
+ @replacements.reverse_each do |unescaped, escaped|
78
+ string.gsub!(escaped, unescaped)
79
+ end
80
+
81
+ string
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,161 @@
1
+ require_relative 'xml_node'
2
+ require_relative 'string_terminals_in_right_order'
3
+
4
+ module STXML
5
+ #############
6
+ # class SalsaTigerXmlNode
7
+ #
8
+ # additional methods:
9
+ #
10
+ # is_terminal? true if this is a Tiger XML terminal node
11
+ #
12
+ # is_nonterminal? true if this is a Tiger XML nonterminal node
13
+ #
14
+ # is_splitword? true if this is a splitword part
15
+ #
16
+ # is_syntactic? true for terminal, nonterminal, splitword
17
+ #
18
+ # is_frame? true if this is a Salsa/Tiger XML frame
19
+ #
20
+ # is_target? true if this is a Salsa/Tiger XML frame target
21
+ #
22
+ # is_fe? true if this is a Salsa/Tiger XML frame element
23
+ #
24
+ # is_outside_sentence? returns false -- this node is not a placeholder for
25
+ # a node that is outside the current sentence
26
+ # (but see descendant class TSSynNode)
27
+ #
28
+ # yield_nodes returns the list of descendants thatare leaves of the tree
29
+ # NOTE: this overwrites the Graph.yield_nodes method
30
+ # since we have to treat splitwords in a special way
31
+ # empty array if no yield nodes are present
32
+ #
33
+ # yield_nodes_ordered returns those descendants ordered by precedence
34
+ # in the sentence, i.e. their node IDs.
35
+ #
36
+ # sid returns the sentence ID of this node
37
+ #
38
+ # to_s returns the yield of this node as a string of space-separated words
39
+ # words ordered left to right
40
+ #
41
+ class SalsaTigerXmlNode < XMLNode
42
+ include StringTerminalsInRightOrder
43
+
44
+ ###
45
+ # extracting the ID from a RegXML element
46
+ # depends on whether it has an ID or an IDref
47
+ #
48
+ # returns: a string, the ID, or nil if none was found
49
+ def self.xmlel_id(xml_obj) # RegXML object
50
+ case xml_obj.name
51
+ when "edge", "fenode", "uspitem", "splitword", "other_edge"
52
+ # contains ID ref
53
+ xml_obj.attributes["idref"]
54
+ when "part"
55
+ # contains ID
56
+ xml_obj.attributes["id"]
57
+ else
58
+ # something else
59
+ # default: ID is in attribute "id"
60
+ xml_obj.attributes["id"]
61
+ end
62
+ end
63
+
64
+ ###
65
+ # RegXML object or text
66
+ def initialize(xml)
67
+ if xml.text?
68
+ # text
69
+ super(xml, nil, nil, true)
70
+ else
71
+ # xml element
72
+ super(xml.name, xml.attributes, SalsaTigerXmlNode.xmlel_id(xml), false)
73
+ end
74
+ end
75
+
76
+ ###
77
+ def is_terminal?
78
+ get_f("name") == "t"
79
+ end
80
+
81
+ ###
82
+ def is_nonterminal?
83
+ get_f("name") == "nt"
84
+ end
85
+
86
+ ###
87
+ def is_splitword?
88
+ get_f("name") == "part"
89
+ end
90
+
91
+ ###
92
+ def is_syntactic?
93
+ is_terminal? || is_nonterminal? || is_splitword?
94
+ end
95
+
96
+ ###
97
+ def is_frame?
98
+ get_f("name") == "frame"
99
+ end
100
+
101
+ ###
102
+ def is_target?
103
+ get_f("name") == "target"
104
+ end
105
+
106
+ ###
107
+ def is_fe?
108
+ get_f("name") == "fe"
109
+ end
110
+
111
+ ###
112
+ def sid
113
+ # my node ID starts out with the sentence ID
114
+ id =~ /^(.*?)_/
115
+ return $1
116
+ end
117
+
118
+ ###
119
+ def is_outside_sentence?
120
+ false
121
+ end
122
+
123
+ ###
124
+ def yield_nodes
125
+ # special consideration: splitwords do not count as children!
126
+ if children.reject { |c| c.is_splitword? }.empty?
127
+ return [self]
128
+ end
129
+
130
+ arr = []
131
+ children.reject { |c| c.is_splitword? }.each { |c|
132
+ if c.children.reject(&:is_splitword?).empty?
133
+ arr << c
134
+ else
135
+ arr.concat c.yield_nodes
136
+ end
137
+ }
138
+
139
+ arr
140
+ end
141
+
142
+ ###
143
+ def yield_nodes_ordered # legacy name
144
+ # sort_terminals_and_splitwords_... cannot deal with nonterminals
145
+ # so remove and attach to the end of the chain
146
+ t, nt = yield_nodes.distribute { |x| x.is_terminal? || x.is_splitword? }
147
+ return sort_terminals_and_splitwords_left_to_right(t).concat(nt)
148
+ end
149
+
150
+ ###
151
+ # name parallel to the method of SalsaTigerSentence
152
+ def terminals_sorted
153
+ return yield_nodes_ordered
154
+ end
155
+
156
+ ###
157
+ def to_s
158
+ string_for_node(self)
159
+ end
160
+ end
161
+ end
@@ -0,0 +1,58 @@
1
+ require_relative 'salsa_tiger_xml_node'
2
+
3
+ module STXML
4
+ #############
5
+ # class SemNode
6
+ #
7
+ # common superclass for FrameNode and FeNode,
8
+ # with methods that are the same for both:
9
+ #
10
+ #
11
+ # is_usp? returns true if the frame/FE is involved in underspecification,
12
+ # else false
13
+ #
14
+ # flags returns an array of all the frame/FE flags for this node.
15
+ # members of the array are strings describing the flags
16
+ # that have been set to true
17
+ #
18
+ # add_flag add or remove a frame/FE flag
19
+ # remove_flag
20
+ class SemNode < SalsaTigerXmlNode
21
+ attr_reader :flags
22
+
23
+ def initialize(xml) # RegXML object or text
24
+ super(xml)
25
+ # flags: array of FlagNode objects
26
+ @flags = []
27
+ end
28
+
29
+ ###
30
+ def is_usp?
31
+ get_attribute("usp") == "yes"
32
+ end
33
+
34
+ ###
35
+ def add_flag(name) # string: flag name
36
+ @flags << name
37
+ end
38
+
39
+ ###
40
+ def remove_flag(name) # string: flag name
41
+ @flags.delete(name)
42
+ end
43
+
44
+ #############
45
+ protected
46
+
47
+ def get_xml_embedded
48
+ super() + get_xml_offlags
49
+ end
50
+
51
+ def get_xml_offlags
52
+ # and add flags
53
+ @flags.map do |flagname|
54
+ "<flag name=\'#{xml_secure_val(flagname)}\'/>\n"
55
+ end.join
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,192 @@
1
+ module STXML
2
+ #########
3
+ # module StringTerminalsInRightOrder
4
+ #
5
+ # returns the yield of a node, or a list of nodes, as a string
6
+ # of " "-separated words
7
+ #
8
+ # Words are put into the right order, left to right,
9
+ # under the assumption that their node IDs reflect that order
10
+ #
11
+ # Terminal nodes are assumed to have IDs ending in a number,
12
+ # numbered from left to right
13
+ #
14
+ # Splitword nodes are assumed to have IDs ending in N_sM
15
+ # for numbers N and M, where N orders terminals left to right
16
+ # and M orders the splitword parts left to right
17
+ #
18
+ # If the yield of the node/the list of nodes contains all splitwords of a terminal,
19
+ # the whole terminal is taken instead
20
+ #
21
+ # methods:
22
+ #
23
+ # string_for_node returns the string for the yield of a node
24
+ # node: a node object
25
+ #
26
+ # string_for_nodes returns the string for the yield of a list of nodes
27
+ # nodes: a list of node objects
28
+ module StringTerminalsInRightOrder
29
+ def string_for_node(node)
30
+ string_for_nodes([node])
31
+ end
32
+
33
+ def string_for_nodes(nodes)
34
+ a = right_level_terminals_for_nodes(nodes)
35
+ a = sort_terminals_and_splitwords_left_to_right(a)
36
+ return node_array_to_string(a)
37
+ end
38
+
39
+ #####
40
+ private
41
+
42
+ # right_level_terminals_for_nodes:
43
+ # - compute the yield for each element of 'nodes'
44
+ # - then consider all splitwords in the yield:
45
+ # if all splitwords of a terminal are in the yield,
46
+ # then use the terminal rather than its splitwords
47
+ def right_level_terminals_for_nodes(nodes)
48
+ a = nodes.map { |n| n.yield_nodes}.flatten
49
+ b = []
50
+ a.each { |n|
51
+ if n.is_splitword?
52
+ # see if a contains all parts of this splitword
53
+ # if so, take into b the splitword's parent, the terminal,
54
+ # rather than the individual splitwords
55
+
56
+ if n.parent.nil?
57
+ # splitword without a parent
58
+ b << n
59
+ elsif b.include? n.parent or a.include? n.parent
60
+ # did we already include the splitword's parent in b?
61
+ # then we're done
62
+ else
63
+
64
+ # check if all children of n.parent are in 'a'
65
+ all_in = true
66
+ n.parent.each_child { |nsibling|
67
+ unless a.include? nsibling
68
+ all_in = false
69
+ break
70
+ end
71
+ }
72
+
73
+ if all_in
74
+ # yes, all children of n.parent are in 'a'
75
+ b << n.parent
76
+ else
77
+ # no, some sibling of n is not in 'a'
78
+ b << n
79
+ end
80
+ end
81
+ elsif n.is_terminal?
82
+ # n is a terminal
83
+ b << n
84
+ # if n is anything but a splitword or a terminal,
85
+ # ignore it
86
+ end
87
+ }
88
+ return b.uniq
89
+ end
90
+
91
+ # sort_terminals_and_splitwords_left_to_right:
92
+ # take an array of nodes that consists of terminals and splitwords
93
+ # and sort them using the following comparison:
94
+ # - when comparing two terminals, use the
95
+ # last numbers in their respective IDs
96
+ # - when comparing two splitwords, their IDs end in _N_sM
97
+ # for numbers N and M.
98
+ # If they coincide in N, compare them by M,
99
+ # else compare them by M
100
+ # - when comparing a terminal and a splitword,
101
+ # compare the terminal's last number to the splitword's N
102
+ def sort_terminals_and_splitwords_left_to_right(nodes)
103
+ nodes.sort { |a, b|
104
+ if a.is_splitword? and b.is_splitword?
105
+ compare_splitwords(a, b)
106
+ elsif a.is_terminal? and b.is_terminal?
107
+ compare_terminals(a, b)
108
+ else
109
+ compare_mixed(a, b)
110
+ end
111
+ }
112
+ end
113
+
114
+ # node_array_to_string:
115
+ # 'nodes' is an array of node objects, each of which offer a "word" method
116
+ # string their words together separated by " "
117
+ def node_array_to_string(nodes)
118
+ s = ""
119
+ nodes.each { |n|
120
+ s = s + n.word + " "
121
+ }
122
+ return s
123
+ end
124
+
125
+ # - when comparing two terminals, use the
126
+ # last numbers in their respective IDs
127
+ def compare_terminals(a, b)
128
+ last_i(a) <=> last_i(b)
129
+ end
130
+
131
+ # - when comparing two splitwords, their IDs end in _N_sM
132
+ # for numbers N and M.
133
+ # If they coincide in N, compare them by M,
134
+ # else compare them by M
135
+ def compare_splitwords(a, b)
136
+ if splitword_terminal_i(a) == splitword_terminal_i(b)
137
+ # parts of same terminal?
138
+ # compare parts
139
+ last_i(a) <=> last_i(b)
140
+ else
141
+ # not parts of same terminal?
142
+ # compare terminals
143
+ splitword_terminal_i(a) <=> splitword_terminal_i(b)
144
+ end
145
+ end
146
+
147
+ # - when comparing a terminal and a splitword,
148
+ # compare the terminal's last number to the splitword's N
149
+ def compare_mixed(a, b)
150
+ if a.is_splitword? and b.is_terminal?
151
+ splitword_terminal_i(a) <=> last_i(b)
152
+
153
+ elsif a.is_terminal? and b.is_splitword?
154
+ last_i(a) <=> splitword_terminal_i(b)
155
+ else
156
+ # not one terminal, one splitword?
157
+ # then what?
158
+ $stderr.print "SalsaTigerSentence, compare_mixed: confused by "
159
+ $stderr.print a.id, ", ", b.id, "\n"
160
+ end
161
+ end
162
+
163
+ # return last number of the ID of a node
164
+ def last_i(n)
165
+ n.id =~ /(\d+)$/ # match final string of digits
166
+ if $1.nil? # if shouldn't happen _in principle_
167
+ # but we might get weird node IDs for splitwords;
168
+ # so we act gracefully and catch the case where there
169
+ # is one final letter behind the digits
170
+ n.id =~ /(\d+)\w$/
171
+ end
172
+ if $1.nil? # this shouldn't ever happen
173
+ $stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
174
+ $stderr.print n.id, "\n"
175
+ exit 1
176
+ end
177
+ return $1.to_i # and return it as number
178
+ end
179
+
180
+ # assume the ID of the node includes N_sM
181
+ # return N
182
+ def splitword_terminal_i(n)
183
+ n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
184
+ if $1.nil? # this shouldn't ever happen
185
+ $stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
186
+ $stderr.print n.id, "\n"
187
+ exit 1
188
+ end
189
+ return $1.to_i # and return it as number
190
+ end
191
+ end
192
+ end