shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,84 @@
1
+ module STXML
2
+ # sp jul 05 05
3
+ #
4
+ # Static helper methods for SalsaTigerRegXML:
5
+
6
+ # - provide header and footer for Salsa/Tiger XML files
7
+ # - escape and unescape HTML entities
8
+ #
9
+ # changed KE nov 05:
10
+ # many methods moved to FrappeHelper
11
+ class SalsaTigerXMLHelper
12
+ # escape and unescape strings for representation in XML
13
+ @replacements = [
14
+ # ["''","""], # added by ines (09/03/09), might cause problems for unescape???
15
+ ["&", "&"], # must be first for escaping, last for unescaping
16
+ ["<", "&lt;"],
17
+ [">", "&gt;"],
18
+ ["\"", "&apos;&apos;"],
19
+ # ["\"","&quot;"],
20
+ # ["\'\'","&quot;"],
21
+ # ["\`\`","&quot;"],
22
+ ["\'", "&apos;"],
23
+ ["\`\`", "&apos;&apos;"],
24
+ # ["''","&apos;&apos;"]
25
+ ]
26
+
27
+ ###
28
+ # get header of SalsaTigerXML files (as string)
29
+ def self.get_header
30
+ header = <<ENDOFHEADER
31
+ <?xml version="1.0" encoding="UTF-8"?>
32
+ <corpus corpusname="corpus" target="">
33
+ <head>
34
+ <meta>
35
+ <format>NeGra format, version 3</format>
36
+ </meta>
37
+ <frames xmlns="http://www.clt-st.de/framenet/frame-database">
38
+ </frames>
39
+ <wordtags xmlns="http://www.clt-st.de/salsa/wordtags">
40
+ </wordtags>
41
+ <flags>
42
+ </flags>
43
+ <annotation>
44
+ <edgelabel>
45
+ </edgelabel>
46
+ <secedgelabel>
47
+ </secedgelabel>
48
+ </annotation>
49
+ </head>
50
+ <body>
51
+ ENDOFHEADER
52
+
53
+ header
54
+ end
55
+
56
+ ###
57
+ # get footer of SALSATigerXML files (as string)
58
+ def self.get_footer
59
+ footer = <<ENDOFFOOTER
60
+ </body>
61
+ </corpus>
62
+ ENDOFFOOTER
63
+
64
+ footer
65
+ end
66
+
67
+ def self.escape(string)
68
+ @replacements.each do |unescaped, escaped|
69
+ string.gsub!(unescaped, escaped)
70
+ end
71
+
72
+ string
73
+ end
74
+
75
+ def self.unescape(string)
76
+ # reverse replacements to replace &amp last
77
+ @replacements.reverse_each do |unescaped, escaped|
78
+ string.gsub!(escaped, unescaped)
79
+ end
80
+
81
+ string
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,161 @@
1
+ require_relative 'xml_node'
2
+ require_relative 'string_terminals_in_right_order'
3
+
4
+ module STXML
5
+ #############
6
+ # class SalsaTigerXmlNode
7
+ #
8
+ # additional methods:
9
+ #
10
+ # is_terminal? true if this is a Tiger XML terminal node
11
+ #
12
+ # is_nonterminal? true if this is a Tiger XML nonterminal node
13
+ #
14
+ # is_splitword? true if this is a splitword part
15
+ #
16
+ # is_syntactic? true for terminal, nonterminal, splitword
17
+ #
18
+ # is_frame? true if this is a Salsa/Tiger XML frame
19
+ #
20
+ # is_target? true if this is a Salsa/Tiger XML frame target
21
+ #
22
+ # is_fe? true if this is a Salsa/Tiger XML frame element
23
+ #
24
+ # is_outside_sentence? returns false -- this node is not a placeholder for
25
+ # a node that is outside the current sentence
26
+ # (but see descendant class TSSynNode)
27
+ #
28
+ # yield_nodes returns the list of descendants thatare leaves of the tree
29
+ # NOTE: this overwrites the Graph.yield_nodes method
30
+ # since we have to treat splitwords in a special way
31
+ # empty array if no yield nodes are present
32
+ #
33
+ # yield_nodes_ordered returns those descendants ordered by precedence
34
+ # in the sentence, i.e. their node IDs.
35
+ #
36
+ # sid returns the sentence ID of this node
37
+ #
38
+ # to_s returns the yield of this node as a string of space-separated words
39
+ # words ordered left to right
40
+ #
41
+ class SalsaTigerXmlNode < XMLNode
42
+ include StringTerminalsInRightOrder
43
+
44
+ ###
45
+ # extracting the ID from a RegXML element
46
+ # depends on whether it has an ID or an IDref
47
+ #
48
+ # returns: a string, the ID, or nil if none was found
49
+ def self.xmlel_id(xml_obj) # RegXML object
50
+ case xml_obj.name
51
+ when "edge", "fenode", "uspitem", "splitword", "other_edge"
52
+ # contains ID ref
53
+ xml_obj.attributes["idref"]
54
+ when "part"
55
+ # contains ID
56
+ xml_obj.attributes["id"]
57
+ else
58
+ # something else
59
+ # default: ID is in attribute "id"
60
+ xml_obj.attributes["id"]
61
+ end
62
+ end
63
+
64
+ ###
65
+ # RegXML object or text
66
+ def initialize(xml)
67
+ if xml.text?
68
+ # text
69
+ super(xml, nil, nil, true)
70
+ else
71
+ # xml element
72
+ super(xml.name, xml.attributes, SalsaTigerXmlNode.xmlel_id(xml), false)
73
+ end
74
+ end
75
+
76
+ ###
77
+ def is_terminal?
78
+ get_f("name") == "t"
79
+ end
80
+
81
+ ###
82
+ def is_nonterminal?
83
+ get_f("name") == "nt"
84
+ end
85
+
86
+ ###
87
+ def is_splitword?
88
+ get_f("name") == "part"
89
+ end
90
+
91
+ ###
92
+ def is_syntactic?
93
+ is_terminal? || is_nonterminal? || is_splitword?
94
+ end
95
+
96
+ ###
97
+ def is_frame?
98
+ get_f("name") == "frame"
99
+ end
100
+
101
+ ###
102
+ def is_target?
103
+ get_f("name") == "target"
104
+ end
105
+
106
+ ###
107
+ def is_fe?
108
+ get_f("name") == "fe"
109
+ end
110
+
111
+ ###
112
+ def sid
113
+ # my node ID starts out with the sentence ID
114
+ id =~ /^(.*?)_/
115
+ return $1
116
+ end
117
+
118
+ ###
119
+ def is_outside_sentence?
120
+ false
121
+ end
122
+
123
+ ###
124
+ def yield_nodes
125
+ # special consideration: splitwords do not count as children!
126
+ if children.reject { |c| c.is_splitword? }.empty?
127
+ return [self]
128
+ end
129
+
130
+ arr = []
131
+ children.reject { |c| c.is_splitword? }.each { |c|
132
+ if c.children.reject(&:is_splitword?).empty?
133
+ arr << c
134
+ else
135
+ arr.concat c.yield_nodes
136
+ end
137
+ }
138
+
139
+ arr
140
+ end
141
+
142
+ ###
143
+ def yield_nodes_ordered # legacy name
144
+ # sort_terminals_and_splitwords_... cannot deal with nonterminals
145
+ # so remove and attach to the end of the chain
146
+ t, nt = yield_nodes.distribute { |x| x.is_terminal? || x.is_splitword? }
147
+ return sort_terminals_and_splitwords_left_to_right(t).concat(nt)
148
+ end
149
+
150
+ ###
151
+ # name parallel to the method of SalsaTigerSentence
152
+ def terminals_sorted
153
+ return yield_nodes_ordered
154
+ end
155
+
156
+ ###
157
+ def to_s
158
+ string_for_node(self)
159
+ end
160
+ end
161
+ end
@@ -0,0 +1,58 @@
1
+ require_relative 'salsa_tiger_xml_node'
2
+
3
+ module STXML
4
+ #############
5
+ # class SemNode
6
+ #
7
+ # common superclass for FrameNode and FeNode,
8
+ # with methods that are the same for both:
9
+ #
10
+ #
11
+ # is_usp? returns true if the frame/FE is involved in underspecification,
12
+ # else false
13
+ #
14
+ # flags returns an array of all the frame/FE flags for this node.
15
+ # members of the array are strings describing the flags
16
+ # that have been set to true
17
+ #
18
+ # add_flag add or remove a frame/FE flag
19
+ # remove_flag
20
+ class SemNode < SalsaTigerXmlNode
21
+ attr_reader :flags
22
+
23
+ def initialize(xml) # RegXML object or text
24
+ super(xml)
25
+ # flags: array of FlagNode objects
26
+ @flags = []
27
+ end
28
+
29
+ ###
30
+ def is_usp?
31
+ get_attribute("usp") == "yes"
32
+ end
33
+
34
+ ###
35
+ def add_flag(name) # string: flag name
36
+ @flags << name
37
+ end
38
+
39
+ ###
40
+ def remove_flag(name) # string: flag name
41
+ @flags.delete(name)
42
+ end
43
+
44
+ #############
45
+ protected
46
+
47
+ def get_xml_embedded
48
+ super() + get_xml_offlags
49
+ end
50
+
51
+ def get_xml_offlags
52
+ # and add flags
53
+ @flags.map do |flagname|
54
+ "<flag name=\'#{xml_secure_val(flagname)}\'/>\n"
55
+ end.join
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,192 @@
1
+ module STXML
2
+ #########
3
+ # module StringTerminalsInRightOrder
4
+ #
5
+ # returns the yield of a node, or a list of nodes, as a string
6
+ # of " "-separated words
7
+ #
8
+ # Words are put into the right order, left to right,
9
+ # under the assumption that their node IDs reflect that order
10
+ #
11
+ # Terminal nodes are assumed to have IDs ending in a number,
12
+ # numbered from left to right
13
+ #
14
+ # Splitword nodes are assumed to have IDs ending in N_sM
15
+ # for numbers N and M, where N orders terminals left to right
16
+ # and M orders the splitword parts left to right
17
+ #
18
+ # If the yield of the node/the list of nodes contains all splitwords of a terminal,
19
+ # the whole terminal is taken instead
20
+ #
21
+ # methods:
22
+ #
23
+ # string_for_node returns the string for the yield of a node
24
+ # node: a node object
25
+ #
26
+ # string_for_nodes returns the string for the yield of a list of nodes
27
+ # nodes: a list of node objects
28
+ module StringTerminalsInRightOrder
29
+ def string_for_node(node)
30
+ string_for_nodes([node])
31
+ end
32
+
33
+ def string_for_nodes(nodes)
34
+ a = right_level_terminals_for_nodes(nodes)
35
+ a = sort_terminals_and_splitwords_left_to_right(a)
36
+ return node_array_to_string(a)
37
+ end
38
+
39
+ #####
40
+ private
41
+
42
+ # right_level_terminals_for_nodes:
43
+ # - compute the yield for each element of 'nodes'
44
+ # - then consider all splitwords in the yield:
45
+ # if all splitwords of a terminal are in the yield,
46
+ # then use the terminal rather than its splitwords
47
+ def right_level_terminals_for_nodes(nodes)
48
+ a = nodes.map { |n| n.yield_nodes}.flatten
49
+ b = []
50
+ a.each { |n|
51
+ if n.is_splitword?
52
+ # see if a contains all parts of this splitword
53
+ # if so, take into b the splitword's parent, the terminal,
54
+ # rather than the individual splitwords
55
+
56
+ if n.parent.nil?
57
+ # splitword without a parent
58
+ b << n
59
+ elsif b.include? n.parent or a.include? n.parent
60
+ # did we already include the splitword's parent in b?
61
+ # then we're done
62
+ else
63
+
64
+ # check if all children of n.parent are in 'a'
65
+ all_in = true
66
+ n.parent.each_child { |nsibling|
67
+ unless a.include? nsibling
68
+ all_in = false
69
+ break
70
+ end
71
+ }
72
+
73
+ if all_in
74
+ # yes, all children of n.parent are in 'a'
75
+ b << n.parent
76
+ else
77
+ # no, some sibling of n is not in 'a'
78
+ b << n
79
+ end
80
+ end
81
+ elsif n.is_terminal?
82
+ # n is a terminal
83
+ b << n
84
+ # if n is anything but a splitword or a terminal,
85
+ # ignore it
86
+ end
87
+ }
88
+ return b.uniq
89
+ end
90
+
91
+ # sort_terminals_and_splitwords_left_to_right:
92
+ # take an array of nodes that consists of terminals and splitwords
93
+ # and sort them using the following comparison:
94
+ # - when comparing two terminals, use the
95
+ # last numbers in their respective IDs
96
+ # - when comparing two splitwords, their IDs end in _N_sM
97
+ # for numbers N and M.
98
+ # If they coincide in N, compare them by M,
99
+ # else compare them by M
100
+ # - when comparing a terminal and a splitword,
101
+ # compare the terminal's last number to the splitword's N
102
+ def sort_terminals_and_splitwords_left_to_right(nodes)
103
+ nodes.sort { |a, b|
104
+ if a.is_splitword? and b.is_splitword?
105
+ compare_splitwords(a, b)
106
+ elsif a.is_terminal? and b.is_terminal?
107
+ compare_terminals(a, b)
108
+ else
109
+ compare_mixed(a, b)
110
+ end
111
+ }
112
+ end
113
+
114
+ # node_array_to_string:
115
+ # 'nodes' is an array of node objects, each of which offer a "word" method
116
+ # string their words together separated by " "
117
+ def node_array_to_string(nodes)
118
+ s = ""
119
+ nodes.each { |n|
120
+ s = s + n.word + " "
121
+ }
122
+ return s
123
+ end
124
+
125
+ # - when comparing two terminals, use the
126
+ # last numbers in their respective IDs
127
+ def compare_terminals(a, b)
128
+ last_i(a) <=> last_i(b)
129
+ end
130
+
131
+ # - when comparing two splitwords, their IDs end in _N_sM
132
+ # for numbers N and M.
133
+ # If they coincide in N, compare them by M,
134
+ # else compare them by M
135
+ def compare_splitwords(a, b)
136
+ if splitword_terminal_i(a) == splitword_terminal_i(b)
137
+ # parts of same terminal?
138
+ # compare parts
139
+ last_i(a) <=> last_i(b)
140
+ else
141
+ # not parts of same terminal?
142
+ # compare terminals
143
+ splitword_terminal_i(a) <=> splitword_terminal_i(b)
144
+ end
145
+ end
146
+
147
+ # - when comparing a terminal and a splitword,
148
+ # compare the terminal's last number to the splitword's N
149
+ def compare_mixed(a, b)
150
+ if a.is_splitword? and b.is_terminal?
151
+ splitword_terminal_i(a) <=> last_i(b)
152
+
153
+ elsif a.is_terminal? and b.is_splitword?
154
+ last_i(a) <=> splitword_terminal_i(b)
155
+ else
156
+ # not one terminal, one splitword?
157
+ # then what?
158
+ $stderr.print "SalsaTigerSentence, compare_mixed: confused by "
159
+ $stderr.print a.id, ", ", b.id, "\n"
160
+ end
161
+ end
162
+
163
+ # return last number of the ID of a node
164
+ def last_i(n)
165
+ n.id =~ /(\d+)$/ # match final string of digits
166
+ if $1.nil? # if shouldn't happen _in principle_
167
+ # but we might get weird node IDs for splitwords;
168
+ # so we act gracefully and catch the case where there
169
+ # is one final letter behind the digits
170
+ n.id =~ /(\d+)\w$/
171
+ end
172
+ if $1.nil? # this shouldn't ever happen
173
+ $stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
174
+ $stderr.print n.id, "\n"
175
+ exit 1
176
+ end
177
+ return $1.to_i # and return it as number
178
+ end
179
+
180
+ # assume the ID of the node includes N_sM
181
+ # return N
182
+ def splitword_terminal_i(n)
183
+ n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
184
+ if $1.nil? # this shouldn't ever happen
185
+ $stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
186
+ $stderr.print n.id, "\n"
187
+ exit 1
188
+ end
189
+ return $1.to_i # and return it as number
190
+ end
191
+ end
192
+ end