shalmaneser-lib 1.2.rc5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/configuration/config_data.rb +457 -0
- data/lib/configuration/config_format_element.rb +210 -0
- data/lib/configuration/configuration_error.rb +15 -0
- data/lib/configuration/external_config_data.rb +56 -0
- data/lib/configuration/frappe_config_data.rb +134 -0
- data/lib/configuration/fred_config_data.rb +199 -0
- data/lib/configuration/rosy_config_data.rb +126 -0
- data/lib/db/db_interface.rb +50 -0
- data/lib/db/db_mysql.rb +141 -0
- data/lib/db/db_sqlite.rb +280 -0
- data/lib/db/db_table.rb +237 -0
- data/lib/db/db_view.rb +416 -0
- data/lib/db/db_wrapper.rb +175 -0
- data/lib/db/select_table_and_columns.rb +10 -0
- data/lib/db/sql_query.rb +243 -0
- data/lib/definitions.rb +19 -0
- data/lib/eval.rb +482 -0
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/external_systems.rb +251 -0
- data/lib/framenet_format/fn_corpus_aset.rb +209 -0
- data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
- data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
- data/lib/framenet_format/fn_database.rb +143 -0
- data/lib/framenet_format/frame_xml_file.rb +104 -0
- data/lib/framenet_format/frame_xml_sentence.rb +411 -0
- data/lib/logging.rb +25 -0
- data/lib/ml/classifier.rb +189 -0
- data/lib/ml/mallet.rb +236 -0
- data/lib/ml/maxent.rb +229 -0
- data/lib/ml/optimize.rb +195 -0
- data/lib/ml/timbl.rb +140 -0
- data/lib/monkey_patching/array.rb +82 -0
- data/lib/monkey_patching/enumerable_bool.rb +24 -0
- data/lib/monkey_patching/enumerable_distribute.rb +18 -0
- data/lib/monkey_patching/file.rb +131 -0
- data/lib/monkey_patching/subsumed.rb +24 -0
- data/lib/ruby_class_extensions.rb +4 -0
- data/lib/salsa_tiger_xml/corpus.rb +24 -0
- data/lib/salsa_tiger_xml/fe_node.rb +98 -0
- data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
- data/lib/salsa_tiger_xml/frame_node.rb +145 -0
- data/lib/salsa_tiger_xml/graph_node.rb +347 -0
- data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
- data/lib/salsa_tiger_xml/sem_node.rb +58 -0
- data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
- data/lib/salsa_tiger_xml/syn_node.rb +169 -0
- data/lib/salsa_tiger_xml/tree_node.rb +59 -0
- data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
- data/lib/salsa_tiger_xml/usp_node.rb +72 -0
- data/lib/salsa_tiger_xml/xml_node.rb +163 -0
- data/lib/shalmaneser/lib.rb +1 -0
- data/lib/tabular_format/fn_tab_format_file.rb +38 -0
- data/lib/tabular_format/fn_tab_frame.rb +67 -0
- data/lib/tabular_format/fn_tab_sentence.rb +169 -0
- data/lib/tabular_format/tab_format_file.rb +91 -0
- data/lib/tabular_format/tab_format_named_args.rb +184 -0
- data/lib/tabular_format/tab_format_sentence.rb +119 -0
- data/lib/value_restriction.rb +49 -0
- metadata +131 -0
@@ -0,0 +1,169 @@
|
|
1
|
+
require_relative 'salsa_tiger_xml_node'
|
2
|
+
|
3
|
+
module STXML
|
4
|
+
#############
|
5
|
+
# class SynNode
|
6
|
+
#
|
7
|
+
# inherits from SalsaTigerXmlNode,
|
8
|
+
# adds to it methods specific to nodes
|
9
|
+
# that describe the syntactic structure
|
10
|
+
#
|
11
|
+
# additional/changed methods:
|
12
|
+
#
|
13
|
+
# part_of_speech part_of_speech information as a string,
|
14
|
+
# nil for anything but terminal nodes
|
15
|
+
#
|
16
|
+
# word word information for this node as a string,
|
17
|
+
# nil for anything but terminal nodes
|
18
|
+
#
|
19
|
+
# category category information for this node as a string,
|
20
|
+
# nil for anything but nonterminal nodes
|
21
|
+
#
|
22
|
+
# is_punct? true if this is a terminal node and it is a punctuation sign
|
23
|
+
#
|
24
|
+
# get_sem add a non-tree edge from this syntactic node to a semantic node
|
25
|
+
# Idea: this is basically the inverse of the edge pointing from
|
26
|
+
# the FeNode to this SynNode, so you can fetch a node's semantics directly
|
27
|
+
#
|
28
|
+
# add_sem add non-tree edge from this syntactic node to a FeNode
|
29
|
+
class SynNode < SalsaTigerXmlNode
|
30
|
+
|
31
|
+
###
|
32
|
+
def initialize(xml)
|
33
|
+
super(xml)
|
34
|
+
|
35
|
+
@sem = []
|
36
|
+
@other_links = []
|
37
|
+
end
|
38
|
+
|
39
|
+
###
|
40
|
+
def add_link(other_node, # SynNode
|
41
|
+
link_label, # string: edge label
|
42
|
+
attributes = {}) # hash string>string: further attribute-value pairs for the edge
|
43
|
+
|
44
|
+
@other_links << [link_label, other_node, attributes]
|
45
|
+
end
|
46
|
+
|
47
|
+
###
|
48
|
+
def get_linked(label = nil) # string/nil: if string, use only linked with this link_label
|
49
|
+
if label
|
50
|
+
return @other_links.select { |label_node_attr| label_node_attr.first == label }
|
51
|
+
else
|
52
|
+
return @other_links
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
###
|
57
|
+
def part_of_speech
|
58
|
+
if get_attribute("pos")
|
59
|
+
return get_attribute("pos").strip
|
60
|
+
else
|
61
|
+
return nil
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
###
|
66
|
+
def category
|
67
|
+
if get_attribute("cat")
|
68
|
+
return get_attribute("cat").strip
|
69
|
+
else
|
70
|
+
return nil
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
###
|
75
|
+
def word
|
76
|
+
if get_attribute("word")
|
77
|
+
return get_attribute("word").strip
|
78
|
+
else
|
79
|
+
return nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
###
|
84
|
+
def is_punct?
|
85
|
+
if is_nonterminal?
|
86
|
+
# only terminals can be punctuation signs
|
87
|
+
return false
|
88
|
+
end
|
89
|
+
|
90
|
+
# next check part of speech
|
91
|
+
# this works at least for TIGER corpus annotation
|
92
|
+
case part_of_speech
|
93
|
+
when '$.', '$,', '$('
|
94
|
+
return true
|
95
|
+
end
|
96
|
+
if part_of_speech =~ /^PUNC/
|
97
|
+
return true
|
98
|
+
end
|
99
|
+
|
100
|
+
# known punctuation signs: filtered out for determining maximal constituents
|
101
|
+
|
102
|
+
# no luck with part of speech:
|
103
|
+
# check word
|
104
|
+
case word
|
105
|
+
when ".", ";", ",", ":", "?", "!", "(", ")", "[", "]", "{", "}", "-", "''", "``", "\"", "'"
|
106
|
+
return true
|
107
|
+
end
|
108
|
+
|
109
|
+
# not a punctuation sign by any of the tests we have applied
|
110
|
+
return false
|
111
|
+
end
|
112
|
+
|
113
|
+
###
|
114
|
+
def to_s
|
115
|
+
if is_terminal?
|
116
|
+
return word
|
117
|
+
else
|
118
|
+
return super()
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
###
|
123
|
+
def get_sem
|
124
|
+
return @sem.clone
|
125
|
+
end
|
126
|
+
|
127
|
+
###
|
128
|
+
def add_sem(fe_node)
|
129
|
+
unless fe_node.class == FeNode
|
130
|
+
raise "Unexpected class of semantic node: was expecting an FeNode"
|
131
|
+
end
|
132
|
+
|
133
|
+
@sem << fe_node
|
134
|
+
end
|
135
|
+
|
136
|
+
#############
|
137
|
+
protected
|
138
|
+
|
139
|
+
def get_xml_ofchildren
|
140
|
+
string = ""
|
141
|
+
|
142
|
+
each_child_with_edgelabel { |label, child|
|
143
|
+
unless child.is_splitword?
|
144
|
+
# terminal or nonterminal child.
|
145
|
+
# splitwords are handled separately in the "sem" part of the sentence
|
146
|
+
if label
|
147
|
+
string << "<edge label=\'#{xml_secure_val(label)}\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
148
|
+
else
|
149
|
+
string << "<edge label=\'-\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
}
|
153
|
+
@other_links.each { |label, node, attributes|
|
154
|
+
if label
|
155
|
+
string << "<other_edge label=\'#{xml_secure_val(label)}\'"
|
156
|
+
else
|
157
|
+
string << "<other_edge label=\'-\'"
|
158
|
+
end
|
159
|
+
string << " idref=\'#{xml_secure_val(node.id)}\'"
|
160
|
+
if attributes
|
161
|
+
string << " " + attributes.to_a.map { |attr, val| "#{xml_secure_val(attr)}=\'#{xml_secure_val(val)}\'" }.join(" ")
|
162
|
+
end
|
163
|
+
string << "/>\n"
|
164
|
+
}
|
165
|
+
|
166
|
+
return string
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require_relative 'graph_node'
|
2
|
+
|
3
|
+
module STXML
|
4
|
+
class TreeNode < GraphNode
|
5
|
+
|
6
|
+
def initialize(id)
|
7
|
+
super(id)
|
8
|
+
end
|
9
|
+
|
10
|
+
# redo the ancestor-related methods,
|
11
|
+
# since here we only have one parent per node
|
12
|
+
def parent
|
13
|
+
retv = parents
|
14
|
+
if retv.nil?
|
15
|
+
return nil
|
16
|
+
else
|
17
|
+
return retv.first
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def parent_label
|
22
|
+
retv = parent_labels
|
23
|
+
if retv.nil?
|
24
|
+
return nil
|
25
|
+
else
|
26
|
+
return retv.first
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def parent_with_edgelabel
|
31
|
+
retv = parents_with_edgelabel
|
32
|
+
|
33
|
+
if retv.nil?
|
34
|
+
return nil
|
35
|
+
else
|
36
|
+
return retv.first
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_parent(parent, edgelabel, varhash = {})
|
41
|
+
set_parent(parent, edgelabel, varhash)
|
42
|
+
end
|
43
|
+
|
44
|
+
def set_parent(parent, edgelabel, varhash = {})
|
45
|
+
# remove old parent
|
46
|
+
each_parent_with_edgelabel { |label, p| remove_parent(p, label, varhash) }
|
47
|
+
|
48
|
+
# set new parent
|
49
|
+
@parents << [edgelabel, parent]
|
50
|
+
|
51
|
+
# and vice versa: add self as child to parent
|
52
|
+
unless varhash["pointer_insteadof_edge"]
|
53
|
+
unless parent.children_with_edgelabel.include? [edgelabel, self]
|
54
|
+
parent.add_child(self, edgelabel)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require_relative 'syn_node'
|
2
|
+
require_relative 'reg_xml'
|
3
|
+
|
4
|
+
module STXML
|
5
|
+
#############
|
6
|
+
# class TSSynNode
|
7
|
+
#
|
8
|
+
# inherits from SynNode
|
9
|
+
#
|
10
|
+
# describes a syntactic node that isn't really there:
|
11
|
+
# a reference to a node in another sentence
|
12
|
+
#
|
13
|
+
# contains that node's ID, but an empty RegXML object,
|
14
|
+
# its string is "<unknown>", and you cannot add
|
15
|
+
# a child to it
|
16
|
+
#
|
17
|
+
# new or changed methods:
|
18
|
+
#-----------------------
|
19
|
+
#
|
20
|
+
# is_outside_sentence? returns true
|
21
|
+
#
|
22
|
+
# word returns "<unknown>"
|
23
|
+
#
|
24
|
+
# add_child raises an error
|
25
|
+
class TSSynNode < SynNode
|
26
|
+
|
27
|
+
###
|
28
|
+
def initialize(id_string)
|
29
|
+
super(RegXML.new("<OTHER_SENTENCE id='" + id_string + "'/>"))
|
30
|
+
end
|
31
|
+
|
32
|
+
###
|
33
|
+
def is_outside_sentence?
|
34
|
+
return true
|
35
|
+
end
|
36
|
+
|
37
|
+
###
|
38
|
+
# word of this node: <unknown>
|
39
|
+
def word
|
40
|
+
return "<unknown>"
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_child(arg1, arg2)
|
44
|
+
raise "Not implemented for this class"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require_relative 'salsa_tiger_xml_node'
|
2
|
+
|
3
|
+
module STXML
|
4
|
+
#############
|
5
|
+
# class UspNode
|
6
|
+
#
|
7
|
+
# inherits from SalsaTigerXmlNode,
|
8
|
+
# adds to it methods specific to nodes
|
9
|
+
# that describe a frame underspecification or frame element underspecification
|
10
|
+
#
|
11
|
+
# additional/changed methods:
|
12
|
+
#----------------------------
|
13
|
+
#
|
14
|
+
# new initializes the object
|
15
|
+
# rexml_object: underlying XML object for this node
|
16
|
+
# frame_or_fe: string, either "frame" for frame underspecification
|
17
|
+
# or "fe" for frame element underspecification
|
18
|
+
#
|
19
|
+
# add_child, remove_child add, remove underspecification entry
|
20
|
+
class UspNode < SalsaTigerXmlNode
|
21
|
+
|
22
|
+
attr_reader :i_am
|
23
|
+
|
24
|
+
###
|
25
|
+
def initialize(xml_obj, # RegXMl object
|
26
|
+
frame_or_fe) # string "frame" or "fe"
|
27
|
+
|
28
|
+
super(xml_obj)
|
29
|
+
case frame_or_fe
|
30
|
+
when "frame"
|
31
|
+
@i_am = "frame"
|
32
|
+
when "fe"
|
33
|
+
@i_am = "fe"
|
34
|
+
else
|
35
|
+
raise "new: neither frame nor fe??"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
###
|
40
|
+
def add_child(node, varhash={})
|
41
|
+
if node
|
42
|
+
super(node, nil, "pointer_insteadof_edge" => true)
|
43
|
+
else
|
44
|
+
raise "Got nil for a node."
|
45
|
+
end
|
46
|
+
|
47
|
+
# set usp. attribute on child
|
48
|
+
node.set_attribute("usp", "yes")
|
49
|
+
end
|
50
|
+
|
51
|
+
###
|
52
|
+
def remove_child(node, varhash={})
|
53
|
+
super(node, nil, "pointer_insteadof_edge" => true)
|
54
|
+
|
55
|
+
# removing "usp" attribute on child
|
56
|
+
# this will be wrong if the child is involved in more
|
57
|
+
# than one instance of underspecification!
|
58
|
+
|
59
|
+
$stderr.puts "Warning: unsafe removal of attribute 'usp'"
|
60
|
+
node.del_attribute("usp")
|
61
|
+
end
|
62
|
+
|
63
|
+
#############
|
64
|
+
protected
|
65
|
+
|
66
|
+
def get_xml_ofchildren
|
67
|
+
return children.map { |child|
|
68
|
+
"<uspitem idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
69
|
+
}.join
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
require_relative 'tree_node'
|
2
|
+
|
3
|
+
module STXML
|
4
|
+
#############
|
5
|
+
# class XMLNode
|
6
|
+
#
|
7
|
+
# node with entries pointing to its children
|
8
|
+
# as well as its parent.
|
9
|
+
# all edges may be labeled.
|
10
|
+
# each node has a unique ID.
|
11
|
+
#
|
12
|
+
# indexes a string with XML data representing the same node,
|
13
|
+
# but does not look into it, just keeps it
|
14
|
+
#
|
15
|
+
# methods:
|
16
|
+
# This class inherits from TreeNode and GraphNode.
|
17
|
+
# See Tree.rb and Graph.rb for the methods they offer.
|
18
|
+
#
|
19
|
+
# new initializes the object
|
20
|
+
#
|
21
|
+
# get returns the XML object representing
|
22
|
+
# the same node as this node object
|
23
|
+
#
|
24
|
+
class XMLNode < TreeNode
|
25
|
+
|
26
|
+
# @param name [String] element name; or, for text, the whole text
|
27
|
+
# @param attribute [Hash] attr_name(string) -> attr_value(string)
|
28
|
+
# @param id [String] node ID
|
29
|
+
# @param i_am_text [false, true] set to anything but false or nil to represent
|
30
|
+
# not an xml element but text
|
31
|
+
def initialize(name, attribute, id, i_am_text = false)
|
32
|
+
|
33
|
+
if id.nil?
|
34
|
+
# I wasn't given any ID
|
35
|
+
# take system time for an ID
|
36
|
+
# use to_f to get fractions of seconds too:
|
37
|
+
# If I make several nodes in the same second,
|
38
|
+
# they should still have unique IDs
|
39
|
+
id = Time.new.to_f.to_s
|
40
|
+
end
|
41
|
+
|
42
|
+
super(id)
|
43
|
+
|
44
|
+
# remember values for this element
|
45
|
+
set_f("name", name)
|
46
|
+
set_f("attributes", attribute)
|
47
|
+
set_f("i_am_text", i_am_text)
|
48
|
+
|
49
|
+
# sanity check
|
50
|
+
if i_am_text and attributes
|
51
|
+
raise "A text element cannot have attributes"
|
52
|
+
end
|
53
|
+
|
54
|
+
@kith = []
|
55
|
+
end
|
56
|
+
|
57
|
+
###
|
58
|
+
# add sanity check:
|
59
|
+
# if this is text rather than an xml element,
|
60
|
+
# it cannot have children
|
61
|
+
def add_child(child, edgelabel, varhash={})
|
62
|
+
if get_f("i_am_text")
|
63
|
+
raise "A text element cannot have children"
|
64
|
+
end
|
65
|
+
super(child, edgelabel, varhash)
|
66
|
+
end
|
67
|
+
|
68
|
+
###
|
69
|
+
def add_kith(xml) # RegXML object
|
70
|
+
@kith << xml
|
71
|
+
end
|
72
|
+
|
73
|
+
###
|
74
|
+
# set attribute
|
75
|
+
# @param value [String]
|
76
|
+
def set_attribute(name, value)
|
77
|
+
unless value.class == String
|
78
|
+
raise "I can only set attribute values to strings. Got: #{value.class}."
|
79
|
+
end
|
80
|
+
|
81
|
+
if get_f("attributes").nil?
|
82
|
+
set_f("attributes", {})
|
83
|
+
end
|
84
|
+
get_f("attributes")[name] = value
|
85
|
+
end
|
86
|
+
|
87
|
+
###
|
88
|
+
def get_attribute(name)
|
89
|
+
if get_f("attributes")
|
90
|
+
return get_f("attributes")[name]
|
91
|
+
else
|
92
|
+
return nil
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
###
|
97
|
+
# delete attribute
|
98
|
+
def del_attribute(name)
|
99
|
+
if get_f("attributes")
|
100
|
+
get_f("attributes").delete(name)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
###
|
105
|
+
# return XML as string:
|
106
|
+
# If this is a text, just return the text
|
107
|
+
# which is stored in "name"
|
108
|
+
# If this is an XMl element,
|
109
|
+
# make a tag from its name and attributes,
|
110
|
+
# then add tags for all its children,
|
111
|
+
# then add an end tag.
|
112
|
+
def get
|
113
|
+
if get_f("i_am_text")
|
114
|
+
# text rather than XML element
|
115
|
+
return get_f("name")
|
116
|
+
else
|
117
|
+
# XMl element, not text
|
118
|
+
string = "<" + get_f("name")
|
119
|
+
if get_f("attributes")
|
120
|
+
string << get_f("attributes").to_a.map { |name, value|
|
121
|
+
" " + name + "=\'" + xml_secure_val(value) + "\'"
|
122
|
+
}.join
|
123
|
+
end
|
124
|
+
string << ">\n"
|
125
|
+
string << get_xml_embedded
|
126
|
+
string << "</#{get_f("name")}>\n"
|
127
|
+
return string
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
#############
|
132
|
+
protected
|
133
|
+
|
134
|
+
def get_xml_embedded
|
135
|
+
return get_xml_ofchildren +
|
136
|
+
get_xml_ofkith
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
def get_xml_ofchildren
|
141
|
+
return children.map { |child|
|
142
|
+
child.get
|
143
|
+
}.join
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
def get_xml_ofkith
|
148
|
+
return @kith.map { |thing| thing.to_s + "\n" }.join
|
149
|
+
end
|
150
|
+
|
151
|
+
|
152
|
+
###
|
153
|
+
def warn_child_ignored(where, xml_node)
|
154
|
+
$stderr.puts "WARNING: additional material found in #{where}, will be ignored:"
|
155
|
+
$stderr.puts "\t" + xml_node.to_s
|
156
|
+
end
|
157
|
+
|
158
|
+
###
|
159
|
+
def xml_secure_val(value) # string: value of an attribute
|
160
|
+
value.gsub(/'/, "'").gsub(/"/, "''")
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|