shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,251 @@
1
+ # ExternalSystems.rb
2
+ #
3
+ # ke oct/nov 2005
4
+ #
5
+ # Store all known interfaces to
6
+ # systems that do syntactic analysis
7
+ #
8
+ # Given the name of a system and the service that the
9
+ # system performs, return the appropriate interface
10
+ #
11
+ # There are two types of interfaces to syntactic analysis systems:
12
+ # - interfaces:
13
+ # offer methods for syntactic analysis,
14
+ # and the transformation to Salsa/Tiger XML and SalsaTigerSentence objects
15
+ # - interpreters:
16
+ # interpret the resulting Salsa/Tiger XML (represented as
17
+ # SalsaTigerSentence and SynNode objects), e.g.
18
+ # generalize over part of speech;
19
+ # describe the path between a pair of nodes both as a path
20
+ # and (potentially) as a grammatical function of one of the nodes;
21
+ # determine whether a node describes a verb, and in which voice;
22
+ # determine the head of a constituent
23
+ #
24
+ # Abstract classes for both interfaces and interpreters
25
+ # are in AbstractSynInterface.rb
26
+
27
+ require "ruby_class_extensions"
28
+ require 'logging'
29
+
30
+ # The list of available interface packages
31
+ # is at the end of this file.
32
+ # Please enter additional interfaces there.
33
+
34
+ # @todo AB: [2015-12-16 Wed 01:03]
35
+ # After decoupling in OptParser and ConfigData classes move this
36
+ # to Frappe.
37
+ module Shalmaneser
38
+ class ExternalSystems
39
+ ###
40
+ # class variable:
41
+ # list of all known interface classes
42
+ # add to it using add_interface()
43
+ @interfaces = []
44
+
45
+ ###
46
+ # class variable:
47
+ # list of all known interpreter classes
48
+ # add to it using add_interpreter()
49
+ @interpreters = []
50
+
51
+ ###
52
+ # add interface/interpreter
53
+ def self.add_interface(class_name)
54
+ LOGGER.debug "Initializing interface <#{class_name}>."
55
+ @interfaces << class_name
56
+ end
57
+
58
+ def self.add_interpreter(class_name)
59
+ LOGGER.debug "Initializing interpreter <#{class_name}>."
60
+ @interpreters << class_name
61
+ end
62
+
63
+ ###
64
+ # check_interfaces_abort_if_missing:
65
+ #
66
+ # Given an experiment file, use some_system_missing? to
67
+ # determine whether the system can be run with the requested
68
+ # syntactic processing, exit with an error message if that is not possible
69
+ # @param [FrappeConfigData] exp Experiment description.
70
+ def self.check_interfaces_abort_if_missing(exp)
71
+ if (missing = some_system_missing?(exp))
72
+ interwhat, services = missing
73
+
74
+ $stderr.puts
75
+ $stderr.puts "ERROR: I am missing an #{interwhat} for "
76
+ services.each_pair { |service, system_name|
77
+ $stderr.puts "\tservice #{service}, system #{system_name}"
78
+ }
79
+ $stderr.puts
80
+ $stderr.puts "I have the following interfaces:"
81
+ @interfaces.each { |interface_class|
82
+ $stderr.puts "\tservice #{interface_class.service}, system #{interface_class.system}"
83
+ }
84
+ $stderr.puts "I have the following interpreters:"
85
+ @interpreters.each { |interpreter_class|
86
+ $stderr.print "\t"
87
+ $stderr.print interpreter_class.systems.to_a.map { |service, system_name|
88
+ "service #{service}, system #{system_name}"
89
+ }.join("; ")
90
+ unless interpreter_class.optional_systems.empty?
91
+ $stderr.print ", optional: "
92
+ $stderr.print interpreter_class.optional_systems.to_a.map { |service, system_name|
93
+ "service #{service}, system #{system_name}"
94
+ }.join("; ")
95
+ end
96
+ $stderr.puts
97
+ }
98
+ $stderr.puts
99
+ $stderr.puts "Please adapt your experiment file."
100
+ exit 1
101
+ end
102
+ end
103
+
104
+ ###
105
+ # given the name of a system and the service that it
106
+ # performs, find the matching interface class
107
+ #
108
+ # system: string: name of system, e.g. collins
109
+ # service: string: service, e.g. parser
110
+ #
111
+ # returns: SynInterface class
112
+ def self.get_interface(service, system)
113
+ interfaces = @interfaces.select do |interface_class|
114
+ interface_class.system == system && interface_class.service == service
115
+ end
116
+
117
+ unless interfaces.any?
118
+ raise "I've been requested an interface for #{service} and #{system}, "\
119
+ 'but I cannot find any. Please correct your experiment files.'
120
+ end
121
+
122
+ # @todo AB: Actually it's bad logic, but no idea for now how to handle it.
123
+ interfaces.first
124
+ end
125
+
126
+ ###
127
+ # helper for get_interpreter:
128
+ def self.get_interpreter_according_to_exp(exp)
129
+ ExternalSystems.get_interpreter(ExternalSystems.requested_services(exp))
130
+ end
131
+
132
+ ###
133
+ # given the names and services of a set of systems,
134
+ # find the matching interpreter class
135
+ #
136
+ # an interpreter class has both obligatory systems
137
+ # (they need to be present for this class to apply)
138
+ # and optional systems (they may or may not be present
139
+ # for the class to apply, but no other system performing
140
+ # the same service may)
141
+ #
142
+ # systems:
143
+ # hash: service(string) -> system name(string)
144
+ #
145
+ # returns: SynInterpreter class
146
+ def self.get_interpreter(systems)
147
+ # try to find an interface class with the given
148
+ # service-name pairs
149
+
150
+ @interpreters.each { |interpreter_class|
151
+
152
+ if interpreter_class.systems.to_a.big_and { |service, system|
153
+ # all obligatory entries of interpreter_class
154
+ # are in systems
155
+ systems[service] == system
156
+ } and
157
+ interpreter_class.optional_systems.to_a.big_and { |service, system|
158
+ # all optional entries of interpreter_class are
159
+ # either in systems, or the service isn't in systems at all
160
+ systems[service].nil? or systems[service] == system
161
+ } and
162
+ systems.to_a.big_and { |service, system|
163
+ # all entries in names are in either
164
+ # the obligatory or optional set for interpreter_class
165
+ interpreter_class.systems[service] == system or
166
+ interpreter_class.optional_systems[service] == system
167
+ }
168
+ return interpreter_class
169
+ end
170
+ }
171
+
172
+ # at this point, detection of a suitable interpreter class has failed
173
+ return nil
174
+ end
175
+
176
+ ################
177
+ private
178
+
179
+ ###
180
+ # knows about possible services that can be set in
181
+ # the experiment file, and where the names of
182
+ # the matching systems will be found in the experiment file data structure
183
+ #
184
+ # WARNING: adapt this when you introduce new services!
185
+ #
186
+ # returns: a hash
187
+ # <service> => system_name
188
+ #
189
+ # such that for each service/system name pair:
190
+ # the service with the given name has been requested in
191
+ # the experiment file, and the names of the systems to be used
192
+ # for performing the service
193
+ def self.requested_services(exp)
194
+ services = {}
195
+ [
196
+ {"flag" => "do_postag", "service" => "pos_tagger"},
197
+ {"flag" => "do_lemmatize", "service" => "lemmatizer"},
198
+ {"flag" => "do_parse", "service" => "parser"}
199
+ ].each do |hash|
200
+ # yes, perform this service
201
+ if exp.get(hash["flag"])
202
+ services[hash["service"]] = exp.get(hash["service"])
203
+ end
204
+ end
205
+
206
+ services
207
+ end
208
+
209
+ ###
210
+ # some_system_missing?
211
+ # returns nil if I have interfaces and interpreters
212
+ # for all services requested in the given experiment file
213
+ # else:
214
+ # returns pair [interface or interpreter, info]
215
+ # where the 1st element is either 'interface' or 'interpreter',
216
+ # and the 2nd element is a hash mapping services to system names:
217
+ # the services that could not be provided
218
+ # @param [FrappeConfigdata] exp FrappeConfigData object to check all the systems.
219
+ def self.some_system_missing?(exp)
220
+ missing_systems = nil
221
+ # check interfaces
222
+ requested_services(exp).each_pair do |service, system_name|
223
+ unless get_interface(service, system_name)
224
+ missing_systems = ["interface", {service => system_name}]
225
+ end
226
+ end
227
+
228
+ # check interpreter
229
+ unless get_interpreter_according_to_exp(exp)
230
+ missing_systems = ["interpreter", services]
231
+ end
232
+
233
+ # everything okay
234
+ missing_systems
235
+ end
236
+ end
237
+ end
238
+
239
+ # @todo AB: We should require programmatically all files in
240
+ # <frappe/interpreters> and <frappe/interfaces>.
241
+ require 'frappe/interfaces/collins_interface'
242
+ require 'frappe/interpreters/collins_treetagger_interpreter'
243
+ require 'frappe/interpreters/collins_tnt_interpreter'
244
+ require 'frappe/interfaces/berkeley_interface'
245
+ require 'frappe/interpreters/berkeley_interpreter'
246
+ require 'frappe/interfaces/stanford_interface'
247
+ require 'frappe/interpreters/stanford_interpreter'
248
+ require 'frappe/interfaces/treetagger_interface'
249
+ require 'frappe/interfaces/treetagger_pos_interface'
250
+ require 'frappe/interpreters/treetagger_interpreter'
251
+ require 'frappe/interpreters/empty_interpreter'
@@ -0,0 +1,209 @@
1
+ #####################
2
+ # class to keep data for one frame
3
+ class FNCorpusAset
4
+ attr_reader :layers, :aset_type, :aset_id, :frame_name, :lu
5
+
6
+ #######
7
+ # Analyze RegXML object, store in object variables:
8
+ #
9
+ # @aset_type: "frame" or "NER"
10
+ # @frame_name: frame name for "frame" type
11
+ # @lu: LU for "frame" type
12
+ # @aset_id: ID of the annotation set
13
+ # @layers: hash: layer type (FE, GF, PT, Target, NER) -> [offset, "start"/"stop"] -> list of labels
14
+ # string -> int*string -> array:string
15
+ #
16
+ def initialize(aset, #RegXML object
17
+ charidx) # array of pairs [start index, stop index] int*int
18
+
19
+ @layers = {}
20
+ @frame_name = nil
21
+ @lu = nil
22
+ @aset_type = nil
23
+
24
+ attributes = aset.attributes
25
+
26
+ @aset_id = attributes["ID"]
27
+
28
+ if attributes["frameName"]
29
+ # all of these seem to be frames. store in 'frames' array
30
+ unless attributes["luName"]
31
+ $stderr.puts "FNCorpusAset warning: cannot determine LU name"
32
+ $stder.puts aset.to_s
33
+ return
34
+ end
35
+ @aset_type = "frame"
36
+ @frame_name = attributes["frameName"]
37
+ @lu = attributes["luName"]
38
+
39
+ unless (layers = aset.first_child_matching("layers"))
40
+ $stderr.puts "FNCorpusAset warning: unexpectedly no layers found"
41
+ $stderr.puts aset.to_s
42
+ return
43
+ end
44
+
45
+ layers.each_child_matching("layer") { |l| analyze_layer(l, charidx) }
46
+
47
+ else
48
+ # all we seem to get here are named entity labels.
49
+ @aset_type = "NER"
50
+
51
+ unless (layers = aset.first_child_matching("layers"))
52
+ $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
53
+ $stderr.puts aset.to_s
54
+ return
55
+ end
56
+ unless (layer = layers.first_child_matching("layer"))
57
+ $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
58
+ $stderr.puts aset.to_s
59
+ return
60
+ end
61
+
62
+ unless layer.attributes["name"] == "NER"
63
+ $stderr.puts "FNCorpusAset Warning: unexpected layer #{layer.attributes["name"]}, was expecting only an NER layer."
64
+ $stderr.puts aset.to_s
65
+ return
66
+ end
67
+
68
+ analyze_layer(layer, charidx)
69
+
70
+ end
71
+ end
72
+
73
+
74
+ #############
75
+ # input: <layer> RegXML object
76
+ # analyze this, put into @layers data structure
77
+ def analyze_layer(layer, # RegXML object
78
+ charidx) # array:int*int pairs start/end index of words
79
+ layer_name = layer.attributes["name"]
80
+ unless layer_name
81
+ $stderr.puts "FNCorpusAset warning: cannot determine layer name"
82
+ $stderr.puts layer.to_s
83
+ return
84
+ end
85
+
86
+ # FN-specific: skip 2nd layer FEs for now
87
+ if layer_name == "FE" and layer.attributes["rank"] == "2"
88
+ return
89
+ end
90
+
91
+ unless @layers[layer_name]
92
+ @layers[layer_name] = {}
93
+ end
94
+
95
+ unless (labels = layer.first_child_matching("labels"))
96
+ # nothing to record for this layer
97
+ return
98
+ end
99
+
100
+
101
+ # taking over much of analyse_layer from class FrameXML
102
+ thisLayer = []
103
+
104
+ labels.each_child_matching("label") { |label|
105
+ attributes = label.attributes
106
+ if attributes["itype"] =~ /NI/
107
+ # null instantiation, ignore
108
+ next
109
+ end
110
+
111
+ if not(attributes["start"]) and not(attributes["end"])
112
+ # no start and end labels
113
+ next
114
+ end
115
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
116
+ }
117
+
118
+ # sanity check: do indices
119
+ # match word start and end indices?
120
+ thisLayer = verify_annotation(thisLayer, charidx)
121
+
122
+ # sanity check: verify that
123
+ # we don't have overlapping labels
124
+
125
+ deleteHash = {} # keep track of the labels which are to be deleted
126
+ # i -> Boolean
127
+
128
+ thisLayer.each_index {|i|
129
+ # efficiency: skip already delete labels
130
+ if deleteHash[i]
131
+ next
132
+ end
133
+ this_label, this_from , this_to = thisLayer[i]
134
+
135
+ # compare with all remaining labels
136
+ (i+1..thisLayer.length-1).to_a.each { |other_i|
137
+ other_label,other_from,other_to = thisLayer[other_i]
138
+
139
+ # overlap? Throw out the later FE
140
+ if this_from <= other_from and other_from <= this_to
141
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
142
+ deleteHash[other_i] = true
143
+ elsif this_from <= other_to and other_to <= this_to
144
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
145
+ delete_hash[i] = true
146
+ end
147
+ }
148
+ # matched with all other labels. If "keep", return
149
+
150
+ if deleteHash[i]
151
+ # $stderr.puts " deleting entry #{i}"
152
+ else
153
+ [ [this_from, "start"], [this_to, "stop"]].each { |offset, start_or_stop|
154
+ unless @layers[layer_name].has_key?([offset, start_or_stop])
155
+ @layers[layer_name][[offset, start_or_stop]] = []
156
+ end
157
+ @layers[layer_name][ [offset, start_or_stop] ] << this_label
158
+ }
159
+ end
160
+ }
161
+ end
162
+
163
+ ##############3
164
+ # verify found triples label/from_index/to_index
165
+ # against given start/end indices of words
166
+ #
167
+ # returns: triples, possibly changed
168
+ def verify_annotation(found, # array: label/from/to, string*int*int
169
+ charidx) # array: from/to, int*int
170
+
171
+ return found.map {|element, start, stop|
172
+
173
+ newstart = start
174
+ newstop = stop
175
+
176
+ # compare against word start/stop indices
177
+ charidx.each_index{|j|
178
+ unless j== 0
179
+ pstartidx, pstopidx = charidx[j-1]
180
+ end
181
+ startidx, stopidx = charidx[j]
182
+
183
+ if (start > startidx and start <= stopidx) or
184
+ (j != 0 and start > pstopidx and start < startidx)
185
+ newstart = startidx
186
+ end
187
+
188
+ if (stop >= startidx and stop < stopidx)
189
+ newstop = stopidx
190
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
191
+ newstop = pstopidx
192
+ end
193
+ }
194
+
195
+ # change?
196
+ if start != newstart or stop != newstop
197
+ # report change
198
+ $stderr.puts "FNCorpusXML warning: Heuristics has changed element "+element
199
+ $stderr.puts "\tfrom ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"]"
200
+
201
+ [element, newstart, newstop]
202
+
203
+ else
204
+
205
+ [element, start, stop]
206
+ end
207
+ }
208
+ end
209
+ end