shalmaneser-lib 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/configuration/config_data.rb +457 -0
  7. data/lib/configuration/config_format_element.rb +210 -0
  8. data/lib/configuration/configuration_error.rb +15 -0
  9. data/lib/configuration/external_config_data.rb +56 -0
  10. data/lib/configuration/frappe_config_data.rb +134 -0
  11. data/lib/configuration/fred_config_data.rb +199 -0
  12. data/lib/configuration/rosy_config_data.rb +126 -0
  13. data/lib/db/db_interface.rb +50 -0
  14. data/lib/db/db_mysql.rb +141 -0
  15. data/lib/db/db_sqlite.rb +280 -0
  16. data/lib/db/db_table.rb +237 -0
  17. data/lib/db/db_view.rb +416 -0
  18. data/lib/db/db_wrapper.rb +175 -0
  19. data/lib/db/select_table_and_columns.rb +10 -0
  20. data/lib/db/sql_query.rb +243 -0
  21. data/lib/definitions.rb +19 -0
  22. data/lib/eval.rb +482 -0
  23. data/lib/ext/maxent/Classify.class +0 -0
  24. data/lib/ext/maxent/Train.class +0 -0
  25. data/lib/external_systems.rb +251 -0
  26. data/lib/framenet_format/fn_corpus_aset.rb +209 -0
  27. data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
  28. data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
  29. data/lib/framenet_format/fn_database.rb +143 -0
  30. data/lib/framenet_format/frame_xml_file.rb +104 -0
  31. data/lib/framenet_format/frame_xml_sentence.rb +411 -0
  32. data/lib/logging.rb +25 -0
  33. data/lib/ml/classifier.rb +189 -0
  34. data/lib/ml/mallet.rb +236 -0
  35. data/lib/ml/maxent.rb +229 -0
  36. data/lib/ml/optimize.rb +195 -0
  37. data/lib/ml/timbl.rb +140 -0
  38. data/lib/monkey_patching/array.rb +82 -0
  39. data/lib/monkey_patching/enumerable_bool.rb +24 -0
  40. data/lib/monkey_patching/enumerable_distribute.rb +18 -0
  41. data/lib/monkey_patching/file.rb +131 -0
  42. data/lib/monkey_patching/subsumed.rb +24 -0
  43. data/lib/ruby_class_extensions.rb +4 -0
  44. data/lib/salsa_tiger_xml/corpus.rb +24 -0
  45. data/lib/salsa_tiger_xml/fe_node.rb +98 -0
  46. data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
  47. data/lib/salsa_tiger_xml/frame_node.rb +145 -0
  48. data/lib/salsa_tiger_xml/graph_node.rb +347 -0
  49. data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
  50. data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
  51. data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
  52. data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
  53. data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
  54. data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
  55. data/lib/salsa_tiger_xml/sem_node.rb +58 -0
  56. data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
  57. data/lib/salsa_tiger_xml/syn_node.rb +169 -0
  58. data/lib/salsa_tiger_xml/tree_node.rb +59 -0
  59. data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
  60. data/lib/salsa_tiger_xml/usp_node.rb +72 -0
  61. data/lib/salsa_tiger_xml/xml_node.rb +163 -0
  62. data/lib/shalmaneser/lib.rb +1 -0
  63. data/lib/tabular_format/fn_tab_format_file.rb +38 -0
  64. data/lib/tabular_format/fn_tab_frame.rb +67 -0
  65. data/lib/tabular_format/fn_tab_sentence.rb +169 -0
  66. data/lib/tabular_format/tab_format_file.rb +91 -0
  67. data/lib/tabular_format/tab_format_named_args.rb +184 -0
  68. data/lib/tabular_format/tab_format_sentence.rb +119 -0
  69. data/lib/value_restriction.rb +49 -0
  70. metadata +131 -0
@@ -0,0 +1,251 @@
1
+ # ExternalSystems.rb
2
+ #
3
+ # ke oct/nov 2005
4
+ #
5
+ # Store all known interfaces to
6
+ # systems that do syntactic analysis
7
+ #
8
+ # Given the name of a system and the service that the
9
+ # system performs, return the appropriate interface
10
+ #
11
+ # There are two types of interfaces to syntactic analysis systems:
12
+ # - interfaces:
13
+ # offer methods for syntactic analysis,
14
+ # and the transformation to Salsa/Tiger XML and SalsaTigerSentence objects
15
+ # - interpreters:
16
+ # interpret the resulting Salsa/Tiger XML (represented as
17
+ # SalsaTigerSentence and SynNode objects), e.g.
18
+ # generalize over part of speech;
19
+ # describe the path between a pair of nodes both as a path
20
+ # and (potentially) as a grammatical function of one of the nodes;
21
+ # determine whether a node describes a verb, and in which voice;
22
+ # determine the head of a constituent
23
+ #
24
+ # Abstract classes for both interfaces and interpreters
25
+ # are in AbstractSynInterface.rb
26
+
27
+ require "ruby_class_extensions"
28
+ require 'logging'
29
+
30
+ # The list of available interface packages
31
+ # is at the end of this file.
32
+ # Please enter additional interfaces there.
33
+
34
+ # @todo AB: [2015-12-16 Wed 01:03]
35
+ # After decoupling in OptParser and ConfigData classes move this
36
+ # to Frappe.
37
+ module Shalmaneser
38
+ class ExternalSystems
39
+ ###
40
+ # class variable:
41
+ # list of all known interface classes
42
+ # add to it using add_interface()
43
+ @interfaces = []
44
+
45
+ ###
46
+ # class variable:
47
+ # list of all known interpreter classes
48
+ # add to it using add_interpreter()
49
+ @interpreters = []
50
+
51
+ ###
52
+ # add interface/interpreter
53
+ def self.add_interface(class_name)
54
+ LOGGER.debug "Initializing interface <#{class_name}>."
55
+ @interfaces << class_name
56
+ end
57
+
58
+ def self.add_interpreter(class_name)
59
+ LOGGER.debug "Initializing interpreter <#{class_name}>."
60
+ @interpreters << class_name
61
+ end
62
+
63
+ ###
64
+ # check_interfaces_abort_if_missing:
65
+ #
66
+ # Given an experiment file, use some_system_missing? to
67
+ # determine whether the system can be run with the requested
68
+ # syntactic processing, exit with an error message if that is not possible
69
+ # @param [FrappeConfigData] exp Experiment description.
70
+ def self.check_interfaces_abort_if_missing(exp)
71
+ if (missing = some_system_missing?(exp))
72
+ interwhat, services = missing
73
+
74
+ $stderr.puts
75
+ $stderr.puts "ERROR: I am missing an #{interwhat} for "
76
+ services.each_pair { |service, system_name|
77
+ $stderr.puts "\tservice #{service}, system #{system_name}"
78
+ }
79
+ $stderr.puts
80
+ $stderr.puts "I have the following interfaces:"
81
+ @interfaces.each { |interface_class|
82
+ $stderr.puts "\tservice #{interface_class.service}, system #{interface_class.system}"
83
+ }
84
+ $stderr.puts "I have the following interpreters:"
85
+ @interpreters.each { |interpreter_class|
86
+ $stderr.print "\t"
87
+ $stderr.print interpreter_class.systems.to_a.map { |service, system_name|
88
+ "service #{service}, system #{system_name}"
89
+ }.join("; ")
90
+ unless interpreter_class.optional_systems.empty?
91
+ $stderr.print ", optional: "
92
+ $stderr.print interpreter_class.optional_systems.to_a.map { |service, system_name|
93
+ "service #{service}, system #{system_name}"
94
+ }.join("; ")
95
+ end
96
+ $stderr.puts
97
+ }
98
+ $stderr.puts
99
+ $stderr.puts "Please adapt your experiment file."
100
+ exit 1
101
+ end
102
+ end
103
+
104
+ ###
105
+ # given the name of a system and the service that it
106
+ # performs, find the matching interface class
107
+ #
108
+ # system: string: name of system, e.g. collins
109
+ # service: string: service, e.g. parser
110
+ #
111
+ # returns: SynInterface class
112
+ def self.get_interface(service, system)
113
+ interfaces = @interfaces.select do |interface_class|
114
+ interface_class.system == system && interface_class.service == service
115
+ end
116
+
117
+ unless interfaces.any?
118
+ raise "I've been requested an interface for #{service} and #{system}, "\
119
+ 'but I cannot find any. Please correct your experiment files.'
120
+ end
121
+
122
+ # @todo AB: Actually it's bad logic, but no idea for now how to handle it.
123
+ interfaces.first
124
+ end
125
+
126
+ ###
127
+ # helper for get_interpreter:
128
+ def self.get_interpreter_according_to_exp(exp)
129
+ ExternalSystems.get_interpreter(ExternalSystems.requested_services(exp))
130
+ end
131
+
132
+ ###
133
+ # given the names and services of a set of systems,
134
+ # find the matching interpreter class
135
+ #
136
+ # an interpreter class has both obligatory systems
137
+ # (they need to be present for this class to apply)
138
+ # and optional systems (they may or may not be present
139
+ # for the class to apply, but no other system performing
140
+ # the same service may)
141
+ #
142
+ # systems:
143
+ # hash: service(string) -> system name(string)
144
+ #
145
+ # returns: SynInterpreter class
146
+ def self.get_interpreter(systems)
147
+ # try to find an interface class with the given
148
+ # service-name pairs
149
+
150
+ @interpreters.each { |interpreter_class|
151
+
152
+ if interpreter_class.systems.to_a.big_and { |service, system|
153
+ # all obligatory entries of interpreter_class
154
+ # are in systems
155
+ systems[service] == system
156
+ } and
157
+ interpreter_class.optional_systems.to_a.big_and { |service, system|
158
+ # all optional entries of interpreter_class are
159
+ # either in systems, or the service isn't in systems at all
160
+ systems[service].nil? or systems[service] == system
161
+ } and
162
+ systems.to_a.big_and { |service, system|
163
+ # all entries in names are in either
164
+ # the obligatory or optional set for interpreter_class
165
+ interpreter_class.systems[service] == system or
166
+ interpreter_class.optional_systems[service] == system
167
+ }
168
+ return interpreter_class
169
+ end
170
+ }
171
+
172
+ # at this point, detection of a suitable interpreter class has failed
173
+ return nil
174
+ end
175
+
176
+ ################
177
+ private
178
+
179
+ ###
180
+ # knows about possible services that can be set in
181
+ # the experiment file, and where the names of
182
+ # the matching systems will be found in the experiment file data structure
183
+ #
184
+ # WARNING: adapt this when you introduce new services!
185
+ #
186
+ # returns: a hash
187
+ # <service> => system_name
188
+ #
189
+ # such that for each service/system name pair:
190
+ # the service with the given name has been requested in
191
+ # the experiment file, and the names of the systems to be used
192
+ # for performing the service
193
+ def self.requested_services(exp)
194
+ services = {}
195
+ [
196
+ {"flag" => "do_postag", "service" => "pos_tagger"},
197
+ {"flag" => "do_lemmatize", "service" => "lemmatizer"},
198
+ {"flag" => "do_parse", "service" => "parser"}
199
+ ].each do |hash|
200
+ # yes, perform this service
201
+ if exp.get(hash["flag"])
202
+ services[hash["service"]] = exp.get(hash["service"])
203
+ end
204
+ end
205
+
206
+ services
207
+ end
208
+
209
+ ###
210
+ # some_system_missing?
211
+ # returns nil if I have interfaces and interpreters
212
+ # for all services requested in the given experiment file
213
+ # else:
214
+ # returns pair [interface or interpreter, info]
215
+ # where the 1st element is either 'interface' or 'interpreter',
216
+ # and the 2nd element is a hash mapping services to system names:
217
+ # the services that could not be provided
218
+ # @param [FrappeConfigdata] exp FrappeConfigData object to check all the systems.
219
+ def self.some_system_missing?(exp)
220
+ missing_systems = nil
221
+ # check interfaces
222
+ requested_services(exp).each_pair do |service, system_name|
223
+ unless get_interface(service, system_name)
224
+ missing_systems = ["interface", {service => system_name}]
225
+ end
226
+ end
227
+
228
+ # check interpreter
229
+ unless get_interpreter_according_to_exp(exp)
230
+ missing_systems = ["interpreter", services]
231
+ end
232
+
233
+ # everything okay
234
+ missing_systems
235
+ end
236
+ end
237
+ end
238
+
239
+ # @todo AB: We should require programmatically all files in
240
+ # <frappe/interpreters> and <frappe/interfaces>.
241
+ require 'frappe/interfaces/collins_interface'
242
+ require 'frappe/interpreters/collins_treetagger_interpreter'
243
+ require 'frappe/interpreters/collins_tnt_interpreter'
244
+ require 'frappe/interfaces/berkeley_interface'
245
+ require 'frappe/interpreters/berkeley_interpreter'
246
+ require 'frappe/interfaces/stanford_interface'
247
+ require 'frappe/interpreters/stanford_interpreter'
248
+ require 'frappe/interfaces/treetagger_interface'
249
+ require 'frappe/interfaces/treetagger_pos_interface'
250
+ require 'frappe/interpreters/treetagger_interpreter'
251
+ require 'frappe/interpreters/empty_interpreter'
@@ -0,0 +1,209 @@
1
+ #####################
2
+ # class to keep data for one frame
3
+ class FNCorpusAset
4
+ attr_reader :layers, :aset_type, :aset_id, :frame_name, :lu
5
+
6
+ #######
7
+ # Analyze RegXML object, store in object variables:
8
+ #
9
+ # @aset_type: "frame" or "NER"
10
+ # @frame_name: frame name for "frame" type
11
+ # @lu: LU for "frame" type
12
+ # @aset_id: ID of the annotation set
13
+ # @layers: hash: layer type (FE, GF, PT, Target, NER) -> [offset, "start"/"stop"] -> list of labels
14
+ # string -> int*string -> array:string
15
+ #
16
+ def initialize(aset, #RegXML object
17
+ charidx) # array of pairs [start index, stop index] int*int
18
+
19
+ @layers = {}
20
+ @frame_name = nil
21
+ @lu = nil
22
+ @aset_type = nil
23
+
24
+ attributes = aset.attributes
25
+
26
+ @aset_id = attributes["ID"]
27
+
28
+ if attributes["frameName"]
29
+ # all of these seem to be frames. store in 'frames' array
30
+ unless attributes["luName"]
31
+ $stderr.puts "FNCorpusAset warning: cannot determine LU name"
32
+ $stder.puts aset.to_s
33
+ return
34
+ end
35
+ @aset_type = "frame"
36
+ @frame_name = attributes["frameName"]
37
+ @lu = attributes["luName"]
38
+
39
+ unless (layers = aset.first_child_matching("layers"))
40
+ $stderr.puts "FNCorpusAset warning: unexpectedly no layers found"
41
+ $stderr.puts aset.to_s
42
+ return
43
+ end
44
+
45
+ layers.each_child_matching("layer") { |l| analyze_layer(l, charidx) }
46
+
47
+ else
48
+ # all we seem to get here are named entity labels.
49
+ @aset_type = "NER"
50
+
51
+ unless (layers = aset.first_child_matching("layers"))
52
+ $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
53
+ $stderr.puts aset.to_s
54
+ return
55
+ end
56
+ unless (layer = layers.first_child_matching("layer"))
57
+ $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
58
+ $stderr.puts aset.to_s
59
+ return
60
+ end
61
+
62
+ unless layer.attributes["name"] == "NER"
63
+ $stderr.puts "FNCorpusAset Warning: unexpected layer #{layer.attributes["name"]}, was expecting only an NER layer."
64
+ $stderr.puts aset.to_s
65
+ return
66
+ end
67
+
68
+ analyze_layer(layer, charidx)
69
+
70
+ end
71
+ end
72
+
73
+
74
+ #############
75
+ # input: <layer> RegXML object
76
+ # analyze this, put into @layers data structure
77
+ def analyze_layer(layer, # RegXML object
78
+ charidx) # array:int*int pairs start/end index of words
79
+ layer_name = layer.attributes["name"]
80
+ unless layer_name
81
+ $stderr.puts "FNCorpusAset warning: cannot determine layer name"
82
+ $stderr.puts layer.to_s
83
+ return
84
+ end
85
+
86
+ # FN-specific: skip 2nd layer FEs for now
87
+ if layer_name == "FE" and layer.attributes["rank"] == "2"
88
+ return
89
+ end
90
+
91
+ unless @layers[layer_name]
92
+ @layers[layer_name] = {}
93
+ end
94
+
95
+ unless (labels = layer.first_child_matching("labels"))
96
+ # nothing to record for this layer
97
+ return
98
+ end
99
+
100
+
101
+ # taking over much of analyse_layer from class FrameXML
102
+ thisLayer = []
103
+
104
+ labels.each_child_matching("label") { |label|
105
+ attributes = label.attributes
106
+ if attributes["itype"] =~ /NI/
107
+ # null instantiation, ignore
108
+ next
109
+ end
110
+
111
+ if not(attributes["start"]) and not(attributes["end"])
112
+ # no start and end labels
113
+ next
114
+ end
115
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
116
+ }
117
+
118
+ # sanity check: do indices
119
+ # match word start and end indices?
120
+ thisLayer = verify_annotation(thisLayer, charidx)
121
+
122
+ # sanity check: verify that
123
+ # we don't have overlapping labels
124
+
125
+ deleteHash = {} # keep track of the labels which are to be deleted
126
+ # i -> Boolean
127
+
128
+ thisLayer.each_index {|i|
129
+ # efficiency: skip already delete labels
130
+ if deleteHash[i]
131
+ next
132
+ end
133
+ this_label, this_from , this_to = thisLayer[i]
134
+
135
+ # compare with all remaining labels
136
+ (i+1..thisLayer.length-1).to_a.each { |other_i|
137
+ other_label,other_from,other_to = thisLayer[other_i]
138
+
139
+ # overlap? Throw out the later FE
140
+ if this_from <= other_from and other_from <= this_to
141
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
142
+ deleteHash[other_i] = true
143
+ elsif this_from <= other_to and other_to <= this_to
144
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
145
+ delete_hash[i] = true
146
+ end
147
+ }
148
+ # matched with all other labels. If "keep", return
149
+
150
+ if deleteHash[i]
151
+ # $stderr.puts " deleting entry #{i}"
152
+ else
153
+ [ [this_from, "start"], [this_to, "stop"]].each { |offset, start_or_stop|
154
+ unless @layers[layer_name].has_key?([offset, start_or_stop])
155
+ @layers[layer_name][[offset, start_or_stop]] = []
156
+ end
157
+ @layers[layer_name][ [offset, start_or_stop] ] << this_label
158
+ }
159
+ end
160
+ }
161
+ end
162
+
163
+ ##############3
164
+ # verify found triples label/from_index/to_index
165
+ # against given start/end indices of words
166
+ #
167
+ # returns: triples, possibly changed
168
+ def verify_annotation(found, # array: label/from/to, string*int*int
169
+ charidx) # array: from/to, int*int
170
+
171
+ return found.map {|element, start, stop|
172
+
173
+ newstart = start
174
+ newstop = stop
175
+
176
+ # compare against word start/stop indices
177
+ charidx.each_index{|j|
178
+ unless j== 0
179
+ pstartidx, pstopidx = charidx[j-1]
180
+ end
181
+ startidx, stopidx = charidx[j]
182
+
183
+ if (start > startidx and start <= stopidx) or
184
+ (j != 0 and start > pstopidx and start < startidx)
185
+ newstart = startidx
186
+ end
187
+
188
+ if (stop >= startidx and stop < stopidx)
189
+ newstop = stopidx
190
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
191
+ newstop = pstopidx
192
+ end
193
+ }
194
+
195
+ # change?
196
+ if start != newstart or stop != newstop
197
+ # report change
198
+ $stderr.puts "FNCorpusXML warning: Heuristics has changed element "+element
199
+ $stderr.puts "\tfrom ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"]"
200
+
201
+ [element, newstart, newstop]
202
+
203
+ else
204
+
205
+ [element, start, stop]
206
+ end
207
+ }
208
+ end
209
+ end