shalmaneser 0.0.1.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +284 -0
@@ -0,0 +1,275 @@
1
+ # SynInterfaces.rb
2
+ #
3
+ # ke oct/nov 2005
4
+ #
5
+ # Store all known interfaces to
6
+ # systems that do syntactic analysis
7
+ #
8
+ # Given the name of a system and the service that the
9
+ # system performs, return the appropriate interface
10
+ #
11
+ # There are two types of interfaces to syntactic analysis systems:
12
+ # - interfaces:
13
+ # offer methods for syntactic analysis,
14
+ # and the transformation to Salsa/Tiger XML and SalsaTigerSentence objects
15
+ # - interpreters:
16
+ # interpret the resulting Salsa/Tiger XML (represented as
17
+ # SalsaTigerSentence and SynNode objects), e.g.
18
+ # generalize over part of speech;
19
+ # describe the path between a pair of nodes both as a path
20
+ # and (potentially) as a grammatical function of one of the nodes;
21
+ # determine whether a node describes a verb, and in which voice;
22
+ # determine the head of a constituent
23
+ #
24
+ # Abstract classes for both interfaces and interpreters
25
+ # are in AbstractSynInterface.rb
26
+
27
+ require "frprep/ruby_class_extensions"
28
+ class Array
29
+ include EnumerableBool
30
+ end
31
+
32
+ # The list of available interface packages
33
+ # is at the end of this file.
34
+ # Please enter additional interfaces there.
35
+
36
+ class SynInterfaces
37
+
38
+ ###
39
+ # class variable:
40
+ # list of all known interface classes
41
+ # add to it using add_interface()
42
+ @@interfaces = Array.new
43
+
44
+ ###
45
+ # class variable:
46
+ # list of all known interpreter classes
47
+ # add to it using add_interpreter()
48
+ @@interpreters = Array.new
49
+
50
+ ###
51
+ # add interface/interpreter
52
+ def SynInterfaces.add_interface(class_name)
53
+ $stderr.puts "Initializing interface #{class_name}" if $DEBUG
54
+ @@interfaces << class_name
55
+ end
56
+
57
+ def SynInterfaces.add_interpreter(class_name)
58
+ $stderr.puts "Initializing interpreter #{class_name}" if $DEBUG
59
+ @@interpreters << class_name
60
+ end
61
+
62
+ # AB: fake method to preview the interfaces table.
63
+ def SynInterfaces.explore
64
+ $stderr.puts "Exploring..."
65
+ $stderr.puts @@interfaces
66
+ $stderr.puts @@interpreters
67
+ end
68
+ ###
69
+ # check_interfaces_abort_if_missing:
70
+ #
71
+ # Given an experiment file, use some_system_missing? to
72
+ # determine whether the system can be run with the requested
73
+ # syntactic processing, exit with an error message if that is not possible
74
+ def SynInterfaces.check_interfaces_abort_if_missing(exp) #FrPrepConfigData object
75
+ if (missing = SynInterfaces.some_system_missing?(exp))
76
+ interwhat, services = missing
77
+
78
+ $stderr.puts
79
+ $stderr.puts "ERROR: I am missing an #{interwhat} for "
80
+ services.each_pair { |service, system_name|
81
+ $stderr.puts "\tservice #{service}, system #{system_name}"
82
+ }
83
+ $stderr.puts
84
+ $stderr.puts "I have the following interfaces:"
85
+ @@interfaces.each { |interface_class|
86
+ $stderr.puts "\tservice #{interface_class.service}, system #{interface_class.system}"
87
+ }
88
+ $stderr.puts "I have the following interpreters:"
89
+ @@interpreters.each { |interpreter_class|
90
+ $stderr.print "\t"
91
+ $stderr.print interpreter_class.systems.to_a.map { |service, system_name|
92
+ "service #{service}, system #{system_name}"
93
+ }.join("; ")
94
+ unless interpreter_class.optional_systems.empty?
95
+ $stderr.print ", optional: "
96
+ $stderr.print interpreter_class.optional_systems.to_a.map { |service, system_name|
97
+ "service #{service}, system #{system_name}"
98
+ }.join("; ")
99
+ end
100
+ $stderr.puts
101
+ }
102
+ $stderr.puts
103
+ $stderr.puts "Please adapt your experiment file."
104
+ exit 1
105
+ end
106
+ end
107
+
108
+ ###
109
+ # some_system_missing?
110
+ # returns nil if I have interfaces and interpreters
111
+ # for all services requested in the given experiment file
112
+ # else:
113
+ # returns pair [interface or interpreter, info]
114
+ # where the 1st element is either 'interface' or 'interpreter',
115
+ # and the 2nd element is a hash mapping services to system names:
116
+ # the services that could not be provided
117
+ def SynInterfaces.some_system_missing?(exp) # FrPrepConfigData object
118
+
119
+ services = SynInterfaces.requested_services(exp)
120
+
121
+ # check interfaces
122
+ services.each_pair { |service, system_name|
123
+ unless SynInterfaces.get_interface(service, system_name)
124
+ return ["interface", {service => system_name} ]
125
+ end
126
+ }
127
+
128
+ # check interpreter
129
+ unless SynInterfaces.get_interpreter_according_to_exp(exp)
130
+ return ["interpreter", services]
131
+ end
132
+
133
+ # everything okay
134
+ return nil
135
+ end
136
+
137
+ ###
138
+ # given the name of a system and the service that it
139
+ # performs, find the matching interface class
140
+ #
141
+ # system: string: name of system, e.g. collins
142
+ # service: string: service, e.g. parser
143
+ #
144
+ # returns: SynInterface class
145
+ def SynInterfaces.get_interface(service,
146
+ system)
147
+
148
+ # try to find an interface class with the given
149
+ # name and service
150
+ @@interfaces.each { |interface_class|
151
+ if interface_class.system == system and
152
+ interface_class.service == service
153
+ return interface_class
154
+ end
155
+ }
156
+
157
+ # at this point, detection of a suitable interface class has failed
158
+ return nil
159
+ end
160
+
161
+ ###
162
+ # helper for get_interpreter:
163
+ def SynInterfaces.get_interpreter_according_to_exp(exp)
164
+ return SynInterfaces.get_interpreter(SynInterfaces.requested_services(exp))
165
+ end
166
+
167
+
168
+
169
+ ###
170
+ # given the names and services of a set of systems,
171
+ # find the matching interpreter class
172
+ #
173
+ # an interpreter class has both obligatory systems
174
+ # (they need to be present for this class to apply)
175
+ # and optional systems (they may or may not be present
176
+ # for the class to apply, but no other system performing
177
+ # the same service may)
178
+ #
179
+ # systems:
180
+ # hash: service(string) -> system name(string)
181
+ #
182
+ # returns: SynInterpreter class
183
+ def SynInterfaces.get_interpreter(systems)
184
+ # try to find an interface class with the given
185
+ # service-name pairs
186
+
187
+ @@interpreters.each { |interpreter_class|
188
+
189
+ if interpreter_class.systems.to_a.big_and { |service, system|
190
+ # all obligatory entries of interpreter_class
191
+ # are in systems
192
+ systems[service] == system
193
+ } and
194
+ interpreter_class.optional_systems.to_a.big_and { |service, system|
195
+ # all optional entries of interpreter_class are
196
+ # either in systems, or the service isn't in systems at all
197
+ systems[service].nil? or systems[service] == system
198
+ } and
199
+ systems.to_a.big_and { |service, system|
200
+ # all entries in names are in either
201
+ # the obligatory or optional set for interpreter_class
202
+ interpreter_class.systems[service] == system or
203
+ interpreter_class.optional_systems[service] == system
204
+ }
205
+ return interpreter_class
206
+ end
207
+ }
208
+
209
+ # at this point, detection of a suitable interpreter class has failed
210
+ return nil
211
+ end
212
+
213
+ ################
214
+ protected
215
+
216
+ ###
217
+ # knows about possible services that can be set in
218
+ # the experiment file, and where the names of
219
+ # the matching systems will be found in the experiment file data structure
220
+ #
221
+ # WARNING: adapt this when you introduce new services!
222
+ #
223
+ # returns: a hash
224
+ # <service> => system_name
225
+ #
226
+ # such that for each service/system name pair:
227
+ # the service with the given name has been requested in
228
+ # the experiment file, and the names of the systems to be used
229
+ # for performing the service
230
+ def SynInterfaces.requested_services(exp)
231
+ retv = Hash.new
232
+
233
+ [
234
+ { "flag" => "do_postag", "service"=> "pos_tagger"},
235
+ { "flag" => "do_lemmatize", "service"=> "lemmatizer"},
236
+ { "flag" => "do_parse", "service" => "parser" }
237
+ ].each { |hash|
238
+ if exp.get(hash["flag"]) # yes, perform this service
239
+ retv[hash["service"]] = exp.get(hash["service"])
240
+ end
241
+ }
242
+
243
+ return retv
244
+ end
245
+ end
246
+
247
+
248
+ require "frprep/CollinsInterface"
249
+ require "frprep/BerkeleyInterface"
250
+ require "frprep/SleepyInterface"
251
+ require "frprep/MiniparInterface"
252
+ require "frprep/TntInterface"
253
+ require "frprep/TreetaggerInterface"
254
+
255
+
256
+ class EmptyInterpreter < SynInterpreter
257
+ EmptyInterpreter.announce_me()
258
+
259
+ ###
260
+ # systems interpreted by this class:
261
+ # returns a hash service(string) -> system name (string),
262
+ # e.g.
263
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
264
+ def EmptyInterpreter.systems()
265
+ return {}
266
+ end
267
+
268
+ ###
269
+ # names of additional systems that may be interpreted by this class
270
+ # returns a hash service(string) -> system name(string)
271
+ # same as names()
272
+ def SynInterpreter.optional_systems()
273
+ return {}
274
+ end
275
+ end
@@ -0,0 +1,720 @@
1
+ # TabFormat.rb
2
+ # Katrin Erk, Jan 2004
3
+ #
4
+ # classes to be used with tabular format text files.
5
+ # originally CoNLL2.rb
6
+ # Original: Katrin Erk, Jan 2004 for CoNLL '04 data
7
+ # Rewrite: Sebastian Pado, Mar 2004 for Gemmas FrameNet data (no NEs etc.)
8
+
9
+ # Extensions SP Jun/Jul 04
10
+ # renamed GemmaCorpus to FNTabFormat
11
+
12
+ # partial rewrite SP 250804: made things cleaner & leaner: no RawFormat, for example
13
+
14
+ # sp 04/05: add a "frame" column to FNTabFormat
15
+ #
16
+ # Substantial changes KE 12/06:
17
+ # variable number of columns to accommodate more than one frame per sentence
18
+
19
+ #################################################
20
+ # class for reading a file
21
+ # containing data in tabular
22
+
23
+ require "tempfile"
24
+
25
+ require "frprep/ISO-8859-1"
26
+ require "frprep/ruby_class_extensions"
27
+
28
+ #######################
29
+ # This function takes a variable number of arguments and
30
+ # returns them as an array
31
+ # Idea: make formulation of tab format entries easier to read,
32
+ # enclose variable arguments in a repeat() call,
33
+ # which immediately gets transformed into a list
34
+ def repeat(*args)
35
+ return args
36
+ end
37
+
38
+ #######################
39
+ class TabFormatFile
40
+
41
+
42
+ #######
43
+ # initialize:
44
+ # open files for reading.
45
+ #
46
+ # fp is a list of pairs [filename, format]
47
+ # where format is a list of strings that will be used
48
+ # to address columns of the file, the 1st string for the 1st column
49
+ #
50
+ # format may contain _one_ entry that is an array (or a call to repeat())
51
+ # e.g.:
52
+ # ["word", "pos", "lemma", repeat("frame", "target", "gf", "pt")]
53
+ def initialize(fp)
54
+ # open files
55
+ @files = Array.new
56
+ @patterns = Array.new
57
+ @no_of_read_lines = 0
58
+ fp.each_index { |ix|
59
+ if ix.modulo(2) == 0
60
+ # filename
61
+ begin
62
+ @files << File.new(fp[ix])
63
+ rescue
64
+ raise 'Sorry, could not read input file ' + fp[ix] + "\n"
65
+ end
66
+ else
67
+ # pattern
68
+ @patterns += fp[ix]
69
+ end
70
+ }
71
+
72
+ @my_sentence_class = TabFormatSentence
73
+ end
74
+
75
+ ########
76
+ # each_sentence:
77
+ # yield each sentence of the files in turn.
78
+ # sentences are expected to be separated
79
+ # by a line containing nothing but whitespace.
80
+ # the last sentence may or may not be followed by
81
+ # an empty line.
82
+ # each_sentence ends when EOF is encountered on the first file.
83
+ # it expects all the other files to be the same length
84
+ # (in terms of number of lines) as the first file.
85
+ # each sentence is returned in the form of an
86
+ # array of TabFormatSentence sentences.
87
+
88
+ def each_sentence
89
+ unless @read_completely
90
+ sentence = @my_sentence_class.new(@patterns)
91
+ begin
92
+ lines = Array.new
93
+ while true do
94
+ line = ""
95
+ linearray = Array.new
96
+ @files.each {|f|
97
+ linearray << f.readline().chomp()
98
+ }
99
+ #STDERR.puts linearray
100
+ @no_of_read_lines += 1
101
+ if linearray.detect{|x| x.strip == ""}
102
+ if linearray.detect {|x| x.strip != ""}
103
+ STDERR.puts "Error: Mismatching empty lines!"
104
+ exit(1)
105
+ else
106
+ # sentence finished. yield it and start a new one
107
+ unless sentence.empty?
108
+ yield sentence
109
+ end
110
+ sentence = @my_sentence_class.new(@patterns)
111
+ end
112
+ # read an empty line in each of the other files
113
+
114
+ else
115
+ # sentence not yet finished.
116
+ # add this line to it
117
+ sentence.add_line(linearray.join("\t"))
118
+ end
119
+ end
120
+ rescue EOFError
121
+ unless sentence.empty?
122
+ # maybe we haven't yielded the last sentence yet.
123
+ yield sentence
124
+ end
125
+ @read_completely = true
126
+ end
127
+ end
128
+ end
129
+
130
+ end
131
+
132
+ #################################################
133
+ # class for keeping one line,
134
+ # parsed.
135
+ # The line is kept as follows:
136
+ # - normal features: in a hash @f mapping feature names to values
137
+ # - features of the repeated group: in an array @r of
138
+ # TabFormatNamedArgs objects, one per group
139
+ #
140
+ # each feature of the line is available by name
141
+ # via the method "get".
142
+ # Additional features (from other input files) can be
143
+ # added to the TabFormatNamedArgs object via the method
144
+ # add_feature
145
+ #
146
+ # methods:
147
+ #
148
+ # new: initialize.
149
+ # values: array of strings
150
+ # features: how to access the strings by name
151
+ # 'features' is an array of strings
152
+ # later the i-th feature will be used to access
153
+ # the i-th value,
154
+ # except for repeated groups
155
+ #
156
+ # get: returns one feature by its name
157
+ # name: a string
158
+ #
159
+ # add_feature: add another feature to this object,
160
+ # which can be accessed via "get"
161
+ # name: name for the new feature, should be distinct
162
+ # from the ones already used in new()
163
+ # feature: a string, the value of the feature
164
+ ##
165
+
166
+ class TabFormatNamedArgs
167
+ ############
168
+ def initialize(values, features, group = nil)
169
+ @f = Hash.new
170
+ @r = Array.new
171
+ @group = group
172
+
173
+ # record the feature names, give special attention to a group
174
+ # if we have one
175
+ @group_feature_names = nil
176
+ @feature_names = features.map { |feature|
177
+ if feature.instance_of? Array
178
+ # found a group
179
+ @group_feature_names = feature
180
+ "GROUP"
181
+ else
182
+ feature
183
+ end
184
+ }
185
+
186
+ if @feature_names.count("GROUP") > 1
187
+ $stderr.puts "More than one group in feature set:" + features.join(" ")
188
+ raise "Cannot handle this."
189
+ end
190
+
191
+ # group_index: position of group in overall feature list
192
+ group_index = @feature_names.index("GROUP")
193
+ unless group_index
194
+ group_index = @feature_names.length()
195
+ end
196
+ num_features_after_group = [0,
197
+ (@feature_names.length() - 1) - group_index].max()
198
+ index_after_groups = values.length() - num_features_after_group
199
+
200
+
201
+ # features before group: put feature/value pairs in @f hash
202
+ 0.upto(group_index - 1) { |i|
203
+ @f[features[i]] = values[i]
204
+ }
205
+ # group: store each group in @r hash
206
+ if @group_feature_names
207
+ # for (group_start = group_index; group_start < index_after_groups;
208
+ # group_start += @group_feature_names.length())
209
+ group_no = 0
210
+ group_index.step(index_after_groups - 1,
211
+ @group_feature_names.length()) { |group_start|
212
+ @r << TabFormatNamedArgs.new(values.slice(group_start,
213
+ @group_feature_names.length()),
214
+ @group_feature_names,
215
+ group_no)
216
+ group_no += 1
217
+ }
218
+ end
219
+
220
+ # features after group: put feature/value pairs in @f hash
221
+ feature_index = group_index + 1
222
+ index_after_groups.upto(values.length() - 1) { |i|
223
+ @f[features[feature_index]] = values[i]
224
+ feature_index += 1
225
+ }
226
+ end
227
+
228
+ ############
229
+ # return feature/value pairs as a tab format line,
230
+ # order of features as given in the 'features' list
231
+ # Features not set in the hash: their entry will be "-"
232
+ #
233
+ # If the feature list includes a group,
234
+ # assume zero entries for that group
235
+ def TabFormatNamedArgs.format_str(hash, # hash: feature -> value
236
+ features) # feature list, as for new()
237
+ if features.nil?
238
+ return ""
239
+ end
240
+
241
+ # sanity check: does the hash contain keys that are not in the feature list?
242
+ hash.keys().reject { |f| features.include? f }.each { |bad_feature|
243
+ $stderr.puts "Error: unknown feature #{bad_feature} in format_str: ignoring."
244
+ }
245
+
246
+ return features.select { |f|
247
+ # remove the group feature, if it's there
248
+ not(f.instance_of? Array)
249
+ }.map { |feature|
250
+ if hash[feature]
251
+ hash[feature]
252
+ else
253
+ "-"
254
+ end
255
+ }.join("\t")
256
+ end
257
+
258
+
259
+ #############
260
+ def add_feature(name, feature)
261
+ if @f.has_key? name
262
+ raise "Trying to add a feature twice: "+name
263
+ end
264
+
265
+ @f[name] = feature
266
+ end
267
+
268
+ #############
269
+ # get feature value, identified by feature name
270
+ # return: feature value as string
271
+ def get(name)
272
+ if (retv = get_nongroup(name))
273
+ return retv
274
+ else
275
+ return get_from_group(name, @group)
276
+ end
277
+ end
278
+
279
+ #############
280
+ def set(name, feature)
281
+ @f[name] = feature
282
+ end
283
+
284
+ #############
285
+ def num_groups()
286
+ return @r.length()
287
+ end
288
+
289
+ #############
290
+ # return line as string, entries connected by tab,
291
+ # in the order that the entries were in originally
292
+ def to_s()
293
+ return @feature_names.map { |feature|
294
+ case feature
295
+ when "GROUP"
296
+ @r.map { |group_obj| group_obj.to_s }.join("\t")
297
+ else
298
+ @f[feature]
299
+ end
300
+ }.join("\t")
301
+ end
302
+
303
+ protected
304
+
305
+ # get feature, non-group
306
+ # return: feature value (string)
307
+ def get_nongroup(feature)
308
+ return @f[feature]
309
+ end
310
+
311
+ # get feature from one of the groups
312
+ # return: feature value (string)
313
+ def get_from_group(name, group_no)
314
+ if not(group_no) or group_no >= @r.length()
315
+ # no group with that number
316
+ return nil
317
+ else
318
+ return @r[group_no].get_nongroup(name)
319
+ end
320
+ end
321
+ end
322
+
323
+
324
+ #################################################
325
+ # class for keeping and yielding one sentence
326
+ # in tabular format
327
+ class TabFormatSentence
328
+ ############
329
+ # initialize:
330
+ # the sentence will be stored one word (plus additional info
331
+ # for that word) per line. Each line will be stored in a cell of
332
+ # the array @lines. the 'initialize' method starts with an empty
333
+ # array of lines.
334
+ def initialize(pattern)
335
+ @lines = Array.new
336
+ @pattern = pattern
337
+
338
+ # this is just for inheritance; FNTabFormatSentence will need this
339
+ @group_no = nil
340
+ end
341
+
342
+ #####
343
+ # length: number of words in the sentence
344
+ def length
345
+ return @lines.length
346
+ end
347
+
348
+ ################3
349
+ # add_line:
350
+ # add one entry to the @lines array, i.e. information for one word
351
+ # of the sentence.
352
+ def add_line(line)
353
+ @lines << line
354
+ end
355
+
356
+ ###################
357
+ # empty?:
358
+ # returns true if there are currently no lines stored in this
359
+ # TabFormatSentence object
360
+ # else false
361
+ def empty?
362
+ return @lines.empty?
363
+ end
364
+
365
+ ######################
366
+ # empty!:
367
+ # discards all entries to the @lines array,
368
+ # i.e. empties this TabFormatSentence object of all
369
+ # data
370
+ def empty!
371
+ @lines.clear
372
+ end
373
+
374
+ #####################
375
+ # each_line:
376
+ # yields each line of the sentence
377
+ # as a string
378
+ def each_line
379
+ @lines.each { |l| yield l }
380
+ end
381
+
382
+ ######################
383
+ # each_line_parsed:
384
+ # yields each line of the sentence
385
+ # broken up as follows:
386
+ # the line is expected to contain 6 or more pieces of
387
+ # information, separated by whitespace.
388
+ # - the word
389
+ # - the part of speech info for the word
390
+ # - syntax for roles (not to be used)
391
+ # - target (or -)
392
+ # - gramm. function for roles (not to be used)
393
+ # - one column with role annotation
394
+ #
395
+ # All pieces are yielded as strings, except for the argument columns, which
396
+ # are yielded as an array of strings.
397
+ def each_line_parsed
398
+ lineno = 0
399
+ f = nil
400
+ @lines.each { |l|
401
+ f = TabFormatNamedArgs.new(l.split("\t"), @pattern, @group_no)
402
+ f.add_feature("lineno", lineno)
403
+ yield f
404
+ lineno += 1
405
+ }
406
+ end
407
+
408
+ ###
409
+ # read_one_line:
410
+ # return a line of the sentence specified by its number
411
+ def read_one_line(number)
412
+ return(@lines[number])
413
+ end
414
+
415
+ ###
416
+ # read_one_line_parsed:
417
+ # like get_line, but the features in the line are returned
418
+ # separately,
419
+ # as in each_line_parsed
420
+ def read_one_line_parsed(number)
421
+ if @lines[number].nil?
422
+ return nil
423
+ else
424
+ f = TabFormatNamedArgs.new(@lines[number].split("\t"), @pattern, @group_no)
425
+ f.add_feature("lineno", number)
426
+ return f
427
+ end
428
+ end
429
+
430
+ # set line no of first line of present sentence
431
+ def set_starting_line(n)
432
+ raise "Deprecated"
433
+ end
434
+
435
+ # returns line no of first line of present sentence
436
+ def get_starting_line()
437
+ raise "Deprecated"
438
+ end
439
+ end
440
+
441
+ ########################################################
442
+ # TabFormat files containing everything that's in the FN lexunit files
443
+ #
444
+ # one target per sentence
445
+
446
+ class FNTabFormatFile < TabFormatFile
447
+
448
+ def initialize(filename,tag_suffix=nil,lemma_suffix=nil)
449
+
450
+ corpusname = File.dirname(filename)+"/"+File.basename(filename,".tab")
451
+
452
+ filename_label_pairs = [filename,FNTabFormatFile.fntab_format()]
453
+ if lemma_suffix # raise exception if lemmatisation does not esist
454
+ filename_label_pairs.concat [corpusname+lemma_suffix,["lemma"]]
455
+ end
456
+ if tag_suffix # raise exception if tagging does not exist
457
+ filename_label_pairs.concat [corpusname+tag_suffix,["pos"]]
458
+ end
459
+ super(filename_label_pairs)
460
+
461
+ @my_sentence_class = FNTabSentence
462
+ end
463
+
464
+
465
+ def FNTabFormatFile.fntab_format()
466
+ # return ["word", "pt", "gf", "role", "target", "frame", "lu_sent_ids"]
467
+ return [
468
+ "word",
469
+ FNTabFormatFile.frametab_format(),
470
+ "ne", "sent_id"
471
+ ]
472
+ end
473
+
474
+ def FNTabFormatFile.frametab_format()
475
+ return ["pt", "gf", "role", "target", "frame", "stuff"]
476
+ end
477
+
478
+ ##########
479
+ # given a hash mapping features to values,
480
+ # format according to fntab_format
481
+ def FNTabFormatFile.format_str(hash)
482
+ return TabFormatNamedArgs.format_str(hash, FNTabFormatFile.fntab_format())
483
+ end
484
+ end
485
+
486
+ ############################################
487
+ class FNTabSentence < TabFormatSentence
488
+
489
+ ####
490
+ # overwrite this to get a feature from
491
+ # a group rather than from the main feature list
492
+ def get_this(l, feature_name)
493
+ return l.get(feature_name)
494
+ end
495
+
496
+ ####
497
+ def sanity_check()
498
+ each_line_parsed {|l|
499
+ if l.get("sent_id").nil?
500
+ raise "Error: corpus file does not conform to FN format."
501
+ else
502
+ return
503
+ end
504
+ }
505
+ end
506
+
507
+ ####
508
+ # returns the sentence ID, a string, as set by FrameNet
509
+ def get_sent_id()
510
+ sanity_check
511
+ each_line_parsed {|l|
512
+ return l.get("sent_id")
513
+ }
514
+ end
515
+
516
+ ####
517
+ # iterator, yields each frame of the sentence as a FNTabFrame
518
+ # object. They contain the complete sentence, but provide
519
+ # access to exactly one frame of that sentence.
520
+ def each_frame()
521
+ # how many frames? assume that each line has the same
522
+ # number of frames
523
+ num_frames = read_one_line_parsed(0).num_groups()
524
+ 0.upto(num_frames - 1) { |frame_no|
525
+ frame_obj = FNTabFrame.new(@pattern, frame_no)
526
+ each_line { |l| frame_obj.add_line(l) }
527
+ yield frame_obj
528
+ }
529
+ end
530
+
531
+ ####
532
+ # computes a mapping from word indices to labels on these words
533
+ #
534
+ # returns a hash: index_list(array:integer) -> label(string)
535
+ # An entry il->label means that all the lines whose line
536
+ # numbers are listed in il are labeled with label.
537
+ #
538
+ # Line numbers correspond to words of the sentence. Counting starts at 0.
539
+ #
540
+ # By default, "markables" looks for role labels, i.e. labels in the
541
+ # column "role", but it can also look in another column.
542
+ # To change the default, give the column name as a parameter.
543
+ def markables(use_this_column="role")
544
+ # returns hash of {index list} -> {markup label}
545
+
546
+ sanity_check()
547
+
548
+ idlist_to_annotation_list = Hash.new
549
+
550
+ # add entry for the target word
551
+ # idlist_to_annotation_list[get_target_indices()] = "target"
552
+
553
+ # determine span of each frame element
554
+ # if we find overlapping FEs, we write a warning to STDERR
555
+ # ignore the 2nd label and attempt to "close" the 1st label
556
+
557
+ ids = Array.new
558
+ label = nil
559
+
560
+ each_line_parsed { |l|
561
+
562
+ this_id = get_this(l, "lineno")
563
+
564
+ # start of FE?
565
+ this_col = get_this(l, use_this_column)
566
+ unless this_col
567
+ $stderr.puts "nil entry #{use_this_column} in line #{this_id} of sent #{get_sent_id()}. Skipping."
568
+ next
569
+ end
570
+ this_fe_ann = this_col.split(":")
571
+
572
+ case this_fe_ann.length
573
+ when 1 # nothing at all, or a single begin or end
574
+ markup = this_fe_ann.first
575
+ if markup == "-" or markup == "--" # no change
576
+ if label
577
+ ids << this_id
578
+ end
579
+ elsif markup =~ /^B-(\S+)$/
580
+ if label # are we within a markable right now?
581
+ $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" starts while within markable ", label.to_s
582
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
583
+ else
584
+ label = $1
585
+ ids << this_id
586
+ end
587
+ elsif markup =~ /^E-(\S+)$/
588
+ if label == $1 # we close the markable we've opened before
589
+ ids << this_id
590
+ # store information
591
+ idlist_to_annotation_list[ids] = label
592
+ # reset memory
593
+ label = nil
594
+ ids = Array.new
595
+ else
596
+ $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" closes while within markable "+ label.to_s
597
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
598
+ end
599
+ else
600
+ $stderr.puts "[TabFormat] Warning: cannot analyse markup "+markup
601
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}"
602
+ end
603
+ when 2 # this should be a one-word markable
604
+ b_markup = this_fe_ann[0]
605
+ e_markup = this_fe_ann[1]
606
+ if label
607
+ $stderr.puts "[TabFormat] Warning: Finding new markable at word #{this_id} while within markable ", label
608
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
609
+ else
610
+ if b_markup =~ /^B-(\S+)$/
611
+ b_label = $1
612
+ if e_markup =~ /^E-(\S+)$/
613
+ e_label = $1
614
+ if b_label == e_label
615
+ idlist_to_annotation_list[[this_id]] = b_label
616
+ else
617
+ $stderr.puts "[TabFormat] Warning: Starting markable "+b_label+", closing markable "+e_label
618
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
619
+ end
620
+ else
621
+ $stderr.puts "[TabFormat] Warning: Unknown end markup "+e_markup
622
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
623
+ end
624
+ else
625
+ $stderr.puts "[TabFormat] Warning: Unknown start markup "+b_markup
626
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
627
+ end
628
+ end
629
+ else
630
+ $stderr.puts "Warning: cannot analyse markup with more than two colon-separated parts like "+this_fee_ann.join(":")
631
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}"
632
+ end
633
+ }
634
+
635
+ unless label.nil?
636
+ $stderr.puts "[TabFormat] Warning: Markable ", label, " did not end in sentence."
637
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
638
+ end
639
+
640
+ return idlist_to_annotation_list
641
+ end
642
+
643
+ #######
644
+ def to_s
645
+ sanity_check
646
+ array = Array.new
647
+ each_line_parsed {|l|
648
+ array << l.get("word")
649
+ }
650
+ return array.join(" ")
651
+ end
652
+
653
+ end
654
+
655
+ class FNTabFrame < FNTabSentence
656
+
657
+ ############
658
+ # initialize:
659
+ # as parent, except that we also get a frame number
660
+ # such that we can access the features of ``our'' frame
661
+ def initialize(pattern, frameno)
662
+ # by setting @group_no to frameno,
663
+ # we are initializing each TabFormatNamedArgs object
664
+ # in each_line_parsed() or read_one_line_parsed()
665
+ # with the right group number,
666
+ # such that all calls to TabFormatNamedArgs.get()
667
+ # will access the right group.
668
+ super(pattern)
669
+ @group_no = frameno
670
+ end
671
+
672
+
673
+ # returns the frame introduced by the target word(s)
674
+ # of this frame group, a string
675
+ def get_frame()
676
+ sanity_check()
677
+ each_line_parsed {|l|
678
+ return l.get("frame")
679
+ }
680
+ end
681
+
682
+ ####
683
+ # returns an array of integers: the indices of the target of
684
+ # the frame
685
+ # These are the line numbers, which start counting at 0
686
+ #
687
+ # a target may span more than one word
688
+ def get_target_indices()
689
+ sanity_check
690
+ idx = Array.new
691
+ each_line_parsed {|l|
692
+ unless l.get("target") == "-"
693
+ idx << l.get("lineno")
694
+ end
695
+ }
696
+ return idx
697
+ end
698
+
699
+ ####
700
+ # returns a string: the target
701
+ # in the case of multiword targets,
702
+ # we find the complete target at all
703
+ # indices, i.e. we can just take the first one we find
704
+ def get_target()
705
+ each_line_parsed {|l|
706
+ t = l.get("target")
707
+ unless t == "-"
708
+ return t
709
+ end
710
+ }
711
+ end
712
+
713
+ ####
714
+ # get the target POS, according to FrameNet
715
+ def get_target_fn_pos()
716
+ get_target() =~ /^[^\.]+\.(\w+)$/
717
+ return $1
718
+ end
719
+
720
+ end