frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,275 @@
1
+ # SynInterfaces.rb
2
+ #
3
+ # ke oct/nov 2005
4
+ #
5
+ # Store all known interfaces to
6
+ # systems that do syntactic analysis
7
+ #
8
+ # Given the name of a system and the service that the
9
+ # system performs, return the appropriate interface
10
+ #
11
+ # There are two types of interfaces to syntactic analysis systems:
12
+ # - interfaces:
13
+ # offer methods for syntactic analysis,
14
+ # and the transformation to Salsa/Tiger XML and SalsaTigerSentence objects
15
+ # - interpreters:
16
+ # interpret the resulting Salsa/Tiger XML (represented as
17
+ # SalsaTigerSentence and SynNode objects), e.g.
18
+ # generalize over part of speech;
19
+ # describe the path between a pair of nodes both as a path
20
+ # and (potentially) as a grammatical function of one of the nodes;
21
+ # determine whether a node describes a verb, and in which voice;
22
+ # determine the head of a constituent
23
+ #
24
+ # Abstract classes for both interfaces and interpreters
25
+ # are in AbstractSynInterface.rb
26
+
27
+ require "frprep/ruby_class_extensions"
28
+ class Array
29
+ include EnumerableBool
30
+ end
31
+
32
+ # The list of available interface packages
33
+ # is at the end of this file.
34
+ # Please enter additional interfaces there.
35
+
36
+ class SynInterfaces
37
+
38
+ ###
39
+ # class variable:
40
+ # list of all known interface classes
41
+ # add to it using add_interface()
42
+ @@interfaces = Array.new
43
+
44
+ ###
45
+ # class variable:
46
+ # list of all known interpreter classes
47
+ # add to it using add_interpreter()
48
+ @@interpreters = Array.new
49
+
50
+ ###
51
+ # add interface/interpreter
52
+ def SynInterfaces.add_interface(class_name)
53
+ $stderr.puts "Initializing interface #{class_name}" if $DEBUG
54
+ @@interfaces << class_name
55
+ end
56
+
57
+ def SynInterfaces.add_interpreter(class_name)
58
+ $stderr.puts "Initializing interpreter #{class_name}" if $DEBUG
59
+ @@interpreters << class_name
60
+ end
61
+
62
+ # AB: fake method to preview the interfaces table.
63
+ def SynInterfaces.explore
64
+ $stderr.puts "Exploring..."
65
+ $stderr.puts @@interfaces
66
+ $stderr.puts @@interpreters
67
+ end
68
+ ###
69
+ # check_interfaces_abort_if_missing:
70
+ #
71
+ # Given an experiment file, use some_system_missing? to
72
+ # determine whether the system can be run with the requested
73
+ # syntactic processing, exit with an error message if that is not possible
74
+ def SynInterfaces.check_interfaces_abort_if_missing(exp) #FrPrepConfigData object
75
+ if (missing = SynInterfaces.some_system_missing?(exp))
76
+ interwhat, services = missing
77
+
78
+ $stderr.puts
79
+ $stderr.puts "ERROR: I am missing an #{interwhat} for "
80
+ services.each_pair { |service, system_name|
81
+ $stderr.puts "\tservice #{service}, system #{system_name}"
82
+ }
83
+ $stderr.puts
84
+ $stderr.puts "I have the following interfaces:"
85
+ @@interfaces.each { |interface_class|
86
+ $stderr.puts "\tservice #{interface_class.service}, system #{interface_class.system}"
87
+ }
88
+ $stderr.puts "I have the following interpreters:"
89
+ @@interpreters.each { |interpreter_class|
90
+ $stderr.print "\t"
91
+ $stderr.print interpreter_class.systems.to_a.map { |service, system_name|
92
+ "service #{service}, system #{system_name}"
93
+ }.join("; ")
94
+ unless interpreter_class.optional_systems.empty?
95
+ $stderr.print ", optional: "
96
+ $stderr.print interpreter_class.optional_systems.to_a.map { |service, system_name|
97
+ "service #{service}, system #{system_name}"
98
+ }.join("; ")
99
+ end
100
+ $stderr.puts
101
+ }
102
+ $stderr.puts
103
+ $stderr.puts "Please adapt your experiment file."
104
+ exit 1
105
+ end
106
+ end
107
+
108
+ ###
109
+ # some_system_missing?
110
+ # returns nil if I have interfaces and interpreters
111
+ # for all services requested in the given experiment file
112
+ # else:
113
+ # returns pair [interface or interpreter, info]
114
+ # where the 1st element is either 'interface' or 'interpreter',
115
+ # and the 2nd element is a hash mapping services to system names:
116
+ # the services that could not be provided
117
+ def SynInterfaces.some_system_missing?(exp) # FrPrepConfigData object
118
+
119
+ services = SynInterfaces.requested_services(exp)
120
+
121
+ # check interfaces
122
+ services.each_pair { |service, system_name|
123
+ unless SynInterfaces.get_interface(service, system_name)
124
+ return ["interface", {service => system_name} ]
125
+ end
126
+ }
127
+
128
+ # check interpreter
129
+ unless SynInterfaces.get_interpreter_according_to_exp(exp)
130
+ return ["interpreter", services]
131
+ end
132
+
133
+ # everything okay
134
+ return nil
135
+ end
136
+
137
+ ###
138
+ # given the name of a system and the service that it
139
+ # performs, find the matching interface class
140
+ #
141
+ # system: string: name of system, e.g. collins
142
+ # service: string: service, e.g. parser
143
+ #
144
+ # returns: SynInterface class
145
+ def SynInterfaces.get_interface(service,
146
+ system)
147
+
148
+ # try to find an interface class with the given
149
+ # name and service
150
+ @@interfaces.each { |interface_class|
151
+ if interface_class.system == system and
152
+ interface_class.service == service
153
+ return interface_class
154
+ end
155
+ }
156
+
157
+ # at this point, detection of a suitable interface class has failed
158
+ return nil
159
+ end
160
+
161
+ ###
162
+ # helper for get_interpreter:
163
+ def SynInterfaces.get_interpreter_according_to_exp(exp)
164
+ return SynInterfaces.get_interpreter(SynInterfaces.requested_services(exp))
165
+ end
166
+
167
+
168
+
169
+ ###
170
+ # given the names and services of a set of systems,
171
+ # find the matching interpreter class
172
+ #
173
+ # an interpreter class has both obligatory systems
174
+ # (they need to be present for this class to apply)
175
+ # and optional systems (they may or may not be present
176
+ # for the class to apply, but no other system performing
177
+ # the same service may)
178
+ #
179
+ # systems:
180
+ # hash: service(string) -> system name(string)
181
+ #
182
+ # returns: SynInterpreter class
183
+ def SynInterfaces.get_interpreter(systems)
184
+ # try to find an interface class with the given
185
+ # service-name pairs
186
+
187
+ @@interpreters.each { |interpreter_class|
188
+
189
+ if interpreter_class.systems.to_a.big_and { |service, system|
190
+ # all obligatory entries of interpreter_class
191
+ # are in systems
192
+ systems[service] == system
193
+ } and
194
+ interpreter_class.optional_systems.to_a.big_and { |service, system|
195
+ # all optional entries of interpreter_class are
196
+ # either in systems, or the service isn't in systems at all
197
+ systems[service].nil? or systems[service] == system
198
+ } and
199
+ systems.to_a.big_and { |service, system|
200
+ # all entries in names are in either
201
+ # the obligatory or optional set for interpreter_class
202
+ interpreter_class.systems[service] == system or
203
+ interpreter_class.optional_systems[service] == system
204
+ }
205
+ return interpreter_class
206
+ end
207
+ }
208
+
209
+ # at this point, detection of a suitable interpreter class has failed
210
+ return nil
211
+ end
212
+
213
+ ################
214
+ protected
215
+
216
+ ###
217
+ # knows about possible services that can be set in
218
+ # the experiment file, and where the names of
219
+ # the matching systems will be found in the experiment file data structure
220
+ #
221
+ # WARNING: adapt this when you introduce new services!
222
+ #
223
+ # returns: a hash
224
+ # <service> => system_name
225
+ #
226
+ # such that for each service/system name pair:
227
+ # the service with the given name has been requested in
228
+ # the experiment file, and the names of the systems to be used
229
+ # for performing the service
230
+ def SynInterfaces.requested_services(exp)
231
+ retv = Hash.new
232
+
233
+ [
234
+ { "flag" => "do_postag", "service"=> "pos_tagger"},
235
+ { "flag" => "do_lemmatize", "service"=> "lemmatizer"},
236
+ { "flag" => "do_parse", "service" => "parser" }
237
+ ].each { |hash|
238
+ if exp.get(hash["flag"]) # yes, perform this service
239
+ retv[hash["service"]] = exp.get(hash["service"])
240
+ end
241
+ }
242
+
243
+ return retv
244
+ end
245
+ end
246
+
247
+
248
+ require "frprep/CollinsInterface"
249
+ require "frprep/BerkeleyInterface"
250
+ require "frprep/SleepyInterface"
251
+ require "frprep/MiniparInterface"
252
+ require "frprep/TntInterface"
253
+ require "frprep/TreetaggerInterface"
254
+
255
+
256
+ class EmptyInterpreter < SynInterpreter
257
+ EmptyInterpreter.announce_me()
258
+
259
+ ###
260
+ # systems interpreted by this class:
261
+ # returns a hash service(string) -> system name (string),
262
+ # e.g.
263
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
264
+ def EmptyInterpreter.systems()
265
+ return {}
266
+ end
267
+
268
+ ###
269
+ # names of additional systems that may be interpreted by this class
270
+ # returns a hash service(string) -> system name(string)
271
+ # same as names()
272
+ def SynInterpreter.optional_systems()
273
+ return {}
274
+ end
275
+ end
@@ -0,0 +1,720 @@
1
+ # TabFormat.rb
2
+ # Katrin Erk, Jan 2004
3
+ #
4
+ # classes to be used with tabular format text files.
5
+ # originally CoNLL2.rb
6
+ # Original: Katrin Erk, Jan 2004 for CoNLL '04 data
7
+ # Rewrite: Sebastian Pado, Mar 2004 for Gemmas FrameNet data (no NEs etc.)
8
+
9
+ # Extensions SP Jun/Jul 04
10
+ # renamed GemmaCorpus to FNTabFormat
11
+
12
+ # partial rewrite SP 250804: made things cleaner & leaner: no RawFormat, for example
13
+
14
+ # sp 04/05: add a "frame" column to FNTabFormat
15
+ #
16
+ # Substantial changes KE 12/06:
17
+ # variable number of columns to accommodate more than one frame per sentence
18
+
19
+ #################################################
20
+ # class for reading a file
21
+ # containing data in tabular
22
+
23
+ require "tempfile"
24
+
25
+ require "frprep/ISO-8859-1"
26
+ require "frprep/ruby_class_extensions"
27
+
28
+ #######################
29
+ # This function takes a variable number of arguments and
30
+ # returns them as an array
31
+ # Idea: make formulation of tab format entries easier to read,
32
+ # enclose variable arguments in a repeat() call,
33
+ # which immediately gets transformed into a list
34
+ def repeat(*args)
35
+ return args
36
+ end
37
+
38
+ #######################
39
+ class TabFormatFile
40
+
41
+
42
+ #######
43
+ # initialize:
44
+ # open files for reading.
45
+ #
46
+ # fp is a list of pairs [filename, format]
47
+ # where format is a list of strings that will be used
48
+ # to address columns of the file, the 1st string for the 1st column
49
+ #
50
+ # format may contain _one_ entry that is an array (or a call to repeat())
51
+ # e.g.:
52
+ # ["word", "pos", "lemma", repeat("frame", "target", "gf", "pt")]
53
+ def initialize(fp)
54
+ # open files
55
+ @files = Array.new
56
+ @patterns = Array.new
57
+ @no_of_read_lines = 0
58
+ fp.each_index { |ix|
59
+ if ix.modulo(2) == 0
60
+ # filename
61
+ begin
62
+ @files << File.new(fp[ix])
63
+ rescue
64
+ raise 'Sorry, could not read input file ' + fp[ix] + "\n"
65
+ end
66
+ else
67
+ # pattern
68
+ @patterns += fp[ix]
69
+ end
70
+ }
71
+
72
+ @my_sentence_class = TabFormatSentence
73
+ end
74
+
75
+ ########
76
+ # each_sentence:
77
+ # yield each sentence of the files in turn.
78
+ # sentences are expected to be separated
79
+ # by a line containing nothing but whitespace.
80
+ # the last sentence may or may not be followed by
81
+ # an empty line.
82
+ # each_sentence ends when EOF is encountered on the first file.
83
+ # it expects all the other files to be the same length
84
+ # (in terms of number of lines) as the first file.
85
+ # each sentence is returned in the form of an
86
+ # array of TabFormatSentence sentences.
87
+
88
+ def each_sentence
89
+ unless @read_completely
90
+ sentence = @my_sentence_class.new(@patterns)
91
+ begin
92
+ lines = Array.new
93
+ while true do
94
+ line = ""
95
+ linearray = Array.new
96
+ @files.each {|f|
97
+ linearray << f.readline().chomp()
98
+ }
99
+ #STDERR.puts linearray
100
+ @no_of_read_lines += 1
101
+ if linearray.detect{|x| x.strip == ""}
102
+ if linearray.detect {|x| x.strip != ""}
103
+ STDERR.puts "Error: Mismatching empty lines!"
104
+ exit(1)
105
+ else
106
+ # sentence finished. yield it and start a new one
107
+ unless sentence.empty?
108
+ yield sentence
109
+ end
110
+ sentence = @my_sentence_class.new(@patterns)
111
+ end
112
+ # read an empty line in each of the other files
113
+
114
+ else
115
+ # sentence not yet finished.
116
+ # add this line to it
117
+ sentence.add_line(linearray.join("\t"))
118
+ end
119
+ end
120
+ rescue EOFError
121
+ unless sentence.empty?
122
+ # maybe we haven't yielded the last sentence yet.
123
+ yield sentence
124
+ end
125
+ @read_completely = true
126
+ end
127
+ end
128
+ end
129
+
130
+ end
131
+
132
+ #################################################
133
+ # class for keeping one line,
134
+ # parsed.
135
+ # The line is kept as follows:
136
+ # - normal features: in a hash @f mapping feature names to values
137
+ # - features of the repeated group: in an array @r of
138
+ # TabFormatNamedArgs objects, one per group
139
+ #
140
+ # each feature of the line is available by name
141
+ # via the method "get".
142
+ # Additional features (from other input files) can be
143
+ # added to the TabFormatNamedArgs object via the method
144
+ # add_feature
145
+ #
146
+ # methods:
147
+ #
148
+ # new: initialize.
149
+ # values: array of strings
150
+ # features: how to access the strings by name
151
+ # 'features' is an array of strings
152
+ # later the i-th feature will be used to access
153
+ # the i-th value,
154
+ # except for repeated groups
155
+ #
156
+ # get: returns one feature by its name
157
+ # name: a string
158
+ #
159
+ # add_feature: add another feature to this object,
160
+ # which can be accessed via "get"
161
+ # name: name for the new feature, should be distinct
162
+ # from the ones already used in new()
163
+ # feature: a string, the value of the feature
164
+ ##
165
+
166
+ class TabFormatNamedArgs
167
+ ############
168
+ def initialize(values, features, group = nil)
169
+ @f = Hash.new
170
+ @r = Array.new
171
+ @group = group
172
+
173
+ # record the feature names, give special attention to a group
174
+ # if we have one
175
+ @group_feature_names = nil
176
+ @feature_names = features.map { |feature|
177
+ if feature.instance_of? Array
178
+ # found a group
179
+ @group_feature_names = feature
180
+ "GROUP"
181
+ else
182
+ feature
183
+ end
184
+ }
185
+
186
+ if @feature_names.count("GROUP") > 1
187
+ $stderr.puts "More than one group in feature set:" + features.join(" ")
188
+ raise "Cannot handle this."
189
+ end
190
+
191
+ # group_index: position of group in overall feature list
192
+ group_index = @feature_names.index("GROUP")
193
+ unless group_index
194
+ group_index = @feature_names.length()
195
+ end
196
+ num_features_after_group = [0,
197
+ (@feature_names.length() - 1) - group_index].max()
198
+ index_after_groups = values.length() - num_features_after_group
199
+
200
+
201
+ # features before group: put feature/value pairs in @f hash
202
+ 0.upto(group_index - 1) { |i|
203
+ @f[features[i]] = values[i]
204
+ }
205
+ # group: store each group in @r hash
206
+ if @group_feature_names
207
+ # for (group_start = group_index; group_start < index_after_groups;
208
+ # group_start += @group_feature_names.length())
209
+ group_no = 0
210
+ group_index.step(index_after_groups - 1,
211
+ @group_feature_names.length()) { |group_start|
212
+ @r << TabFormatNamedArgs.new(values.slice(group_start,
213
+ @group_feature_names.length()),
214
+ @group_feature_names,
215
+ group_no)
216
+ group_no += 1
217
+ }
218
+ end
219
+
220
+ # features after group: put feature/value pairs in @f hash
221
+ feature_index = group_index + 1
222
+ index_after_groups.upto(values.length() - 1) { |i|
223
+ @f[features[feature_index]] = values[i]
224
+ feature_index += 1
225
+ }
226
+ end
227
+
228
+ ############
229
+ # return feature/value pairs as a tab format line,
230
+ # order of features as given in the 'features' list
231
+ # Features not set in the hash: their entry will be "-"
232
+ #
233
+ # If the feature list includes a group,
234
+ # assume zero entries for that group
235
+ def TabFormatNamedArgs.format_str(hash, # hash: feature -> value
236
+ features) # feature list, as for new()
237
+ if features.nil?
238
+ return ""
239
+ end
240
+
241
+ # sanity check: does the hash contain keys that are not in the feature list?
242
+ hash.keys().reject { |f| features.include? f }.each { |bad_feature|
243
+ $stderr.puts "Error: unknown feature #{bad_feature} in format_str: ignoring."
244
+ }
245
+
246
+ return features.select { |f|
247
+ # remove the group feature, if it's there
248
+ not(f.instance_of? Array)
249
+ }.map { |feature|
250
+ if hash[feature]
251
+ hash[feature]
252
+ else
253
+ "-"
254
+ end
255
+ }.join("\t")
256
+ end
257
+
258
+
259
+ #############
260
+ def add_feature(name, feature)
261
+ if @f.has_key? name
262
+ raise "Trying to add a feature twice: "+name
263
+ end
264
+
265
+ @f[name] = feature
266
+ end
267
+
268
+ #############
269
+ # get feature value, identified by feature name
270
+ # return: feature value as string
271
+ def get(name)
272
+ if (retv = get_nongroup(name))
273
+ return retv
274
+ else
275
+ return get_from_group(name, @group)
276
+ end
277
+ end
278
+
279
+ #############
280
+ def set(name, feature)
281
+ @f[name] = feature
282
+ end
283
+
284
+ #############
285
+ def num_groups()
286
+ return @r.length()
287
+ end
288
+
289
+ #############
290
+ # return line as string, entries connected by tab,
291
+ # in the order that the entries were in originally
292
+ def to_s()
293
+ return @feature_names.map { |feature|
294
+ case feature
295
+ when "GROUP"
296
+ @r.map { |group_obj| group_obj.to_s }.join("\t")
297
+ else
298
+ @f[feature]
299
+ end
300
+ }.join("\t")
301
+ end
302
+
303
+ protected
304
+
305
+ # get feature, non-group
306
+ # return: feature value (string)
307
+ def get_nongroup(feature)
308
+ return @f[feature]
309
+ end
310
+
311
+ # get feature from one of the groups
312
+ # return: feature value (string)
313
+ def get_from_group(name, group_no)
314
+ if not(group_no) or group_no >= @r.length()
315
+ # no group with that number
316
+ return nil
317
+ else
318
+ return @r[group_no].get_nongroup(name)
319
+ end
320
+ end
321
+ end
322
+
323
+
324
+ #################################################
325
+ # class for keeping and yielding one sentence
326
+ # in tabular format
327
+ class TabFormatSentence
328
+ ############
329
+ # initialize:
330
+ # the sentence will be stored one word (plus additional info
331
+ # for that word) per line. Each line will be stored in a cell of
332
+ # the array @lines. the 'initialize' method starts with an empty
333
+ # array of lines.
334
+ def initialize(pattern)
335
+ @lines = Array.new
336
+ @pattern = pattern
337
+
338
+ # this is just for inheritance; FNTabFormatSentence will need this
339
+ @group_no = nil
340
+ end
341
+
342
+ #####
343
+ # length: number of words in the sentence
344
+ def length
345
+ return @lines.length
346
+ end
347
+
348
+ ################3
349
+ # add_line:
350
+ # add one entry to the @lines array, i.e. information for one word
351
+ # of the sentence.
352
+ def add_line(line)
353
+ @lines << line
354
+ end
355
+
356
+ ###################
357
+ # empty?:
358
+ # returns true if there are currently no lines stored in this
359
+ # TabFormatSentence object
360
+ # else false
361
+ def empty?
362
+ return @lines.empty?
363
+ end
364
+
365
+ ######################
366
+ # empty!:
367
+ # discards all entries to the @lines array,
368
+ # i.e. empties this TabFormatSentence object of all
369
+ # data
370
+ def empty!
371
+ @lines.clear
372
+ end
373
+
374
+ #####################
375
+ # each_line:
376
+ # yields each line of the sentence
377
+ # as a string
378
+ def each_line
379
+ @lines.each { |l| yield l }
380
+ end
381
+
382
+ ######################
383
+ # each_line_parsed:
384
+ # yields each line of the sentence
385
+ # broken up as follows:
386
+ # the line is expected to contain 6 or more pieces of
387
+ # information, separated by whitespace.
388
+ # - the word
389
+ # - the part of speech info for the word
390
+ # - syntax for roles (not to be used)
391
+ # - target (or -)
392
+ # - gramm. function for roles (not to be used)
393
+ # - one column with role annotation
394
+ #
395
+ # All pieces are yielded as strings, except for the argument columns, which
396
+ # are yielded as an array of strings.
397
+ def each_line_parsed
398
+ lineno = 0
399
+ f = nil
400
+ @lines.each { |l|
401
+ f = TabFormatNamedArgs.new(l.split("\t"), @pattern, @group_no)
402
+ f.add_feature("lineno", lineno)
403
+ yield f
404
+ lineno += 1
405
+ }
406
+ end
407
+
408
+ ###
409
+ # read_one_line:
410
+ # return a line of the sentence specified by its number
411
+ def read_one_line(number)
412
+ return(@lines[number])
413
+ end
414
+
415
+ ###
416
+ # read_one_line_parsed:
417
+ # like get_line, but the features in the line are returned
418
+ # separately,
419
+ # as in each_line_parsed
420
+ def read_one_line_parsed(number)
421
+ if @lines[number].nil?
422
+ return nil
423
+ else
424
+ f = TabFormatNamedArgs.new(@lines[number].split("\t"), @pattern, @group_no)
425
+ f.add_feature("lineno", number)
426
+ return f
427
+ end
428
+ end
429
+
430
+ # set line no of first line of present sentence
431
+ def set_starting_line(n)
432
+ raise "Deprecated"
433
+ end
434
+
435
+ # returns line no of first line of present sentence
436
+ def get_starting_line()
437
+ raise "Deprecated"
438
+ end
439
+ end
440
+
441
+ ########################################################
442
+ # TabFormat files containing everything that's in the FN lexunit files
443
+ #
444
+ # one target per sentence
445
+
446
+ class FNTabFormatFile < TabFormatFile
447
+
448
+ def initialize(filename,tag_suffix=nil,lemma_suffix=nil)
449
+
450
+ corpusname = File.dirname(filename)+"/"+File.basename(filename,".tab")
451
+
452
+ filename_label_pairs = [filename,FNTabFormatFile.fntab_format()]
453
+ if lemma_suffix # raise exception if lemmatisation does not esist
454
+ filename_label_pairs.concat [corpusname+lemma_suffix,["lemma"]]
455
+ end
456
+ if tag_suffix # raise exception if tagging does not exist
457
+ filename_label_pairs.concat [corpusname+tag_suffix,["pos"]]
458
+ end
459
+ super(filename_label_pairs)
460
+
461
+ @my_sentence_class = FNTabSentence
462
+ end
463
+
464
+
465
+ def FNTabFormatFile.fntab_format()
466
+ # return ["word", "pt", "gf", "role", "target", "frame", "lu_sent_ids"]
467
+ return [
468
+ "word",
469
+ FNTabFormatFile.frametab_format(),
470
+ "ne", "sent_id"
471
+ ]
472
+ end
473
+
474
+ def FNTabFormatFile.frametab_format()
475
+ return ["pt", "gf", "role", "target", "frame", "stuff"]
476
+ end
477
+
478
+ ##########
479
+ # given a hash mapping features to values,
480
+ # format according to fntab_format
481
+ def FNTabFormatFile.format_str(hash)
482
+ return TabFormatNamedArgs.format_str(hash, FNTabFormatFile.fntab_format())
483
+ end
484
+ end
485
+
486
+ ############################################
487
+ class FNTabSentence < TabFormatSentence
488
+
489
+ ####
490
+ # overwrite this to get a feature from
491
+ # a group rather than from the main feature list
492
+ def get_this(l, feature_name)
493
+ return l.get(feature_name)
494
+ end
495
+
496
+ ####
497
+ def sanity_check()
498
+ each_line_parsed {|l|
499
+ if l.get("sent_id").nil?
500
+ raise "Error: corpus file does not conform to FN format."
501
+ else
502
+ return
503
+ end
504
+ }
505
+ end
506
+
507
+ ####
508
+ # returns the sentence ID, a string, as set by FrameNet
509
+ def get_sent_id()
510
+ sanity_check
511
+ each_line_parsed {|l|
512
+ return l.get("sent_id")
513
+ }
514
+ end
515
+
516
+ ####
517
+ # iterator, yields each frame of the sentence as a FNTabFrame
518
+ # object. They contain the complete sentence, but provide
519
+ # access to exactly one frame of that sentence.
520
+ def each_frame()
521
+ # how many frames? assume that each line has the same
522
+ # number of frames
523
+ num_frames = read_one_line_parsed(0).num_groups()
524
+ 0.upto(num_frames - 1) { |frame_no|
525
+ frame_obj = FNTabFrame.new(@pattern, frame_no)
526
+ each_line { |l| frame_obj.add_line(l) }
527
+ yield frame_obj
528
+ }
529
+ end
530
+
531
+ ####
532
+ # computes a mapping from word indices to labels on these words
533
+ #
534
+ # returns a hash: index_list(array:integer) -> label(string)
535
+ # An entry il->label means that all the lines whose line
536
+ # numbers are listed in il are labeled with label.
537
+ #
538
+ # Line numbers correspond to words of the sentence. Counting starts at 0.
539
+ #
540
+ # By default, "markables" looks for role labels, i.e. labels in the
541
+ # column "role", but it can also look in another column.
542
+ # To change the default, give the column name as a parameter.
543
+ def markables(use_this_column="role")
544
+ # returns hash of {index list} -> {markup label}
545
+
546
+ sanity_check()
547
+
548
+ idlist_to_annotation_list = Hash.new
549
+
550
+ # add entry for the target word
551
+ # idlist_to_annotation_list[get_target_indices()] = "target"
552
+
553
+ # determine span of each frame element
554
+ # if we find overlapping FEs, we write a warning to STDERR
555
+ # ignore the 2nd label and attempt to "close" the 1st label
556
+
557
+ ids = Array.new
558
+ label = nil
559
+
560
+ each_line_parsed { |l|
561
+
562
+ this_id = get_this(l, "lineno")
563
+
564
+ # start of FE?
565
+ this_col = get_this(l, use_this_column)
566
+ unless this_col
567
+ $stderr.puts "nil entry #{use_this_column} in line #{this_id} of sent #{get_sent_id()}. Skipping."
568
+ next
569
+ end
570
+ this_fe_ann = this_col.split(":")
571
+
572
+ case this_fe_ann.length
573
+ when 1 # nothing at all, or a single begin or end
574
+ markup = this_fe_ann.first
575
+ if markup == "-" or markup == "--" # no change
576
+ if label
577
+ ids << this_id
578
+ end
579
+ elsif markup =~ /^B-(\S+)$/
580
+ if label # are we within a markable right now?
581
+ $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" starts while within markable ", label.to_s
582
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
583
+ else
584
+ label = $1
585
+ ids << this_id
586
+ end
587
+ elsif markup =~ /^E-(\S+)$/
588
+ if label == $1 # we close the markable we've opened before
589
+ ids << this_id
590
+ # store information
591
+ idlist_to_annotation_list[ids] = label
592
+ # reset memory
593
+ label = nil
594
+ ids = Array.new
595
+ else
596
+ $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" closes while within markable "+ label.to_s
597
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
598
+ end
599
+ else
600
+ $stderr.puts "[TabFormat] Warning: cannot analyse markup "+markup
601
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}"
602
+ end
603
+ when 2 # this should be a one-word markable
604
+ b_markup = this_fe_ann[0]
605
+ e_markup = this_fe_ann[1]
606
+ if label
607
+ $stderr.puts "[TabFormat] Warning: Finding new markable at word #{this_id} while within markable ", label
608
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
609
+ else
610
+ if b_markup =~ /^B-(\S+)$/
611
+ b_label = $1
612
+ if e_markup =~ /^E-(\S+)$/
613
+ e_label = $1
614
+ if b_label == e_label
615
+ idlist_to_annotation_list[[this_id]] = b_label
616
+ else
617
+ $stderr.puts "[TabFormat] Warning: Starting markable "+b_label+", closing markable "+e_label
618
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
619
+ end
620
+ else
621
+ $stderr.puts "[TabFormat] Warning: Unknown end markup "+e_markup
622
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
623
+ end
624
+ else
625
+ $stderr.puts "[TabFormat] Warning: Unknown start markup "+b_markup
626
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
627
+ end
628
+ end
629
+ else
630
+ $stderr.puts "Warning: cannot analyse markup with more than two colon-separated parts like "+this_fee_ann.join(":")
631
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}"
632
+ end
633
+ }
634
+
635
+ unless label.nil?
636
+ $stderr.puts "[TabFormat] Warning: Markable ", label, " did not end in sentence."
637
+ $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
638
+ end
639
+
640
+ return idlist_to_annotation_list
641
+ end
642
+
643
+ #######
644
+ def to_s
645
+ sanity_check
646
+ array = Array.new
647
+ each_line_parsed {|l|
648
+ array << l.get("word")
649
+ }
650
+ return array.join(" ")
651
+ end
652
+
653
+ end
654
+
655
+ class FNTabFrame < FNTabSentence
656
+
657
+ ############
658
+ # initialize:
659
+ # as parent, except that we also get a frame number
660
+ # such that we can access the features of ``our'' frame
661
+ def initialize(pattern, frameno)
662
+ # by setting @group_no to frameno,
663
+ # we are initializing each TabFormatNamedArgs object
664
+ # in each_line_parsed() or read_one_line_parsed()
665
+ # with the right group number,
666
+ # such that all calls to TabFormatNamedArgs.get()
667
+ # will access the right group.
668
+ super(pattern)
669
+ @group_no = frameno
670
+ end
671
+
672
+
673
+ # returns the frame introduced by the target word(s)
674
+ # of this frame group, a string
675
+ def get_frame()
676
+ sanity_check()
677
+ each_line_parsed {|l|
678
+ return l.get("frame")
679
+ }
680
+ end
681
+
682
+ ####
683
+ # returns an array of integers: the indices of the target of
684
+ # the frame
685
+ # These are the line numbers, which start counting at 0
686
+ #
687
+ # a target may span more than one word
688
+ def get_target_indices()
689
+ sanity_check
690
+ idx = Array.new
691
+ each_line_parsed {|l|
692
+ unless l.get("target") == "-"
693
+ idx << l.get("lineno")
694
+ end
695
+ }
696
+ return idx
697
+ end
698
+
699
+ ####
700
+ # returns a string: the target
701
+ # in the case of multiword targets,
702
+ # we find the complete target at all
703
+ # indices, i.e. we can just take the first one we find
704
+ def get_target()
705
+ each_line_parsed {|l|
706
+ t = l.get("target")
707
+ unless t == "-"
708
+ return t
709
+ end
710
+ }
711
+ end
712
+
713
+ ####
714
+ # get the target POS, according to FrameNet
715
+ def get_target_fn_pos()
716
+ get_target() =~ /^[^\.]+\.(\w+)$/
717
+ return $1
718
+ end
719
+
720
+ end