shalmaneser 0.0.1.alpha → 1.2.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +2 -2
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +49 -0
  6. data/bin/fred +18 -0
  7. data/bin/frprep +34 -0
  8. data/bin/rosy +17 -0
  9. data/lib/common/AbstractSynInterface.rb +35 -33
  10. data/lib/common/Mallet.rb +236 -0
  11. data/lib/common/Maxent.rb +26 -12
  12. data/lib/common/Parser.rb +5 -5
  13. data/lib/common/SynInterfaces.rb +13 -6
  14. data/lib/common/TabFormat.rb +7 -6
  15. data/lib/common/Tiger.rb +4 -4
  16. data/lib/common/Timbl.rb +144 -0
  17. data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
  18. data/lib/common/headz.rb +1 -1
  19. data/lib/common/ruby_class_extensions.rb +3 -3
  20. data/lib/fred/FredBOWContext.rb +14 -2
  21. data/lib/fred/FredDetermineTargets.rb +4 -9
  22. data/lib/fred/FredEval.rb +1 -1
  23. data/lib/fred/FredFeatureExtractors.rb +4 -3
  24. data/lib/fred/FredFeaturize.rb +1 -1
  25. data/lib/frprep/CollinsInterface.rb +6 -6
  26. data/lib/frprep/MiniparInterface.rb +5 -5
  27. data/lib/frprep/SleepyInterface.rb +7 -7
  28. data/lib/frprep/TntInterface.rb +1 -1
  29. data/lib/frprep/TreetaggerInterface.rb +29 -5
  30. data/lib/frprep/do_parses.rb +1 -0
  31. data/lib/frprep/frprep.rb +36 -32
  32. data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
  33. data/lib/frprep/interfaces/stanford_interface.rb +353 -0
  34. data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
  35. data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
  36. data/lib/frprep/opt_parser.rb +2 -2
  37. data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
  38. data/lib/rosy/RosyIterator.rb +11 -10
  39. data/lib/rosy/rosy.rb +1 -0
  40. data/lib/shalmaneser/version.rb +1 -1
  41. data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
  42. data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
  43. data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
  44. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
  45. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
  46. data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
  47. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
  48. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
  49. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
  50. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
  51. data/test/functional/test_frprep.rb +3 -3
  52. data/test/functional/test_rosy.rb +20 -0
  53. metadata +215 -224
  54. data/CHANGELOG.rdoc +0 -0
  55. data/LICENSE.rdoc +0 -0
  56. data/README.rdoc +0 -0
  57. data/lib/common/CollinsInterface.rb +0 -1165
  58. data/lib/common/MiniparInterface.rb +0 -1388
  59. data/lib/common/SleepyInterface.rb +0 -384
  60. data/lib/common/TntInterface.rb +0 -44
  61. data/lib/common/TreetaggerInterface.rb +0 -303
  62. data/lib/frprep/AbstractSynInterface.rb +0 -1227
  63. data/lib/frprep/BerkeleyInterface.rb +0 -375
  64. data/lib/frprep/ConfigData.rb +0 -694
  65. data/lib/frprep/FixSynSemMapping.rb +0 -196
  66. data/lib/frprep/FrPrepConfigData.rb +0 -66
  67. data/lib/frprep/FrprepHelper.rb +0 -1324
  68. data/lib/frprep/ISO-8859-1.rb +0 -24
  69. data/lib/frprep/Parser.rb +0 -213
  70. data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
  71. data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
  72. data/lib/frprep/SynInterfaces.rb +0 -275
  73. data/lib/frprep/TabFormat.rb +0 -720
  74. data/lib/frprep/Tiger.rb +0 -1448
  75. data/lib/frprep/Tree.rb +0 -61
  76. data/lib/frprep/headz.rb +0 -338
@@ -1,99 +0,0 @@
1
- # sp jul 05 05
2
- #
3
- # Static helper methods for SalsaTigerRegXML:
4
-
5
- # - provide header and footer for Salsa/Tiger XML files
6
- # - escape and unescape HTML entities
7
- #
8
- # changed KE nov 05:
9
- # many methods moved to FrprepHelper
10
-
11
- require "frprep/SalsaTigerRegXML"
12
- require "frprep/headz"
13
- require "frprep/Parser"
14
- require "tempfile"
15
-
16
- class SalsaTigerXMLHelper
17
-
18
-
19
- ###
20
- # get header of SalsaTigerXML files (as string)
21
- def SalsaTigerXMLHelper.get_header
22
-
23
- header = <<ENDOFHEADER
24
- <?xml version="1.0" encoding="UTF-8"?>
25
- <corpus corpusname="corpus" target="">
26
- <head>
27
- <meta>
28
- <format>
29
- NeGra format, version 3</format>
30
- </meta>
31
- <frames xmlns="http://www.clt-st.de/framenet/frame-database">
32
- </frames>
33
- <wordtags xmlns="http://www.clt-st.de/salsa/wordtags">
34
- </wordtags>
35
- <flags>
36
- </flags>
37
- <annotation>
38
- <edgelabel>
39
- </edgelabel>
40
- <secedgelabel>
41
- </secedgelabel>
42
- </annotation>
43
- </head>
44
- <body>
45
- ENDOFHEADER
46
-
47
- return header
48
-
49
- end
50
-
51
- ###
52
- # get footer of SALSATigerXML files (as string)
53
- def SalsaTigerXMLHelper.get_footer
54
-
55
- footer = <<ENDOFFOOTER
56
- </body>
57
- </corpus>
58
- ENDOFFOOTER
59
-
60
- return footer
61
- end
62
-
63
-
64
-
65
- # escape and unescape strings for representation in XML
66
-
67
- @@replacements = [
68
- # ["&apos;&apos;","&quot;"], # added by ines (09/03/09), might cause problems for unescape???
69
- ["&","&amp;"], # must be first for escaping, last for unescaping
70
- ["<","&lt;"],
71
- [">", "&gt;"],
72
- ["\"","&apos;&apos;"],
73
- # ["\"","&quot;"],
74
- # ["\'\'","&quot;"],
75
- # ["\`\`","&quot;"],
76
- ["\'","&apos;"],
77
- ["\`\`","&apos;&apos;"],
78
- # ["''","&apos;&apos;"]
79
- ]
80
-
81
-
82
-
83
- def SalsaTigerXMLHelper.escape(string)
84
- @@replacements.each {|unescaped,escaped|
85
- string.gsub!(unescaped,escaped)
86
- }
87
- return string
88
- end
89
-
90
- def SalsaTigerXMLHelper.unescape(string)
91
- # reverse replacements to replace &amp last
92
- @@replacements.reverse.each {|unescaped,escaped|
93
- string.gsub!(escaped,unescaped)
94
- }
95
- return string
96
- end
97
-
98
-
99
- end
@@ -1,275 +0,0 @@
1
- # SynInterfaces.rb
2
- #
3
- # ke oct/nov 2005
4
- #
5
- # Store all known interfaces to
6
- # systems that do syntactic analysis
7
- #
8
- # Given the name of a system and the service that the
9
- # system performs, return the appropriate interface
10
- #
11
- # There are two types of interfaces to syntactic analysis systems:
12
- # - interfaces:
13
- # offer methods for syntactic analysis,
14
- # and the transformation to Salsa/Tiger XML and SalsaTigerSentence objects
15
- # - interpreters:
16
- # interpret the resulting Salsa/Tiger XML (represented as
17
- # SalsaTigerSentence and SynNode objects), e.g.
18
- # generalize over part of speech;
19
- # describe the path between a pair of nodes both as a path
20
- # and (potentially) as a grammatical function of one of the nodes;
21
- # determine whether a node describes a verb, and in which voice;
22
- # determine the head of a constituent
23
- #
24
- # Abstract classes for both interfaces and interpreters
25
- # are in AbstractSynInterface.rb
26
-
27
- require "frprep/ruby_class_extensions"
28
- class Array
29
- include EnumerableBool
30
- end
31
-
32
- # The list of available interface packages
33
- # is at the end of this file.
34
- # Please enter additional interfaces there.
35
-
36
- class SynInterfaces
37
-
38
- ###
39
- # class variable:
40
- # list of all known interface classes
41
- # add to it using add_interface()
42
- @@interfaces = Array.new
43
-
44
- ###
45
- # class variable:
46
- # list of all known interpreter classes
47
- # add to it using add_interpreter()
48
- @@interpreters = Array.new
49
-
50
- ###
51
- # add interface/interpreter
52
- def SynInterfaces.add_interface(class_name)
53
- $stderr.puts "Initializing interface #{class_name}" if $DEBUG
54
- @@interfaces << class_name
55
- end
56
-
57
- def SynInterfaces.add_interpreter(class_name)
58
- $stderr.puts "Initializing interpreter #{class_name}" if $DEBUG
59
- @@interpreters << class_name
60
- end
61
-
62
- # AB: fake method to preview the interfaces table.
63
- def SynInterfaces.explore
64
- $stderr.puts "Exploring..."
65
- $stderr.puts @@interfaces
66
- $stderr.puts @@interpreters
67
- end
68
- ###
69
- # check_interfaces_abort_if_missing:
70
- #
71
- # Given an experiment file, use some_system_missing? to
72
- # determine whether the system can be run with the requested
73
- # syntactic processing, exit with an error message if that is not possible
74
- def SynInterfaces.check_interfaces_abort_if_missing(exp) #FrPrepConfigData object
75
- if (missing = SynInterfaces.some_system_missing?(exp))
76
- interwhat, services = missing
77
-
78
- $stderr.puts
79
- $stderr.puts "ERROR: I am missing an #{interwhat} for "
80
- services.each_pair { |service, system_name|
81
- $stderr.puts "\tservice #{service}, system #{system_name}"
82
- }
83
- $stderr.puts
84
- $stderr.puts "I have the following interfaces:"
85
- @@interfaces.each { |interface_class|
86
- $stderr.puts "\tservice #{interface_class.service}, system #{interface_class.system}"
87
- }
88
- $stderr.puts "I have the following interpreters:"
89
- @@interpreters.each { |interpreter_class|
90
- $stderr.print "\t"
91
- $stderr.print interpreter_class.systems.to_a.map { |service, system_name|
92
- "service #{service}, system #{system_name}"
93
- }.join("; ")
94
- unless interpreter_class.optional_systems.empty?
95
- $stderr.print ", optional: "
96
- $stderr.print interpreter_class.optional_systems.to_a.map { |service, system_name|
97
- "service #{service}, system #{system_name}"
98
- }.join("; ")
99
- end
100
- $stderr.puts
101
- }
102
- $stderr.puts
103
- $stderr.puts "Please adapt your experiment file."
104
- exit 1
105
- end
106
- end
107
-
108
- ###
109
- # some_system_missing?
110
- # returns nil if I have interfaces and interpreters
111
- # for all services requested in the given experiment file
112
- # else:
113
- # returns pair [interface or interpreter, info]
114
- # where the 1st element is either 'interface' or 'interpreter',
115
- # and the 2nd element is a hash mapping services to system names:
116
- # the services that could not be provided
117
- def SynInterfaces.some_system_missing?(exp) # FrPrepConfigData object
118
-
119
- services = SynInterfaces.requested_services(exp)
120
-
121
- # check interfaces
122
- services.each_pair { |service, system_name|
123
- unless SynInterfaces.get_interface(service, system_name)
124
- return ["interface", {service => system_name} ]
125
- end
126
- }
127
-
128
- # check interpreter
129
- unless SynInterfaces.get_interpreter_according_to_exp(exp)
130
- return ["interpreter", services]
131
- end
132
-
133
- # everything okay
134
- return nil
135
- end
136
-
137
- ###
138
- # given the name of a system and the service that it
139
- # performs, find the matching interface class
140
- #
141
- # system: string: name of system, e.g. collins
142
- # service: string: service, e.g. parser
143
- #
144
- # returns: SynInterface class
145
- def SynInterfaces.get_interface(service,
146
- system)
147
-
148
- # try to find an interface class with the given
149
- # name and service
150
- @@interfaces.each { |interface_class|
151
- if interface_class.system == system and
152
- interface_class.service == service
153
- return interface_class
154
- end
155
- }
156
-
157
- # at this point, detection of a suitable interface class has failed
158
- return nil
159
- end
160
-
161
- ###
162
- # helper for get_interpreter:
163
- def SynInterfaces.get_interpreter_according_to_exp(exp)
164
- return SynInterfaces.get_interpreter(SynInterfaces.requested_services(exp))
165
- end
166
-
167
-
168
-
169
- ###
170
- # given the names and services of a set of systems,
171
- # find the matching interpreter class
172
- #
173
- # an interpreter class has both obligatory systems
174
- # (they need to be present for this class to apply)
175
- # and optional systems (they may or may not be present
176
- # for the class to apply, but no other system performing
177
- # the same service may)
178
- #
179
- # systems:
180
- # hash: service(string) -> system name(string)
181
- #
182
- # returns: SynInterpreter class
183
- def SynInterfaces.get_interpreter(systems)
184
- # try to find an interface class with the given
185
- # service-name pairs
186
-
187
- @@interpreters.each { |interpreter_class|
188
-
189
- if interpreter_class.systems.to_a.big_and { |service, system|
190
- # all obligatory entries of interpreter_class
191
- # are in systems
192
- systems[service] == system
193
- } and
194
- interpreter_class.optional_systems.to_a.big_and { |service, system|
195
- # all optional entries of interpreter_class are
196
- # either in systems, or the service isn't in systems at all
197
- systems[service].nil? or systems[service] == system
198
- } and
199
- systems.to_a.big_and { |service, system|
200
- # all entries in names are in either
201
- # the obligatory or optional set for interpreter_class
202
- interpreter_class.systems[service] == system or
203
- interpreter_class.optional_systems[service] == system
204
- }
205
- return interpreter_class
206
- end
207
- }
208
-
209
- # at this point, detection of a suitable interpreter class has failed
210
- return nil
211
- end
212
-
213
- ################
214
- protected
215
-
216
- ###
217
- # knows about possible services that can be set in
218
- # the experiment file, and where the names of
219
- # the matching systems will be found in the experiment file data structure
220
- #
221
- # WARNING: adapt this when you introduce new services!
222
- #
223
- # returns: a hash
224
- # <service> => system_name
225
- #
226
- # such that for each service/system name pair:
227
- # the service with the given name has been requested in
228
- # the experiment file, and the names of the systems to be used
229
- # for performing the service
230
- def SynInterfaces.requested_services(exp)
231
- retv = Hash.new
232
-
233
- [
234
- { "flag" => "do_postag", "service"=> "pos_tagger"},
235
- { "flag" => "do_lemmatize", "service"=> "lemmatizer"},
236
- { "flag" => "do_parse", "service" => "parser" }
237
- ].each { |hash|
238
- if exp.get(hash["flag"]) # yes, perform this service
239
- retv[hash["service"]] = exp.get(hash["service"])
240
- end
241
- }
242
-
243
- return retv
244
- end
245
- end
246
-
247
-
248
- require "frprep/CollinsInterface"
249
- require "frprep/BerkeleyInterface"
250
- require "frprep/SleepyInterface"
251
- require "frprep/MiniparInterface"
252
- require "frprep/TntInterface"
253
- require "frprep/TreetaggerInterface"
254
-
255
-
256
- class EmptyInterpreter < SynInterpreter
257
- EmptyInterpreter.announce_me()
258
-
259
- ###
260
- # systems interpreted by this class:
261
- # returns a hash service(string) -> system name (string),
262
- # e.g.
263
- # { "parser" => "collins", "lemmatizer" => "treetagger" }
264
- def EmptyInterpreter.systems()
265
- return {}
266
- end
267
-
268
- ###
269
- # names of additional systems that may be interpreted by this class
270
- # returns a hash service(string) -> system name(string)
271
- # same as names()
272
- def SynInterpreter.optional_systems()
273
- return {}
274
- end
275
- end
@@ -1,720 +0,0 @@
1
- # TabFormat.rb
2
- # Katrin Erk, Jan 2004
3
- #
4
- # classes to be used with tabular format text files.
5
- # originally CoNLL2.rb
6
- # Original: Katrin Erk, Jan 2004 for CoNLL '04 data
7
- # Rewrite: Sebastian Pado, Mar 2004 for Gemmas FrameNet data (no NEs etc.)
8
-
9
- # Extensions SP Jun/Jul 04
10
- # renamed GemmaCorpus to FNTabFormat
11
-
12
- # partial rewrite SP 250804: made things cleaner & leaner: no RawFormat, for example
13
-
14
- # sp 04/05: add a "frame" column to FNTabFormat
15
- #
16
- # Substantial changes KE 12/06:
17
- # variable number of columns to accommodate more than one frame per sentence
18
-
19
- #################################################
20
- # class for reading a file
21
- # containing data in tabular
22
-
23
- require "tempfile"
24
-
25
- require "frprep/ISO-8859-1"
26
- require "frprep/ruby_class_extensions"
27
-
28
- #######################
29
- # This function takes a variable number of arguments and
30
- # returns them as an array
31
- # Idea: make formulation of tab format entries easier to read,
32
- # enclose variable arguments in a repeat() call,
33
- # which immediately gets transformed into a list
34
- def repeat(*args)
35
- return args
36
- end
37
-
38
- #######################
39
- class TabFormatFile
40
-
41
-
42
- #######
43
- # initialize:
44
- # open files for reading.
45
- #
46
- # fp is a list of pairs [filename, format]
47
- # where format is a list of strings that will be used
48
- # to address columns of the file, the 1st string for the 1st column
49
- #
50
- # format may contain _one_ entry that is an array (or a call to repeat())
51
- # e.g.:
52
- # ["word", "pos", "lemma", repeat("frame", "target", "gf", "pt")]
53
- def initialize(fp)
54
- # open files
55
- @files = Array.new
56
- @patterns = Array.new
57
- @no_of_read_lines = 0
58
- fp.each_index { |ix|
59
- if ix.modulo(2) == 0
60
- # filename
61
- begin
62
- @files << File.new(fp[ix])
63
- rescue
64
- raise 'Sorry, could not read input file ' + fp[ix] + "\n"
65
- end
66
- else
67
- # pattern
68
- @patterns += fp[ix]
69
- end
70
- }
71
-
72
- @my_sentence_class = TabFormatSentence
73
- end
74
-
75
- ########
76
- # each_sentence:
77
- # yield each sentence of the files in turn.
78
- # sentences are expected to be separated
79
- # by a line containing nothing but whitespace.
80
- # the last sentence may or may not be followed by
81
- # an empty line.
82
- # each_sentence ends when EOF is encountered on the first file.
83
- # it expects all the other files to be the same length
84
- # (in terms of number of lines) as the first file.
85
- # each sentence is returned in the form of an
86
- # array of TabFormatSentence sentences.
87
-
88
- def each_sentence
89
- unless @read_completely
90
- sentence = @my_sentence_class.new(@patterns)
91
- begin
92
- lines = Array.new
93
- while true do
94
- line = ""
95
- linearray = Array.new
96
- @files.each {|f|
97
- linearray << f.readline().chomp()
98
- }
99
- #STDERR.puts linearray
100
- @no_of_read_lines += 1
101
- if linearray.detect{|x| x.strip == ""}
102
- if linearray.detect {|x| x.strip != ""}
103
- STDERR.puts "Error: Mismatching empty lines!"
104
- exit(1)
105
- else
106
- # sentence finished. yield it and start a new one
107
- unless sentence.empty?
108
- yield sentence
109
- end
110
- sentence = @my_sentence_class.new(@patterns)
111
- end
112
- # read an empty line in each of the other files
113
-
114
- else
115
- # sentence not yet finished.
116
- # add this line to it
117
- sentence.add_line(linearray.join("\t"))
118
- end
119
- end
120
- rescue EOFError
121
- unless sentence.empty?
122
- # maybe we haven't yielded the last sentence yet.
123
- yield sentence
124
- end
125
- @read_completely = true
126
- end
127
- end
128
- end
129
-
130
- end
131
-
132
- #################################################
133
- # class for keeping one line,
134
- # parsed.
135
- # The line is kept as follows:
136
- # - normal features: in a hash @f mapping feature names to values
137
- # - features of the repeated group: in an array @r of
138
- # TabFormatNamedArgs objects, one per group
139
- #
140
- # each feature of the line is available by name
141
- # via the method "get".
142
- # Additional features (from other input files) can be
143
- # added to the TabFormatNamedArgs object via the method
144
- # add_feature
145
- #
146
- # methods:
147
- #
148
- # new: initialize.
149
- # values: array of strings
150
- # features: how to access the strings by name
151
- # 'features' is an array of strings
152
- # later the i-th feature will be used to access
153
- # the i-th value,
154
- # except for repeated groups
155
- #
156
- # get: returns one feature by its name
157
- # name: a string
158
- #
159
- # add_feature: add another feature to this object,
160
- # which can be accessed via "get"
161
- # name: name for the new feature, should be distinct
162
- # from the ones already used in new()
163
- # feature: a string, the value of the feature
164
- ##
165
-
166
- class TabFormatNamedArgs
167
- ############
168
- def initialize(values, features, group = nil)
169
- @f = Hash.new
170
- @r = Array.new
171
- @group = group
172
-
173
- # record the feature names, give special attention to a group
174
- # if we have one
175
- @group_feature_names = nil
176
- @feature_names = features.map { |feature|
177
- if feature.instance_of? Array
178
- # found a group
179
- @group_feature_names = feature
180
- "GROUP"
181
- else
182
- feature
183
- end
184
- }
185
-
186
- if @feature_names.count("GROUP") > 1
187
- $stderr.puts "More than one group in feature set:" + features.join(" ")
188
- raise "Cannot handle this."
189
- end
190
-
191
- # group_index: position of group in overall feature list
192
- group_index = @feature_names.index("GROUP")
193
- unless group_index
194
- group_index = @feature_names.length()
195
- end
196
- num_features_after_group = [0,
197
- (@feature_names.length() - 1) - group_index].max()
198
- index_after_groups = values.length() - num_features_after_group
199
-
200
-
201
- # features before group: put feature/value pairs in @f hash
202
- 0.upto(group_index - 1) { |i|
203
- @f[features[i]] = values[i]
204
- }
205
- # group: store each group in @r hash
206
- if @group_feature_names
207
- # for (group_start = group_index; group_start < index_after_groups;
208
- # group_start += @group_feature_names.length())
209
- group_no = 0
210
- group_index.step(index_after_groups - 1,
211
- @group_feature_names.length()) { |group_start|
212
- @r << TabFormatNamedArgs.new(values.slice(group_start,
213
- @group_feature_names.length()),
214
- @group_feature_names,
215
- group_no)
216
- group_no += 1
217
- }
218
- end
219
-
220
- # features after group: put feature/value pairs in @f hash
221
- feature_index = group_index + 1
222
- index_after_groups.upto(values.length() - 1) { |i|
223
- @f[features[feature_index]] = values[i]
224
- feature_index += 1
225
- }
226
- end
227
-
228
- ############
229
- # return feature/value pairs as a tab format line,
230
- # order of features as given in the 'features' list
231
- # Features not set in the hash: their entry will be "-"
232
- #
233
- # If the feature list includes a group,
234
- # assume zero entries for that group
235
- def TabFormatNamedArgs.format_str(hash, # hash: feature -> value
236
- features) # feature list, as for new()
237
- if features.nil?
238
- return ""
239
- end
240
-
241
- # sanity check: does the hash contain keys that are not in the feature list?
242
- hash.keys().reject { |f| features.include? f }.each { |bad_feature|
243
- $stderr.puts "Error: unknown feature #{bad_feature} in format_str: ignoring."
244
- }
245
-
246
- return features.select { |f|
247
- # remove the group feature, if it's there
248
- not(f.instance_of? Array)
249
- }.map { |feature|
250
- if hash[feature]
251
- hash[feature]
252
- else
253
- "-"
254
- end
255
- }.join("\t")
256
- end
257
-
258
-
259
- #############
260
- def add_feature(name, feature)
261
- if @f.has_key? name
262
- raise "Trying to add a feature twice: "+name
263
- end
264
-
265
- @f[name] = feature
266
- end
267
-
268
- #############
269
- # get feature value, identified by feature name
270
- # return: feature value as string
271
- def get(name)
272
- if (retv = get_nongroup(name))
273
- return retv
274
- else
275
- return get_from_group(name, @group)
276
- end
277
- end
278
-
279
- #############
280
- def set(name, feature)
281
- @f[name] = feature
282
- end
283
-
284
- #############
285
- def num_groups()
286
- return @r.length()
287
- end
288
-
289
- #############
290
- # return line as string, entries connected by tab,
291
- # in the order that the entries were in originally
292
- def to_s()
293
- return @feature_names.map { |feature|
294
- case feature
295
- when "GROUP"
296
- @r.map { |group_obj| group_obj.to_s }.join("\t")
297
- else
298
- @f[feature]
299
- end
300
- }.join("\t")
301
- end
302
-
303
- protected
304
-
305
- # get feature, non-group
306
- # return: feature value (string)
307
- def get_nongroup(feature)
308
- return @f[feature]
309
- end
310
-
311
- # get feature from one of the groups
312
- # return: feature value (string)
313
- def get_from_group(name, group_no)
314
- if not(group_no) or group_no >= @r.length()
315
- # no group with that number
316
- return nil
317
- else
318
- return @r[group_no].get_nongroup(name)
319
- end
320
- end
321
- end
322
-
323
-
324
- #################################################
325
- # class for keeping and yielding one sentence
326
- # in tabular format
327
- class TabFormatSentence
328
- ############
329
- # initialize:
330
- # the sentence will be stored one word (plus additional info
331
- # for that word) per line. Each line will be stored in a cell of
332
- # the array @lines. the 'initialize' method starts with an empty
333
- # array of lines.
334
- def initialize(pattern)
335
- @lines = Array.new
336
- @pattern = pattern
337
-
338
- # this is just for inheritance; FNTabFormatSentence will need this
339
- @group_no = nil
340
- end
341
-
342
- #####
343
- # length: number of words in the sentence
344
- def length
345
- return @lines.length
346
- end
347
-
348
- ################3
349
- # add_line:
350
- # add one entry to the @lines array, i.e. information for one word
351
- # of the sentence.
352
- def add_line(line)
353
- @lines << line
354
- end
355
-
356
- ###################
357
- # empty?:
358
- # returns true if there are currently no lines stored in this
359
- # TabFormatSentence object
360
- # else false
361
- def empty?
362
- return @lines.empty?
363
- end
364
-
365
- ######################
366
- # empty!:
367
- # discards all entries to the @lines array,
368
- # i.e. empties this TabFormatSentence object of all
369
- # data
370
- def empty!
371
- @lines.clear
372
- end
373
-
374
- #####################
375
- # each_line:
376
- # yields each line of the sentence
377
- # as a string
378
- def each_line
379
- @lines.each { |l| yield l }
380
- end
381
-
382
- ######################
383
- # each_line_parsed:
384
- # yields each line of the sentence
385
- # broken up as follows:
386
- # the line is expected to contain 6 or more pieces of
387
- # information, separated by whitespace.
388
- # - the word
389
- # - the part of speech info for the word
390
- # - syntax for roles (not to be used)
391
- # - target (or -)
392
- # - gramm. function for roles (not to be used)
393
- # - one column with role annotation
394
- #
395
- # All pieces are yielded as strings, except for the argument columns, which
396
- # are yielded as an array of strings.
397
- def each_line_parsed
398
- lineno = 0
399
- f = nil
400
- @lines.each { |l|
401
- f = TabFormatNamedArgs.new(l.split("\t"), @pattern, @group_no)
402
- f.add_feature("lineno", lineno)
403
- yield f
404
- lineno += 1
405
- }
406
- end
407
-
408
- ###
409
- # read_one_line:
410
- # return a line of the sentence specified by its number
411
- def read_one_line(number)
412
- return(@lines[number])
413
- end
414
-
415
- ###
416
- # read_one_line_parsed:
417
- # like get_line, but the features in the line are returned
418
- # separately,
419
- # as in each_line_parsed
420
- def read_one_line_parsed(number)
421
- if @lines[number].nil?
422
- return nil
423
- else
424
- f = TabFormatNamedArgs.new(@lines[number].split("\t"), @pattern, @group_no)
425
- f.add_feature("lineno", number)
426
- return f
427
- end
428
- end
429
-
430
- # set line no of first line of present sentence
431
- def set_starting_line(n)
432
- raise "Deprecated"
433
- end
434
-
435
- # returns line no of first line of present sentence
436
- def get_starting_line()
437
- raise "Deprecated"
438
- end
439
- end
440
-
441
- ########################################################
442
- # TabFormat files containing everything that's in the FN lexunit files
443
- #
444
- # one target per sentence
445
-
446
- class FNTabFormatFile < TabFormatFile
447
-
448
- def initialize(filename,tag_suffix=nil,lemma_suffix=nil)
449
-
450
- corpusname = File.dirname(filename)+"/"+File.basename(filename,".tab")
451
-
452
- filename_label_pairs = [filename,FNTabFormatFile.fntab_format()]
453
- if lemma_suffix # raise exception if lemmatisation does not esist
454
- filename_label_pairs.concat [corpusname+lemma_suffix,["lemma"]]
455
- end
456
- if tag_suffix # raise exception if tagging does not exist
457
- filename_label_pairs.concat [corpusname+tag_suffix,["pos"]]
458
- end
459
- super(filename_label_pairs)
460
-
461
- @my_sentence_class = FNTabSentence
462
- end
463
-
464
-
465
- def FNTabFormatFile.fntab_format()
466
- # return ["word", "pt", "gf", "role", "target", "frame", "lu_sent_ids"]
467
- return [
468
- "word",
469
- FNTabFormatFile.frametab_format(),
470
- "ne", "sent_id"
471
- ]
472
- end
473
-
474
- def FNTabFormatFile.frametab_format()
475
- return ["pt", "gf", "role", "target", "frame", "stuff"]
476
- end
477
-
478
- ##########
479
- # given a hash mapping features to values,
480
- # format according to fntab_format
481
- def FNTabFormatFile.format_str(hash)
482
- return TabFormatNamedArgs.format_str(hash, FNTabFormatFile.fntab_format())
483
- end
484
- end
485
-
486
- ############################################
487
- class FNTabSentence < TabFormatSentence
488
-
489
- ####
490
- # overwrite this to get a feature from
491
- # a group rather than from the main feature list
492
- def get_this(l, feature_name)
493
- return l.get(feature_name)
494
- end
495
-
496
- ####
497
- def sanity_check()
498
- each_line_parsed {|l|
499
- if l.get("sent_id").nil?
500
- raise "Error: corpus file does not conform to FN format."
501
- else
502
- return
503
- end
504
- }
505
- end
506
-
507
- ####
508
- # returns the sentence ID, a string, as set by FrameNet
509
- def get_sent_id()
510
- sanity_check
511
- each_line_parsed {|l|
512
- return l.get("sent_id")
513
- }
514
- end
515
-
516
- ####
517
- # iterator, yields each frame of the sentence as a FNTabFrame
518
- # object. They contain the complete sentence, but provide
519
- # access to exactly one frame of that sentence.
520
- def each_frame()
521
- # how many frames? assume that each line has the same
522
- # number of frames
523
- num_frames = read_one_line_parsed(0).num_groups()
524
- 0.upto(num_frames - 1) { |frame_no|
525
- frame_obj = FNTabFrame.new(@pattern, frame_no)
526
- each_line { |l| frame_obj.add_line(l) }
527
- yield frame_obj
528
- }
529
- end
530
-
531
- ####
532
- # computes a mapping from word indices to labels on these words
533
- #
534
- # returns a hash: index_list(array:integer) -> label(string)
535
- # An entry il->label means that all the lines whose line
536
- # numbers are listed in il are labeled with label.
537
- #
538
- # Line numbers correspond to words of the sentence. Counting starts at 0.
539
- #
540
- # By default, "markables" looks for role labels, i.e. labels in the
541
- # column "role", but it can also look in another column.
542
- # To change the default, give the column name as a parameter.
543
- def markables(use_this_column="role")
544
- # returns hash of {index list} -> {markup label}
545
-
546
- sanity_check()
547
-
548
- idlist_to_annotation_list = Hash.new
549
-
550
- # add entry for the target word
551
- # idlist_to_annotation_list[get_target_indices()] = "target"
552
-
553
- # determine span of each frame element
554
- # if we find overlapping FEs, we write a warning to STDERR
555
- # ignore the 2nd label and attempt to "close" the 1st label
556
-
557
- ids = Array.new
558
- label = nil
559
-
560
- each_line_parsed { |l|
561
-
562
- this_id = get_this(l, "lineno")
563
-
564
- # start of FE?
565
- this_col = get_this(l, use_this_column)
566
- unless this_col
567
- $stderr.puts "nil entry #{use_this_column} in line #{this_id} of sent #{get_sent_id()}. Skipping."
568
- next
569
- end
570
- this_fe_ann = this_col.split(":")
571
-
572
- case this_fe_ann.length
573
- when 1 # nothing at all, or a single begin or end
574
- markup = this_fe_ann.first
575
- if markup == "-" or markup == "--" # no change
576
- if label
577
- ids << this_id
578
- end
579
- elsif markup =~ /^B-(\S+)$/
580
- if label # are we within a markable right now?
581
- $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" starts while within markable ", label.to_s
582
- $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
583
- else
584
- label = $1
585
- ids << this_id
586
- end
587
- elsif markup =~ /^E-(\S+)$/
588
- if label == $1 # we close the markable we've opened before
589
- ids << this_id
590
- # store information
591
- idlist_to_annotation_list[ids] = label
592
- # reset memory
593
- label = nil
594
- ids = Array.new
595
- else
596
- $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" closes while within markable "+ label.to_s
597
- $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
598
- end
599
- else
600
- $stderr.puts "[TabFormat] Warning: cannot analyse markup "+markup
601
- $stderr.puts "Debug data: Sentence id #{get_sent_id()}"
602
- end
603
- when 2 # this should be a one-word markable
604
- b_markup = this_fe_ann[0]
605
- e_markup = this_fe_ann[1]
606
- if label
607
- $stderr.puts "[TabFormat] Warning: Finding new markable at word #{this_id} while within markable ", label
608
- $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
609
- else
610
- if b_markup =~ /^B-(\S+)$/
611
- b_label = $1
612
- if e_markup =~ /^E-(\S+)$/
613
- e_label = $1
614
- if b_label == e_label
615
- idlist_to_annotation_list[[this_id]] = b_label
616
- else
617
- $stderr.puts "[TabFormat] Warning: Starting markable "+b_label+", closing markable "+e_label
618
- $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
619
- end
620
- else
621
- $stderr.puts "[TabFormat] Warning: Unknown end markup "+e_markup
622
- $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
623
- end
624
- else
625
- $stderr.puts "[TabFormat] Warning: Unknown start markup "+b_markup
626
- $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
627
- end
628
- end
629
- else
630
- $stderr.puts "Warning: cannot analyse markup with more than two colon-separated parts like "+this_fee_ann.join(":")
631
- $stderr.puts "Debug data: Sentence id #{get_sent_id()}"
632
- end
633
- }
634
-
635
- unless label.nil?
636
- $stderr.puts "[TabFormat] Warning: Markable ", label, " did not end in sentence."
637
- $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
638
- end
639
-
640
- return idlist_to_annotation_list
641
- end
642
-
643
- #######
644
- def to_s
645
- sanity_check
646
- array = Array.new
647
- each_line_parsed {|l|
648
- array << l.get("word")
649
- }
650
- return array.join(" ")
651
- end
652
-
653
- end
654
-
655
- class FNTabFrame < FNTabSentence
656
-
657
- ############
658
- # initialize:
659
- # as parent, except that we also get a frame number
660
- # such that we can access the features of ``our'' frame
661
- def initialize(pattern, frameno)
662
- # by setting @group_no to frameno,
663
- # we are initializing each TabFormatNamedArgs object
664
- # in each_line_parsed() or read_one_line_parsed()
665
- # with the right group number,
666
- # such that all calls to TabFormatNamedArgs.get()
667
- # will access the right group.
668
- super(pattern)
669
- @group_no = frameno
670
- end
671
-
672
-
673
- # returns the frame introduced by the target word(s)
674
- # of this frame group, a string
675
- def get_frame()
676
- sanity_check()
677
- each_line_parsed {|l|
678
- return l.get("frame")
679
- }
680
- end
681
-
682
- ####
683
- # returns an array of integers: the indices of the target of
684
- # the frame
685
- # These are the line numbers, which start counting at 0
686
- #
687
- # a target may span more than one word
688
- def get_target_indices()
689
- sanity_check
690
- idx = Array.new
691
- each_line_parsed {|l|
692
- unless l.get("target") == "-"
693
- idx << l.get("lineno")
694
- end
695
- }
696
- return idx
697
- end
698
-
699
- ####
700
- # returns a string: the target
701
- # in the case of multiword targets,
702
- # we find the complete target at all
703
- # indices, i.e. we can just take the first one we find
704
- def get_target()
705
- each_line_parsed {|l|
706
- t = l.get("target")
707
- unless t == "-"
708
- return t
709
- end
710
- }
711
- end
712
-
713
- ####
714
- # get the target POS, according to FrameNet
715
- def get_target_fn_pos()
716
- get_target() =~ /^[^\.]+\.(\w+)$/
717
- return $1
718
- end
719
-
720
- end