frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,213 @@
1
+ # Alexander Koller 2003
2
+ # extended Katrin Erk June 2003
3
+ #
4
+ # Classes that return a list of sentence DOMs, from various sources
5
+ #
6
+ # Each class in this file defines the following methods:
7
+ #
8
+ # initialize(...) "..." depends on the class
9
+ # extractDOMs() return list of all s nodes as DOM objects
10
+ # each_s() iterate over s nodes; may take less memory
11
+
12
+
13
+ require "rexml/document"
14
+
15
+ class FileParser
16
+
17
+ include REXML
18
+
19
+ def initialize(filename)
20
+ @file = File.new(filename)
21
+ @doc = nil
22
+ end
23
+
24
+ # returns an array of DOMs for the sentences
25
+ def extractDOMs()
26
+ ensureParsedDocument()
27
+ @doc.get_elements("/corpus/body/s")
28
+ end
29
+
30
+ # Iterates over all sentence nodes. This may be more memory
31
+ # efficient than using extractDOMs(), but isn't in this case.
32
+ def each_s()
33
+ extractDOMs().each { |dom| yield(dom) }
34
+ end
35
+
36
+ # Iterates over all sentence nodes. The block passed to this
37
+ # method should return a DOM object as a value. After the iteration
38
+ # has been completed, the contents of /corpus/body are then replaced
39
+ # by the list of these results.
40
+ # At the moment, this changes the FileParser object. This should
41
+ # probably change in the future, but I don't want to mess with
42
+ # cloning now.
43
+ def process_s!()
44
+ newBody = Element.new('body')
45
+ each_s { |dom| newBody.add_element( yield(dom) ) }
46
+
47
+ @doc.delete_element("/corpus/body")
48
+ @doc.elements["corpus"].add_element(newBody)
49
+
50
+ return @doc
51
+ end
52
+
53
+
54
+
55
+ private
56
+
57
+ def ensureParsedDocument()
58
+ if @doc == nil then
59
+ @doc = Document.new(@file)
60
+ end
61
+ end
62
+
63
+
64
+ end
65
+
66
+
67
+
68
+
69
+ #####################################################################
70
+
71
+
72
+
73
+
74
+ class FilePartsParser
75
+ # @file = File object for the corpus
76
+ # @head = string up to the first <s> tag
77
+ # @tail = string after the last </s> tag
78
+ # @rest = string starting with the latest <s> tag (complete this to
79
+ # a <s>...</s> structure by reading up to next </s> tag)
80
+ # @readCompletely = boolean specifying whether there's still something
81
+ # left to read in the file
82
+
83
+ attr_reader :head, :tail
84
+
85
+ def initialize(filename)
86
+ @file = File.new(filename)
87
+ @readCompletely = false
88
+ # read stuff into @head and initialize @rest
89
+ @head = ''
90
+ begin
91
+ while true do
92
+ line = @file.readline()
93
+ if line =~ /(.*)(<s\s.*)/ then
94
+ @head = @head << $1
95
+ @rest = $2
96
+ break
97
+ elsif line =~ /^(.*)(<\/body[\s>].*)$/
98
+ # empty corpus
99
+ @head = @head << $1
100
+ @tail = $2
101
+ while (line = @file.readline())
102
+ @tail << "\n" + line
103
+ end
104
+ @readCompletely = true
105
+ break
106
+ else
107
+ @head = @head << line
108
+ end
109
+ end
110
+ rescue EOFError
111
+ @readCompletely = true
112
+ end
113
+ end
114
+
115
+ def close()
116
+ @file.close()
117
+ end
118
+
119
+ def extractDOMs()
120
+ allDOMs = Array.new
121
+
122
+ process_s!() { |dom|
123
+ allDOMs.push(dom)
124
+ Element.new("x")
125
+ }
126
+ return allDOMs
127
+ end
128
+
129
+ def each_s()
130
+ process_s!() { |dom|
131
+ yield(dom)
132
+ Element.new("x")
133
+ }
134
+ end
135
+
136
+ # This function returns the string for the modified corpus.
137
+ # It doesn't change the internal state of the FilePartsParser,
138
+ # and is much more memory (and probably time) efficient than
139
+ # FileParser#process_s!.
140
+ # The block that is called by the method is given an element
141
+ # as its argument and is expected to return a changed element.
142
+ def process_s!()
143
+ if @readCompletely
144
+ return
145
+ end
146
+
147
+ ret = ''
148
+ scan_s() { |element|
149
+ # Process the <s> ... </s> element
150
+ doc = Document.new(element)
151
+ elt = doc.root
152
+ changedElt = yield(elt)
153
+
154
+ changedEltAsString = ''
155
+ changedElt.write(changedEltAsString, 0)
156
+ ret <<= changedEltAsString
157
+ }
158
+
159
+ return ret
160
+ end
161
+
162
+ # KE 12.6.03: scan_s :
163
+ # doesn't parse a sentence before yielding it
164
+ # doesn't allow for any changes
165
+ # but otherwise the same as process_s!
166
+ def scan_s()
167
+ if @readCompletely
168
+ return
169
+ end
170
+
171
+ begin
172
+ while true do
173
+ # Invariant: At this point, @rest always starts with an
174
+ # unseen <s> tag.
175
+
176
+ # First, we continue reading until we find the closing </s>
177
+ # No exception should occur in this loop if we're parsing
178
+ # a valid XML document.
179
+ while @rest !~ /^(.*<\/s>)(.*)/m do
180
+ @rest = @rest << @file.readline()
181
+ end
182
+
183
+ element = $1
184
+ @rest = $2
185
+
186
+ yield(element) # change HERE: element not parsed!
187
+
188
+ # Read on up to the next <s>
189
+ while @rest !~ /(.*)(<s\s.*)/m do
190
+ @rest = @rest << @file.readline()
191
+ end
192
+
193
+ @rest = $2
194
+ end
195
+ rescue EOFError
196
+ @tail = @rest
197
+ @readCompletely = true
198
+ end
199
+ end
200
+
201
+ # KE 5.11.03: get_rest: read all of the file not processed up to this point
202
+ # and return it as a string
203
+ def get_rest()
204
+ begin
205
+ while true do
206
+ @rest = @rest << @file.readline()
207
+ end
208
+ rescue EOFError
209
+ @readCompletely = true
210
+ end
211
+ return @rest
212
+ end
213
+ end
@@ -0,0 +1,269 @@
1
+ # RegXML
2
+ #
3
+ # Katrin Erk June 2005
4
+
5
+ # SalsaTigerRegXML: take control of the data structure, no underlying xml
6
+ # representation anymore, re-generation of xml on demand
7
+
8
+ class RegXML
9
+
10
+ def initialize(string, # string representing a single XML element
11
+ i_am_text = false) # boolean: xml element (false) or text (true)
12
+
13
+ unless string.class == String
14
+ raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
15
+ end
16
+ if i_am_text
17
+ @s = string
18
+ @i_am_text = true
19
+ else
20
+ @s = string.gsub(/\n/, " ").freeze
21
+ @i_am_text = false
22
+
23
+ element_test()
24
+ dyck_test()
25
+ end
26
+ end
27
+
28
+ def to_s()
29
+ return xml_readable(@s)
30
+ end
31
+
32
+ def text?
33
+ return @i_am_text
34
+ end
35
+
36
+ def name()
37
+ if @i_am_text
38
+ # text
39
+ return nil
40
+
41
+ else
42
+ # xml element
43
+ if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
44
+ return $1
45
+ else
46
+ raise "Cannot parse:\n#{xml_readable(@s)}"
47
+ end
48
+ end
49
+ end
50
+
51
+ def attributes()
52
+ if @i_am_text
53
+ # text
54
+ return {}
55
+
56
+ else
57
+ # xml element
58
+
59
+ # remove <element_name from the beginning of @s,
60
+ # place the rest up to the first > into elt_contents:
61
+ # this is a string of the form
62
+ # - either (name=value)*
63
+ # - or (name=value)*/
64
+ unless @s =~ /^\s*<\s*#{name()}(.*)$/
65
+ raise "Cannot parse:\n #{xml_readable(@s)}"
66
+ end
67
+
68
+ retv = Hash.new
69
+ elt_contents = $1
70
+
71
+ # repeat until only > or /> is left
72
+ while elt_contents !~ /^\s*\/?>/
73
+
74
+ # shave off the next name=value pair
75
+ # put the rest into elt_contents
76
+ # make sure that if the value is quoted with ',
77
+ # we accept " inside the value, and vice versa.
78
+ unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
79
+ raise "Cannot parse:\n #{xml_readable(elt_contents)}"
80
+ end
81
+ retv[$1] = $3
82
+ elt_contents = $4
83
+ end
84
+
85
+ return retv
86
+ end
87
+ end
88
+
89
+ def children_and_text()
90
+ if @i_am_text
91
+ return []
92
+
93
+ else
94
+ if unary_element()
95
+ # <bla/>, no children
96
+ return []
97
+ end
98
+
99
+ # @s has the form <bla...> ... </bla>.
100
+ # remove <bla ...> from the beginning of @s,
101
+ # place the rest up to </bla> into children_s:
102
+
103
+ mainname = name()
104
+ unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
105
+ raise "Cannot parse:\n #{xml_readable(@s)}"
106
+ end
107
+
108
+ retv = Array.new
109
+ children_s = $3
110
+
111
+ # repeat until only whitespace is left
112
+ while children_s !~ /^\s*$/
113
+
114
+ # shave off the next bit of text
115
+ # put the rest into children_s
116
+ unless children_s =~ /^\s*(.*?)(<.*$|$)/
117
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
118
+ $stderr.puts
119
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
120
+ end
121
+ unless $1.strip.empty?
122
+ children_s = $2
123
+ retv << RegXML.new($1, true)
124
+ end
125
+
126
+ # anything left after we've parsed text?
127
+ if children_s =~ /^s*$/
128
+ break
129
+ end
130
+
131
+ # shave off the next child
132
+ # and put the rest into children_s
133
+
134
+ # determine the next child's name, and the string index at which
135
+ # the element start tag ends with either / or >
136
+ unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
137
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
138
+ $stderr.puts
139
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
140
+ end
141
+ childname = $2
142
+ child = $1
143
+ endofelt_ix = $&.length()
144
+
145
+
146
+ # and remove it
147
+ case children_s[endofelt_ix..-1]
148
+ when /^\/>(.*)$/
149
+ # next child is a unary element
150
+ children_s = $1
151
+ retv << RegXML.new(child + "/>")
152
+
153
+ when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
154
+ children_s = $2
155
+ retv << RegXML.new(child + $1)
156
+
157
+ else
158
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
159
+ $stderr.puts
160
+ raise "Cannot parse:\n#{xml_readable(children_s)}"
161
+ end
162
+ end
163
+
164
+ return retv
165
+ end
166
+ end
167
+
168
+ def RegXML.test()
169
+ bla = RegXML.new(" <bla blupp='a\"b'
170
+ lalala=\"c\">
171
+ <lalala> </lalala>
172
+ texttext
173
+ <lala blupp='b'/>
174
+ nochtext
175
+ <la> <l/> </la>
176
+ </ bla >
177
+ ")
178
+ puts "name " + bla.name()
179
+ puts
180
+ puts bla.to_s()
181
+ puts
182
+ bla.attributes.each { |attr, val|
183
+ puts "attr " + attr + "=" + val
184
+ }
185
+ puts
186
+ bla.children_and_text.each { |child_obj|
187
+ if child_obj.text?
188
+ puts "da text " + child_obj.to_s
189
+ else
190
+ puts "da child " + child_obj.to_s
191
+ end
192
+ }
193
+ puts
194
+
195
+ puts "NEU"
196
+ bla = RegXML.new(" < bla blupp='a\"'/> ")
197
+ puts "name " + bla.name()
198
+ puts
199
+ puts bla.to_s()
200
+ puts
201
+ bla.attributes.each { |attr, val|
202
+ puts "attr " + attr + "=" + val
203
+ }
204
+ puts
205
+ bla.children_and_text.each { |child_obj|
206
+ if child_obj.text?
207
+ puts "da text " + child_obj.to_s
208
+ else
209
+ puts "da child " + child_obj.to_s
210
+ end
211
+ }
212
+ puts
213
+
214
+ end
215
+
216
+ ##############
217
+ protected
218
+
219
+ def unary_element()
220
+ # <bla/>
221
+ if @s =~ /^\s*<.*\/>\s*$/
222
+ return true
223
+ else
224
+ return false
225
+ end
226
+ end
227
+
228
+ def element_test()
229
+ # make sure we have a single XML element, either <bla/> or
230
+ # <bla>...</bla>
231
+
232
+ if unary_element()
233
+ # <bla/>
234
+ elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
235
+ # <bla > ... </bla>
236
+ else
237
+ raise "Cannot parse:\n #{xml_readable(@s)}"
238
+ end
239
+ end
240
+
241
+ def dyck_test()
242
+ # every prefix of @s must have at least as many < as >
243
+ opening = 0
244
+ closing = 0
245
+ @s.scan(/[<>]/) { |bracket|
246
+ case bracket
247
+ when "<"
248
+ opening += 1
249
+ when ">"
250
+ closing += 1
251
+ if closing > opening
252
+ raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
253
+ end
254
+ end
255
+ }
256
+
257
+ # and in total, @s must have equally many < and >
258
+ unless @s.count("<") == @s.count(">")
259
+ raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
260
+ end
261
+ end
262
+
263
+ def xml_readable(string)
264
+ return string.gsub(/>/, ">\n")
265
+ end
266
+ end
267
+
268
+ # RegXML.test()
269
+