frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,195 @@
1
+ # sp 29 07 04
2
+ # "optimise" c4.5 files by replacing all feature values which only
3
+ # occur with one label by a new, common value.
4
+ #
5
+ # two modes of operation:
6
+ # optimise <file> -- optimise file and store optimisations in <file>.opts
7
+ # optimise <file> <file.opts> -- apply optimisation from file.opts to file
8
+
9
+ class Optimise
10
+
11
+ def initialize
12
+ @ready = false
13
+ end
14
+
15
+ def init_from_data(infile) # find new optimisation
16
+
17
+ STDERR.puts "[Optimise] computing new feature optimisation"
18
+
19
+ infile = File.new(infile)
20
+ labels = Array.new
21
+ features = nil
22
+ @replacements = Array.new # for each feature, store the list of replacements
23
+
24
+ # read data from infile into hash and initialise replacements array
25
+ while (line = infile.gets)
26
+ f_l = line.chomp.split(",")
27
+
28
+ if features.nil? # first line: initialisation
29
+ features = Array.new # for each feature: array of feature values from file
30
+ f_l.each_index {|i|
31
+ features[i] = Array.new
32
+ @replacements[i] = Hash.new
33
+ }
34
+ end
35
+ labels << f_l.pop
36
+ f_l.each_index {|i|
37
+ features[i] << f_l[i]
38
+ }
39
+ end
40
+ infile.close
41
+
42
+ features.each_index {|findex| # traverse all features
43
+
44
+ # for each feature *value*, find all label indices
45
+
46
+ fvalues = features[findex]
47
+
48
+ fval_to_label = Hash.new # record fval -> label mappings
49
+ # no label : nil
50
+ # one label: <label>
51
+ # two labels: false
52
+
53
+ fvalues.each_index {|inst_idx|
54
+ label = labels[inst_idx] # current label
55
+ fval = fvalues[inst_idx] # current feature value
56
+ seen_label = fval_to_label[fval] # previously seen label
57
+ if seen_label.nil?
58
+ fval_to_label[fval] = label
59
+ elsif seen_label and seen_label != label
60
+ fval_to_label[fval] = false
61
+ end
62
+ } # at the end, all fvals should be mapped to either <label> or false
63
+
64
+ # construct new feature value names
65
+
66
+ new_fvals = Hash.new
67
+ labels.each {|label|
68
+ new_fvals[label] = "f"+findex.to_s+"_"+label.gsub(/\./,"")
69
+ }
70
+
71
+ # record all features values for which we have only seen one label in @replacements
72
+
73
+ fval_to_label.each_pair {|fval,label|
74
+ if fval == "[U]"
75
+ puts "[U]: "+label.to_s+" "+new_fvals[label]
76
+ end
77
+ if label
78
+ # STDERR.puts "replacement of "+fval+" by "+new_fvals[label]
79
+ @replacements[findex][fval] = new_fvals[label]
80
+ end
81
+ }
82
+
83
+ # fvalues = features[findex]
84
+
85
+ # l_to_v = Hash.new # label -> array of feature values
86
+ # v_to_l = Hash.new # feature value -> array of labels
87
+
88
+ # fvalues.each_index {|inst| # traverse all instances
89
+ # fval = fvalues[inst]
90
+ # label = labels[inst]
91
+
92
+
93
+ # unless v_to_l.key?(fval) # add entry to v_to_l
94
+ # v_to_l[fval] = Array.new
95
+ # end
96
+ # v_to_l[fval] << label
97
+
98
+ # unless l_to_v.key?(label) # add entry to l_to_v
99
+ # l_to_v[label] = Array.new
100
+ # end
101
+ # l_to_v[label] << fval
102
+ # }
103
+
104
+ # l_to_v.each_pair {|label,values|
105
+ # newvalue = "f"+findex.to_s+"_"+label.gsub(/\./,"")
106
+ # values.each {|value|
107
+ # if v_to_l[value].uniq.length == 1
108
+ # @replacements[findex][value] = newvalue
109
+ # end
110
+ # }
111
+ # }
112
+ }
113
+ @ready = true
114
+ end
115
+
116
+ def init_from_file(optsfile) # use old optimisation
117
+ optsinfile = File.new(optsfile)
118
+ @replacements = read(optsinfile)
119
+ optsinfile.close
120
+ @ready = true
121
+ end
122
+
123
+ def store(outfilename) # store data necessary to recreate optimisation
124
+ unless @ready
125
+ raise "[Optimise] Error: Cannot store un-initialised optimisation"
126
+ end
127
+ outfile = File.new(outfilename,"w")
128
+ @replacements.each_index {|i| # for each feature
129
+ reps = @replacements[i]
130
+ outfile.puts "<"+i.to_s+">"
131
+ reps.each_pair{|old,new|
132
+ outfile.puts [old,new].join("\t")
133
+ }
134
+ outfile.puts "</"+i.to_s+">"
135
+ }
136
+ outfile.close
137
+ end
138
+
139
+ def apply(infilename,outfilename)
140
+ unless @ready
141
+ raise "[Optimise] Error: Cannot apply un-initialised optimisation"
142
+ end
143
+
144
+ STDERR.puts "[Optimise] applying feature optimisation"
145
+
146
+ infile = File.new(infilename)
147
+ outfile = File.new(outfilename,"w")
148
+ features = Array.new
149
+ labels = Array.new
150
+
151
+
152
+ while (line = infile.gets)
153
+ tokens = line.chomp.split(",")
154
+
155
+ unless tokens.length == @replacements.length
156
+ raise "[Optimise] Error: trying to optimise incompatible feature file!\nFile has "+features.length.to_s+" features, and we know replacements for "+@replacements.length.to_s+" features."
157
+ end
158
+
159
+ label = tokens.pop
160
+ tokens.each_index {|f_idx|
161
+ fval = tokens[f_idx]
162
+ if @replacements[f_idx].key?(fval)
163
+ tokens[f_idx] = @replacements[f_idx][fval]
164
+ end
165
+ }
166
+ tokens.push label
167
+ outfile.puts tokens.join(",")
168
+ end
169
+ outfile.close
170
+ end
171
+
172
+ private
173
+
174
+ def read(infile)
175
+ @replacements = Array.new
176
+ while line = infile.gets
177
+ line.chomp!
178
+ if line =~ /<(\d+)>/
179
+ reps = Hash.new
180
+ elsif line =~ /<\/(\d+)>/
181
+ @replacements[$1.to_i] = reps
182
+ else
183
+ tokens = line.chomp.split("\t")
184
+ reps[tokens[0]] = tokens[1]
185
+ end
186
+ end
187
+ infile.close
188
+ end
189
+
190
+ # return recommended filename to store optimisation patterns for basefile
191
+ def Optimise.recommended_filename(basefile)
192
+ return basefile+".optimisations"
193
+ end
194
+
195
+ end
@@ -0,0 +1,213 @@
1
+ # Alexander Koller 2003
2
+ # extended Katrin Erk June 2003
3
+ #
4
+ # Classes that return a list of sentence DOMs, from various sources
5
+ #
6
+ # Each class in this file defines the following methods:
7
+ #
8
+ # initialize(...) "..." depends on the class
9
+ # extractDOMs() return list of all s nodes as DOM objects
10
+ # each_s() iterate over s nodes; may take less memory
11
+
12
+
13
+ require "rexml/document"
14
+
15
+ class FileParser
16
+
17
+ include REXML
18
+
19
+ def initialize(filename)
20
+ @file = File.new(filename)
21
+ @doc = nil
22
+ end
23
+
24
+ # returns an array of DOMs for the sentences
25
+ def extractDOMs()
26
+ ensureParsedDocument()
27
+ @doc.get_elements("/corpus/body/s")
28
+ end
29
+
30
+ # Iterates over all sentence nodes. This may be more memory
31
+ # efficient than using extractDOMs(), but isn't in this case.
32
+ def each_s()
33
+ extractDOMs().each { |dom| yield(dom) }
34
+ end
35
+
36
+ # Iterates over all sentence nodes. The block passed to this
37
+ # method should return a DOM object as a value. After the iteration
38
+ # has been completed, the contents of /corpus/body are then replaced
39
+ # by the list of these results.
40
+ # At the moment, this changes the FileParser object. This should
41
+ # probably change in the future, but I don't want to mess with
42
+ # cloning now.
43
+ def process_s!()
44
+ newBody = Element.new('body')
45
+ each_s { |dom| newBody.add_element( yield(dom) ) }
46
+
47
+ @doc.delete_element("/corpus/body")
48
+ @doc.elements["corpus"].add_element(newBody)
49
+
50
+ return @doc
51
+ end
52
+
53
+
54
+
55
+ private
56
+
57
+ def ensureParsedDocument()
58
+ if @doc == nil then
59
+ @doc = Document.new(@file)
60
+ end
61
+ end
62
+
63
+
64
+ end
65
+
66
+
67
+
68
+
69
+ #####################################################################
70
+
71
+
72
+
73
+
74
+ class FilePartsParser
75
+ # @file = File object for the corpus
76
+ # @head = string up to the first <s> tag
77
+ # @tail = string after the last </s> tag
78
+ # @rest = string starting with the latest <s> tag (complete this to
79
+ # a <s>...</s> structure by reading up to next </s> tag)
80
+ # @readCompletely = boolean specifying whether there's still something
81
+ # left to read in the file
82
+
83
+ attr_reader :head, :tail
84
+
85
+ def initialize(filename)
86
+ @file = File.new(filename)
87
+ @readCompletely = false
88
+ # read stuff into @head and initialize @rest
89
+ @head = ''
90
+ begin
91
+ while true do
92
+ line = @file.readline()
93
+ if line =~ /(.*)(<s\s.*)/ then
94
+ @head = @head << $1
95
+ @rest = $2
96
+ break
97
+ elsif line =~ /^(.*)(<\/body[\s>].*)$/
98
+ # empty corpus
99
+ @head = @head << $1
100
+ @tail = $2
101
+ while (line = @file.readline())
102
+ @tail << "\n" + line
103
+ end
104
+ @readCompletely = true
105
+ break
106
+ else
107
+ @head = @head << line
108
+ end
109
+ end
110
+ rescue EOFError
111
+ @readCompletely = true
112
+ end
113
+ end
114
+
115
+ def close()
116
+ @file.close()
117
+ end
118
+
119
+ def extractDOMs()
120
+ allDOMs = Array.new
121
+
122
+ process_s!() { |dom|
123
+ allDOMs.push(dom)
124
+ Element.new("x")
125
+ }
126
+ return allDOMs
127
+ end
128
+
129
+ def each_s()
130
+ process_s!() { |dom|
131
+ yield(dom)
132
+ Element.new("x")
133
+ }
134
+ end
135
+
136
+ # This function returns the string for the modified corpus.
137
+ # It doesn't change the internal state of the FilePartsParser,
138
+ # and is much more memory (and probably time) efficient than
139
+ # FileParser#process_s!.
140
+ # The block that is called by the method is given an element
141
+ # as its argument and is expected to return a changed element.
142
+ def process_s!()
143
+ if @readCompletely
144
+ return
145
+ end
146
+
147
+ ret = ''
148
+ scan_s() { |element|
149
+ # Process the <s> ... </s> element
150
+ doc = Document.new(element)
151
+ elt = doc.root
152
+ changedElt = yield(elt)
153
+
154
+ changedEltAsString = ''
155
+ changedElt.write(changedEltAsString, 0)
156
+ ret <<= changedEltAsString
157
+ }
158
+
159
+ return ret
160
+ end
161
+
162
+ # KE 12.6.03: scan_s :
163
+ # doesn't parse a sentence before yielding it
164
+ # doesn't allow for any changes
165
+ # but otherwise the same as process_s!
166
+ def scan_s()
167
+ if @readCompletely
168
+ return
169
+ end
170
+
171
+ begin
172
+ while true do
173
+ # Invariant: At this point, @rest always starts with an
174
+ # unseen <s> tag.
175
+
176
+ # First, we continue reading until we find the closing </s>
177
+ # No exception should occur in this loop if we're parsing
178
+ # a valid XML document.
179
+ while @rest !~ /^(.*<\/s>)(.*)/m do
180
+ @rest = @rest << @file.readline()
181
+ end
182
+
183
+ element = $1
184
+ @rest = $2
185
+
186
+ yield(element) # change HERE: element not parsed!
187
+
188
+ # Read on up to the next <s>
189
+ while @rest !~ /(.*)(<s\s.*)/m do
190
+ @rest = @rest << @file.readline()
191
+ end
192
+
193
+ @rest = $2
194
+ end
195
+ rescue EOFError
196
+ @tail = @rest
197
+ @readCompletely = true
198
+ end
199
+ end
200
+
201
+ # KE 5.11.03: get_rest: read all of the file not processed up to this point
202
+ # and return it as a string
203
+ def get_rest()
204
+ begin
205
+ while true do
206
+ @rest = @rest << @file.readline()
207
+ end
208
+ rescue EOFError
209
+ @readCompletely = true
210
+ end
211
+ return @rest
212
+ end
213
+ end
@@ -0,0 +1,269 @@
1
+ # RegXML
2
+ #
3
+ # Katrin Erk June 2005
4
+
5
+ # SalsaTigerRegXML: take control of the data structure, no underlying xml
6
+ # representation anymore, re-generation of xml on demand
7
+
8
+ class RegXML
9
+
10
+ def initialize(string, # string representing a single XML element
11
+ i_am_text = false) # boolean: xml element (false) or text (true)
12
+
13
+ unless string.class == String
14
+ raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
15
+ end
16
+ if i_am_text
17
+ @s = string
18
+ @i_am_text = true
19
+ else
20
+ @s = string.gsub(/\n/, " ").freeze
21
+ @i_am_text = false
22
+
23
+ element_test()
24
+ dyck_test()
25
+ end
26
+ end
27
+
28
+ def to_s()
29
+ return xml_readable(@s)
30
+ end
31
+
32
+ def text?
33
+ return @i_am_text
34
+ end
35
+
36
+ def name()
37
+ if @i_am_text
38
+ # text
39
+ return nil
40
+
41
+ else
42
+ # xml element
43
+ if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
44
+ return $1
45
+ else
46
+ raise "Cannot parse:\n#{xml_readable(@s)}"
47
+ end
48
+ end
49
+ end
50
+
51
+ def attributes()
52
+ if @i_am_text
53
+ # text
54
+ return {}
55
+
56
+ else
57
+ # xml element
58
+
59
+ # remove <element_name from the beginning of @s,
60
+ # place the rest up to the first > into elt_contents:
61
+ # this is a string of the form
62
+ # - either (name=value)*
63
+ # - or (name=value)*/
64
+ unless @s =~ /^\s*<\s*#{name()}(.*)$/
65
+ raise "Cannot parse:\n #{xml_readable(@s)}"
66
+ end
67
+
68
+ retv = Hash.new
69
+ elt_contents = $1
70
+
71
+ # repeat until only > or /> is left
72
+ while elt_contents !~ /^\s*\/?>/
73
+
74
+ # shave off the next name=value pair
75
+ # put the rest into elt_contents
76
+ # make sure that if the value is quoted with ',
77
+ # we accept " inside the value, and vice versa.
78
+ unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
79
+ raise "Cannot parse:\n #{xml_readable(elt_contents)}"
80
+ end
81
+ retv[$1] = $3
82
+ elt_contents = $4
83
+ end
84
+
85
+ return retv
86
+ end
87
+ end
88
+
89
+ def children_and_text()
90
+ if @i_am_text
91
+ return []
92
+
93
+ else
94
+ if unary_element()
95
+ # <bla/>, no children
96
+ return []
97
+ end
98
+
99
+ # @s has the form <bla...> ... </bla>.
100
+ # remove <bla ...> from the beginning of @s,
101
+ # place the rest up to </bla> into children_s:
102
+
103
+ mainname = name()
104
+ unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
105
+ raise "Cannot parse:\n #{xml_readable(@s)}"
106
+ end
107
+
108
+ retv = Array.new
109
+ children_s = $3
110
+
111
+ # repeat until only whitespace is left
112
+ while children_s !~ /^\s*$/
113
+
114
+ # shave off the next bit of text
115
+ # put the rest into children_s
116
+ unless children_s =~ /^\s*(.*?)(<.*$|$)/
117
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
118
+ $stderr.puts
119
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
120
+ end
121
+ unless $1.strip.empty?
122
+ children_s = $2
123
+ retv << RegXML.new($1, true)
124
+ end
125
+
126
+ # anything left after we've parsed text?
127
+ if children_s =~ /^s*$/
128
+ break
129
+ end
130
+
131
+ # shave off the next child
132
+ # and put the rest into children_s
133
+
134
+ # determine the next child's name, and the string index at which
135
+ # the element start tag ends with either / or >
136
+ unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
137
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
138
+ $stderr.puts
139
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
140
+ end
141
+ childname = $2
142
+ child = $1
143
+ endofelt_ix = $&.length()
144
+
145
+
146
+ # and remove it
147
+ case children_s[endofelt_ix..-1]
148
+ when /^\/>(.*)$/
149
+ # next child is a unary element
150
+ children_s = $1
151
+ retv << RegXML.new(child + "/>")
152
+
153
+ when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
154
+ children_s = $2
155
+ retv << RegXML.new(child + $1)
156
+
157
+ else
158
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
159
+ $stderr.puts
160
+ raise "Cannot parse:\n#{xml_readable(children_s)}"
161
+ end
162
+ end
163
+
164
+ return retv
165
+ end
166
+ end
167
+
168
+ def RegXML.test()
169
+ bla = RegXML.new(" <bla blupp='a\"b'
170
+ lalala=\"c\">
171
+ <lalala> </lalala>
172
+ texttext
173
+ <lala blupp='b'/>
174
+ nochtext
175
+ <la> <l/> </la>
176
+ </ bla >
177
+ ")
178
+ puts "name " + bla.name()
179
+ puts
180
+ puts bla.to_s()
181
+ puts
182
+ bla.attributes.each { |attr, val|
183
+ puts "attr " + attr + "=" + val
184
+ }
185
+ puts
186
+ bla.children_and_text.each { |child_obj|
187
+ if child_obj.text?
188
+ puts "da text " + child_obj.to_s
189
+ else
190
+ puts "da child " + child_obj.to_s
191
+ end
192
+ }
193
+ puts
194
+
195
+ puts "NEU"
196
+ bla = RegXML.new(" < bla blupp='a\"'/> ")
197
+ puts "name " + bla.name()
198
+ puts
199
+ puts bla.to_s()
200
+ puts
201
+ bla.attributes.each { |attr, val|
202
+ puts "attr " + attr + "=" + val
203
+ }
204
+ puts
205
+ bla.children_and_text.each { |child_obj|
206
+ if child_obj.text?
207
+ puts "da text " + child_obj.to_s
208
+ else
209
+ puts "da child " + child_obj.to_s
210
+ end
211
+ }
212
+ puts
213
+
214
+ end
215
+
216
+ ##############
217
+ protected
218
+
219
+ def unary_element()
220
+ # <bla/>
221
+ if @s =~ /^\s*<.*\/>\s*$/
222
+ return true
223
+ else
224
+ return false
225
+ end
226
+ end
227
+
228
+ def element_test()
229
+ # make sure we have a single XML element, either <bla/> or
230
+ # <bla>...</bla>
231
+
232
+ if unary_element()
233
+ # <bla/>
234
+ elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
235
+ # <bla > ... </bla>
236
+ else
237
+ raise "Cannot parse:\n #{xml_readable(@s)}"
238
+ end
239
+ end
240
+
241
+ def dyck_test()
242
+ # every prefix of @s must have at least as many < as >
243
+ opening = 0
244
+ closing = 0
245
+ @s.scan(/[<>]/) { |bracket|
246
+ case bracket
247
+ when "<"
248
+ opening += 1
249
+ when ">"
250
+ closing += 1
251
+ if closing > opening
252
+ raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
253
+ end
254
+ end
255
+ }
256
+
257
+ # and in total, @s must have equally many < and >
258
+ unless @s.count("<") == @s.count(">")
259
+ raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
260
+ end
261
+ end
262
+
263
+ def xml_readable(string)
264
+ return string.gsub(/>/, ">\n")
265
+ end
266
+ end
267
+
268
+ # RegXML.test()
269
+