frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,195 @@
|
|
1
|
+
# sp 29 07 04
|
2
|
+
# "optimise" c4.5 files by replacing all feature values which only
|
3
|
+
# occur with one label by a new, common value.
|
4
|
+
#
|
5
|
+
# two modes of operation:
|
6
|
+
# optimise <file> -- optimise file and store optimisations in <file>.opts
|
7
|
+
# optimise <file> <file.opts> -- apply optimisation from file.opts to file
|
8
|
+
|
9
|
+
class Optimise
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@ready = false
|
13
|
+
end
|
14
|
+
|
15
|
+
def init_from_data(infile) # find new optimisation
|
16
|
+
|
17
|
+
STDERR.puts "[Optimise] computing new feature optimisation"
|
18
|
+
|
19
|
+
infile = File.new(infile)
|
20
|
+
labels = Array.new
|
21
|
+
features = nil
|
22
|
+
@replacements = Array.new # for each feature, store the list of replacements
|
23
|
+
|
24
|
+
# read data from infile into hash and initialise replacements array
|
25
|
+
while (line = infile.gets)
|
26
|
+
f_l = line.chomp.split(",")
|
27
|
+
|
28
|
+
if features.nil? # first line: initialisation
|
29
|
+
features = Array.new # for each feature: array of feature values from file
|
30
|
+
f_l.each_index {|i|
|
31
|
+
features[i] = Array.new
|
32
|
+
@replacements[i] = Hash.new
|
33
|
+
}
|
34
|
+
end
|
35
|
+
labels << f_l.pop
|
36
|
+
f_l.each_index {|i|
|
37
|
+
features[i] << f_l[i]
|
38
|
+
}
|
39
|
+
end
|
40
|
+
infile.close
|
41
|
+
|
42
|
+
features.each_index {|findex| # traverse all features
|
43
|
+
|
44
|
+
# for each feature *value*, find all label indices
|
45
|
+
|
46
|
+
fvalues = features[findex]
|
47
|
+
|
48
|
+
fval_to_label = Hash.new # record fval -> label mappings
|
49
|
+
# no label : nil
|
50
|
+
# one label: <label>
|
51
|
+
# two labels: false
|
52
|
+
|
53
|
+
fvalues.each_index {|inst_idx|
|
54
|
+
label = labels[inst_idx] # current label
|
55
|
+
fval = fvalues[inst_idx] # current feature value
|
56
|
+
seen_label = fval_to_label[fval] # previously seen label
|
57
|
+
if seen_label.nil?
|
58
|
+
fval_to_label[fval] = label
|
59
|
+
elsif seen_label and seen_label != label
|
60
|
+
fval_to_label[fval] = false
|
61
|
+
end
|
62
|
+
} # at the end, all fvals should be mapped to either <label> or false
|
63
|
+
|
64
|
+
# construct new feature value names
|
65
|
+
|
66
|
+
new_fvals = Hash.new
|
67
|
+
labels.each {|label|
|
68
|
+
new_fvals[label] = "f"+findex.to_s+"_"+label.gsub(/\./,"")
|
69
|
+
}
|
70
|
+
|
71
|
+
# record all features values for which we have only seen one label in @replacements
|
72
|
+
|
73
|
+
fval_to_label.each_pair {|fval,label|
|
74
|
+
if fval == "[U]"
|
75
|
+
puts "[U]: "+label.to_s+" "+new_fvals[label]
|
76
|
+
end
|
77
|
+
if label
|
78
|
+
# STDERR.puts "replacement of "+fval+" by "+new_fvals[label]
|
79
|
+
@replacements[findex][fval] = new_fvals[label]
|
80
|
+
end
|
81
|
+
}
|
82
|
+
|
83
|
+
# fvalues = features[findex]
|
84
|
+
|
85
|
+
# l_to_v = Hash.new # label -> array of feature values
|
86
|
+
# v_to_l = Hash.new # feature value -> array of labels
|
87
|
+
|
88
|
+
# fvalues.each_index {|inst| # traverse all instances
|
89
|
+
# fval = fvalues[inst]
|
90
|
+
# label = labels[inst]
|
91
|
+
|
92
|
+
|
93
|
+
# unless v_to_l.key?(fval) # add entry to v_to_l
|
94
|
+
# v_to_l[fval] = Array.new
|
95
|
+
# end
|
96
|
+
# v_to_l[fval] << label
|
97
|
+
|
98
|
+
# unless l_to_v.key?(label) # add entry to l_to_v
|
99
|
+
# l_to_v[label] = Array.new
|
100
|
+
# end
|
101
|
+
# l_to_v[label] << fval
|
102
|
+
# }
|
103
|
+
|
104
|
+
# l_to_v.each_pair {|label,values|
|
105
|
+
# newvalue = "f"+findex.to_s+"_"+label.gsub(/\./,"")
|
106
|
+
# values.each {|value|
|
107
|
+
# if v_to_l[value].uniq.length == 1
|
108
|
+
# @replacements[findex][value] = newvalue
|
109
|
+
# end
|
110
|
+
# }
|
111
|
+
# }
|
112
|
+
}
|
113
|
+
@ready = true
|
114
|
+
end
|
115
|
+
|
116
|
+
def init_from_file(optsfile) # use old optimisation
|
117
|
+
optsinfile = File.new(optsfile)
|
118
|
+
@replacements = read(optsinfile)
|
119
|
+
optsinfile.close
|
120
|
+
@ready = true
|
121
|
+
end
|
122
|
+
|
123
|
+
def store(outfilename) # store data necessary to recreate optimisation
|
124
|
+
unless @ready
|
125
|
+
raise "[Optimise] Error: Cannot store un-initialised optimisation"
|
126
|
+
end
|
127
|
+
outfile = File.new(outfilename,"w")
|
128
|
+
@replacements.each_index {|i| # for each feature
|
129
|
+
reps = @replacements[i]
|
130
|
+
outfile.puts "<"+i.to_s+">"
|
131
|
+
reps.each_pair{|old,new|
|
132
|
+
outfile.puts [old,new].join("\t")
|
133
|
+
}
|
134
|
+
outfile.puts "</"+i.to_s+">"
|
135
|
+
}
|
136
|
+
outfile.close
|
137
|
+
end
|
138
|
+
|
139
|
+
def apply(infilename,outfilename)
|
140
|
+
unless @ready
|
141
|
+
raise "[Optimise] Error: Cannot apply un-initialised optimisation"
|
142
|
+
end
|
143
|
+
|
144
|
+
STDERR.puts "[Optimise] applying feature optimisation"
|
145
|
+
|
146
|
+
infile = File.new(infilename)
|
147
|
+
outfile = File.new(outfilename,"w")
|
148
|
+
features = Array.new
|
149
|
+
labels = Array.new
|
150
|
+
|
151
|
+
|
152
|
+
while (line = infile.gets)
|
153
|
+
tokens = line.chomp.split(",")
|
154
|
+
|
155
|
+
unless tokens.length == @replacements.length
|
156
|
+
raise "[Optimise] Error: trying to optimise incompatible feature file!\nFile has "+features.length.to_s+" features, and we know replacements for "+@replacements.length.to_s+" features."
|
157
|
+
end
|
158
|
+
|
159
|
+
label = tokens.pop
|
160
|
+
tokens.each_index {|f_idx|
|
161
|
+
fval = tokens[f_idx]
|
162
|
+
if @replacements[f_idx].key?(fval)
|
163
|
+
tokens[f_idx] = @replacements[f_idx][fval]
|
164
|
+
end
|
165
|
+
}
|
166
|
+
tokens.push label
|
167
|
+
outfile.puts tokens.join(",")
|
168
|
+
end
|
169
|
+
outfile.close
|
170
|
+
end
|
171
|
+
|
172
|
+
private
|
173
|
+
|
174
|
+
def read(infile)
|
175
|
+
@replacements = Array.new
|
176
|
+
while line = infile.gets
|
177
|
+
line.chomp!
|
178
|
+
if line =~ /<(\d+)>/
|
179
|
+
reps = Hash.new
|
180
|
+
elsif line =~ /<\/(\d+)>/
|
181
|
+
@replacements[$1.to_i] = reps
|
182
|
+
else
|
183
|
+
tokens = line.chomp.split("\t")
|
184
|
+
reps[tokens[0]] = tokens[1]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
infile.close
|
188
|
+
end
|
189
|
+
|
190
|
+
# return recommended filename to store optimisation patterns for basefile
|
191
|
+
def Optimise.recommended_filename(basefile)
|
192
|
+
return basefile+".optimisations"
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
@@ -0,0 +1,213 @@
|
|
1
|
+
# Alexander Koller 2003
|
2
|
+
# extended Katrin Erk June 2003
|
3
|
+
#
|
4
|
+
# Classes that return a list of sentence DOMs, from various sources
|
5
|
+
#
|
6
|
+
# Each class in this file defines the following methods:
|
7
|
+
#
|
8
|
+
# initialize(...) "..." depends on the class
|
9
|
+
# extractDOMs() return list of all s nodes as DOM objects
|
10
|
+
# each_s() iterate over s nodes; may take less memory
|
11
|
+
|
12
|
+
|
13
|
+
require "rexml/document"
|
14
|
+
|
15
|
+
class FileParser
|
16
|
+
|
17
|
+
include REXML
|
18
|
+
|
19
|
+
def initialize(filename)
|
20
|
+
@file = File.new(filename)
|
21
|
+
@doc = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
# returns an array of DOMs for the sentences
|
25
|
+
def extractDOMs()
|
26
|
+
ensureParsedDocument()
|
27
|
+
@doc.get_elements("/corpus/body/s")
|
28
|
+
end
|
29
|
+
|
30
|
+
# Iterates over all sentence nodes. This may be more memory
|
31
|
+
# efficient than using extractDOMs(), but isn't in this case.
|
32
|
+
def each_s()
|
33
|
+
extractDOMs().each { |dom| yield(dom) }
|
34
|
+
end
|
35
|
+
|
36
|
+
# Iterates over all sentence nodes. The block passed to this
|
37
|
+
# method should return a DOM object as a value. After the iteration
|
38
|
+
# has been completed, the contents of /corpus/body are then replaced
|
39
|
+
# by the list of these results.
|
40
|
+
# At the moment, this changes the FileParser object. This should
|
41
|
+
# probably change in the future, but I don't want to mess with
|
42
|
+
# cloning now.
|
43
|
+
def process_s!()
|
44
|
+
newBody = Element.new('body')
|
45
|
+
each_s { |dom| newBody.add_element( yield(dom) ) }
|
46
|
+
|
47
|
+
@doc.delete_element("/corpus/body")
|
48
|
+
@doc.elements["corpus"].add_element(newBody)
|
49
|
+
|
50
|
+
return @doc
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def ensureParsedDocument()
|
58
|
+
if @doc == nil then
|
59
|
+
@doc = Document.new(@file)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
#####################################################################
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
|
74
|
+
class FilePartsParser
|
75
|
+
# @file = File object for the corpus
|
76
|
+
# @head = string up to the first <s> tag
|
77
|
+
# @tail = string after the last </s> tag
|
78
|
+
# @rest = string starting with the latest <s> tag (complete this to
|
79
|
+
# a <s>...</s> structure by reading up to next </s> tag)
|
80
|
+
# @readCompletely = boolean specifying whether there's still something
|
81
|
+
# left to read in the file
|
82
|
+
|
83
|
+
attr_reader :head, :tail
|
84
|
+
|
85
|
+
def initialize(filename)
|
86
|
+
@file = File.new(filename)
|
87
|
+
@readCompletely = false
|
88
|
+
# read stuff into @head and initialize @rest
|
89
|
+
@head = ''
|
90
|
+
begin
|
91
|
+
while true do
|
92
|
+
line = @file.readline()
|
93
|
+
if line =~ /(.*)(<s\s.*)/ then
|
94
|
+
@head = @head << $1
|
95
|
+
@rest = $2
|
96
|
+
break
|
97
|
+
elsif line =~ /^(.*)(<\/body[\s>].*)$/
|
98
|
+
# empty corpus
|
99
|
+
@head = @head << $1
|
100
|
+
@tail = $2
|
101
|
+
while (line = @file.readline())
|
102
|
+
@tail << "\n" + line
|
103
|
+
end
|
104
|
+
@readCompletely = true
|
105
|
+
break
|
106
|
+
else
|
107
|
+
@head = @head << line
|
108
|
+
end
|
109
|
+
end
|
110
|
+
rescue EOFError
|
111
|
+
@readCompletely = true
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def close()
|
116
|
+
@file.close()
|
117
|
+
end
|
118
|
+
|
119
|
+
def extractDOMs()
|
120
|
+
allDOMs = Array.new
|
121
|
+
|
122
|
+
process_s!() { |dom|
|
123
|
+
allDOMs.push(dom)
|
124
|
+
Element.new("x")
|
125
|
+
}
|
126
|
+
return allDOMs
|
127
|
+
end
|
128
|
+
|
129
|
+
def each_s()
|
130
|
+
process_s!() { |dom|
|
131
|
+
yield(dom)
|
132
|
+
Element.new("x")
|
133
|
+
}
|
134
|
+
end
|
135
|
+
|
136
|
+
# This function returns the string for the modified corpus.
|
137
|
+
# It doesn't change the internal state of the FilePartsParser,
|
138
|
+
# and is much more memory (and probably time) efficient than
|
139
|
+
# FileParser#process_s!.
|
140
|
+
# The block that is called by the method is given an element
|
141
|
+
# as its argument and is expected to return a changed element.
|
142
|
+
def process_s!()
|
143
|
+
if @readCompletely
|
144
|
+
return
|
145
|
+
end
|
146
|
+
|
147
|
+
ret = ''
|
148
|
+
scan_s() { |element|
|
149
|
+
# Process the <s> ... </s> element
|
150
|
+
doc = Document.new(element)
|
151
|
+
elt = doc.root
|
152
|
+
changedElt = yield(elt)
|
153
|
+
|
154
|
+
changedEltAsString = ''
|
155
|
+
changedElt.write(changedEltAsString, 0)
|
156
|
+
ret <<= changedEltAsString
|
157
|
+
}
|
158
|
+
|
159
|
+
return ret
|
160
|
+
end
|
161
|
+
|
162
|
+
# KE 12.6.03: scan_s :
|
163
|
+
# doesn't parse a sentence before yielding it
|
164
|
+
# doesn't allow for any changes
|
165
|
+
# but otherwise the same as process_s!
|
166
|
+
def scan_s()
|
167
|
+
if @readCompletely
|
168
|
+
return
|
169
|
+
end
|
170
|
+
|
171
|
+
begin
|
172
|
+
while true do
|
173
|
+
# Invariant: At this point, @rest always starts with an
|
174
|
+
# unseen <s> tag.
|
175
|
+
|
176
|
+
# First, we continue reading until we find the closing </s>
|
177
|
+
# No exception should occur in this loop if we're parsing
|
178
|
+
# a valid XML document.
|
179
|
+
while @rest !~ /^(.*<\/s>)(.*)/m do
|
180
|
+
@rest = @rest << @file.readline()
|
181
|
+
end
|
182
|
+
|
183
|
+
element = $1
|
184
|
+
@rest = $2
|
185
|
+
|
186
|
+
yield(element) # change HERE: element not parsed!
|
187
|
+
|
188
|
+
# Read on up to the next <s>
|
189
|
+
while @rest !~ /(.*)(<s\s.*)/m do
|
190
|
+
@rest = @rest << @file.readline()
|
191
|
+
end
|
192
|
+
|
193
|
+
@rest = $2
|
194
|
+
end
|
195
|
+
rescue EOFError
|
196
|
+
@tail = @rest
|
197
|
+
@readCompletely = true
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# KE 5.11.03: get_rest: read all of the file not processed up to this point
|
202
|
+
# and return it as a string
|
203
|
+
def get_rest()
|
204
|
+
begin
|
205
|
+
while true do
|
206
|
+
@rest = @rest << @file.readline()
|
207
|
+
end
|
208
|
+
rescue EOFError
|
209
|
+
@readCompletely = true
|
210
|
+
end
|
211
|
+
return @rest
|
212
|
+
end
|
213
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
# RegXML
|
2
|
+
#
|
3
|
+
# Katrin Erk June 2005
|
4
|
+
|
5
|
+
# SalsaTigerRegXML: take control of the data structure, no underlying xml
|
6
|
+
# representation anymore, re-generation of xml on demand
|
7
|
+
|
8
|
+
class RegXML
|
9
|
+
|
10
|
+
def initialize(string, # string representing a single XML element
|
11
|
+
i_am_text = false) # boolean: xml element (false) or text (true)
|
12
|
+
|
13
|
+
unless string.class == String
|
14
|
+
raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
|
15
|
+
end
|
16
|
+
if i_am_text
|
17
|
+
@s = string
|
18
|
+
@i_am_text = true
|
19
|
+
else
|
20
|
+
@s = string.gsub(/\n/, " ").freeze
|
21
|
+
@i_am_text = false
|
22
|
+
|
23
|
+
element_test()
|
24
|
+
dyck_test()
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_s()
|
29
|
+
return xml_readable(@s)
|
30
|
+
end
|
31
|
+
|
32
|
+
def text?
|
33
|
+
return @i_am_text
|
34
|
+
end
|
35
|
+
|
36
|
+
def name()
|
37
|
+
if @i_am_text
|
38
|
+
# text
|
39
|
+
return nil
|
40
|
+
|
41
|
+
else
|
42
|
+
# xml element
|
43
|
+
if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
|
44
|
+
return $1
|
45
|
+
else
|
46
|
+
raise "Cannot parse:\n#{xml_readable(@s)}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def attributes()
|
52
|
+
if @i_am_text
|
53
|
+
# text
|
54
|
+
return {}
|
55
|
+
|
56
|
+
else
|
57
|
+
# xml element
|
58
|
+
|
59
|
+
# remove <element_name from the beginning of @s,
|
60
|
+
# place the rest up to the first > into elt_contents:
|
61
|
+
# this is a string of the form
|
62
|
+
# - either (name=value)*
|
63
|
+
# - or (name=value)*/
|
64
|
+
unless @s =~ /^\s*<\s*#{name()}(.*)$/
|
65
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
66
|
+
end
|
67
|
+
|
68
|
+
retv = Hash.new
|
69
|
+
elt_contents = $1
|
70
|
+
|
71
|
+
# repeat until only > or /> is left
|
72
|
+
while elt_contents !~ /^\s*\/?>/
|
73
|
+
|
74
|
+
# shave off the next name=value pair
|
75
|
+
# put the rest into elt_contents
|
76
|
+
# make sure that if the value is quoted with ',
|
77
|
+
# we accept " inside the value, and vice versa.
|
78
|
+
unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
|
79
|
+
raise "Cannot parse:\n #{xml_readable(elt_contents)}"
|
80
|
+
end
|
81
|
+
retv[$1] = $3
|
82
|
+
elt_contents = $4
|
83
|
+
end
|
84
|
+
|
85
|
+
return retv
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def children_and_text()
|
90
|
+
if @i_am_text
|
91
|
+
return []
|
92
|
+
|
93
|
+
else
|
94
|
+
if unary_element()
|
95
|
+
# <bla/>, no children
|
96
|
+
return []
|
97
|
+
end
|
98
|
+
|
99
|
+
# @s has the form <bla...> ... </bla>.
|
100
|
+
# remove <bla ...> from the beginning of @s,
|
101
|
+
# place the rest up to </bla> into children_s:
|
102
|
+
|
103
|
+
mainname = name()
|
104
|
+
unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
|
105
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
106
|
+
end
|
107
|
+
|
108
|
+
retv = Array.new
|
109
|
+
children_s = $3
|
110
|
+
|
111
|
+
# repeat until only whitespace is left
|
112
|
+
while children_s !~ /^\s*$/
|
113
|
+
|
114
|
+
# shave off the next bit of text
|
115
|
+
# put the rest into children_s
|
116
|
+
unless children_s =~ /^\s*(.*?)(<.*$|$)/
|
117
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
118
|
+
$stderr.puts
|
119
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
120
|
+
end
|
121
|
+
unless $1.strip.empty?
|
122
|
+
children_s = $2
|
123
|
+
retv << RegXML.new($1, true)
|
124
|
+
end
|
125
|
+
|
126
|
+
# anything left after we've parsed text?
|
127
|
+
if children_s =~ /^s*$/
|
128
|
+
break
|
129
|
+
end
|
130
|
+
|
131
|
+
# shave off the next child
|
132
|
+
# and put the rest into children_s
|
133
|
+
|
134
|
+
# determine the next child's name, and the string index at which
|
135
|
+
# the element start tag ends with either / or >
|
136
|
+
unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
|
137
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
138
|
+
$stderr.puts
|
139
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
140
|
+
end
|
141
|
+
childname = $2
|
142
|
+
child = $1
|
143
|
+
endofelt_ix = $&.length()
|
144
|
+
|
145
|
+
|
146
|
+
# and remove it
|
147
|
+
case children_s[endofelt_ix..-1]
|
148
|
+
when /^\/>(.*)$/
|
149
|
+
# next child is a unary element
|
150
|
+
children_s = $1
|
151
|
+
retv << RegXML.new(child + "/>")
|
152
|
+
|
153
|
+
when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
|
154
|
+
children_s = $2
|
155
|
+
retv << RegXML.new(child + $1)
|
156
|
+
|
157
|
+
else
|
158
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
159
|
+
$stderr.puts
|
160
|
+
raise "Cannot parse:\n#{xml_readable(children_s)}"
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
return retv
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def RegXML.test()
|
169
|
+
bla = RegXML.new(" <bla blupp='a\"b'
|
170
|
+
lalala=\"c\">
|
171
|
+
<lalala> </lalala>
|
172
|
+
texttext
|
173
|
+
<lala blupp='b'/>
|
174
|
+
nochtext
|
175
|
+
<la> <l/> </la>
|
176
|
+
</ bla >
|
177
|
+
")
|
178
|
+
puts "name " + bla.name()
|
179
|
+
puts
|
180
|
+
puts bla.to_s()
|
181
|
+
puts
|
182
|
+
bla.attributes.each { |attr, val|
|
183
|
+
puts "attr " + attr + "=" + val
|
184
|
+
}
|
185
|
+
puts
|
186
|
+
bla.children_and_text.each { |child_obj|
|
187
|
+
if child_obj.text?
|
188
|
+
puts "da text " + child_obj.to_s
|
189
|
+
else
|
190
|
+
puts "da child " + child_obj.to_s
|
191
|
+
end
|
192
|
+
}
|
193
|
+
puts
|
194
|
+
|
195
|
+
puts "NEU"
|
196
|
+
bla = RegXML.new(" < bla blupp='a\"'/> ")
|
197
|
+
puts "name " + bla.name()
|
198
|
+
puts
|
199
|
+
puts bla.to_s()
|
200
|
+
puts
|
201
|
+
bla.attributes.each { |attr, val|
|
202
|
+
puts "attr " + attr + "=" + val
|
203
|
+
}
|
204
|
+
puts
|
205
|
+
bla.children_and_text.each { |child_obj|
|
206
|
+
if child_obj.text?
|
207
|
+
puts "da text " + child_obj.to_s
|
208
|
+
else
|
209
|
+
puts "da child " + child_obj.to_s
|
210
|
+
end
|
211
|
+
}
|
212
|
+
puts
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
##############
|
217
|
+
protected
|
218
|
+
|
219
|
+
def unary_element()
|
220
|
+
# <bla/>
|
221
|
+
if @s =~ /^\s*<.*\/>\s*$/
|
222
|
+
return true
|
223
|
+
else
|
224
|
+
return false
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def element_test()
|
229
|
+
# make sure we have a single XML element, either <bla/> or
|
230
|
+
# <bla>...</bla>
|
231
|
+
|
232
|
+
if unary_element()
|
233
|
+
# <bla/>
|
234
|
+
elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
|
235
|
+
# <bla > ... </bla>
|
236
|
+
else
|
237
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def dyck_test()
|
242
|
+
# every prefix of @s must have at least as many < as >
|
243
|
+
opening = 0
|
244
|
+
closing = 0
|
245
|
+
@s.scan(/[<>]/) { |bracket|
|
246
|
+
case bracket
|
247
|
+
when "<"
|
248
|
+
opening += 1
|
249
|
+
when ">"
|
250
|
+
closing += 1
|
251
|
+
if closing > opening
|
252
|
+
raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
|
253
|
+
end
|
254
|
+
end
|
255
|
+
}
|
256
|
+
|
257
|
+
# and in total, @s must have equally many < and >
|
258
|
+
unless @s.count("<") == @s.count(">")
|
259
|
+
raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def xml_readable(string)
|
264
|
+
return string.gsub(/>/, ">\n")
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
# RegXML.test()
|
269
|
+
|