frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,195 @@
|
|
1
|
+
# sp 29 07 04
|
2
|
+
# "optimise" c4.5 files by replacing all feature values which only
|
3
|
+
# occur with one label by a new, common value.
|
4
|
+
#
|
5
|
+
# two modes of operation:
|
6
|
+
# optimise <file> -- optimise file and store optimisations in <file>.opts
|
7
|
+
# optimise <file> <file.opts> -- apply optimisation from file.opts to file
|
8
|
+
|
9
|
+
class Optimise
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@ready = false
|
13
|
+
end
|
14
|
+
|
15
|
+
def init_from_data(infile) # find new optimisation
|
16
|
+
|
17
|
+
STDERR.puts "[Optimise] computing new feature optimisation"
|
18
|
+
|
19
|
+
infile = File.new(infile)
|
20
|
+
labels = Array.new
|
21
|
+
features = nil
|
22
|
+
@replacements = Array.new # for each feature, store the list of replacements
|
23
|
+
|
24
|
+
# read data from infile into hash and initialise replacements array
|
25
|
+
while (line = infile.gets)
|
26
|
+
f_l = line.chomp.split(",")
|
27
|
+
|
28
|
+
if features.nil? # first line: initialisation
|
29
|
+
features = Array.new # for each feature: array of feature values from file
|
30
|
+
f_l.each_index {|i|
|
31
|
+
features[i] = Array.new
|
32
|
+
@replacements[i] = Hash.new
|
33
|
+
}
|
34
|
+
end
|
35
|
+
labels << f_l.pop
|
36
|
+
f_l.each_index {|i|
|
37
|
+
features[i] << f_l[i]
|
38
|
+
}
|
39
|
+
end
|
40
|
+
infile.close
|
41
|
+
|
42
|
+
features.each_index {|findex| # traverse all features
|
43
|
+
|
44
|
+
# for each feature *value*, find all label indices
|
45
|
+
|
46
|
+
fvalues = features[findex]
|
47
|
+
|
48
|
+
fval_to_label = Hash.new # record fval -> label mappings
|
49
|
+
# no label : nil
|
50
|
+
# one label: <label>
|
51
|
+
# two labels: false
|
52
|
+
|
53
|
+
fvalues.each_index {|inst_idx|
|
54
|
+
label = labels[inst_idx] # current label
|
55
|
+
fval = fvalues[inst_idx] # current feature value
|
56
|
+
seen_label = fval_to_label[fval] # previously seen label
|
57
|
+
if seen_label.nil?
|
58
|
+
fval_to_label[fval] = label
|
59
|
+
elsif seen_label and seen_label != label
|
60
|
+
fval_to_label[fval] = false
|
61
|
+
end
|
62
|
+
} # at the end, all fvals should be mapped to either <label> or false
|
63
|
+
|
64
|
+
# construct new feature value names
|
65
|
+
|
66
|
+
new_fvals = Hash.new
|
67
|
+
labels.each {|label|
|
68
|
+
new_fvals[label] = "f"+findex.to_s+"_"+label.gsub(/\./,"")
|
69
|
+
}
|
70
|
+
|
71
|
+
# record all features values for which we have only seen one label in @replacements
|
72
|
+
|
73
|
+
fval_to_label.each_pair {|fval,label|
|
74
|
+
if fval == "[U]"
|
75
|
+
puts "[U]: "+label.to_s+" "+new_fvals[label]
|
76
|
+
end
|
77
|
+
if label
|
78
|
+
# STDERR.puts "replacement of "+fval+" by "+new_fvals[label]
|
79
|
+
@replacements[findex][fval] = new_fvals[label]
|
80
|
+
end
|
81
|
+
}
|
82
|
+
|
83
|
+
# fvalues = features[findex]
|
84
|
+
|
85
|
+
# l_to_v = Hash.new # label -> array of feature values
|
86
|
+
# v_to_l = Hash.new # feature value -> array of labels
|
87
|
+
|
88
|
+
# fvalues.each_index {|inst| # traverse all instances
|
89
|
+
# fval = fvalues[inst]
|
90
|
+
# label = labels[inst]
|
91
|
+
|
92
|
+
|
93
|
+
# unless v_to_l.key?(fval) # add entry to v_to_l
|
94
|
+
# v_to_l[fval] = Array.new
|
95
|
+
# end
|
96
|
+
# v_to_l[fval] << label
|
97
|
+
|
98
|
+
# unless l_to_v.key?(label) # add entry to l_to_v
|
99
|
+
# l_to_v[label] = Array.new
|
100
|
+
# end
|
101
|
+
# l_to_v[label] << fval
|
102
|
+
# }
|
103
|
+
|
104
|
+
# l_to_v.each_pair {|label,values|
|
105
|
+
# newvalue = "f"+findex.to_s+"_"+label.gsub(/\./,"")
|
106
|
+
# values.each {|value|
|
107
|
+
# if v_to_l[value].uniq.length == 1
|
108
|
+
# @replacements[findex][value] = newvalue
|
109
|
+
# end
|
110
|
+
# }
|
111
|
+
# }
|
112
|
+
}
|
113
|
+
@ready = true
|
114
|
+
end
|
115
|
+
|
116
|
+
def init_from_file(optsfile) # use old optimisation
|
117
|
+
optsinfile = File.new(optsfile)
|
118
|
+
@replacements = read(optsinfile)
|
119
|
+
optsinfile.close
|
120
|
+
@ready = true
|
121
|
+
end
|
122
|
+
|
123
|
+
def store(outfilename) # store data necessary to recreate optimisation
|
124
|
+
unless @ready
|
125
|
+
raise "[Optimise] Error: Cannot store un-initialised optimisation"
|
126
|
+
end
|
127
|
+
outfile = File.new(outfilename,"w")
|
128
|
+
@replacements.each_index {|i| # for each feature
|
129
|
+
reps = @replacements[i]
|
130
|
+
outfile.puts "<"+i.to_s+">"
|
131
|
+
reps.each_pair{|old,new|
|
132
|
+
outfile.puts [old,new].join("\t")
|
133
|
+
}
|
134
|
+
outfile.puts "</"+i.to_s+">"
|
135
|
+
}
|
136
|
+
outfile.close
|
137
|
+
end
|
138
|
+
|
139
|
+
def apply(infilename,outfilename)
|
140
|
+
unless @ready
|
141
|
+
raise "[Optimise] Error: Cannot apply un-initialised optimisation"
|
142
|
+
end
|
143
|
+
|
144
|
+
STDERR.puts "[Optimise] applying feature optimisation"
|
145
|
+
|
146
|
+
infile = File.new(infilename)
|
147
|
+
outfile = File.new(outfilename,"w")
|
148
|
+
features = Array.new
|
149
|
+
labels = Array.new
|
150
|
+
|
151
|
+
|
152
|
+
while (line = infile.gets)
|
153
|
+
tokens = line.chomp.split(",")
|
154
|
+
|
155
|
+
unless tokens.length == @replacements.length
|
156
|
+
raise "[Optimise] Error: trying to optimise incompatible feature file!\nFile has "+features.length.to_s+" features, and we know replacements for "+@replacements.length.to_s+" features."
|
157
|
+
end
|
158
|
+
|
159
|
+
label = tokens.pop
|
160
|
+
tokens.each_index {|f_idx|
|
161
|
+
fval = tokens[f_idx]
|
162
|
+
if @replacements[f_idx].key?(fval)
|
163
|
+
tokens[f_idx] = @replacements[f_idx][fval]
|
164
|
+
end
|
165
|
+
}
|
166
|
+
tokens.push label
|
167
|
+
outfile.puts tokens.join(",")
|
168
|
+
end
|
169
|
+
outfile.close
|
170
|
+
end
|
171
|
+
|
172
|
+
private
|
173
|
+
|
174
|
+
def read(infile)
|
175
|
+
@replacements = Array.new
|
176
|
+
while line = infile.gets
|
177
|
+
line.chomp!
|
178
|
+
if line =~ /<(\d+)>/
|
179
|
+
reps = Hash.new
|
180
|
+
elsif line =~ /<\/(\d+)>/
|
181
|
+
@replacements[$1.to_i] = reps
|
182
|
+
else
|
183
|
+
tokens = line.chomp.split("\t")
|
184
|
+
reps[tokens[0]] = tokens[1]
|
185
|
+
end
|
186
|
+
end
|
187
|
+
infile.close
|
188
|
+
end
|
189
|
+
|
190
|
+
# return recommended filename to store optimisation patterns for basefile
|
191
|
+
def Optimise.recommended_filename(basefile)
|
192
|
+
return basefile+".optimisations"
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
@@ -0,0 +1,213 @@
|
|
1
|
+
# Alexander Koller 2003
|
2
|
+
# extended Katrin Erk June 2003
|
3
|
+
#
|
4
|
+
# Classes that return a list of sentence DOMs, from various sources
|
5
|
+
#
|
6
|
+
# Each class in this file defines the following methods:
|
7
|
+
#
|
8
|
+
# initialize(...) "..." depends on the class
|
9
|
+
# extractDOMs() return list of all s nodes as DOM objects
|
10
|
+
# each_s() iterate over s nodes; may take less memory
|
11
|
+
|
12
|
+
|
13
|
+
require "rexml/document"
|
14
|
+
|
15
|
+
class FileParser
|
16
|
+
|
17
|
+
include REXML
|
18
|
+
|
19
|
+
def initialize(filename)
|
20
|
+
@file = File.new(filename)
|
21
|
+
@doc = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
# returns an array of DOMs for the sentences
|
25
|
+
def extractDOMs()
|
26
|
+
ensureParsedDocument()
|
27
|
+
@doc.get_elements("/corpus/body/s")
|
28
|
+
end
|
29
|
+
|
30
|
+
# Iterates over all sentence nodes. This may be more memory
|
31
|
+
# efficient than using extractDOMs(), but isn't in this case.
|
32
|
+
def each_s()
|
33
|
+
extractDOMs().each { |dom| yield(dom) }
|
34
|
+
end
|
35
|
+
|
36
|
+
# Iterates over all sentence nodes. The block passed to this
|
37
|
+
# method should return a DOM object as a value. After the iteration
|
38
|
+
# has been completed, the contents of /corpus/body are then replaced
|
39
|
+
# by the list of these results.
|
40
|
+
# At the moment, this changes the FileParser object. This should
|
41
|
+
# probably change in the future, but I don't want to mess with
|
42
|
+
# cloning now.
|
43
|
+
def process_s!()
|
44
|
+
newBody = Element.new('body')
|
45
|
+
each_s { |dom| newBody.add_element( yield(dom) ) }
|
46
|
+
|
47
|
+
@doc.delete_element("/corpus/body")
|
48
|
+
@doc.elements["corpus"].add_element(newBody)
|
49
|
+
|
50
|
+
return @doc
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def ensureParsedDocument()
|
58
|
+
if @doc == nil then
|
59
|
+
@doc = Document.new(@file)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
#####################################################################
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
|
74
|
+
class FilePartsParser
|
75
|
+
# @file = File object for the corpus
|
76
|
+
# @head = string up to the first <s> tag
|
77
|
+
# @tail = string after the last </s> tag
|
78
|
+
# @rest = string starting with the latest <s> tag (complete this to
|
79
|
+
# a <s>...</s> structure by reading up to next </s> tag)
|
80
|
+
# @readCompletely = boolean specifying whether there's still something
|
81
|
+
# left to read in the file
|
82
|
+
|
83
|
+
attr_reader :head, :tail
|
84
|
+
|
85
|
+
def initialize(filename)
|
86
|
+
@file = File.new(filename)
|
87
|
+
@readCompletely = false
|
88
|
+
# read stuff into @head and initialize @rest
|
89
|
+
@head = ''
|
90
|
+
begin
|
91
|
+
while true do
|
92
|
+
line = @file.readline()
|
93
|
+
if line =~ /(.*)(<s\s.*)/ then
|
94
|
+
@head = @head << $1
|
95
|
+
@rest = $2
|
96
|
+
break
|
97
|
+
elsif line =~ /^(.*)(<\/body[\s>].*)$/
|
98
|
+
# empty corpus
|
99
|
+
@head = @head << $1
|
100
|
+
@tail = $2
|
101
|
+
while (line = @file.readline())
|
102
|
+
@tail << "\n" + line
|
103
|
+
end
|
104
|
+
@readCompletely = true
|
105
|
+
break
|
106
|
+
else
|
107
|
+
@head = @head << line
|
108
|
+
end
|
109
|
+
end
|
110
|
+
rescue EOFError
|
111
|
+
@readCompletely = true
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def close()
|
116
|
+
@file.close()
|
117
|
+
end
|
118
|
+
|
119
|
+
def extractDOMs()
|
120
|
+
allDOMs = Array.new
|
121
|
+
|
122
|
+
process_s!() { |dom|
|
123
|
+
allDOMs.push(dom)
|
124
|
+
Element.new("x")
|
125
|
+
}
|
126
|
+
return allDOMs
|
127
|
+
end
|
128
|
+
|
129
|
+
def each_s()
|
130
|
+
process_s!() { |dom|
|
131
|
+
yield(dom)
|
132
|
+
Element.new("x")
|
133
|
+
}
|
134
|
+
end
|
135
|
+
|
136
|
+
# This function returns the string for the modified corpus.
|
137
|
+
# It doesn't change the internal state of the FilePartsParser,
|
138
|
+
# and is much more memory (and probably time) efficient than
|
139
|
+
# FileParser#process_s!.
|
140
|
+
# The block that is called by the method is given an element
|
141
|
+
# as its argument and is expected to return a changed element.
|
142
|
+
def process_s!()
|
143
|
+
if @readCompletely
|
144
|
+
return
|
145
|
+
end
|
146
|
+
|
147
|
+
ret = ''
|
148
|
+
scan_s() { |element|
|
149
|
+
# Process the <s> ... </s> element
|
150
|
+
doc = Document.new(element)
|
151
|
+
elt = doc.root
|
152
|
+
changedElt = yield(elt)
|
153
|
+
|
154
|
+
changedEltAsString = ''
|
155
|
+
changedElt.write(changedEltAsString, 0)
|
156
|
+
ret <<= changedEltAsString
|
157
|
+
}
|
158
|
+
|
159
|
+
return ret
|
160
|
+
end
|
161
|
+
|
162
|
+
# KE 12.6.03: scan_s :
|
163
|
+
# doesn't parse a sentence before yielding it
|
164
|
+
# doesn't allow for any changes
|
165
|
+
# but otherwise the same as process_s!
|
166
|
+
def scan_s()
|
167
|
+
if @readCompletely
|
168
|
+
return
|
169
|
+
end
|
170
|
+
|
171
|
+
begin
|
172
|
+
while true do
|
173
|
+
# Invariant: At this point, @rest always starts with an
|
174
|
+
# unseen <s> tag.
|
175
|
+
|
176
|
+
# First, we continue reading until we find the closing </s>
|
177
|
+
# No exception should occur in this loop if we're parsing
|
178
|
+
# a valid XML document.
|
179
|
+
while @rest !~ /^(.*<\/s>)(.*)/m do
|
180
|
+
@rest = @rest << @file.readline()
|
181
|
+
end
|
182
|
+
|
183
|
+
element = $1
|
184
|
+
@rest = $2
|
185
|
+
|
186
|
+
yield(element) # change HERE: element not parsed!
|
187
|
+
|
188
|
+
# Read on up to the next <s>
|
189
|
+
while @rest !~ /(.*)(<s\s.*)/m do
|
190
|
+
@rest = @rest << @file.readline()
|
191
|
+
end
|
192
|
+
|
193
|
+
@rest = $2
|
194
|
+
end
|
195
|
+
rescue EOFError
|
196
|
+
@tail = @rest
|
197
|
+
@readCompletely = true
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# KE 5.11.03: get_rest: read all of the file not processed up to this point
|
202
|
+
# and return it as a string
|
203
|
+
def get_rest()
|
204
|
+
begin
|
205
|
+
while true do
|
206
|
+
@rest = @rest << @file.readline()
|
207
|
+
end
|
208
|
+
rescue EOFError
|
209
|
+
@readCompletely = true
|
210
|
+
end
|
211
|
+
return @rest
|
212
|
+
end
|
213
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
# RegXML
|
2
|
+
#
|
3
|
+
# Katrin Erk June 2005
|
4
|
+
|
5
|
+
# SalsaTigerRegXML: take control of the data structure, no underlying xml
|
6
|
+
# representation anymore, re-generation of xml on demand
|
7
|
+
|
8
|
+
class RegXML
|
9
|
+
|
10
|
+
def initialize(string, # string representing a single XML element
|
11
|
+
i_am_text = false) # boolean: xml element (false) or text (true)
|
12
|
+
|
13
|
+
unless string.class == String
|
14
|
+
raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
|
15
|
+
end
|
16
|
+
if i_am_text
|
17
|
+
@s = string
|
18
|
+
@i_am_text = true
|
19
|
+
else
|
20
|
+
@s = string.gsub(/\n/, " ").freeze
|
21
|
+
@i_am_text = false
|
22
|
+
|
23
|
+
element_test()
|
24
|
+
dyck_test()
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_s()
|
29
|
+
return xml_readable(@s)
|
30
|
+
end
|
31
|
+
|
32
|
+
def text?
|
33
|
+
return @i_am_text
|
34
|
+
end
|
35
|
+
|
36
|
+
def name()
|
37
|
+
if @i_am_text
|
38
|
+
# text
|
39
|
+
return nil
|
40
|
+
|
41
|
+
else
|
42
|
+
# xml element
|
43
|
+
if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
|
44
|
+
return $1
|
45
|
+
else
|
46
|
+
raise "Cannot parse:\n#{xml_readable(@s)}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def attributes()
|
52
|
+
if @i_am_text
|
53
|
+
# text
|
54
|
+
return {}
|
55
|
+
|
56
|
+
else
|
57
|
+
# xml element
|
58
|
+
|
59
|
+
# remove <element_name from the beginning of @s,
|
60
|
+
# place the rest up to the first > into elt_contents:
|
61
|
+
# this is a string of the form
|
62
|
+
# - either (name=value)*
|
63
|
+
# - or (name=value)*/
|
64
|
+
unless @s =~ /^\s*<\s*#{name()}(.*)$/
|
65
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
66
|
+
end
|
67
|
+
|
68
|
+
retv = Hash.new
|
69
|
+
elt_contents = $1
|
70
|
+
|
71
|
+
# repeat until only > or /> is left
|
72
|
+
while elt_contents !~ /^\s*\/?>/
|
73
|
+
|
74
|
+
# shave off the next name=value pair
|
75
|
+
# put the rest into elt_contents
|
76
|
+
# make sure that if the value is quoted with ',
|
77
|
+
# we accept " inside the value, and vice versa.
|
78
|
+
unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
|
79
|
+
raise "Cannot parse:\n #{xml_readable(elt_contents)}"
|
80
|
+
end
|
81
|
+
retv[$1] = $3
|
82
|
+
elt_contents = $4
|
83
|
+
end
|
84
|
+
|
85
|
+
return retv
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def children_and_text()
|
90
|
+
if @i_am_text
|
91
|
+
return []
|
92
|
+
|
93
|
+
else
|
94
|
+
if unary_element()
|
95
|
+
# <bla/>, no children
|
96
|
+
return []
|
97
|
+
end
|
98
|
+
|
99
|
+
# @s has the form <bla...> ... </bla>.
|
100
|
+
# remove <bla ...> from the beginning of @s,
|
101
|
+
# place the rest up to </bla> into children_s:
|
102
|
+
|
103
|
+
mainname = name()
|
104
|
+
unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
|
105
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
106
|
+
end
|
107
|
+
|
108
|
+
retv = Array.new
|
109
|
+
children_s = $3
|
110
|
+
|
111
|
+
# repeat until only whitespace is left
|
112
|
+
while children_s !~ /^\s*$/
|
113
|
+
|
114
|
+
# shave off the next bit of text
|
115
|
+
# put the rest into children_s
|
116
|
+
unless children_s =~ /^\s*(.*?)(<.*$|$)/
|
117
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
118
|
+
$stderr.puts
|
119
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
120
|
+
end
|
121
|
+
unless $1.strip.empty?
|
122
|
+
children_s = $2
|
123
|
+
retv << RegXML.new($1, true)
|
124
|
+
end
|
125
|
+
|
126
|
+
# anything left after we've parsed text?
|
127
|
+
if children_s =~ /^s*$/
|
128
|
+
break
|
129
|
+
end
|
130
|
+
|
131
|
+
# shave off the next child
|
132
|
+
# and put the rest into children_s
|
133
|
+
|
134
|
+
# determine the next child's name, and the string index at which
|
135
|
+
# the element start tag ends with either / or >
|
136
|
+
unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
|
137
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
138
|
+
$stderr.puts
|
139
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
140
|
+
end
|
141
|
+
childname = $2
|
142
|
+
child = $1
|
143
|
+
endofelt_ix = $&.length()
|
144
|
+
|
145
|
+
|
146
|
+
# and remove it
|
147
|
+
case children_s[endofelt_ix..-1]
|
148
|
+
when /^\/>(.*)$/
|
149
|
+
# next child is a unary element
|
150
|
+
children_s = $1
|
151
|
+
retv << RegXML.new(child + "/>")
|
152
|
+
|
153
|
+
when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
|
154
|
+
children_s = $2
|
155
|
+
retv << RegXML.new(child + $1)
|
156
|
+
|
157
|
+
else
|
158
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
159
|
+
$stderr.puts
|
160
|
+
raise "Cannot parse:\n#{xml_readable(children_s)}"
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
return retv
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def RegXML.test()
|
169
|
+
bla = RegXML.new(" <bla blupp='a\"b'
|
170
|
+
lalala=\"c\">
|
171
|
+
<lalala> </lalala>
|
172
|
+
texttext
|
173
|
+
<lala blupp='b'/>
|
174
|
+
nochtext
|
175
|
+
<la> <l/> </la>
|
176
|
+
</ bla >
|
177
|
+
")
|
178
|
+
puts "name " + bla.name()
|
179
|
+
puts
|
180
|
+
puts bla.to_s()
|
181
|
+
puts
|
182
|
+
bla.attributes.each { |attr, val|
|
183
|
+
puts "attr " + attr + "=" + val
|
184
|
+
}
|
185
|
+
puts
|
186
|
+
bla.children_and_text.each { |child_obj|
|
187
|
+
if child_obj.text?
|
188
|
+
puts "da text " + child_obj.to_s
|
189
|
+
else
|
190
|
+
puts "da child " + child_obj.to_s
|
191
|
+
end
|
192
|
+
}
|
193
|
+
puts
|
194
|
+
|
195
|
+
puts "NEU"
|
196
|
+
bla = RegXML.new(" < bla blupp='a\"'/> ")
|
197
|
+
puts "name " + bla.name()
|
198
|
+
puts
|
199
|
+
puts bla.to_s()
|
200
|
+
puts
|
201
|
+
bla.attributes.each { |attr, val|
|
202
|
+
puts "attr " + attr + "=" + val
|
203
|
+
}
|
204
|
+
puts
|
205
|
+
bla.children_and_text.each { |child_obj|
|
206
|
+
if child_obj.text?
|
207
|
+
puts "da text " + child_obj.to_s
|
208
|
+
else
|
209
|
+
puts "da child " + child_obj.to_s
|
210
|
+
end
|
211
|
+
}
|
212
|
+
puts
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
##############
|
217
|
+
protected
|
218
|
+
|
219
|
+
def unary_element()
|
220
|
+
# <bla/>
|
221
|
+
if @s =~ /^\s*<.*\/>\s*$/
|
222
|
+
return true
|
223
|
+
else
|
224
|
+
return false
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def element_test()
|
229
|
+
# make sure we have a single XML element, either <bla/> or
|
230
|
+
# <bla>...</bla>
|
231
|
+
|
232
|
+
if unary_element()
|
233
|
+
# <bla/>
|
234
|
+
elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
|
235
|
+
# <bla > ... </bla>
|
236
|
+
else
|
237
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def dyck_test()
|
242
|
+
# every prefix of @s must have at least as many < as >
|
243
|
+
opening = 0
|
244
|
+
closing = 0
|
245
|
+
@s.scan(/[<>]/) { |bracket|
|
246
|
+
case bracket
|
247
|
+
when "<"
|
248
|
+
opening += 1
|
249
|
+
when ">"
|
250
|
+
closing += 1
|
251
|
+
if closing > opening
|
252
|
+
raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
|
253
|
+
end
|
254
|
+
end
|
255
|
+
}
|
256
|
+
|
257
|
+
# and in total, @s must have equally many < and >
|
258
|
+
unless @s.count("<") == @s.count(">")
|
259
|
+
raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def xml_readable(string)
|
264
|
+
return string.gsub(/>/, ">\n")
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
# RegXML.test()
|
269
|
+
|