hutch-xamplr-pp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +504 -0
- data/README.rdoc +94 -0
- data/Rakefile +56 -0
- data/VERSION.yml +4 -0
- data/lib/xampl-pp-dtd.rb +126 -0
- data/lib/xampl-pp-wf.rb +1037 -0
- data/lib/xamplr-pp/ANNOUNCE.TXT +47 -0
- data/lib/xamplr-pp/LICENSE +504 -0
- data/lib/xamplr-pp/Makefile +122 -0
- data/lib/xamplr-pp/examples/parse-wf.rb +55 -0
- data/lib/xamplr-pp/examples/parse.rb +59 -0
- data/lib/xamplr-pp/license.inc +17 -0
- data/lib/xamplr-pp/saxdemo.rb +214 -0
- data/lib/xamplr-pp/saxish.rb +298 -0
- data/lib/xamplr-pp/saxishHandler.rb +58 -0
- data/lib/xamplr-pp/toys/chew.rb +62 -0
- data/lib/xamplr-pp/toys/chewMultibyte.rb +44 -0
- data/lib/xamplr-pp/toys/dump.rb +58 -0
- data/lib/xamplr-pp/xmlName.defn +67 -0
- data/lib/xamplr-pp/xpp.rb +908 -0
- data/lib/xamplr-pp/xppDeluxe.rb +49 -0
- data/lib/xamplr-pp/xppIter.rb +845 -0
- data/lib/xamplr-pp.rb +991 -0
- data/test/test_helper.rb +10 -0
- data/test/xamplr_pp_gem_test.rb +7 -0
- metadata +79 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/usr/local/bin/ruby
|
2
|
+
require "xampl-pp-wf"
|
3
|
+
|
4
|
+
class Listener
|
5
|
+
|
6
|
+
attr :count, false
|
7
|
+
|
8
|
+
def parse(filename)
|
9
|
+
@xpp = Xampl_PP.new
|
10
|
+
@xpp.input = File.new(filename)
|
11
|
+
|
12
|
+
@count = 0
|
13
|
+
|
14
|
+
while not @xpp.endDocument? do
|
15
|
+
event = @xpp.nextEvent
|
16
|
+
case event
|
17
|
+
#case @xpp.nextEvent
|
18
|
+
when Xampl_PP::START_DOCUMENT
|
19
|
+
@count += 1
|
20
|
+
when Xampl_PP::END_DOCUMENT
|
21
|
+
@count += 1
|
22
|
+
when Xampl_PP::START_ELEMENT
|
23
|
+
@count += 1
|
24
|
+
when Xampl_PP::END_ELEMENT
|
25
|
+
@count += 1
|
26
|
+
when Xampl_PP::TEXT
|
27
|
+
@count += 1
|
28
|
+
when Xampl_PP::CDATA_SECTION
|
29
|
+
@count += 1
|
30
|
+
when Xampl_PP::ENTITY_REF
|
31
|
+
@count += 1
|
32
|
+
when Xampl_PP::IGNORABLE_WHITESPACE
|
33
|
+
@count += 1
|
34
|
+
when Xampl_PP::PROCESSING_INSTRUCTION
|
35
|
+
@count += 1
|
36
|
+
when Xampl_PP::COMMENT
|
37
|
+
@count += 1
|
38
|
+
when Xampl_PP::DOCTYPE
|
39
|
+
@count += 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
start = Time.now
|
47
|
+
for filename in ARGV do
|
48
|
+
listener = Listener.new
|
49
|
+
listener.parse(filename)
|
50
|
+
#printf("EVENTS: %d\n", listener.count)
|
51
|
+
end
|
52
|
+
puts "Time: #{Time.now - start}"
|
53
|
+
|
54
|
+
|
55
|
+
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/local/bin/ruby
|
2
|
+
require "xampl-pp"
|
3
|
+
|
4
|
+
class Listener
|
5
|
+
|
6
|
+
attr :count, false
|
7
|
+
|
8
|
+
def parse(filename)
|
9
|
+
@xpp = Xampl_PP.new
|
10
|
+
@xpp.input = File.new(filename)
|
11
|
+
|
12
|
+
@count = 0
|
13
|
+
|
14
|
+
#printf("__________________________________")
|
15
|
+
while not @xpp.endDocument? do
|
16
|
+
event = @xpp.nextEvent
|
17
|
+
if(0 == (@count % 10001)) then
|
18
|
+
printf("count: %d\n", count)
|
19
|
+
end
|
20
|
+
#printf("\nEVENT: %s\n", event)
|
21
|
+
case event
|
22
|
+
#case @xpp.nextEvent
|
23
|
+
when Xampl_PP::START_DOCUMENT
|
24
|
+
@count += 1
|
25
|
+
when Xampl_PP::END_DOCUMENT
|
26
|
+
@count += 1
|
27
|
+
when Xampl_PP::START_ELEMENT
|
28
|
+
@count += 1
|
29
|
+
when Xampl_PP::END_ELEMENT
|
30
|
+
@count += 1
|
31
|
+
when Xampl_PP::TEXT
|
32
|
+
@count += 1
|
33
|
+
when Xampl_PP::CDATA_SECTION
|
34
|
+
@count += 1
|
35
|
+
when Xampl_PP::ENTITY_REF
|
36
|
+
@count += 1
|
37
|
+
when Xampl_PP::IGNORABLE_WHITESPACE
|
38
|
+
@count += 1
|
39
|
+
when Xampl_PP::PROCESSING_INSTRUCTION
|
40
|
+
@count += 1
|
41
|
+
when Xampl_PP::COMMENT
|
42
|
+
@count += 1
|
43
|
+
when Xampl_PP::DOCTYPE
|
44
|
+
@count += 1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
for filename in ARGV do
|
52
|
+
listener = Listener.new
|
53
|
+
listener.parse(filename)
|
54
|
+
|
55
|
+
printf("EVENTS: %d\n", listener.count)
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# xampl-pp : XML pull parser
|
2
|
+
# Copyright (C) 2002-2009 Bob Hutchison
|
3
|
+
#
|
4
|
+
# This library is free software; you can redistribute it and/or
|
5
|
+
# modify it under the terms of the GNU Lesser General Public
|
6
|
+
# License as published by the Free Software Foundation; either
|
7
|
+
# version 2.1 of the License, or (at your option) any later version.
|
8
|
+
#
|
9
|
+
# This library is distributed in the hope that it will be useful,
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
12
|
+
# #Lesser General Public License for more details.
|
13
|
+
#
|
14
|
+
# You should have received a copy of the GNU Lesser General Public
|
15
|
+
# License along with this library; if not, write to the Free Software
|
16
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
17
|
+
#
|
@@ -0,0 +1,214 @@
|
|
1
|
+
#!/usr/local/bin/ruby
|
2
|
+
#
|
3
|
+
# xampl-pp : XML pull parser
|
4
|
+
# Copyright (C) 2002-2009 Bob Hutchison
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# #Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
|
21
|
+
require "xampl-pp"
|
22
|
+
require "saxish"
|
23
|
+
require "saxishHandler"
|
24
|
+
|
25
|
+
#
|
26
|
+
# This module uses the saxish api. The saxish api is meant to demonstrate
|
27
|
+
# the use of Xampl_PP pull parser, while being useful in itself. If you are
|
28
|
+
# meaning to learn about the Xampl_PP parser, you should look at the
|
29
|
+
# saxish.rb file -- this one isn't going to help you a lot.
|
30
|
+
#
|
31
|
+
|
32
|
+
class SAXdemo
|
33
|
+
include SAXishHandler
|
34
|
+
|
35
|
+
attr :verbose, true
|
36
|
+
|
37
|
+
def resolve(name)
|
38
|
+
@resolverCount += 1
|
39
|
+
return "fake it"
|
40
|
+
end
|
41
|
+
|
42
|
+
def startElement(name, namespace, qname, prefix, attributeCount, isEmpty, saxparser)
|
43
|
+
printf("StartElement -- name: '%s'\n", name) if verbose
|
44
|
+
printf(" namespace: '%s'\n", namespace) if verbose
|
45
|
+
printf(" qname: '%s'\n", qname) if verbose
|
46
|
+
printf(" prefix: '%s'\n", prefix) if verbose
|
47
|
+
printf(" attributeCount: %d\n", attributeCount) if verbose
|
48
|
+
printf(" isEmpty: %s\n", isEmpty) if verbose
|
49
|
+
i = 0
|
50
|
+
while i < attributeCount do
|
51
|
+
printf(" attribute[%d] -- name: '%s'\n", i, saxparser.attributeName(i)) if verbose
|
52
|
+
printf(" attribute[%d] -- namespace: '%s'\n", i, saxparser.attributeNamespace(i)) if verbose
|
53
|
+
printf(" attribute[%d] -- qname: '%s'\n", i, saxparser.attributeQName(i)) if verbose
|
54
|
+
printf(" attribute[%d] -- prefix: '%s'\n", i, saxparser.attributePrefix(i)) if verbose
|
55
|
+
printf(" attribute[%d] -- value: '%s'\n", i, saxparser.attributeValue(i)) if verbose
|
56
|
+
i += 1
|
57
|
+
end
|
58
|
+
@startElementEventCount += 1
|
59
|
+
@eventCount += 1
|
60
|
+
if @maxDepth < saxparser.depth then
|
61
|
+
@maxDepth = saxparser.depth
|
62
|
+
@maxDepthLine = saxparser.line
|
63
|
+
@maxDepthColumn = saxparser.column
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def endElement(name, namespace, qname, prefix)
|
68
|
+
printf("EndElement -- name: '%s'\n", name) if verbose
|
69
|
+
printf(" namespace: '%s'\n", namespace) if verbose
|
70
|
+
printf(" qname: '%s'\n", qname) if verbose
|
71
|
+
printf(" prefix: '%s'\n", prefix) if verbose
|
72
|
+
@endElementEventCount += 1
|
73
|
+
@eventCount += 1
|
74
|
+
end
|
75
|
+
|
76
|
+
def entityRef(name, text)
|
77
|
+
printf("EntityRef -- name '%s' text '%s'\n", name, text) if verbose
|
78
|
+
@entityRefCount += 1
|
79
|
+
@eventCount += 1
|
80
|
+
end
|
81
|
+
|
82
|
+
def text(text, isWhitespace)
|
83
|
+
if not isWhitespace then
|
84
|
+
printf("Text -- length: %d\n", text.length) if verbose
|
85
|
+
@textEventCount += 1
|
86
|
+
else
|
87
|
+
printf("Text -- length: %d WHITESPACE\n", text.length) if verbose
|
88
|
+
@whitespaceTextEventCount += 1
|
89
|
+
end
|
90
|
+
@eventCount += 1
|
91
|
+
end
|
92
|
+
|
93
|
+
def cdataSection(text)
|
94
|
+
printf("CDATA -- length: %s\n", text.length) if verbose
|
95
|
+
@cdataEventCount += 1
|
96
|
+
@eventCount += 1
|
97
|
+
end
|
98
|
+
|
99
|
+
def ignoreableWhitespace(text)
|
100
|
+
printf("IgnoreableWhitespace -- length: %s\n", text.length) if verbose
|
101
|
+
@ignorableWhitespaceEventCount += 1
|
102
|
+
@eventCount += 1
|
103
|
+
end
|
104
|
+
|
105
|
+
def processingInstruction(text)
|
106
|
+
printf("ProcessingInstruction -- [%s]\n", text) if verbose
|
107
|
+
@processingInstructionEventCount += 1
|
108
|
+
@eventCount += 1
|
109
|
+
end
|
110
|
+
|
111
|
+
def comment(text)
|
112
|
+
printf("comment -- [%s]\n", text) if verbose
|
113
|
+
@commentEventCount += 1
|
114
|
+
@eventCount += 1
|
115
|
+
end
|
116
|
+
|
117
|
+
def doctype(text)
|
118
|
+
printf("doctype -- [%s]\n", text) if verbose
|
119
|
+
@doctypeEventCount += 1
|
120
|
+
@eventCount += 1
|
121
|
+
end
|
122
|
+
|
123
|
+
def init
|
124
|
+
@startElementEventCount = 0
|
125
|
+
@endElementEventCount = 0
|
126
|
+
@entityRefCount = 0
|
127
|
+
@resolverCount = 0
|
128
|
+
@textEventCount = 0
|
129
|
+
@cdataEventCount = 0
|
130
|
+
@whitespaceTextEventCount = 0
|
131
|
+
@ignorableWhitespaceEventCount = 0
|
132
|
+
@processingInstructionEventCount = 0
|
133
|
+
@doctypeEventCount = 0
|
134
|
+
@commentEventCount = 0
|
135
|
+
@eventCount = 0
|
136
|
+
@failureCount = 0
|
137
|
+
@successCount = 0
|
138
|
+
@maxDepth = -1
|
139
|
+
end
|
140
|
+
|
141
|
+
def report
|
142
|
+
printf("%5d eventCount\n", @eventCount)
|
143
|
+
printf("%5d successCount\n", @successCount)
|
144
|
+
printf("%5d maxDepth [%d, %d]\n", @maxDepth, @maxDepthLine, @maxDepthColumn)
|
145
|
+
printf("%5d failureCount\n", @failureCount)
|
146
|
+
printf("%5d startElementEventCount\n", @startElementEventCount)
|
147
|
+
printf("%5d endElementEventCount\n", @endElementEventCount)
|
148
|
+
printf("%5d entityRefCount\n", @entityRefCount)
|
149
|
+
printf("%5d resolverCount\n", @resolverCount)
|
150
|
+
printf("%5d textEventCount\n", @textEventCount)
|
151
|
+
printf("%5d cdataEventCount\n", @cdataEventCount)
|
152
|
+
printf("%5d whitespaceTextEventCount\n", @whitespaceTextEventCount)
|
153
|
+
printf("%5d ignorableWhitespaceEventCount\n", @ignorableWhitespaceEventCount)
|
154
|
+
printf("%5d processingInstructionEventCount\n", @processingInstructionEventCount)
|
155
|
+
printf("%5d doctypeEventCount\n", @doctypeEventCount)
|
156
|
+
printf("%5d commentEventCount\n", @commentEventCount)
|
157
|
+
end
|
158
|
+
|
159
|
+
def fileNames(fileNames)
|
160
|
+
init
|
161
|
+
|
162
|
+
@saxparser = SAXish.new
|
163
|
+
@saxparser.handler = self
|
164
|
+
@saxparser.processNamespace = true
|
165
|
+
@saxparser.reportNamespaceAttributes = false
|
166
|
+
|
167
|
+
fileNames.each do
|
168
|
+
| filename |
|
169
|
+
begin
|
170
|
+
@saxparser.parse filename
|
171
|
+
@successCount += 1
|
172
|
+
rescue Exception => message
|
173
|
+
@failureCount += 1
|
174
|
+
print message.backtrace.join("\n")
|
175
|
+
printf("FAILED [%s] '%s'\n", message, filename)
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
report
|
180
|
+
end
|
181
|
+
|
182
|
+
def string(string)
|
183
|
+
init
|
184
|
+
|
185
|
+
@saxparser = SAXish.new
|
186
|
+
@saxparser.handler = self
|
187
|
+
@saxparser.processNamespace = true
|
188
|
+
@saxparser.reportNamespaceAttributes = false
|
189
|
+
|
190
|
+
begin
|
191
|
+
@saxparser.parseString string
|
192
|
+
@successCount += 1
|
193
|
+
rescue Exception => message
|
194
|
+
@failureCount += 1
|
195
|
+
print message.backtrace.join("\n")
|
196
|
+
printf("FAILED [%s] '%s'\n", message, string)
|
197
|
+
end
|
198
|
+
|
199
|
+
report
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
string = <<EOS
|
204
|
+
<root>
|
205
|
+
<a>
|
206
|
+
</a>
|
207
|
+
<b>hello</b>
|
208
|
+
<c>hello &there;</c>
|
209
|
+
</root>
|
210
|
+
EOS
|
211
|
+
|
212
|
+
SAXdemo.new.string(string)
|
213
|
+
SAXdemo.new.fileNames(ARGV)
|
214
|
+
|
@@ -0,0 +1,298 @@
|
|
1
|
+
#
|
2
|
+
# xampl-pp : XML pull parser
|
3
|
+
# Copyright (C) 2002-2009 Bob Hutchison
|
4
|
+
#
|
5
|
+
# This library is free software; you can redistribute it and/or
|
6
|
+
# modify it under the terms of the GNU Lesser General Public
|
7
|
+
# License as published by the Free Software Foundation; either
|
8
|
+
# version 2.1 of the License, or (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This library is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
# #Lesser General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU Lesser General Public
|
16
|
+
# License along with this library; if not, write to the Free Software
|
17
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
18
|
+
#
|
19
|
+
require "xampl-pp"
|
20
|
+
|
21
|
+
##
|
22
|
+
## It may seem strange, but it seems that a good way to demonstrate the use
|
23
|
+
## of the xampl-pp pull parser is to show how to build a SAX-like XML
|
24
|
+
## parser. Both pull parsers and SAX parsers are stream based -- they parse
|
25
|
+
## the XML file bit by bit informing its client of interesting events as
|
26
|
+
## they are encountered. The whole XML document is not required to be in
|
27
|
+
## memory. The significant difference between pull parsers and SAX parsers
|
28
|
+
## is in where the 'main loop' is located: in the client for pull parsers,
|
29
|
+
## in the parser for SAX parsers. Clients call a method of the pull parser
|
30
|
+
## to get the next event. SAX parsers call methods of the client to notify
|
31
|
+
## it of events (so these are 'push parsers').
|
32
|
+
##
|
33
|
+
## It turns out to be quite easy to build a SAX-like parser from a pull
|
34
|
+
## parser. It is quite a lot harder to build a pull parser from a SAX-like
|
35
|
+
## parser.
|
36
|
+
##
|
37
|
+
## This class demonstrates (most) of the xampl-pp interface by implementing a
|
38
|
+
## SAX-like parser. No attempt has been made to provide all the functionality
|
39
|
+
## provided by a good Java SAX parser, though the equivalent of a significant,
|
40
|
+
## and useful, subset is implemented.
|
41
|
+
##
|
42
|
+
## The program text is annotated. Note, that the annotations generally
|
43
|
+
## follow the code being described.
|
44
|
+
##
|
45
|
+
|
46
|
+
|
47
|
+
class SAXish
|
48
|
+
|
49
|
+
##
|
50
|
+
## The Ruby implementation of the xampl-pp parser is called Xampl_PP, and
|
51
|
+
## SAXish will be the name of our SAX-like parser.
|
52
|
+
##
|
53
|
+
|
54
|
+
attr :handler, true
|
55
|
+
|
56
|
+
##
|
57
|
+
## Sax parsers need an event handler. 'handler' is it. Handler is expected to
|
58
|
+
## implement the methods defined in the module 'saxishHandler'. SaxishHandler
|
59
|
+
## is intended to be an adapter (so you can include it in any hander you
|
60
|
+
## write), so only the event-handlers for those events in which you are
|
61
|
+
## interested in need to be re-defined. SAXdemo is an implementation of
|
62
|
+
## SaxishHandler that gathers some statistics.
|
63
|
+
##
|
64
|
+
## Xampl-pp requires something it calls a resolver. This is a class that
|
65
|
+
## implements a method called resolve. There are a number of predefined
|
66
|
+
## entities in xampl-pp: & ' > < and ". It is possible
|
67
|
+
## to add more entities by adding entries to the entityMap hashtable. If an
|
68
|
+
## entity is encountered that is not in entityMap then the resolve method on
|
69
|
+
## the resolver is called. The default resolver returns nil, which causes
|
70
|
+
## an exception to be thrown. If you specify your own resolver you can do
|
71
|
+
## anything you like to obtain a value for the entity, or you can return nil
|
72
|
+
## (and an exception will be thrown). Xampl-pp, by default, is its own
|
73
|
+
## resolver and simply return nil.
|
74
|
+
##
|
75
|
+
## We are going to require that our saxish handler also be the entity
|
76
|
+
## resolver. This is reflected in the SaxHandler module, which implements
|
77
|
+
## a resolve method that always returns nil.
|
78
|
+
##
|
79
|
+
|
80
|
+
attr :processNamespace, true
|
81
|
+
attr :reportNamespaceAttributes, true
|
82
|
+
|
83
|
+
##
|
84
|
+
## This block of comments can be ignored, certainly for the first reading.
|
85
|
+
## It talks about some control you have over how the xampl-pp works. The
|
86
|
+
## default behaviour is the most commonly used.
|
87
|
+
##
|
88
|
+
## There are two main controls used here: processNamespace, and
|
89
|
+
## reportNamespaceAttributes. If processNamespaces is true, then namespaces
|
90
|
+
## in the XML file being parsed will be processed. Processing means that if
|
91
|
+
## an element <prefix:name/> is encountered, then four variables will be
|
92
|
+
## set up in the parser instance: name is 'name', prefix is 'prefix',
|
93
|
+
## qname is 'prefix:name', and namespace is defined. If the namespace cannot
|
94
|
+
## be defined an exception is thrown. In addition the xmlns attributes
|
95
|
+
## are processed. If processNamespace is false then name and qname
|
96
|
+
## will both be 'prefix:name', and both prefix and namespace undefined.
|
97
|
+
## If reportNamespaceAttributes is true then the xmlns attributes will be
|
98
|
+
## reported along with all the other attributes, if false then they will
|
99
|
+
## be hidden. The default behaviour is to process namespaces but to not
|
100
|
+
## report the namespace attributes.
|
101
|
+
##
|
102
|
+
## There are two other controls that should be mentioned. They are not
|
103
|
+
## used here.
|
104
|
+
##
|
105
|
+
## Pull parsers are pretty low level tools. They are meant to be fast. While
|
106
|
+
## may wellformedness constraints are enforced, not all are. If the control
|
107
|
+
## checkWellFormed is true then additional checks are made. Xampl-pp does
|
108
|
+
## not guarantee that it will parse only well formed XML documents. It
|
109
|
+
## will parse some XML files that are not well formed without objecting. In
|
110
|
+
## future releases, it will be possible to have xampl-pp accept only
|
111
|
+
## well formed documents. If checkWellFormed is false, then the parser
|
112
|
+
## doesn't go out of its way to notice ill formed documents. The default
|
113
|
+
## is true.
|
114
|
+
##
|
115
|
+
## The fourth control is 'utf8encode'. If this is true, and it defaults to
|
116
|
+
## true, then an entity like Ӓ is encountered then it will be encoded
|
117
|
+
## using utf8 rules. Given the current state of the parser, it would be best
|
118
|
+
## to leave it set to true. If you want to change this then you must either
|
119
|
+
## never use &#; encodings with numbers greater than 255 (Ruby will throw an
|
120
|
+
## exception), or you must redefine xampl-pp's encode method to do the right
|
121
|
+
## thing.
|
122
|
+
##
|
123
|
+
|
124
|
+
def parse(filename)
|
125
|
+
@xpp = Xampl_PP.new
|
126
|
+
@xpp.input = File.new(filename)
|
127
|
+
@xpp.processNamespace = @processNamespace
|
128
|
+
@xpp.reportNamespaceAttributes = @reportNamespaceAttributes
|
129
|
+
@xpp.resolver = @handler
|
130
|
+
|
131
|
+
work
|
132
|
+
end
|
133
|
+
|
134
|
+
def parseString(string)
|
135
|
+
@xpp = Xampl_PP.new
|
136
|
+
@xpp.input = string
|
137
|
+
@xpp.processNamespace = @processNamespace
|
138
|
+
@xpp.reportNamespaceAttributes = @reportNamespaceAttributes
|
139
|
+
@xpp.resolver = @handler
|
140
|
+
|
141
|
+
work
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Constructing an instance of xampl-pp is pretty straight forward: Xampl_PP.new
|
146
|
+
#
|
147
|
+
# Xampl_PP accepts two kinds of input: IO and String. The same method,
|
148
|
+
# 'input', is used to specify the input. It is possible to set the input
|
149
|
+
# anytime, but if you do, the current input will be closed if it is of
|
150
|
+
# type IO, and the parsing will begin at the current location of the input.
|
151
|
+
#
|
152
|
+
# The methods parse and parseString illustrate.
|
153
|
+
#
|
154
|
+
|
155
|
+
def work
|
156
|
+
while not @xpp.endDocument? do
|
157
|
+
case @xpp.nextEvent
|
158
|
+
when Xampl_PP::START_DOCUMENT
|
159
|
+
@handler.startDocument
|
160
|
+
when Xampl_PP::END_DOCUMENT
|
161
|
+
@handler.endDocument
|
162
|
+
when Xampl_PP::START_ELEMENT
|
163
|
+
@handler.startElement(@xpp.name,
|
164
|
+
@xpp.namespace,
|
165
|
+
@xpp.qname,
|
166
|
+
@xpp.prefix,
|
167
|
+
attributeCount,
|
168
|
+
@xpp.emptyElement,
|
169
|
+
self)
|
170
|
+
when Xampl_PP::END_ELEMENT
|
171
|
+
@handler.endElement(@xpp.name,
|
172
|
+
@xpp.namespace,
|
173
|
+
@xpp.qname,
|
174
|
+
@xpp.prefix)
|
175
|
+
when Xampl_PP::TEXT
|
176
|
+
@handler.text(@xpp.text, @xpp.whitespace?)
|
177
|
+
when Xampl_PP::CDATA_SECTION
|
178
|
+
@handler.cdataSection(@xpp.text)
|
179
|
+
when Xampl_PP::ENTITY_REF
|
180
|
+
@handler.entityRef(@xpp.name, @xpp.text)
|
181
|
+
when Xampl_PP::IGNORABLE_WHITESPACE
|
182
|
+
@handler.ignoreableWhitespace(@xpp.text)
|
183
|
+
when Xampl_PP::PROCESSING_INSTRUCTION
|
184
|
+
@handler.processingInstruction(@xpp.text)
|
185
|
+
when Xampl_PP::COMMENT
|
186
|
+
@handler.comment(@xpp.text)
|
187
|
+
when Xampl_PP::DOCTYPE
|
188
|
+
@handler.doctype(@xpp.text)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def attributeCount
|
194
|
+
return @xpp.attributeName.length
|
195
|
+
end
|
196
|
+
|
197
|
+
def attributeName(i)
|
198
|
+
return @xpp.attributeName[i]
|
199
|
+
end
|
200
|
+
|
201
|
+
def attributeNamespace(i)
|
202
|
+
return @xpp.attributeNamespace[i]
|
203
|
+
end
|
204
|
+
|
205
|
+
def attributeQName(i)
|
206
|
+
return @xpp.attributeQName[i]
|
207
|
+
end
|
208
|
+
|
209
|
+
def attributePrefix(i)
|
210
|
+
return @xpp.attributePrefix[i]
|
211
|
+
end
|
212
|
+
|
213
|
+
def attributeValue(i)
|
214
|
+
return @xpp.attributeValue[i]
|
215
|
+
end
|
216
|
+
|
217
|
+
def depth
|
218
|
+
return @xpp.depth
|
219
|
+
end
|
220
|
+
|
221
|
+
def line
|
222
|
+
return @xpp.line
|
223
|
+
end
|
224
|
+
|
225
|
+
def column
|
226
|
+
return @xpp.column
|
227
|
+
end
|
228
|
+
|
229
|
+
|
230
|
+
##
|
231
|
+
## There is one method used to parse the XML document: nextEvent. It returns
|
232
|
+
## the type of the event (described below). There are corresponding queries
|
233
|
+
## defined for each event type. The event is described by variables in the
|
234
|
+
## xampl-pp instance.
|
235
|
+
##
|
236
|
+
## It is possible to obtain the depth in the XML file (i.e. who many elements
|
237
|
+
## are currently open) using the xampl-pp method 'depth'. This is made
|
238
|
+
## available to the saxish client using a method on the sishax parser with the
|
239
|
+
## same name.
|
240
|
+
##
|
241
|
+
## The line and column number of the next unparsed character is available
|
242
|
+
## using the line and column methods. Note that line is always 1 for
|
243
|
+
## string input.
|
244
|
+
##
|
245
|
+
## There is a method, whitespace?, that will tell you if the current text
|
246
|
+
## value is whitespace.
|
247
|
+
##
|
248
|
+
## The event types are:
|
249
|
+
##
|
250
|
+
## START_DOCUMENT, END_DOCUMENT -- informational
|
251
|
+
##
|
252
|
+
## START_ELEMENT -- on this event several features are defined in the parser
|
253
|
+
## that are pertinent. name, namespace, qname, prefix describe the element
|
254
|
+
## tag name. emptyElement is true if the element is of the form <element/>,
|
255
|
+
## false otherwise. And the arrays attributeName, attributeNamespace,
|
256
|
+
## attributeQName, attributePrefix, and attributeValue contain attribute
|
257
|
+
## information. The number of attributes is obtained from the length of
|
258
|
+
## any of these arrays. Attribute information is presented to the sax
|
259
|
+
## client using six methods: attributeCount, attributeName(i),
|
260
|
+
## attributeNamespace(i), attributeQName(i), attributePrefix(i),
|
261
|
+
## attributeValue(i).
|
262
|
+
##
|
263
|
+
## END_ELEMENT -- name, namespace, qname, and prefix are defined. NOTE that
|
264
|
+
## emptyElement will always be false for this event, even though it is called
|
265
|
+
## for elements of the form <element/>.
|
266
|
+
##
|
267
|
+
## TEXT -- upon plain text found in an element. Note that it is
|
268
|
+
## quite possible that several text events in succession may be made for a
|
269
|
+
## single run of text in the XML file
|
270
|
+
##
|
271
|
+
## CDATA_SECTION -- upon a CDATA section. Note that it is quite possible
|
272
|
+
## that several CDATA events in succession may be made for a single CDATA
|
273
|
+
## section.
|
274
|
+
##
|
275
|
+
## ENTITY_REF -- for each entity encountered. It will have the
|
276
|
+
## value in the text field, and the name in the name field.
|
277
|
+
##
|
278
|
+
## IGNORABLE_WHITESPACE -- for whitespace that occurs at the document
|
279
|
+
## level of the XML file (i.e. outside the root element). This whitespace is
|
280
|
+
## meaningless in XML and so can be ignored (and so the name). If you are
|
281
|
+
## interested in it, the whitespace is in the text field.
|
282
|
+
##
|
283
|
+
## PROCESSING_INSTRUCTION -- upon a processing instruction. The content of
|
284
|
+
## the processing instruction (with the <? and ?> removed) is provied in
|
285
|
+
## the text field.
|
286
|
+
##
|
287
|
+
## COMMENT -- upon a comment. The content of the comment (with the <!--
|
288
|
+
## and --> removed) is provied in the text field.
|
289
|
+
##
|
290
|
+
## DOCTYPE -- upon encountering a doctype. The content of the doctype
|
291
|
+
## (with the <!DOCTYPE and trailing > removed) is provided in the text field.
|
292
|
+
##
|
293
|
+
## The event query methods are: cdata?, comment?, doctype?, endDocument?,
|
294
|
+
## endElement?, entityRef?, ignorableWhitespace?, processingInstruction?,
|
295
|
+
## startDocument?, startElement?, and text?
|
296
|
+
##
|
297
|
+
|
298
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# xampl-pp : XML pull parser
|
2
|
+
# Copyright (C) 2002-2009 Bob Hutchison
|
3
|
+
#
|
4
|
+
# This library is free software; you can redistribute it and/or
|
5
|
+
# modify it under the terms of the GNU Lesser General Public
|
6
|
+
# License as published by the Free Software Foundation; either
|
7
|
+
# version 2.1 of the License, or (at your option) any later version.
|
8
|
+
#
|
9
|
+
# This library is distributed in the hope that it will be useful,
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
12
|
+
# #Lesser General Public License for more details.
|
13
|
+
#
|
14
|
+
# You should have received a copy of the GNU Lesser General Public
|
15
|
+
# License along with this library; if not, write to the Free Software
|
16
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
17
|
+
#
|
18
|
+
|
19
|
+
module SAXishHandler
|
20
|
+
|
21
|
+
def resolve(name)
|
22
|
+
return nil
|
23
|
+
end
|
24
|
+
|
25
|
+
def startDocument
|
26
|
+
end
|
27
|
+
|
28
|
+
def endDocument
|
29
|
+
end
|
30
|
+
|
31
|
+
def startElement(name, namespace, qname, prefix, attributeCount, isEmptyElement, saxParser)
|
32
|
+
end
|
33
|
+
|
34
|
+
def endElement(name, namespace, qname, prefix)
|
35
|
+
end
|
36
|
+
|
37
|
+
def entityRef(name, text)
|
38
|
+
end
|
39
|
+
|
40
|
+
def text(text, isWhitespace)
|
41
|
+
end
|
42
|
+
|
43
|
+
def cdataSection(text)
|
44
|
+
end
|
45
|
+
|
46
|
+
def ignoreableWhitespace(text)
|
47
|
+
end
|
48
|
+
|
49
|
+
def processingInstruction(text)
|
50
|
+
end
|
51
|
+
|
52
|
+
def doctype(text)
|
53
|
+
end
|
54
|
+
|
55
|
+
def comment(text)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|