xmlscan 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +1276 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +31 -0
- data/README.rdoc +365 -0
- data/Rakefile +65 -0
- data/THANKS +11 -0
- data/VERSION +1 -0
- data/install.rb +41 -0
- data/lib/xmlscan/htmlscan.rb +290 -0
- data/lib/xmlscan/namespace.rb +353 -0
- data/lib/xmlscan/parser.rb +300 -0
- data/lib/xmlscan/scanner.rb +1123 -0
- data/lib/xmlscan/version.rb +23 -0
- data/lib/xmlscan/visitor.rb +162 -0
- data/lib/xmlscan/xmlchar.rb +248 -0
- data/test.rb +7 -0
- metadata +113 -0
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "rspec", "~> 2.8.0"
|
10
|
+
gem "rdoc", "~> 3.12"
|
11
|
+
gem "bundler", "~> 1.0.0"
|
12
|
+
gem "jeweler", "~> 1.8.3"
|
13
|
+
#gem "rcov", ">= 0"
|
14
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.8.3)
|
7
|
+
bundler (~> 1.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rdoc
|
11
|
+
json (1.6.5)
|
12
|
+
rake (0.9.2.2)
|
13
|
+
rdoc (3.12)
|
14
|
+
json (~> 1.4)
|
15
|
+
rspec (2.8.0)
|
16
|
+
rspec-core (~> 2.8.0)
|
17
|
+
rspec-expectations (~> 2.8.0)
|
18
|
+
rspec-mocks (~> 2.8.0)
|
19
|
+
rspec-core (2.8.0)
|
20
|
+
rspec-expectations (2.8.0)
|
21
|
+
diff-lcs (~> 1.1.2)
|
22
|
+
rspec-mocks (2.8.0)
|
23
|
+
|
24
|
+
PLATFORMS
|
25
|
+
ruby
|
26
|
+
|
27
|
+
DEPENDENCIES
|
28
|
+
bundler (~> 1.0.0)
|
29
|
+
jeweler (~> 1.8.3)
|
30
|
+
rdoc (~> 3.12)
|
31
|
+
rspec (~> 2.8.0)
|
data/README.rdoc
ADDED
@@ -0,0 +1,365 @@
|
|
1
|
+
|
2
|
+
== xmlscan
|
3
|
+
|
4
|
+
The fastest XML parser written in 100% pure Ruby.
|
5
|
+
|
6
|
+
== Abstract
|
7
|
+
|
8
|
+
XMLscan is one of non-validating XML parser written in 100% pure Ruby.
|
9
|
+
|
10
|
+
== XMLscan's features are as follows:
|
11
|
+
|
12
|
+
* 100% pure Ruby
|
13
|
+
|
14
|
+
XMLscan doesn't require any extension libraries, so
|
15
|
+
it completely works only with a Ruby interpreter version
|
16
|
+
1.6 or above.
|
17
|
+
(It also needs no standard-bundled extension library.)
|
18
|
+
|
19
|
+
* Compliant to the specification
|
20
|
+
|
21
|
+
XMLscan has been developed to satisfy all conditions,
|
22
|
+
described in XML 1.0 Specification and required to a
|
23
|
+
non-validating XML processor
|
24
|
+
|
25
|
+
* High-speed
|
26
|
+
|
27
|
+
XMLscan is, probably, the fastest parser among all
|
28
|
+
existing XML/HTML parsers written in pure Ruby.
|
29
|
+
|
30
|
+
* Support for various CES.
|
31
|
+
|
32
|
+
XMLscan can parse an XML document encoded in at least
|
33
|
+
iso-8859-*, EUC-*, Shift_JIS, and UTF-8 as it is.
|
34
|
+
UTF-16 is not supported directly, though.
|
35
|
+
|
36
|
+
* Just parsing
|
37
|
+
|
38
|
+
The role of xmlscan is just to parse an XML document.
|
39
|
+
XMLscan doesn't provide high-level features to easily
|
40
|
+
handle an XML document. XMLscan is assumed to be used as
|
41
|
+
a core part of a library providing such features.
|
42
|
+
|
43
|
+
* HTML
|
44
|
+
|
45
|
+
XMLscan contains htmlscan, an HTML parser.
|
46
|
+
|
47
|
+
|
48
|
+
=== Character encodings
|
49
|
+
|
50
|
+
By default, the value of global variable $KCODE decides
|
51
|
+
which CES (character encoding scheme) is assumed for xmlscan
|
52
|
+
to parse an XML document.
|
53
|
+
|
54
|
+
You need to set $KCODE or XMLScan::XMLScanner#kcode=
|
55
|
+
an appropriate value to parse an XML document encoded in EUC-*,
|
56
|
+
Shift_JIS, or UTF-8.
|
57
|
+
|
58
|
+
UTF-16 is not supported directly. You should convert it into
|
59
|
+
UTF-8 before parsing.
|
60
|
+
|
61
|
+
=== XML Namespaces
|
62
|
+
|
63
|
+
XML Namespaces have been already implemented in
|
64
|
+
xmlscan/namespace.rb. However, since its interface is going
|
65
|
+
to be modified, this feature is undocumented now.
|
66
|
+
|
67
|
+
== Class Reference
|
68
|
+
|
69
|
+
XMLScan::Error
|
70
|
+
The superclass for all exceptions related to xmlscan.
|
71
|
+
These exceptions are raised by XMLScan::Visitor
|
72
|
+
by default when it receives an error report from a parser,
|
73
|
+
such as XMLScan::XMLScanner or XMLScan::XMLParser.
|
74
|
+
Each parser never raises these exceptions by itself.
|
75
|
+
|
76
|
+
XMLScan::ParseError
|
77
|
+
|
78
|
+
An error except a constraint violation, for example,
|
79
|
+
an XML document is unmatched with a production.
|
80
|
+
|
81
|
+
XMLScan::NotWellFormedError
|
82
|
+
|
83
|
+
Raised when an XML document violates an well-formedness
|
84
|
+
constraint.
|
85
|
+
|
86
|
+
XMLScan::NotValidError
|
87
|
+
|
88
|
+
Raised when an XML document violates an validity constraint.
|
89
|
+
|
90
|
+
|
91
|
+
XMLScan::Visitor
|
92
|
+
Mix-in for receiving the result of parsing an XML document.
|
93
|
+
Each parser included in xmlscan parses an XML document from
|
94
|
+
the beginning, and calls each specific method of given instance of
|
95
|
+
XMLScan::Visitor for each syntactic element, such as a tag.
|
96
|
+
It is ensured that these calls is in order of the appearance
|
97
|
+
in the document from the beginning.
|
98
|
+
Methods:
|
99
|
+
Without special notice, the following methods do nothing by
|
100
|
+
default.
|
101
|
+
|
102
|
+
XMLScan::Visitor#parse_error(msg)
|
103
|
+
|
104
|
+
Called when the parser meets an error except a constraint
|
105
|
+
violation, for example, an XML document is unmatched with
|
106
|
+
a production. By default, this method raises
|
107
|
+
XMLScan::ParseError exception. If no exception is
|
108
|
+
raised and this method returns normally, the parser recovers
|
109
|
+
the error and continues to parse.
|
110
|
+
XMLScan::Visitor#wellformed_error(msg)
|
111
|
+
|
112
|
+
Called when the parser meets an well-formedness constraint
|
113
|
+
violation. By default, this method raises
|
114
|
+
XMLScan::NotWellFormedError exception. If no exception
|
115
|
+
is raised and this method returns normally, the parser recovers
|
116
|
+
the error and continues to parse.
|
117
|
+
XMLScan::Visitor#valid_error(msg)
|
118
|
+
|
119
|
+
Called when the parser meets validity constraint
|
120
|
+
violation. By default, this method raises
|
121
|
+
XMLScan::NotValidError exception. If no exception
|
122
|
+
is raised and this method returns normally, the parser recovers
|
123
|
+
the error and continues to parse.
|
124
|
+
FYI, current version of xmlscan includes no validating XML
|
125
|
+
processor. This method is reserved for future versions.
|
126
|
+
XMLScan::Visitor#warning(msg)
|
127
|
+
|
128
|
+
Called when the parser meets a non-error but unrecommended
|
129
|
+
thing or a syntax which xmlscan is not able to parse.
|
130
|
+
XMLScan::Visitor#on_start_document
|
131
|
+
|
132
|
+
Called just before the parser starts parsing an XML document.
|
133
|
+
After this method is called, corresponding
|
134
|
+
XMLScan::Visitor#on_end_document method is always called.
|
135
|
+
XMLScan::Visitor#on_end_document
|
136
|
+
|
137
|
+
Called after the parser reaches the end of an XML document.
|
138
|
+
XMLScan::Visitor#on_xmldecl
|
139
|
+
XMLScan::Visitor#on_xmldecl_version(str)
|
140
|
+
XMLScan::Visitor#on_xmldecl_encoding(str)
|
141
|
+
XMLScan::Visitor#on_xmldecl_standalone(str)
|
142
|
+
XMLScan::Visitor#on_xmldecl_other(name, value)
|
143
|
+
XMLScan::Visitor#on_xmldecl_end
|
144
|
+
|
145
|
+
Called when the parser meets an XML declaration.
|
146
|
+
<?xml version="1.0" encoding="euc-jp" standalone="yes" ?>
|
147
|
+
^ ^ ^ ^ ^
|
148
|
+
1 2 3 4 5
|
149
|
+
|
150
|
+
method argument
|
151
|
+
--------------------------------------
|
152
|
+
1: on_xmldecl
|
153
|
+
2: on_xmldecl_version ("1.0")
|
154
|
+
3: on_xmldecl_encoding ("euc-jp")
|
155
|
+
4: on_xmldecl_standalone ("yes")
|
156
|
+
5: on_xmldecl_end
|
157
|
+
When an XML declaration is found, both on_xmldecl and
|
158
|
+
on_xmldecl_end method are always called. Any other methods
|
159
|
+
are called only when the corresponding syntaxes are found.
|
160
|
+
When a declaration except version, encoding, and standalone
|
161
|
+
is found in an XML declaration, on_xmldecl_other method is
|
162
|
+
called. Since such a declaration is not permitted, note that
|
163
|
+
the parser always calls XMLScan::Visitor#parse_error method
|
164
|
+
before calling on_xmldecl_other method.
|
165
|
+
XMLScan::Visitor#on_doctype(root, pubid, sysid)
|
166
|
+
|
167
|
+
Called when the parser meets a document type declaration.
|
168
|
+
document argument
|
169
|
+
--------------------------------------------------------------
|
170
|
+
1: <!DOCTYPE foo> ('foo', nil, nil)
|
171
|
+
2: <!DOCTYPE foo SYSTEM "bar"> ('foo', nil, 'bar')
|
172
|
+
3: <!DOCTYPE foo PUBLIC "bar"> ('foo', 'bar', nil )
|
173
|
+
4: <!DOCTYPE foo PUBLIC "bar" "baz"> ('foo', 'bar', 'baz')
|
174
|
+
XMLScan::Visitor#on_prolog_space(str)
|
175
|
+
|
176
|
+
Called when the parser meets whitespaces in prolog.
|
177
|
+
XMLScan::Visitor#on_comment(str)
|
178
|
+
|
179
|
+
Called when the parser meets a comment.
|
180
|
+
XMLScan::Visitor#on_pi(target, pi)
|
181
|
+
|
182
|
+
Called when the parser meets a processing instruction.
|
183
|
+
XMLScan::Visitor#on_chardata(str)
|
184
|
+
|
185
|
+
Called when the parser meets character data.
|
186
|
+
XMLScan::Visitor#on_cdata(str)
|
187
|
+
|
188
|
+
Called when the parser meets a CDATA section.
|
189
|
+
XMLScan::Visitor#on_entityref(ref)
|
190
|
+
|
191
|
+
Called when the parser meets a general entity reference
|
192
|
+
in a place except an attribute value.
|
193
|
+
XMLScan::Visitor#on_charref(code)
|
194
|
+
XMLScan::Visitor#on_charref_hex(code)
|
195
|
+
|
196
|
+
Called when the parser meets a character reference
|
197
|
+
in a place except an attribute value.
|
198
|
+
When the character code is represented by decimals,
|
199
|
+
on_charref is called. When by hexadecimals, on_charref_hex
|
200
|
+
is called. code is an integer.
|
201
|
+
XMLScan::Visitor#on_stag(name)
|
202
|
+
XMLScan::Visitor#on_attribute(name)
|
203
|
+
XMLScan::Visitor#on_attr_value(str)
|
204
|
+
XMLScan::Visitor#on_attr_entityref(ref)
|
205
|
+
XMLScan::Visitor#on_attr_charref(code)
|
206
|
+
XMLScan::Visitor#on_attr_charref_hex(code)
|
207
|
+
XMLScan::Visitor#on_attribute_end(name)
|
208
|
+
XMLScan::Visitor#on_stag_end_empty(name)
|
209
|
+
XMLScan::Visitor#on_stag_end(name)
|
210
|
+
|
211
|
+
Called when the parser meets an XML declaration.
|
212
|
+
<hoge fuga="foo&bar;&#38;&#x26;baz" >
|
213
|
+
^ ^ ^ ^ ^ ^ ^ ^ ^
|
214
|
+
1 2 3 4 5 6 7 8 9
|
215
|
+
|
216
|
+
method argument
|
217
|
+
------------------------------------
|
218
|
+
1: on_stag ('hoge')
|
219
|
+
2: on_attribute ('fuga')
|
220
|
+
3: on_attr_value ('foo')
|
221
|
+
4: on_attr_entityref ('bar')
|
222
|
+
5: on_attr_charref (38)
|
223
|
+
6: on_attr_charref_hex (38)
|
224
|
+
7: on_attr_value ('baz')
|
225
|
+
8: on_attribute_end ('fuga')
|
226
|
+
9: on_stag_end ('hoge')
|
227
|
+
or
|
228
|
+
on_stag_end_empty ('hoge')
|
229
|
+
When a start tag is found, both on_stag and corresponding
|
230
|
+
either on_stag_end or on_stag_end_empty method are always
|
231
|
+
called. Any other methods are called only when at least one
|
232
|
+
attribute is found in the start tag.
|
233
|
+
When an attribute is found, both on_attribute and
|
234
|
+
on_attribute_end method are always called. If the attribute
|
235
|
+
value is empty, only these two methods are called.
|
236
|
+
When the parser meets a general entity reference in an
|
237
|
+
attribute value, it calls on_attr_entityref method.
|
238
|
+
When the parser meets a character reference in an attribute
|
239
|
+
value, it calls either on_charref or on_charref_hex method.
|
240
|
+
If the tag is an empty element tag, on_stag_end_empty method
|
241
|
+
is called instead of on_stag_end method.
|
242
|
+
XMLScan::Visitor#on_etag(name)
|
243
|
+
|
244
|
+
Called when the parser meets an end tag.
|
245
|
+
|
246
|
+
XMLScan::XMLScanner
|
247
|
+
The scanner which tokenizes an XML document and recognize tags,
|
248
|
+
and so on.
|
249
|
+
The conformance of XMLScan::XMLScanner to the specification
|
250
|
+
is described in another document.
|
251
|
+
SuperClass:
|
252
|
+
|
253
|
+
Object
|
254
|
+
|
255
|
+
Class Methods:
|
256
|
+
|
257
|
+
XMLScan::XMLScanner.new(visitor[, option ...])
|
258
|
+
|
259
|
+
Creates an instance. visitor is a instance of
|
260
|
+
XMLScan::Visitor and receives the result of parsing
|
261
|
+
from the XMLScan::Scanner object.
|
262
|
+
You can specify one of more option as a string or symbol.
|
263
|
+
XMLScan::Scanner's options are as follows:
|
264
|
+
|
265
|
+
'strict_char'
|
266
|
+
|
267
|
+
This option is enabled after
|
268
|
+
require 'xmlscan/xmlchar'.
|
269
|
+
XMLScan::Scanner checks whether an XML document includes
|
270
|
+
an illegal character. The performance decreases sharply.
|
271
|
+
|
272
|
+
|
273
|
+
|
274
|
+
Methods:
|
275
|
+
|
276
|
+
XMLScan::XMLScanner#kcode= arg
|
277
|
+
|
278
|
+
Sets CES. Available values for code are same as $KCODE
|
279
|
+
except nil. If code is nil, $KCODE decides the CES.
|
280
|
+
XMLScan::XMLScanner#kcode
|
281
|
+
|
282
|
+
Returns CES. The format of the return value is same as
|
283
|
+
Regexp#kcode. If this method returns nil, it represents that
|
284
|
+
$KCODE decides the CES.
|
285
|
+
XMLScan::XMLScanner#parse(source)
|
286
|
+
|
287
|
+
Parses source as an XML document. source must be
|
288
|
+
a string, an array of strings, or an object which responds to
|
289
|
+
gets method which behaves same as IO#gets does.
|
290
|
+
|
291
|
+
XMLScan::XMLParser
|
292
|
+
The non-validating XML parser.
|
293
|
+
The conformance of XMLScan::XMLParser to the specification
|
294
|
+
is described in another document.
|
295
|
+
SuperClass:
|
296
|
+
|
297
|
+
XMLScan::XMLScanner
|
298
|
+
|
299
|
+
Class Methods:
|
300
|
+
|
301
|
+
XMLScan::XMLParser.new(visitor[, option ...])
|
302
|
+
|
303
|
+
XMLScan::XMLParser makes sure the following for each
|
304
|
+
method of visitor:
|
305
|
+
|
306
|
+
XMLScan::Visitor#on_stag
|
307
|
+
|
308
|
+
After calling this method, XMLScan::Parser always call
|
309
|
+
corresponding XMLScan::Visitor#on_etag method.
|
310
|
+
|
311
|
+
|
312
|
+
In addition, if you never intend error recovery, method calls
|
313
|
+
which must not be occurred in a well-formed XML document are
|
314
|
+
all suppressed.
|
315
|
+
|
316
|
+
XMLScan::HTMLScanner
|
317
|
+
An HTML parser based on XMLScan::XMLScanner.
|
318
|
+
The conformance of XMLScan::HTMLScanner to the specification
|
319
|
+
is described in another document.
|
320
|
+
SuperClass:
|
321
|
+
|
322
|
+
XMLScan::XMLScanner
|
323
|
+
|
324
|
+
Class Methods:
|
325
|
+
|
326
|
+
XMLScan::HTMLScanner.new(visitor[, option ...])
|
327
|
+
|
328
|
+
XMLScan::HTMLScanner makes sure the following for each
|
329
|
+
method of visitor:
|
330
|
+
|
331
|
+
XMLScan::Visitor#on_xmldecl
|
332
|
+
XMLScan::Visitor#on_xmldecl_version
|
333
|
+
XMLScan::Visitor#on_xmldecl_encoding
|
334
|
+
XMLScan::Visitor#on_xmldecl_standalone
|
335
|
+
XMLScan::Visitor#on_xmldecl_end
|
336
|
+
|
337
|
+
An XML declaration never appears in an HTML document,
|
338
|
+
so XMLScan::HTMLScanner never calls these methods.
|
339
|
+
|
340
|
+
XMLScan::Visitor#on_stag_end_empty
|
341
|
+
|
342
|
+
An empty element tag never appears in an HTML document,
|
343
|
+
so XMLScan::HTMLScanner never calls this method.
|
344
|
+
An empty element tag causes a parse error.
|
345
|
+
|
346
|
+
XMLScan::Visitor#wellformed_error
|
347
|
+
|
348
|
+
There is no well-formedness constraint for HTML,
|
349
|
+
so XMLScan::HTMLScanner never calls this method.
|
350
|
+
|
351
|
+
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
|
356
|
+
=== LICENCE
|
357
|
+
|
358
|
+
This module's license is the same of Ruby. You can redistribute
|
359
|
+
it and/or modify it under the same term as Ruby.
|
360
|
+
|
361
|
+
* Japanese: http://www.ruby-lang.org/ja/LICENSE.txt
|
362
|
+
* English: http://www.ruby-lang.org/en/LICENSE.txt
|
363
|
+
|
364
|
+
UENO Katsuhiro
|
365
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'bundler'
|
6
|
+
|
7
|
+
begin
|
8
|
+
Bundler.setup(:default, :development)
|
9
|
+
rescue Bundler::BundlerError => e
|
10
|
+
$stderr.puts e.message
|
11
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
12
|
+
exit e.status_code
|
13
|
+
end
|
14
|
+
|
15
|
+
require 'rake'
|
16
|
+
|
17
|
+
begin
|
18
|
+
require 'jeweler'
|
19
|
+
Jeweler::Tasks.new do |gem|
|
20
|
+
gem.name = "xmlscan"
|
21
|
+
gem.version = '0.2.3'
|
22
|
+
gem.license = "MIT"
|
23
|
+
gem.summary = "The fastest XML parser written in 100% pure Ruby."
|
24
|
+
gem.email = "gerryg@inbox.com"
|
25
|
+
gem.homepage = "http://github.com/GerryG/xmlformat/"
|
26
|
+
gem.description = "The fastest XML parser written in 100% pure Ruby."
|
27
|
+
gem.authors = ["UENO Katsuhiro <katsu@blue.sky.or.jp>"]
|
28
|
+
gem.files = FileList[
|
29
|
+
'[A-Z]*',
|
30
|
+
'*.rb',
|
31
|
+
'lib/**/*.rb',
|
32
|
+
'spec/**/*.rb' ].to_a
|
33
|
+
gem.test_files = Dir.glob('spec/*_spec.rb')
|
34
|
+
gem.has_rdoc = true
|
35
|
+
gem.extra_rdoc_files = [ "README.rdoc", "CHANGES" ]
|
36
|
+
gem.rdoc_options = ["--main", "README.rdoc", "--inline-source", "--line-numbers"]
|
37
|
+
end
|
38
|
+
rescue LoadError
|
39
|
+
puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
|
40
|
+
end
|
41
|
+
|
42
|
+
Jeweler::RubygemsDotOrgTasks.new
|
43
|
+
|
44
|
+
require 'rspec/core'
|
45
|
+
require 'rspec/core/rake_task'
|
46
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
47
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
48
|
+
end
|
49
|
+
|
50
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
51
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
52
|
+
spec.rcov = true
|
53
|
+
end
|
54
|
+
|
55
|
+
task :default => :spec
|
56
|
+
|
57
|
+
require 'rdoc/task'
|
58
|
+
Rake::RDocTask.new do |rdoc|
|
59
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
60
|
+
|
61
|
+
rdoc.rdoc_dir = 'rdoc'
|
62
|
+
rdoc.title = "xmlscan #{version}"
|
63
|
+
rdoc.rdoc_files.include('README*')
|
64
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
65
|
+
end
|