xmlscan 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +1276 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +31 -0
- data/README.rdoc +365 -0
- data/Rakefile +65 -0
- data/THANKS +11 -0
- data/VERSION +1 -0
- data/install.rb +41 -0
- data/lib/xmlscan/htmlscan.rb +290 -0
- data/lib/xmlscan/namespace.rb +353 -0
- data/lib/xmlscan/parser.rb +300 -0
- data/lib/xmlscan/scanner.rb +1123 -0
- data/lib/xmlscan/version.rb +23 -0
- data/lib/xmlscan/visitor.rb +162 -0
- data/lib/xmlscan/xmlchar.rb +248 -0
- data/test.rb +7 -0
- metadata +113 -0
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "rspec", "~> 2.8.0"
|
10
|
+
gem "rdoc", "~> 3.12"
|
11
|
+
gem "bundler", "~> 1.0.0"
|
12
|
+
gem "jeweler", "~> 1.8.3"
|
13
|
+
#gem "rcov", ">= 0"
|
14
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.3)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.8.3)
|
7
|
+
bundler (~> 1.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rdoc
|
11
|
+
json (1.6.5)
|
12
|
+
rake (0.9.2.2)
|
13
|
+
rdoc (3.12)
|
14
|
+
json (~> 1.4)
|
15
|
+
rspec (2.8.0)
|
16
|
+
rspec-core (~> 2.8.0)
|
17
|
+
rspec-expectations (~> 2.8.0)
|
18
|
+
rspec-mocks (~> 2.8.0)
|
19
|
+
rspec-core (2.8.0)
|
20
|
+
rspec-expectations (2.8.0)
|
21
|
+
diff-lcs (~> 1.1.2)
|
22
|
+
rspec-mocks (2.8.0)
|
23
|
+
|
24
|
+
PLATFORMS
|
25
|
+
ruby
|
26
|
+
|
27
|
+
DEPENDENCIES
|
28
|
+
bundler (~> 1.0.0)
|
29
|
+
jeweler (~> 1.8.3)
|
30
|
+
rdoc (~> 3.12)
|
31
|
+
rspec (~> 2.8.0)
|
data/README.rdoc
ADDED
@@ -0,0 +1,365 @@
|
|
1
|
+
|
2
|
+
== xmlscan
|
3
|
+
|
4
|
+
The fastest XML parser written in 100% pure Ruby.
|
5
|
+
|
6
|
+
== Abstract
|
7
|
+
|
8
|
+
XMLscan is one of non-validating XML parser written in 100% pure Ruby.
|
9
|
+
|
10
|
+
== XMLscan's features are as follows:
|
11
|
+
|
12
|
+
* 100% pure Ruby
|
13
|
+
|
14
|
+
XMLscan doesn't require any extension libraries, so
|
15
|
+
it completely works only with a Ruby interpreter version
|
16
|
+
1.6 or above.
|
17
|
+
(It also needs no standard-bundled extension library.)
|
18
|
+
|
19
|
+
* Compliant to the specification
|
20
|
+
|
21
|
+
XMLscan has been developed to satisfy all conditions,
|
22
|
+
described in XML 1.0 Specification and required to a
|
23
|
+
non-validating XML processor
|
24
|
+
|
25
|
+
* High-speed
|
26
|
+
|
27
|
+
XMLscan is, probably, the fastest parser among all
|
28
|
+
existing XML/HTML parsers written in pure Ruby.
|
29
|
+
|
30
|
+
* Support for various CES.
|
31
|
+
|
32
|
+
XMLscan can parse an XML document encoded in at least
|
33
|
+
iso-8859-*, EUC-*, Shift_JIS, and UTF-8 as it is.
|
34
|
+
UTF-16 is not supported directly, though.
|
35
|
+
|
36
|
+
* Just parsing
|
37
|
+
|
38
|
+
The role of xmlscan is just to parse an XML document.
|
39
|
+
XMLscan doesn't provide high-level features to easily
|
40
|
+
handle an XML document. XMLscan is assumed to be used as
|
41
|
+
a core part of a library providing such features.
|
42
|
+
|
43
|
+
* HTML
|
44
|
+
|
45
|
+
XMLscan contains htmlscan, an HTML parser.
|
46
|
+
|
47
|
+
|
48
|
+
=== Character encodings
|
49
|
+
|
50
|
+
By default, the value of global variable $KCODE decides
|
51
|
+
which CES (character encoding scheme) is assumed for xmlscan
|
52
|
+
to parse an XML document.
|
53
|
+
|
54
|
+
You need to set $KCODE or XMLScan::XMLScanner#kcode=
|
55
|
+
an appropriate value to parse an XML document encoded in EUC-*,
|
56
|
+
Shift_JIS, or UTF-8.
|
57
|
+
|
58
|
+
UTF-16 is not supported directly. You should convert it into
|
59
|
+
UTF-8 before parsing.
|
60
|
+
|
61
|
+
=== XML Namespaces
|
62
|
+
|
63
|
+
XML Namespaces have been already implemented in
|
64
|
+
xmlscan/namespace.rb. However, since its interface is going
|
65
|
+
to be modified, this feature is undocumented now.
|
66
|
+
|
67
|
+
== Class Reference
|
68
|
+
|
69
|
+
XMLScan::Error
|
70
|
+
The superclass for all exceptions related to xmlscan.
|
71
|
+
These exceptions are raised by XMLScan::Visitor
|
72
|
+
by default when it receives an error report from a parser,
|
73
|
+
such as XMLScan::XMLScanner or XMLScan::XMLParser.
|
74
|
+
Each parser never raises these exceptions by itself.
|
75
|
+
|
76
|
+
XMLScan::ParseError
|
77
|
+
|
78
|
+
An error except a constraint violation, for example,
|
79
|
+
an XML document is unmatched with a production.
|
80
|
+
|
81
|
+
XMLScan::NotWellFormedError
|
82
|
+
|
83
|
+
Raised when an XML document violates an well-formedness
|
84
|
+
constraint.
|
85
|
+
|
86
|
+
XMLScan::NotValidError
|
87
|
+
|
88
|
+
Raised when an XML document violates an validity constraint.
|
89
|
+
|
90
|
+
|
91
|
+
XMLScan::Visitor
|
92
|
+
Mix-in for receiving the result of parsing an XML document.
|
93
|
+
Each parser included in xmlscan parses an XML document from
|
94
|
+
the beginning, and calls each specific method of given instance of
|
95
|
+
XMLScan::Visitor for each syntactic element, such as a tag.
|
96
|
+
It is ensured that these calls is in order of the appearance
|
97
|
+
in the document from the beginning.
|
98
|
+
Methods:
|
99
|
+
Without special notice, the following methods do nothing by
|
100
|
+
default.
|
101
|
+
|
102
|
+
XMLScan::Visitor#parse_error(msg)
|
103
|
+
|
104
|
+
Called when the parser meets an error except a constraint
|
105
|
+
violation, for example, an XML document is unmatched with
|
106
|
+
a production. By default, this method raises
|
107
|
+
XMLScan::ParseError exception. If no exception is
|
108
|
+
raised and this method returns normally, the parser recovers
|
109
|
+
the error and continues to parse.
|
110
|
+
XMLScan::Visitor#wellformed_error(msg)
|
111
|
+
|
112
|
+
Called when the parser meets an well-formedness constraint
|
113
|
+
violation. By default, this method raises
|
114
|
+
XMLScan::NotWellFormedError exception. If no exception
|
115
|
+
is raised and this method returns normally, the parser recovers
|
116
|
+
the error and continues to parse.
|
117
|
+
XMLScan::Visitor#valid_error(msg)
|
118
|
+
|
119
|
+
Called when the parser meets validity constraint
|
120
|
+
violation. By default, this method raises
|
121
|
+
XMLScan::NotValidError exception. If no exception
|
122
|
+
is raised and this method returns normally, the parser recovers
|
123
|
+
the error and continues to parse.
|
124
|
+
FYI, current version of xmlscan includes no validating XML
|
125
|
+
processor. This method is reserved for future versions.
|
126
|
+
XMLScan::Visitor#warning(msg)
|
127
|
+
|
128
|
+
Called when the parser meets a non-error but unrecommended
|
129
|
+
thing or a syntax which xmlscan is not able to parse.
|
130
|
+
XMLScan::Visitor#on_start_document
|
131
|
+
|
132
|
+
Called just before the parser starts parsing an XML document.
|
133
|
+
After this method is called, corresponding
|
134
|
+
XMLScan::Visitor#on_end_document method is always called.
|
135
|
+
XMLScan::Visitor#on_end_document
|
136
|
+
|
137
|
+
Called after the parser reaches the end of an XML document.
|
138
|
+
XMLScan::Visitor#on_xmldecl
|
139
|
+
XMLScan::Visitor#on_xmldecl_version(str)
|
140
|
+
XMLScan::Visitor#on_xmldecl_encoding(str)
|
141
|
+
XMLScan::Visitor#on_xmldecl_standalone(str)
|
142
|
+
XMLScan::Visitor#on_xmldecl_other(name, value)
|
143
|
+
XMLScan::Visitor#on_xmldecl_end
|
144
|
+
|
145
|
+
Called when the parser meets an XML declaration.
|
146
|
+
<?xml version="1.0" encoding="euc-jp" standalone="yes" ?>
|
147
|
+
^ ^ ^ ^ ^
|
148
|
+
1 2 3 4 5
|
149
|
+
|
150
|
+
method argument
|
151
|
+
--------------------------------------
|
152
|
+
1: on_xmldecl
|
153
|
+
2: on_xmldecl_version ("1.0")
|
154
|
+
3: on_xmldecl_encoding ("euc-jp")
|
155
|
+
4: on_xmldecl_standalone ("yes")
|
156
|
+
5: on_xmldecl_end
|
157
|
+
When an XML declaration is found, both on_xmldecl and
|
158
|
+
on_xmldecl_end method are always called. Any other methods
|
159
|
+
are called only when the corresponding syntaxes are found.
|
160
|
+
When a declaration except version, encoding, and standalone
|
161
|
+
is found in an XML declaration, on_xmldecl_other method is
|
162
|
+
called. Since such a declaration is not permitted, note that
|
163
|
+
the parser always calls XMLScan::Visitor#parse_error method
|
164
|
+
before calling on_xmldecl_other method.
|
165
|
+
XMLScan::Visitor#on_doctype(root, pubid, sysid)
|
166
|
+
|
167
|
+
Called when the parser meets a document type declaration.
|
168
|
+
document argument
|
169
|
+
--------------------------------------------------------------
|
170
|
+
1: <!DOCTYPE foo> ('foo', nil, nil)
|
171
|
+
2: <!DOCTYPE foo SYSTEM "bar"> ('foo', nil, 'bar')
|
172
|
+
3: <!DOCTYPE foo PUBLIC "bar"> ('foo', 'bar', nil )
|
173
|
+
4: <!DOCTYPE foo PUBLIC "bar" "baz"> ('foo', 'bar', 'baz')
|
174
|
+
XMLScan::Visitor#on_prolog_space(str)
|
175
|
+
|
176
|
+
Called when the parser meets whitespaces in prolog.
|
177
|
+
XMLScan::Visitor#on_comment(str)
|
178
|
+
|
179
|
+
Called when the parser meets a comment.
|
180
|
+
XMLScan::Visitor#on_pi(target, pi)
|
181
|
+
|
182
|
+
Called when the parser meets a processing instruction.
|
183
|
+
XMLScan::Visitor#on_chardata(str)
|
184
|
+
|
185
|
+
Called when the parser meets character data.
|
186
|
+
XMLScan::Visitor#on_cdata(str)
|
187
|
+
|
188
|
+
Called when the parser meets a CDATA section.
|
189
|
+
XMLScan::Visitor#on_entityref(ref)
|
190
|
+
|
191
|
+
Called when the parser meets a general entity reference
|
192
|
+
in a place except an attribute value.
|
193
|
+
XMLScan::Visitor#on_charref(code)
|
194
|
+
XMLScan::Visitor#on_charref_hex(code)
|
195
|
+
|
196
|
+
Called when the parser meets a character reference
|
197
|
+
in a place except an attribute value.
|
198
|
+
When the character code is represented by decimals,
|
199
|
+
on_charref is called. When by hexadecimals, on_charref_hex
|
200
|
+
is called. code is an integer.
|
201
|
+
XMLScan::Visitor#on_stag(name)
|
202
|
+
XMLScan::Visitor#on_attribute(name)
|
203
|
+
XMLScan::Visitor#on_attr_value(str)
|
204
|
+
XMLScan::Visitor#on_attr_entityref(ref)
|
205
|
+
XMLScan::Visitor#on_attr_charref(code)
|
206
|
+
XMLScan::Visitor#on_attr_charref_hex(code)
|
207
|
+
XMLScan::Visitor#on_attribute_end(name)
|
208
|
+
XMLScan::Visitor#on_stag_end_empty(name)
|
209
|
+
XMLScan::Visitor#on_stag_end(name)
|
210
|
+
|
211
|
+
Called when the parser meets an XML declaration.
|
212
|
+
<hoge fuga="foo&bar;&#38;&#x26;baz" >
|
213
|
+
^ ^ ^ ^ ^ ^ ^ ^ ^
|
214
|
+
1 2 3 4 5 6 7 8 9
|
215
|
+
|
216
|
+
method argument
|
217
|
+
------------------------------------
|
218
|
+
1: on_stag ('hoge')
|
219
|
+
2: on_attribute ('fuga')
|
220
|
+
3: on_attr_value ('foo')
|
221
|
+
4: on_attr_entityref ('bar')
|
222
|
+
5: on_attr_charref (38)
|
223
|
+
6: on_attr_charref_hex (38)
|
224
|
+
7: on_attr_value ('baz')
|
225
|
+
8: on_attribute_end ('fuga')
|
226
|
+
9: on_stag_end ('hoge')
|
227
|
+
or
|
228
|
+
on_stag_end_empty ('hoge')
|
229
|
+
When a start tag is found, both on_stag and corresponding
|
230
|
+
either on_stag_end or on_stag_end_empty method are always
|
231
|
+
called. Any other methods are called only when at least one
|
232
|
+
attribute is found in the start tag.
|
233
|
+
When an attribute is found, both on_attribute and
|
234
|
+
on_attribute_end method are always called. If the attribute
|
235
|
+
value is empty, only these two methods are called.
|
236
|
+
When the parser meets a general entity reference in an
|
237
|
+
attribute value, it calls on_attr_entityref method.
|
238
|
+
When the parser meets a character reference in an attribute
|
239
|
+
value, it calls either on_charref or on_charref_hex method.
|
240
|
+
If the tag is an empty element tag, on_stag_end_empty method
|
241
|
+
is called instead of on_stag_end method.
|
242
|
+
XMLScan::Visitor#on_etag(name)
|
243
|
+
|
244
|
+
Called when the parser meets an end tag.
|
245
|
+
|
246
|
+
XMLScan::XMLScanner
|
247
|
+
The scanner which tokenizes an XML document and recognize tags,
|
248
|
+
and so on.
|
249
|
+
The conformance of XMLScan::XMLScanner to the specification
|
250
|
+
is described in another document.
|
251
|
+
SuperClass:
|
252
|
+
|
253
|
+
Object
|
254
|
+
|
255
|
+
Class Methods:
|
256
|
+
|
257
|
+
XMLScan::XMLScanner.new(visitor[, option ...])
|
258
|
+
|
259
|
+
Creates an instance. visitor is a instance of
|
260
|
+
XMLScan::Visitor and receives the result of parsing
|
261
|
+
from the XMLScan::Scanner object.
|
262
|
+
You can specify one of more option as a string or symbol.
|
263
|
+
XMLScan::Scanner's options are as follows:
|
264
|
+
|
265
|
+
'strict_char'
|
266
|
+
|
267
|
+
This option is enabled after
|
268
|
+
require 'xmlscan/xmlchar'.
|
269
|
+
XMLScan::Scanner checks whether an XML document includes
|
270
|
+
an illegal character. The performance decreases sharply.
|
271
|
+
|
272
|
+
|
273
|
+
|
274
|
+
Methods:
|
275
|
+
|
276
|
+
XMLScan::XMLScanner#kcode= arg
|
277
|
+
|
278
|
+
Sets CES. Available values for code are same as $KCODE
|
279
|
+
except nil. If code is nil, $KCODE decides the CES.
|
280
|
+
XMLScan::XMLScanner#kcode
|
281
|
+
|
282
|
+
Returns CES. The format of the return value is same as
|
283
|
+
Regexp#kcode. If this method returns nil, it represents that
|
284
|
+
$KCODE decides the CES.
|
285
|
+
XMLScan::XMLScanner#parse(source)
|
286
|
+
|
287
|
+
Parses source as an XML document. source must be
|
288
|
+
a string, an array of strings, or an object which responds to
|
289
|
+
gets method which behaves same as IO#gets does.
|
290
|
+
|
291
|
+
XMLScan::XMLParser
|
292
|
+
The non-validating XML parser.
|
293
|
+
The conformance of XMLScan::XMLParser to the specification
|
294
|
+
is described in another document.
|
295
|
+
SuperClass:
|
296
|
+
|
297
|
+
XMLScan::XMLScanner
|
298
|
+
|
299
|
+
Class Methods:
|
300
|
+
|
301
|
+
XMLScan::XMLParser.new(visitor[, option ...])
|
302
|
+
|
303
|
+
XMLScan::XMLParser makes sure the following for each
|
304
|
+
method of visitor:
|
305
|
+
|
306
|
+
XMLScan::Visitor#on_stag
|
307
|
+
|
308
|
+
After calling this method, XMLScan::Parser always call
|
309
|
+
corresponding XMLScan::Visitor#on_etag method.
|
310
|
+
|
311
|
+
|
312
|
+
In addition, if you never intend error recovery, method calls
|
313
|
+
which must not be occurred in a well-formed XML document are
|
314
|
+
all suppressed.
|
315
|
+
|
316
|
+
XMLScan::HTMLScanner
|
317
|
+
An HTML parser based on XMLScan::XMLScanner.
|
318
|
+
The conformance of XMLScan::HTMLScanner to the specification
|
319
|
+
is described in another document.
|
320
|
+
SuperClass:
|
321
|
+
|
322
|
+
XMLScan::XMLScanner
|
323
|
+
|
324
|
+
Class Methods:
|
325
|
+
|
326
|
+
XMLScan::HTMLScanner.new(visitor[, option ...])
|
327
|
+
|
328
|
+
XMLScan::HTMLScanner makes sure the following for each
|
329
|
+
method of visitor:
|
330
|
+
|
331
|
+
XMLScan::Visitor#on_xmldecl
|
332
|
+
XMLScan::Visitor#on_xmldecl_version
|
333
|
+
XMLScan::Visitor#on_xmldecl_encoding
|
334
|
+
XMLScan::Visitor#on_xmldecl_standalone
|
335
|
+
XMLScan::Visitor#on_xmldecl_end
|
336
|
+
|
337
|
+
An XML declaration never appears in an HTML document,
|
338
|
+
so XMLScan::HTMLScanner never calls these methods.
|
339
|
+
|
340
|
+
XMLScan::Visitor#on_stag_end_empty
|
341
|
+
|
342
|
+
An empty element tag never appears in an HTML document,
|
343
|
+
so XMLScan::HTMLScanner never calls this method.
|
344
|
+
An empty element tag causes a parse error.
|
345
|
+
|
346
|
+
XMLScan::Visitor#wellformed_error
|
347
|
+
|
348
|
+
There is no well-formedness constraint for HTML,
|
349
|
+
so XMLScan::HTMLScanner never calls this method.
|
350
|
+
|
351
|
+
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
|
356
|
+
=== LICENCE
|
357
|
+
|
358
|
+
This module's license is the same of Ruby. You can redistribute
|
359
|
+
it and/or modify it under the same term as Ruby.
|
360
|
+
|
361
|
+
* Japanese: http://www.ruby-lang.org/ja/LICENSE.txt
|
362
|
+
* English: http://www.ruby-lang.org/en/LICENSE.txt
|
363
|
+
|
364
|
+
UENO Katsuhiro
|
365
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'bundler'
|
6
|
+
|
7
|
+
begin
|
8
|
+
Bundler.setup(:default, :development)
|
9
|
+
rescue Bundler::BundlerError => e
|
10
|
+
$stderr.puts e.message
|
11
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
12
|
+
exit e.status_code
|
13
|
+
end
|
14
|
+
|
15
|
+
require 'rake'
|
16
|
+
|
17
|
+
begin
|
18
|
+
require 'jeweler'
|
19
|
+
Jeweler::Tasks.new do |gem|
|
20
|
+
gem.name = "xmlscan"
|
21
|
+
gem.version = '0.2.3'
|
22
|
+
gem.license = "MIT"
|
23
|
+
gem.summary = "The fastest XML parser written in 100% pure Ruby."
|
24
|
+
gem.email = "gerryg@inbox.com"
|
25
|
+
gem.homepage = "http://github.com/GerryG/xmlformat/"
|
26
|
+
gem.description = "The fastest XML parser written in 100% pure Ruby."
|
27
|
+
gem.authors = ["UENO Katsuhiro <katsu@blue.sky.or.jp>"]
|
28
|
+
gem.files = FileList[
|
29
|
+
'[A-Z]*',
|
30
|
+
'*.rb',
|
31
|
+
'lib/**/*.rb',
|
32
|
+
'spec/**/*.rb' ].to_a
|
33
|
+
gem.test_files = Dir.glob('spec/*_spec.rb')
|
34
|
+
gem.has_rdoc = true
|
35
|
+
gem.extra_rdoc_files = [ "README.rdoc", "CHANGES" ]
|
36
|
+
gem.rdoc_options = ["--main", "README.rdoc", "--inline-source", "--line-numbers"]
|
37
|
+
end
|
38
|
+
rescue LoadError
|
39
|
+
puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
|
40
|
+
end
|
41
|
+
|
42
|
+
Jeweler::RubygemsDotOrgTasks.new
|
43
|
+
|
44
|
+
require 'rspec/core'
|
45
|
+
require 'rspec/core/rake_task'
|
46
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
47
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
48
|
+
end
|
49
|
+
|
50
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
51
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
52
|
+
spec.rcov = true
|
53
|
+
end
|
54
|
+
|
55
|
+
task :default => :spec
|
56
|
+
|
57
|
+
require 'rdoc/task'
|
58
|
+
Rake::RDocTask.new do |rdoc|
|
59
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
60
|
+
|
61
|
+
rdoc.rdoc_dir = 'rdoc'
|
62
|
+
rdoc.title = "xmlscan #{version}"
|
63
|
+
rdoc.rdoc_files.include('README*')
|
64
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
65
|
+
end
|