xmlscan 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +1276 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +31 -0
- data/README.rdoc +365 -0
- data/Rakefile +65 -0
- data/THANKS +11 -0
- data/VERSION +1 -0
- data/install.rb +41 -0
- data/lib/xmlscan/htmlscan.rb +290 -0
- data/lib/xmlscan/namespace.rb +353 -0
- data/lib/xmlscan/parser.rb +300 -0
- data/lib/xmlscan/scanner.rb +1123 -0
- data/lib/xmlscan/version.rb +23 -0
- data/lib/xmlscan/visitor.rb +162 -0
- data/lib/xmlscan/xmlchar.rb +248 -0
- data/test.rb +7 -0
- metadata +113 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# xmlscan/version.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) UENO Katsuhiro 2002,2003
|
6
|
+
#
|
7
|
+
# $Id: version.rb,v 1.8.2.3 2003/05/01 15:50:00 katsu Exp $
|
8
|
+
#
|
9
|
+
|
10
|
+
module XMLScan
|
11
|
+
|
12
|
+
# The version like 'X.X.0' (TENNY is 0) means that this is an unstable
|
13
|
+
# release. Incompatible changes will be applied to this version
|
14
|
+
# without special notice. This version should be distributed as a
|
15
|
+
# snapshot only.
|
16
|
+
#
|
17
|
+
# TENNY which is larger than 1 (e.g. 'X.X.1' or 'X.X.2') means this
|
18
|
+
# release is a stable release.
|
19
|
+
|
20
|
+
VERSION = '0.2.3'
|
21
|
+
RELEASE_DATE = '2003-05-02'
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,162 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# xmlscan/visitor.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) Ueno Katsuhiro 2002
|
6
|
+
#
|
7
|
+
# $Id: visitor.rb,v 1.2 2003/01/13 04:07:25 katsu Exp $
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'xmlscan/version'
|
11
|
+
|
12
|
+
|
13
|
+
module XMLScan
|
14
|
+
|
15
|
+
class Error < StandardError
|
16
|
+
|
17
|
+
def initialize(msg, path = nil, lineno = nil)
|
18
|
+
super msg
|
19
|
+
@path = path
|
20
|
+
@lineno = lineno
|
21
|
+
end
|
22
|
+
|
23
|
+
attr_reader :path, :lineno
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
if @lineno and @path then
|
27
|
+
"#{@path}:#{@lineno}:#{super}"
|
28
|
+
else
|
29
|
+
super
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
class ParseError < Error ; end
|
36
|
+
class NotWellFormedError < Error ; end
|
37
|
+
class NotValidError < Error ; end
|
38
|
+
|
39
|
+
|
40
|
+
module Visitor
|
41
|
+
|
42
|
+
def parse_error(msg)
|
43
|
+
raise ParseError.new(msg)
|
44
|
+
end
|
45
|
+
|
46
|
+
def wellformed_error(msg)
|
47
|
+
raise NotWellFormedError.new(msg)
|
48
|
+
end
|
49
|
+
|
50
|
+
def valid_error(msg)
|
51
|
+
raise NotValidError.new(msg)
|
52
|
+
end
|
53
|
+
|
54
|
+
def warning(msg)
|
55
|
+
end
|
56
|
+
|
57
|
+
def on_xmldecl
|
58
|
+
end
|
59
|
+
|
60
|
+
def on_xmldecl_key(key, str)
|
61
|
+
end
|
62
|
+
|
63
|
+
def on_xmldecl_version(str)
|
64
|
+
end
|
65
|
+
|
66
|
+
def on_xmldecl_encoding(str)
|
67
|
+
end
|
68
|
+
|
69
|
+
def on_xmldecl_standalone(str)
|
70
|
+
end
|
71
|
+
|
72
|
+
def on_xmldecl_other(name, value)
|
73
|
+
end
|
74
|
+
|
75
|
+
def on_xmldecl_end
|
76
|
+
end
|
77
|
+
|
78
|
+
def on_doctype(root, pubid, sysid)
|
79
|
+
end
|
80
|
+
|
81
|
+
def on_prolog_space(str)
|
82
|
+
end
|
83
|
+
|
84
|
+
def on_comment(str)
|
85
|
+
end
|
86
|
+
|
87
|
+
def on_pi(target, pi)
|
88
|
+
end
|
89
|
+
|
90
|
+
def on_chardata(str)
|
91
|
+
end
|
92
|
+
|
93
|
+
def on_cdata(str)
|
94
|
+
end
|
95
|
+
|
96
|
+
def on_etag(name)
|
97
|
+
end
|
98
|
+
|
99
|
+
def on_entityref(ref)
|
100
|
+
end
|
101
|
+
|
102
|
+
def on_charref(code)
|
103
|
+
end
|
104
|
+
|
105
|
+
def on_charref_hex(code)
|
106
|
+
end
|
107
|
+
|
108
|
+
def on_start_document
|
109
|
+
end
|
110
|
+
|
111
|
+
def on_end_document
|
112
|
+
end
|
113
|
+
|
114
|
+
def on_stag(name)
|
115
|
+
end
|
116
|
+
|
117
|
+
def on_attribute(name)
|
118
|
+
end
|
119
|
+
|
120
|
+
def on_attr_value(str)
|
121
|
+
end
|
122
|
+
|
123
|
+
def on_attr_entityref(ref)
|
124
|
+
end
|
125
|
+
|
126
|
+
def on_attr_charref(code)
|
127
|
+
end
|
128
|
+
|
129
|
+
def on_attr_charref_hex(code)
|
130
|
+
end
|
131
|
+
|
132
|
+
def on_attribute_end(name)
|
133
|
+
end
|
134
|
+
|
135
|
+
def on_stag_end_empty(name)
|
136
|
+
end
|
137
|
+
|
138
|
+
def on_stag_end(name)
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
class Decoration
|
145
|
+
|
146
|
+
include Visitor
|
147
|
+
|
148
|
+
def initialize(visitor)
|
149
|
+
@visitor = visitor
|
150
|
+
end
|
151
|
+
|
152
|
+
Visitor.instance_methods.each { |i|
|
153
|
+
module_eval <<-END, __FILE__, __LINE__ + 1
|
154
|
+
def #{i}(*args)
|
155
|
+
@visitor.#{i}(*args)
|
156
|
+
end
|
157
|
+
END
|
158
|
+
}
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
@@ -0,0 +1,248 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
#
|
3
|
+
# xmlscan/scanner.rb
|
4
|
+
#
|
5
|
+
# Copyright (C) Ueno Katsuhiro 2002
|
6
|
+
#
|
7
|
+
# $Id: xmlchar.rb,v 1.5.2.2 2003/05/01 14:25:55 katsu Exp $
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'xmlscan/scanner'
|
11
|
+
|
12
|
+
|
13
|
+
module XMLScan
|
14
|
+
|
15
|
+
ENC_UTF8 = Encoding.find('UTF-8')
|
16
|
+
|
17
|
+
module XMLChar
|
18
|
+
|
19
|
+
CharPattern = /\\A[\P{C}\t\n\r]*\\z/u
|
20
|
+
NotCharPattern = /[^\P{C}\t\n\r]/u
|
21
|
+
|
22
|
+
NmtokenPattern = /\\A[\p{Alnum}]+\z/u
|
23
|
+
NotNameCharPattern = /[^\p{Alnum}}]/u
|
24
|
+
|
25
|
+
NamePattern = /\A[\:\_\p{Letter}][\:\_\-\.\p{Alnum}]*\z/u
|
26
|
+
|
27
|
+
def valid_char?(code)
|
28
|
+
return false if code > 0x10ffff
|
29
|
+
NotCharPattern !~ [code].pack('U')
|
30
|
+
end
|
31
|
+
|
32
|
+
def valid_chardata?(str)
|
33
|
+
NotCharPattern !~ str
|
34
|
+
end
|
35
|
+
|
36
|
+
def valid_nmtoken?(str)
|
37
|
+
NotNameCharPattern !~ str
|
38
|
+
end
|
39
|
+
|
40
|
+
def valid_name?(str)
|
41
|
+
not NamePattern !~ str
|
42
|
+
end
|
43
|
+
|
44
|
+
module_function :valid_char?, :valid_chardata?
|
45
|
+
module_function :valid_nmtoken?, :valid_name?
|
46
|
+
|
47
|
+
|
48
|
+
def valid_pubid?(str)
|
49
|
+
/[^\- \r\na-zA-Z0-9'()+,.\/:=?;!*#\@$_%]/u !~ str
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
def valid_version?(str)
|
54
|
+
/[^\-a-zA-Z0-9_.:]/u !~ str
|
55
|
+
end
|
56
|
+
module_function :valid_version?
|
57
|
+
|
58
|
+
|
59
|
+
def valid_encoding?(str)
|
60
|
+
if /\A[A-Za-z]([\-A-Za-z0-9._])*\z/u =~ str then
|
61
|
+
true
|
62
|
+
else
|
63
|
+
false
|
64
|
+
end
|
65
|
+
end
|
66
|
+
module_function :valid_encoding?
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
class XMLScanner
|
74
|
+
|
75
|
+
module StrictChar
|
76
|
+
|
77
|
+
include XMLChar
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def check_valid_name(name)
|
82
|
+
unless valid_name? name then
|
83
|
+
parse_error "`#{name}' is not valid for XML name"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def check_valid_chardata(str)
|
88
|
+
unless valid_chardata? str then
|
89
|
+
parse_error "invlalid XML character is found"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def check_valid_char(code)
|
94
|
+
unless valid_char? code then
|
95
|
+
wellformed_error "#{code} is not a valid XML character"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def check_valid_version(str)
|
100
|
+
unless valid_version? str then
|
101
|
+
parse_error "#{str} is not a valid XML version"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def check_valid_encoding(str)
|
106
|
+
unless valid_encoding? str then
|
107
|
+
parse_error "#{str} is not a valid XML encoding name"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def check_valid_pubid(str)
|
112
|
+
unless valid_pubid? str then
|
113
|
+
parse_error "#{str} is not a valid public ID"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
|
118
|
+
def on_xmldecl_version(str)
|
119
|
+
check_valid_version str
|
120
|
+
super
|
121
|
+
end
|
122
|
+
|
123
|
+
def on_xmldecl_encoding(str)
|
124
|
+
check_valid_encoding str
|
125
|
+
super
|
126
|
+
end
|
127
|
+
|
128
|
+
def on_xmldecl_standalone(str)
|
129
|
+
check_valid_chardata str
|
130
|
+
super
|
131
|
+
end
|
132
|
+
|
133
|
+
def on_doctype(root, pubid, sysid)
|
134
|
+
check_valid_name root
|
135
|
+
check_valid_pubid pubid if pubid
|
136
|
+
check_valid_chardata sysid if sysid
|
137
|
+
super
|
138
|
+
end
|
139
|
+
|
140
|
+
def on_comment(str)
|
141
|
+
check_valid_chardata str
|
142
|
+
super
|
143
|
+
end
|
144
|
+
|
145
|
+
def on_pi(target, pi)
|
146
|
+
check_valid_name target
|
147
|
+
check_valid_chardata pi
|
148
|
+
super
|
149
|
+
end
|
150
|
+
|
151
|
+
def on_chardata(str)
|
152
|
+
check_valid_chardata str
|
153
|
+
super
|
154
|
+
end
|
155
|
+
|
156
|
+
def on_cdata(str)
|
157
|
+
check_valid_chardata str
|
158
|
+
super
|
159
|
+
end
|
160
|
+
|
161
|
+
def on_etag(name)
|
162
|
+
check_valid_name name
|
163
|
+
super
|
164
|
+
end
|
165
|
+
|
166
|
+
def on_entityref(ref)
|
167
|
+
check_valid_name ref
|
168
|
+
super
|
169
|
+
end
|
170
|
+
|
171
|
+
def on_charref(code)
|
172
|
+
check_valid_char code
|
173
|
+
super
|
174
|
+
end
|
175
|
+
|
176
|
+
def on_charref_hex(code)
|
177
|
+
check_valid_char code
|
178
|
+
super
|
179
|
+
end
|
180
|
+
|
181
|
+
def on_stag(name)
|
182
|
+
check_valid_name name
|
183
|
+
super
|
184
|
+
end
|
185
|
+
|
186
|
+
def on_attribute(name)
|
187
|
+
check_valid_name name
|
188
|
+
super
|
189
|
+
end
|
190
|
+
|
191
|
+
def on_attr_value(str)
|
192
|
+
check_valid_chardata str
|
193
|
+
super
|
194
|
+
end
|
195
|
+
|
196
|
+
def on_attr_entityref(ref)
|
197
|
+
check_valid_name ref
|
198
|
+
super
|
199
|
+
end
|
200
|
+
|
201
|
+
def on_attr_charref(code)
|
202
|
+
check_valid_char code
|
203
|
+
super
|
204
|
+
end
|
205
|
+
|
206
|
+
def on_attr_charref_hex(code)
|
207
|
+
check_valid_char code
|
208
|
+
super
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
|
214
|
+
private
|
215
|
+
|
216
|
+
def apply_option_strict_char
|
217
|
+
extend StrictChar
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
221
|
+
|
222
|
+
|
223
|
+
end
|
224
|
+
|
225
|
+
|
226
|
+
|
227
|
+
|
228
|
+
|
229
|
+
|
230
|
+
if $0 == __FILE__ then
|
231
|
+
class TestVisitor
|
232
|
+
include XMLScan::Visitor
|
233
|
+
def parse_error(msg)
|
234
|
+
STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
235
|
+
end
|
236
|
+
def wellformed_error(msg)
|
237
|
+
STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
$s = scan = XMLScan::XMLScanner.new(TestVisitor.new, :strict_char)
|
242
|
+
src = ARGF
|
243
|
+
def src.path; filename; end
|
244
|
+
t1 = Time.times.utime
|
245
|
+
scan.parse src
|
246
|
+
t2 = Time.times.utime
|
247
|
+
STDERR.printf "%2.3f sec\n", t2 - t1
|
248
|
+
end
|
data/test.rb
ADDED
metadata
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: xmlscan
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- UENO Katsuhiro <katsu@blue.sky.or.jp>
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-13 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &8077620 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 2.8.0
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *8077620
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rdoc
|
27
|
+
requirement: &8076660 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '3.12'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *8076660
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: bundler
|
38
|
+
requirement: &8075620 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.0.0
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *8075620
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: jeweler
|
49
|
+
requirement: &8074720 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.8.3
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *8074720
|
58
|
+
description: The fastest XML parser written in 100% pure Ruby.
|
59
|
+
email: gerryg@inbox.com
|
60
|
+
executables: []
|
61
|
+
extensions: []
|
62
|
+
extra_rdoc_files:
|
63
|
+
- README.rdoc
|
64
|
+
files:
|
65
|
+
- ChangeLog
|
66
|
+
- Gemfile
|
67
|
+
- Gemfile.lock
|
68
|
+
- README.rdoc
|
69
|
+
- Rakefile
|
70
|
+
- THANKS
|
71
|
+
- VERSION
|
72
|
+
- install.rb
|
73
|
+
- lib/xmlscan/htmlscan.rb
|
74
|
+
- lib/xmlscan/namespace.rb
|
75
|
+
- lib/xmlscan/parser.rb
|
76
|
+
- lib/xmlscan/scanner.rb
|
77
|
+
- lib/xmlscan/version.rb
|
78
|
+
- lib/xmlscan/visitor.rb
|
79
|
+
- lib/xmlscan/xmlchar.rb
|
80
|
+
- test.rb
|
81
|
+
homepage: http://github.com/GerryG/xmlformat/
|
82
|
+
licenses:
|
83
|
+
- MIT
|
84
|
+
post_install_message:
|
85
|
+
rdoc_options:
|
86
|
+
- --main
|
87
|
+
- README.rdoc
|
88
|
+
- --inline-source
|
89
|
+
- --line-numbers
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
segments:
|
99
|
+
- 0
|
100
|
+
hash: 3268123461909302440
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ! '>='
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 1.8.15
|
110
|
+
signing_key:
|
111
|
+
specification_version: 3
|
112
|
+
summary: The fastest XML parser written in 100% pure Ruby.
|
113
|
+
test_files: []
|