xmlscan 0.2.3 → 0.3.0preb
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +7 -4
- data/VERSION +1 -1
- data/lib/xmlscan/htmlscan.rb +7 -7
- data/lib/xmlscan/namespace.rb +33 -33
- data/lib/xmlscan/parser.rb +17 -13
- data/lib/xmlscan/processor.rb +109 -0
- data/lib/xmlscan/scanner.rb +118 -89
- data/lib/xmlscan/version.rb +4 -10
- data/lib/xmlscan/visitor.rb +31 -29
- data/lib/xmlscan/xmlchar.rb +18 -18
- metadata +15 -16
- data/install.rb +0 -41
- data/test.rb +0 -7
data/Rakefile
CHANGED
@@ -3,6 +3,9 @@
|
|
3
3
|
|
4
4
|
require 'rubygems'
|
5
5
|
require 'bundler'
|
6
|
+
require 'xmlscan/version'
|
7
|
+
|
8
|
+
VERSION = XMLScan::VERSION # File.exist?('VERSION') ? File.read('VERSION') : ""
|
6
9
|
|
7
10
|
begin
|
8
11
|
Bundler.setup(:default, :development)
|
@@ -15,10 +18,11 @@ end
|
|
15
18
|
require 'rake'
|
16
19
|
|
17
20
|
begin
|
21
|
+
include XMLScan
|
18
22
|
require 'jeweler'
|
19
23
|
Jeweler::Tasks.new do |gem|
|
20
|
-
gem.name =
|
21
|
-
gem.version =
|
24
|
+
gem.name = 'xmlscan'
|
25
|
+
gem.version = XMLScan::VERSION
|
22
26
|
gem.license = "MIT"
|
23
27
|
gem.summary = "The fastest XML parser written in 100% pure Ruby."
|
24
28
|
gem.email = "gerryg@inbox.com"
|
@@ -56,10 +60,9 @@ task :default => :spec
|
|
56
60
|
|
57
61
|
require 'rdoc/task'
|
58
62
|
Rake::RDocTask.new do |rdoc|
|
59
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
60
63
|
|
61
64
|
rdoc.rdoc_dir = 'rdoc'
|
62
|
-
rdoc.title = "xmlscan #{
|
65
|
+
rdoc.title = "xmlscan #{VERSION}"
|
63
66
|
rdoc.rdoc_files.include('README*')
|
64
67
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
65
68
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0preb
|
data/lib/xmlscan/htmlscan.rb
CHANGED
@@ -47,7 +47,7 @@ module XMLScan
|
|
47
47
|
raise "[BUG] this method must be never called"
|
48
48
|
end
|
49
49
|
|
50
|
-
def on_stag_end_empty(name)
|
50
|
+
def on_stag_end_empty(name, *a)
|
51
51
|
raise "[BUG] this method must be never called"
|
52
52
|
end
|
53
53
|
|
@@ -127,7 +127,7 @@ module XMLScan
|
|
127
127
|
return found_empty_stag
|
128
128
|
else
|
129
129
|
parse_error "parse error at `<'"
|
130
|
-
return on_chardata
|
130
|
+
return on_chardata '<'
|
131
131
|
end
|
132
132
|
end
|
133
133
|
on_stag name
|
@@ -142,7 +142,7 @@ module XMLScan
|
|
142
142
|
if @src.close_tag then
|
143
143
|
s << '>'
|
144
144
|
end
|
145
|
-
return on_chardata
|
145
|
+
return on_chardata '<'+s
|
146
146
|
end
|
147
147
|
on_stag name
|
148
148
|
begin
|
@@ -156,9 +156,9 @@ module XMLScan
|
|
156
156
|
qmark = val.slice!(0,1)
|
157
157
|
if val[-1] == qmark[0] then
|
158
158
|
val.chop!
|
159
|
-
|
159
|
+
scan_attr_value val unless val.empty?
|
160
160
|
else
|
161
|
-
|
161
|
+
scan_attr_value val unless val.empty?
|
162
162
|
begin
|
163
163
|
s = @src.get
|
164
164
|
unless s then
|
@@ -167,8 +167,8 @@ module XMLScan
|
|
167
167
|
end
|
168
168
|
c = s[0]
|
169
169
|
val, s = s.split(qmark, 2)
|
170
|
-
|
171
|
-
|
170
|
+
scan_attr_value '>' unless c == ?< or c == ?>
|
171
|
+
scan_attr_value val if c
|
172
172
|
end until s
|
173
173
|
continue = s
|
174
174
|
end
|
data/lib/xmlscan/namespace.rb
CHANGED
@@ -54,16 +54,16 @@ module XMLScan
|
|
54
54
|
# on_stag_end_empty_ns ('foo:bar', { 'foo' => '', ... })
|
55
55
|
#
|
56
56
|
|
57
|
-
def on_stag_ns(qname, prefix, localpart)
|
57
|
+
def on_stag_ns(qname, prefix, localpart, *a)
|
58
58
|
end
|
59
59
|
|
60
|
-
def on_attribute_ns(qname, prefix, localpart)
|
60
|
+
def on_attribute_ns(qname, prefix, localpart, *a)
|
61
61
|
end
|
62
62
|
|
63
|
-
def on_stag_end_ns(qname, namespaces)
|
63
|
+
def on_stag_end_ns(qname, namespaces, *a)
|
64
64
|
end
|
65
65
|
|
66
|
-
def on_stag_end_empty_ns(qname, namespaces)
|
66
|
+
def on_stag_end_empty_ns(qname, namespaces, *a)
|
67
67
|
end
|
68
68
|
|
69
69
|
end
|
@@ -99,7 +99,7 @@ module XMLScan
|
|
99
99
|
end
|
100
100
|
|
101
101
|
|
102
|
-
def on_start_document
|
102
|
+
def on_start_document(*a)
|
103
103
|
@namespace = {} #PredefinedNamespace.dup
|
104
104
|
@ns_hist = []
|
105
105
|
@ns_undeclared = {} # for checking undeclared namespace prefixes.
|
@@ -107,14 +107,14 @@ module XMLScan
|
|
107
107
|
@dont_same = [] # ditto.
|
108
108
|
@xmlns = NamespaceDeclaration.new(self)
|
109
109
|
@orig_visitor = @visitor
|
110
|
-
@visitor.on_start_document
|
110
|
+
@visitor.on_start_document *a
|
111
111
|
end
|
112
112
|
|
113
113
|
|
114
|
-
def on_stag(name)
|
114
|
+
def on_stag(name, *a)
|
115
115
|
@ns_hist.push nil
|
116
116
|
unless /:/n =~ name then
|
117
|
-
@visitor.on_stag_ns name, '', name
|
117
|
+
@visitor.on_stag_ns name, '', name, *a
|
118
118
|
else
|
119
119
|
prefix, localpart = $`, $'
|
120
120
|
if localpart.include? ?: then
|
@@ -131,12 +131,12 @@ module XMLScan
|
|
131
131
|
@ns_undeclared[prefix] = true
|
132
132
|
end
|
133
133
|
end
|
134
|
-
@visitor.on_stag_ns name, prefix, localpart
|
134
|
+
@visitor.on_stag_ns name, prefix, localpart, *a
|
135
135
|
end
|
136
136
|
end
|
137
137
|
|
138
138
|
|
139
|
-
def on_attribute(name)
|
139
|
+
def on_attribute(name, *a)
|
140
140
|
if /:/n =~ name then
|
141
141
|
prefix, localpart = $`, $'
|
142
142
|
if localpart.include? ?: then
|
@@ -157,13 +157,13 @@ module XMLScan
|
|
157
157
|
@dont_same.push [ prev, prefix, localpart ]
|
158
158
|
end
|
159
159
|
@prev_prefix[localpart] = prefix
|
160
|
-
@visitor.on_attribute_ns name, prefix, localpart
|
160
|
+
@visitor.on_attribute_ns name, prefix, localpart, *a
|
161
161
|
end
|
162
162
|
elsif name == 'xmlns' then
|
163
163
|
@visitor = @xmlns
|
164
164
|
@xmlns.on_xmlns_start ''
|
165
165
|
else
|
166
|
-
@visitor.on_attribute_ns name, nil, name
|
166
|
+
@visitor.on_attribute_ns name, nil, name, *a
|
167
167
|
end
|
168
168
|
end
|
169
169
|
|
@@ -176,36 +176,36 @@ module XMLScan
|
|
176
176
|
@parent = parent
|
177
177
|
end
|
178
178
|
|
179
|
-
def on_xmlns_start(prefix)
|
179
|
+
def on_xmlns_start(prefix, *a)
|
180
180
|
@prefix = prefix
|
181
181
|
@nsdecl = ''
|
182
182
|
end
|
183
183
|
|
184
|
-
def on_attr_value(str)
|
184
|
+
def on_attr_value(str, *a)
|
185
185
|
@nsdecl << str
|
186
186
|
end
|
187
187
|
|
188
|
-
def on_attr_entityref(ref)
|
188
|
+
def on_attr_entityref(ref, *a)
|
189
189
|
@parent.ns_wellformed_error \
|
190
190
|
"xmlns includes undeclared entity reference"
|
191
191
|
end
|
192
192
|
|
193
|
-
def on_attr_charref(code)
|
193
|
+
def on_attr_charref(code, *a)
|
194
194
|
@nsdecl << [code].pack('U')
|
195
195
|
end
|
196
196
|
|
197
|
-
def on_attr_charref_hex(code)
|
197
|
+
def on_attr_charref_hex(code, *a)
|
198
198
|
@nsdecl << [code].pack('U')
|
199
199
|
end
|
200
200
|
|
201
|
-
def on_attribute_end(name)
|
201
|
+
def on_attribute_end(name, *a)
|
202
202
|
@parent.on_xmlns_end @prefix, @nsdecl
|
203
203
|
end
|
204
204
|
|
205
205
|
end
|
206
206
|
|
207
207
|
|
208
|
-
def on_xmlns_end(prefix, uri)
|
208
|
+
def on_xmlns_end(prefix, uri, *a)
|
209
209
|
@visitor = @orig_visitor
|
210
210
|
if PredefinedNamespace.key? prefix then
|
211
211
|
if prefix == 'xmlns' then
|
@@ -254,54 +254,54 @@ module XMLScan
|
|
254
254
|
end
|
255
255
|
|
256
256
|
|
257
|
-
def on_stag_end(name)
|
257
|
+
def on_stag_end(name, *a)
|
258
258
|
fix_namespace
|
259
|
-
@visitor.on_stag_end_ns name, @namespace
|
259
|
+
@visitor.on_stag_end_ns name, @namespace, *a
|
260
260
|
end
|
261
261
|
|
262
262
|
|
263
|
-
def on_etag(name)
|
263
|
+
def on_etag(name, *a)
|
264
264
|
h = @ns_hist.pop and @namespace.update h
|
265
|
-
@visitor.on_etag name
|
265
|
+
@visitor.on_etag name, *a
|
266
266
|
end
|
267
267
|
|
268
268
|
|
269
|
-
def on_stag_end_empty(name)
|
269
|
+
def on_stag_end_empty(name, *a)
|
270
270
|
fix_namespace
|
271
|
-
@visitor.on_stag_end_empty_ns name, @namespace
|
271
|
+
@visitor.on_stag_end_empty_ns name, @namespace, *a
|
272
272
|
h = @ns_hist.pop and @namespace.update h
|
273
273
|
end
|
274
274
|
|
275
275
|
|
276
|
-
def on_doctype(root, pubid, sysid)
|
276
|
+
def on_doctype(root, pubid, sysid, *a)
|
277
277
|
if root.count(':') > 1 then
|
278
278
|
ns_parse_error "qualified name `#{root}' includes `:'"
|
279
279
|
end
|
280
|
-
@visitor.on_doctype root, pubid, sysid
|
280
|
+
@visitor.on_doctype root, pubid, sysid, *a
|
281
281
|
end
|
282
282
|
|
283
283
|
|
284
|
-
def on_pi(target, pi)
|
284
|
+
def on_pi(target, pi, *a)
|
285
285
|
if target.include? ?: then
|
286
286
|
ns_parse_error "PI target `#{target}' includes `:'"
|
287
287
|
end
|
288
|
-
@visitor.on_pi target, pi
|
288
|
+
@visitor.on_pi target, pi, *a
|
289
289
|
end
|
290
290
|
|
291
291
|
|
292
|
-
def on_entityref(ref)
|
292
|
+
def on_entityref(ref, *a)
|
293
293
|
if ref.include? ?: then
|
294
294
|
ns_parse_error "entity reference `#{ref}' includes `:'"
|
295
295
|
end
|
296
|
-
@visitor.on_entityref ref
|
296
|
+
@visitor.on_entityref ref, *a
|
297
297
|
end
|
298
298
|
|
299
299
|
|
300
|
-
def on_attr_entityref(ref)
|
300
|
+
def on_attr_entityref(ref, *a)
|
301
301
|
if ref.include? ?: then
|
302
302
|
ns_parse_error "entity reference `#{ref}' includes `:'"
|
303
303
|
end
|
304
|
-
@visitor.on_attr_entityref ref
|
304
|
+
@visitor.on_attr_entityref ref, *a
|
305
305
|
end
|
306
306
|
|
307
307
|
end
|
data/lib/xmlscan/parser.rb
CHANGED
@@ -43,7 +43,7 @@ module XMLScan
|
|
43
43
|
|
44
44
|
private
|
45
45
|
|
46
|
-
def on_xmldecl_version(str)
|
46
|
+
def on_xmldecl_version(str, *a)
|
47
47
|
unless str == '1.0' then
|
48
48
|
warning "unsupported XML version `#{str}'"
|
49
49
|
end
|
@@ -51,7 +51,7 @@ module XMLScan
|
|
51
51
|
end
|
52
52
|
|
53
53
|
|
54
|
-
def on_xmldecl_standalone(str)
|
54
|
+
def on_xmldecl_standalone(str, *a)
|
55
55
|
if str == 'yes' then
|
56
56
|
@standalone = true
|
57
57
|
elsif str == 'no' then
|
@@ -63,7 +63,7 @@ module XMLScan
|
|
63
63
|
end
|
64
64
|
|
65
65
|
|
66
|
-
def on_doctype(name, pubid, sysid)
|
66
|
+
def on_doctype(name, pubid, sysid, *a)
|
67
67
|
if pubid and not sysid then
|
68
68
|
parse_error "public external ID must have both public ID and system ID"
|
69
69
|
end
|
@@ -71,12 +71,12 @@ module XMLScan
|
|
71
71
|
end
|
72
72
|
|
73
73
|
|
74
|
-
def on_prolog_space(s)
|
74
|
+
def on_prolog_space(s, *a)
|
75
75
|
# just ignore it.
|
76
76
|
end
|
77
77
|
|
78
78
|
|
79
|
-
def on_pi(target, pi)
|
79
|
+
def on_pi(target, pi, *a)
|
80
80
|
if target.downcase == 'xml' then
|
81
81
|
parse_error "reserved PI target `#{target}'"
|
82
82
|
end
|
@@ -114,39 +114,43 @@ module XMLScan
|
|
114
114
|
#end
|
115
115
|
|
116
116
|
|
117
|
-
def on_stag(name)
|
117
|
+
def on_stag(name, *a)
|
118
118
|
@elem.push name
|
119
119
|
@visitor.on_stag name
|
120
120
|
@attr.clear
|
121
121
|
end
|
122
122
|
|
123
|
-
def on_attribute(name)
|
123
|
+
def on_attribute(name, *a)
|
124
124
|
unless @attr.check_unique name then
|
125
125
|
wellformed_error "doubled attribute `#{name}'"
|
126
126
|
end
|
127
127
|
@visitor.on_attribute name
|
128
128
|
end
|
129
129
|
|
130
|
-
def on_attr_value(str)
|
130
|
+
def on_attr_value(str, *a)
|
131
131
|
str.tr! "\t\r\n", ' ' # normalize
|
132
132
|
@visitor.on_attr_value str
|
133
133
|
end
|
134
134
|
|
135
|
-
def
|
135
|
+
def on_stag_end(name, *a)
|
136
|
+
@visitor.on_stag_end name, *a
|
137
|
+
end
|
138
|
+
|
139
|
+
def on_stag_end_empty(name, *a)
|
136
140
|
# @visitor.on_stag_end name
|
137
141
|
# @elem.pop
|
138
142
|
# @visitor.on_etag name
|
139
|
-
@visitor.on_stag_end_empty name
|
143
|
+
@visitor.on_stag_end_empty name, *a
|
140
144
|
@elem.pop
|
141
145
|
end
|
142
146
|
|
143
|
-
def on_etag(name)
|
147
|
+
def on_etag(name, *a)
|
144
148
|
last = @elem.pop
|
145
149
|
if last == name then
|
146
|
-
@visitor.on_etag name
|
150
|
+
@visitor.on_etag name, *a
|
147
151
|
elsif last then
|
148
152
|
wellformed_error "element type `#{name}' is not matched"
|
149
|
-
@visitor.on_etag last
|
153
|
+
@visitor.on_etag last, *a
|
150
154
|
else
|
151
155
|
parse_error "end tag `#{name}' appears alone"
|
152
156
|
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'xmlscan/parser'
|
3
|
+
require 'xmlscan/visitor'
|
4
|
+
|
5
|
+
module XMLScan
|
6
|
+
module ElementProcessor
|
7
|
+
include XMLScan::Visitor
|
8
|
+
|
9
|
+
SKIP = [:on_chardata, :on_stag, :on_etag, :on_attribute, :on_attr_entityref,
|
10
|
+
:on_attr_value, :on_start_document, :on_end_document, :on_attribute_end,
|
11
|
+
:on_stag_end, :on_stag_end_empty, :on_attr_charref, :on_attr_charref_hex]
|
12
|
+
|
13
|
+
MY_METHODS = XMLScan::Visitor.instance_methods.to_a - SKIP
|
14
|
+
|
15
|
+
def initialize(opts={}, mod=nil)
|
16
|
+
raise "No module" unless mod
|
17
|
+
STDERR << "init Element Processer #{mod}\n"
|
18
|
+
(MY_METHODS - mod.instance_methods).each do |i|
|
19
|
+
self.class.class_eval %{def #{i}(d, *a) d&&(@out << d) end}, __FILE__, __LINE__
|
20
|
+
end
|
21
|
+
self.class.send :include, mod
|
22
|
+
|
23
|
+
@element = opts[:element] || raise("need an element")
|
24
|
+
@key = opts[:key] || raise("need a key")
|
25
|
+
@extras = (ex = opts[:extras]) ? ex.map(&:to_sym) : []
|
26
|
+
|
27
|
+
@pairs = [] # output [name, content, value] * 1 or more
|
28
|
+
@context = '' # current key(name) of the element (card)
|
29
|
+
@stack = [] # stack of containing context cards
|
30
|
+
@out = [] # current output for name(card)
|
31
|
+
@parser = XMLScan::XMLParser.new(self)
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
class XMLProcessor
|
38
|
+
include ElementProcessor
|
39
|
+
|
40
|
+
def self.process(io, opts={}, mod=nil)
|
41
|
+
mod ||= ElementProcessing
|
42
|
+
STDERR << "process #{io.inspect}, #{opts.inspect}\n"
|
43
|
+
io = case io
|
44
|
+
when String; open(io)
|
45
|
+
when IO, StringIO; io
|
46
|
+
else raise "bad type file input #{io.inspect}"
|
47
|
+
end
|
48
|
+
|
49
|
+
visitor = new(opts, mod)
|
50
|
+
visitor.parser.parse(io)
|
51
|
+
visitor.pairs
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
module ElementProcessing
|
57
|
+
def on_chardata(s) @out << s end
|
58
|
+
def on_stag_end(name, s, h, *a)
|
59
|
+
if name.to_sym == @element
|
60
|
+
# starting a new context, first output our substitute string
|
61
|
+
key= h&&h[@key.to_s]||'*no-name*'
|
62
|
+
@tmpl = ":transclude|{{:name}}" # def: "{{:key}}"
|
63
|
+
STDERR << "templ #{@tmpl.inspect}\n"
|
64
|
+
#STDERR << "x> #{x.inspect}, #{h.inspect}, #{(!(/:\w[\w\d]*/ =~ x)) || h[$&[1..-1].to_s] }\n"
|
65
|
+
sub =
|
66
|
+
@tmpl.split('|').find {|x| !(/:\w[\w\d]*/ =~ x) ||
|
67
|
+
h[$&[1..-1].to_s] }.gsub(/:\w[\w\d]*/) {|m|
|
68
|
+
STDERR << "templ sub match #{m.inspect}, #{h[m[1..-1]]}\n"
|
69
|
+
h[m[1..-1]] }
|
70
|
+
#sub = h['transclude'] || "{{#{key}}}"
|
71
|
+
@out << sub
|
72
|
+
# then push the current context and initialize this one
|
73
|
+
@stack.push([@context, @out, *@ex])
|
74
|
+
@context = key; @out = []
|
75
|
+
@ex = @extras.map {|e| h[e]}
|
76
|
+
else @out << s end # pass through tags we aren't processing
|
77
|
+
end
|
78
|
+
|
79
|
+
def on_etag(name, s=nil)
|
80
|
+
if name.to_sym == @element
|
81
|
+
# output a card (name, content, type)
|
82
|
+
@pairs << [@context, @out, @stack[-1][0], *@ex]
|
83
|
+
# restore previous context from stack
|
84
|
+
last = @stack.pop
|
85
|
+
@context, @out, @ex = last.shift, last.shift, *last
|
86
|
+
else @out << s end
|
87
|
+
end
|
88
|
+
|
89
|
+
def on_stag_empty_end(name, s=nil, h={}, *a)
|
90
|
+
if name.to_sym == @element
|
91
|
+
|
92
|
+
key= h&&h[@key.to_s]||'*no-name*'
|
93
|
+
ex = @extras.map {|e| h[e]}
|
94
|
+
@pairs << [key, [], @context, *ex]
|
95
|
+
else @out << s end
|
96
|
+
end
|
97
|
+
|
98
|
+
attr_reader :pairs, :parser
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
ARGV.each do |a|
|
103
|
+
pairs = XMLScan::XMLProcessor.process(a, {:key=>:name, :element=>:card, :extras=>[:type]}, XMLScan::ElementProcessing)
|
104
|
+
STDOUT << "Result\n"
|
105
|
+
STDOUT << pairs.map do |p| n,o,c,t = p
|
106
|
+
"#{c&&c.size>0&&"#{c}::"||''}#{n}#{t&&"[#{t}]"}=>#{o*''}"
|
107
|
+
end * "\n"
|
108
|
+
STDOUT << "\nDone\n"
|
109
|
+
end
|