xmlscan 0.2.3 → 0.3.0preb
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +7 -4
- data/VERSION +1 -1
- data/lib/xmlscan/htmlscan.rb +7 -7
- data/lib/xmlscan/namespace.rb +33 -33
- data/lib/xmlscan/parser.rb +17 -13
- data/lib/xmlscan/processor.rb +109 -0
- data/lib/xmlscan/scanner.rb +118 -89
- data/lib/xmlscan/version.rb +4 -10
- data/lib/xmlscan/visitor.rb +31 -29
- data/lib/xmlscan/xmlchar.rb +18 -18
- metadata +15 -16
- data/install.rb +0 -41
- data/test.rb +0 -7
data/Rakefile
CHANGED
@@ -3,6 +3,9 @@
|
|
3
3
|
|
4
4
|
require 'rubygems'
|
5
5
|
require 'bundler'
|
6
|
+
require 'xmlscan/version'
|
7
|
+
|
8
|
+
VERSION = XMLScan::VERSION # File.exist?('VERSION') ? File.read('VERSION') : ""
|
6
9
|
|
7
10
|
begin
|
8
11
|
Bundler.setup(:default, :development)
|
@@ -15,10 +18,11 @@ end
|
|
15
18
|
require 'rake'
|
16
19
|
|
17
20
|
begin
|
21
|
+
include XMLScan
|
18
22
|
require 'jeweler'
|
19
23
|
Jeweler::Tasks.new do |gem|
|
20
|
-
gem.name =
|
21
|
-
gem.version =
|
24
|
+
gem.name = 'xmlscan'
|
25
|
+
gem.version = XMLScan::VERSION
|
22
26
|
gem.license = "MIT"
|
23
27
|
gem.summary = "The fastest XML parser written in 100% pure Ruby."
|
24
28
|
gem.email = "gerryg@inbox.com"
|
@@ -56,10 +60,9 @@ task :default => :spec
|
|
56
60
|
|
57
61
|
require 'rdoc/task'
|
58
62
|
Rake::RDocTask.new do |rdoc|
|
59
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
60
63
|
|
61
64
|
rdoc.rdoc_dir = 'rdoc'
|
62
|
-
rdoc.title = "xmlscan #{
|
65
|
+
rdoc.title = "xmlscan #{VERSION}"
|
63
66
|
rdoc.rdoc_files.include('README*')
|
64
67
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
65
68
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0preb
|
data/lib/xmlscan/htmlscan.rb
CHANGED
@@ -47,7 +47,7 @@ module XMLScan
|
|
47
47
|
raise "[BUG] this method must be never called"
|
48
48
|
end
|
49
49
|
|
50
|
-
def on_stag_end_empty(name)
|
50
|
+
def on_stag_end_empty(name, *a)
|
51
51
|
raise "[BUG] this method must be never called"
|
52
52
|
end
|
53
53
|
|
@@ -127,7 +127,7 @@ module XMLScan
|
|
127
127
|
return found_empty_stag
|
128
128
|
else
|
129
129
|
parse_error "parse error at `<'"
|
130
|
-
return on_chardata
|
130
|
+
return on_chardata '<'
|
131
131
|
end
|
132
132
|
end
|
133
133
|
on_stag name
|
@@ -142,7 +142,7 @@ module XMLScan
|
|
142
142
|
if @src.close_tag then
|
143
143
|
s << '>'
|
144
144
|
end
|
145
|
-
return on_chardata
|
145
|
+
return on_chardata '<'+s
|
146
146
|
end
|
147
147
|
on_stag name
|
148
148
|
begin
|
@@ -156,9 +156,9 @@ module XMLScan
|
|
156
156
|
qmark = val.slice!(0,1)
|
157
157
|
if val[-1] == qmark[0] then
|
158
158
|
val.chop!
|
159
|
-
|
159
|
+
scan_attr_value val unless val.empty?
|
160
160
|
else
|
161
|
-
|
161
|
+
scan_attr_value val unless val.empty?
|
162
162
|
begin
|
163
163
|
s = @src.get
|
164
164
|
unless s then
|
@@ -167,8 +167,8 @@ module XMLScan
|
|
167
167
|
end
|
168
168
|
c = s[0]
|
169
169
|
val, s = s.split(qmark, 2)
|
170
|
-
|
171
|
-
|
170
|
+
scan_attr_value '>' unless c == ?< or c == ?>
|
171
|
+
scan_attr_value val if c
|
172
172
|
end until s
|
173
173
|
continue = s
|
174
174
|
end
|
data/lib/xmlscan/namespace.rb
CHANGED
@@ -54,16 +54,16 @@ module XMLScan
|
|
54
54
|
# on_stag_end_empty_ns ('foo:bar', { 'foo' => '', ... })
|
55
55
|
#
|
56
56
|
|
57
|
-
def on_stag_ns(qname, prefix, localpart)
|
57
|
+
def on_stag_ns(qname, prefix, localpart, *a)
|
58
58
|
end
|
59
59
|
|
60
|
-
def on_attribute_ns(qname, prefix, localpart)
|
60
|
+
def on_attribute_ns(qname, prefix, localpart, *a)
|
61
61
|
end
|
62
62
|
|
63
|
-
def on_stag_end_ns(qname, namespaces)
|
63
|
+
def on_stag_end_ns(qname, namespaces, *a)
|
64
64
|
end
|
65
65
|
|
66
|
-
def on_stag_end_empty_ns(qname, namespaces)
|
66
|
+
def on_stag_end_empty_ns(qname, namespaces, *a)
|
67
67
|
end
|
68
68
|
|
69
69
|
end
|
@@ -99,7 +99,7 @@ module XMLScan
|
|
99
99
|
end
|
100
100
|
|
101
101
|
|
102
|
-
def on_start_document
|
102
|
+
def on_start_document(*a)
|
103
103
|
@namespace = {} #PredefinedNamespace.dup
|
104
104
|
@ns_hist = []
|
105
105
|
@ns_undeclared = {} # for checking undeclared namespace prefixes.
|
@@ -107,14 +107,14 @@ module XMLScan
|
|
107
107
|
@dont_same = [] # ditto.
|
108
108
|
@xmlns = NamespaceDeclaration.new(self)
|
109
109
|
@orig_visitor = @visitor
|
110
|
-
@visitor.on_start_document
|
110
|
+
@visitor.on_start_document *a
|
111
111
|
end
|
112
112
|
|
113
113
|
|
114
|
-
def on_stag(name)
|
114
|
+
def on_stag(name, *a)
|
115
115
|
@ns_hist.push nil
|
116
116
|
unless /:/n =~ name then
|
117
|
-
@visitor.on_stag_ns name, '', name
|
117
|
+
@visitor.on_stag_ns name, '', name, *a
|
118
118
|
else
|
119
119
|
prefix, localpart = $`, $'
|
120
120
|
if localpart.include? ?: then
|
@@ -131,12 +131,12 @@ module XMLScan
|
|
131
131
|
@ns_undeclared[prefix] = true
|
132
132
|
end
|
133
133
|
end
|
134
|
-
@visitor.on_stag_ns name, prefix, localpart
|
134
|
+
@visitor.on_stag_ns name, prefix, localpart, *a
|
135
135
|
end
|
136
136
|
end
|
137
137
|
|
138
138
|
|
139
|
-
def on_attribute(name)
|
139
|
+
def on_attribute(name, *a)
|
140
140
|
if /:/n =~ name then
|
141
141
|
prefix, localpart = $`, $'
|
142
142
|
if localpart.include? ?: then
|
@@ -157,13 +157,13 @@ module XMLScan
|
|
157
157
|
@dont_same.push [ prev, prefix, localpart ]
|
158
158
|
end
|
159
159
|
@prev_prefix[localpart] = prefix
|
160
|
-
@visitor.on_attribute_ns name, prefix, localpart
|
160
|
+
@visitor.on_attribute_ns name, prefix, localpart, *a
|
161
161
|
end
|
162
162
|
elsif name == 'xmlns' then
|
163
163
|
@visitor = @xmlns
|
164
164
|
@xmlns.on_xmlns_start ''
|
165
165
|
else
|
166
|
-
@visitor.on_attribute_ns name, nil, name
|
166
|
+
@visitor.on_attribute_ns name, nil, name, *a
|
167
167
|
end
|
168
168
|
end
|
169
169
|
|
@@ -176,36 +176,36 @@ module XMLScan
|
|
176
176
|
@parent = parent
|
177
177
|
end
|
178
178
|
|
179
|
-
def on_xmlns_start(prefix)
|
179
|
+
def on_xmlns_start(prefix, *a)
|
180
180
|
@prefix = prefix
|
181
181
|
@nsdecl = ''
|
182
182
|
end
|
183
183
|
|
184
|
-
def on_attr_value(str)
|
184
|
+
def on_attr_value(str, *a)
|
185
185
|
@nsdecl << str
|
186
186
|
end
|
187
187
|
|
188
|
-
def on_attr_entityref(ref)
|
188
|
+
def on_attr_entityref(ref, *a)
|
189
189
|
@parent.ns_wellformed_error \
|
190
190
|
"xmlns includes undeclared entity reference"
|
191
191
|
end
|
192
192
|
|
193
|
-
def on_attr_charref(code)
|
193
|
+
def on_attr_charref(code, *a)
|
194
194
|
@nsdecl << [code].pack('U')
|
195
195
|
end
|
196
196
|
|
197
|
-
def on_attr_charref_hex(code)
|
197
|
+
def on_attr_charref_hex(code, *a)
|
198
198
|
@nsdecl << [code].pack('U')
|
199
199
|
end
|
200
200
|
|
201
|
-
def on_attribute_end(name)
|
201
|
+
def on_attribute_end(name, *a)
|
202
202
|
@parent.on_xmlns_end @prefix, @nsdecl
|
203
203
|
end
|
204
204
|
|
205
205
|
end
|
206
206
|
|
207
207
|
|
208
|
-
def on_xmlns_end(prefix, uri)
|
208
|
+
def on_xmlns_end(prefix, uri, *a)
|
209
209
|
@visitor = @orig_visitor
|
210
210
|
if PredefinedNamespace.key? prefix then
|
211
211
|
if prefix == 'xmlns' then
|
@@ -254,54 +254,54 @@ module XMLScan
|
|
254
254
|
end
|
255
255
|
|
256
256
|
|
257
|
-
def on_stag_end(name)
|
257
|
+
def on_stag_end(name, *a)
|
258
258
|
fix_namespace
|
259
|
-
@visitor.on_stag_end_ns name, @namespace
|
259
|
+
@visitor.on_stag_end_ns name, @namespace, *a
|
260
260
|
end
|
261
261
|
|
262
262
|
|
263
|
-
def on_etag(name)
|
263
|
+
def on_etag(name, *a)
|
264
264
|
h = @ns_hist.pop and @namespace.update h
|
265
|
-
@visitor.on_etag name
|
265
|
+
@visitor.on_etag name, *a
|
266
266
|
end
|
267
267
|
|
268
268
|
|
269
|
-
def on_stag_end_empty(name)
|
269
|
+
def on_stag_end_empty(name, *a)
|
270
270
|
fix_namespace
|
271
|
-
@visitor.on_stag_end_empty_ns name, @namespace
|
271
|
+
@visitor.on_stag_end_empty_ns name, @namespace, *a
|
272
272
|
h = @ns_hist.pop and @namespace.update h
|
273
273
|
end
|
274
274
|
|
275
275
|
|
276
|
-
def on_doctype(root, pubid, sysid)
|
276
|
+
def on_doctype(root, pubid, sysid, *a)
|
277
277
|
if root.count(':') > 1 then
|
278
278
|
ns_parse_error "qualified name `#{root}' includes `:'"
|
279
279
|
end
|
280
|
-
@visitor.on_doctype root, pubid, sysid
|
280
|
+
@visitor.on_doctype root, pubid, sysid, *a
|
281
281
|
end
|
282
282
|
|
283
283
|
|
284
|
-
def on_pi(target, pi)
|
284
|
+
def on_pi(target, pi, *a)
|
285
285
|
if target.include? ?: then
|
286
286
|
ns_parse_error "PI target `#{target}' includes `:'"
|
287
287
|
end
|
288
|
-
@visitor.on_pi target, pi
|
288
|
+
@visitor.on_pi target, pi, *a
|
289
289
|
end
|
290
290
|
|
291
291
|
|
292
|
-
def on_entityref(ref)
|
292
|
+
def on_entityref(ref, *a)
|
293
293
|
if ref.include? ?: then
|
294
294
|
ns_parse_error "entity reference `#{ref}' includes `:'"
|
295
295
|
end
|
296
|
-
@visitor.on_entityref ref
|
296
|
+
@visitor.on_entityref ref, *a
|
297
297
|
end
|
298
298
|
|
299
299
|
|
300
|
-
def on_attr_entityref(ref)
|
300
|
+
def on_attr_entityref(ref, *a)
|
301
301
|
if ref.include? ?: then
|
302
302
|
ns_parse_error "entity reference `#{ref}' includes `:'"
|
303
303
|
end
|
304
|
-
@visitor.on_attr_entityref ref
|
304
|
+
@visitor.on_attr_entityref ref, *a
|
305
305
|
end
|
306
306
|
|
307
307
|
end
|
data/lib/xmlscan/parser.rb
CHANGED
@@ -43,7 +43,7 @@ module XMLScan
|
|
43
43
|
|
44
44
|
private
|
45
45
|
|
46
|
-
def on_xmldecl_version(str)
|
46
|
+
def on_xmldecl_version(str, *a)
|
47
47
|
unless str == '1.0' then
|
48
48
|
warning "unsupported XML version `#{str}'"
|
49
49
|
end
|
@@ -51,7 +51,7 @@ module XMLScan
|
|
51
51
|
end
|
52
52
|
|
53
53
|
|
54
|
-
def on_xmldecl_standalone(str)
|
54
|
+
def on_xmldecl_standalone(str, *a)
|
55
55
|
if str == 'yes' then
|
56
56
|
@standalone = true
|
57
57
|
elsif str == 'no' then
|
@@ -63,7 +63,7 @@ module XMLScan
|
|
63
63
|
end
|
64
64
|
|
65
65
|
|
66
|
-
def on_doctype(name, pubid, sysid)
|
66
|
+
def on_doctype(name, pubid, sysid, *a)
|
67
67
|
if pubid and not sysid then
|
68
68
|
parse_error "public external ID must have both public ID and system ID"
|
69
69
|
end
|
@@ -71,12 +71,12 @@ module XMLScan
|
|
71
71
|
end
|
72
72
|
|
73
73
|
|
74
|
-
def on_prolog_space(s)
|
74
|
+
def on_prolog_space(s, *a)
|
75
75
|
# just ignore it.
|
76
76
|
end
|
77
77
|
|
78
78
|
|
79
|
-
def on_pi(target, pi)
|
79
|
+
def on_pi(target, pi, *a)
|
80
80
|
if target.downcase == 'xml' then
|
81
81
|
parse_error "reserved PI target `#{target}'"
|
82
82
|
end
|
@@ -114,39 +114,43 @@ module XMLScan
|
|
114
114
|
#end
|
115
115
|
|
116
116
|
|
117
|
-
def on_stag(name)
|
117
|
+
def on_stag(name, *a)
|
118
118
|
@elem.push name
|
119
119
|
@visitor.on_stag name
|
120
120
|
@attr.clear
|
121
121
|
end
|
122
122
|
|
123
|
-
def on_attribute(name)
|
123
|
+
def on_attribute(name, *a)
|
124
124
|
unless @attr.check_unique name then
|
125
125
|
wellformed_error "doubled attribute `#{name}'"
|
126
126
|
end
|
127
127
|
@visitor.on_attribute name
|
128
128
|
end
|
129
129
|
|
130
|
-
def on_attr_value(str)
|
130
|
+
def on_attr_value(str, *a)
|
131
131
|
str.tr! "\t\r\n", ' ' # normalize
|
132
132
|
@visitor.on_attr_value str
|
133
133
|
end
|
134
134
|
|
135
|
-
def
|
135
|
+
def on_stag_end(name, *a)
|
136
|
+
@visitor.on_stag_end name, *a
|
137
|
+
end
|
138
|
+
|
139
|
+
def on_stag_end_empty(name, *a)
|
136
140
|
# @visitor.on_stag_end name
|
137
141
|
# @elem.pop
|
138
142
|
# @visitor.on_etag name
|
139
|
-
@visitor.on_stag_end_empty name
|
143
|
+
@visitor.on_stag_end_empty name, *a
|
140
144
|
@elem.pop
|
141
145
|
end
|
142
146
|
|
143
|
-
def on_etag(name)
|
147
|
+
def on_etag(name, *a)
|
144
148
|
last = @elem.pop
|
145
149
|
if last == name then
|
146
|
-
@visitor.on_etag name
|
150
|
+
@visitor.on_etag name, *a
|
147
151
|
elsif last then
|
148
152
|
wellformed_error "element type `#{name}' is not matched"
|
149
|
-
@visitor.on_etag last
|
153
|
+
@visitor.on_etag last, *a
|
150
154
|
else
|
151
155
|
parse_error "end tag `#{name}' appears alone"
|
152
156
|
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'xmlscan/parser'
|
3
|
+
require 'xmlscan/visitor'
|
4
|
+
|
5
|
+
module XMLScan
|
6
|
+
module ElementProcessor
|
7
|
+
include XMLScan::Visitor
|
8
|
+
|
9
|
+
SKIP = [:on_chardata, :on_stag, :on_etag, :on_attribute, :on_attr_entityref,
|
10
|
+
:on_attr_value, :on_start_document, :on_end_document, :on_attribute_end,
|
11
|
+
:on_stag_end, :on_stag_end_empty, :on_attr_charref, :on_attr_charref_hex]
|
12
|
+
|
13
|
+
MY_METHODS = XMLScan::Visitor.instance_methods.to_a - SKIP
|
14
|
+
|
15
|
+
def initialize(opts={}, mod=nil)
|
16
|
+
raise "No module" unless mod
|
17
|
+
STDERR << "init Element Processer #{mod}\n"
|
18
|
+
(MY_METHODS - mod.instance_methods).each do |i|
|
19
|
+
self.class.class_eval %{def #{i}(d, *a) d&&(@out << d) end}, __FILE__, __LINE__
|
20
|
+
end
|
21
|
+
self.class.send :include, mod
|
22
|
+
|
23
|
+
@element = opts[:element] || raise("need an element")
|
24
|
+
@key = opts[:key] || raise("need a key")
|
25
|
+
@extras = (ex = opts[:extras]) ? ex.map(&:to_sym) : []
|
26
|
+
|
27
|
+
@pairs = [] # output [name, content, value] * 1 or more
|
28
|
+
@context = '' # current key(name) of the element (card)
|
29
|
+
@stack = [] # stack of containing context cards
|
30
|
+
@out = [] # current output for name(card)
|
31
|
+
@parser = XMLScan::XMLParser.new(self)
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
class XMLProcessor
|
38
|
+
include ElementProcessor
|
39
|
+
|
40
|
+
def self.process(io, opts={}, mod=nil)
|
41
|
+
mod ||= ElementProcessing
|
42
|
+
STDERR << "process #{io.inspect}, #{opts.inspect}\n"
|
43
|
+
io = case io
|
44
|
+
when String; open(io)
|
45
|
+
when IO, StringIO; io
|
46
|
+
else raise "bad type file input #{io.inspect}"
|
47
|
+
end
|
48
|
+
|
49
|
+
visitor = new(opts, mod)
|
50
|
+
visitor.parser.parse(io)
|
51
|
+
visitor.pairs
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
module ElementProcessing
|
57
|
+
def on_chardata(s) @out << s end
|
58
|
+
def on_stag_end(name, s, h, *a)
|
59
|
+
if name.to_sym == @element
|
60
|
+
# starting a new context, first output our substitute string
|
61
|
+
key= h&&h[@key.to_s]||'*no-name*'
|
62
|
+
@tmpl = ":transclude|{{:name}}" # def: "{{:key}}"
|
63
|
+
STDERR << "templ #{@tmpl.inspect}\n"
|
64
|
+
#STDERR << "x> #{x.inspect}, #{h.inspect}, #{(!(/:\w[\w\d]*/ =~ x)) || h[$&[1..-1].to_s] }\n"
|
65
|
+
sub =
|
66
|
+
@tmpl.split('|').find {|x| !(/:\w[\w\d]*/ =~ x) ||
|
67
|
+
h[$&[1..-1].to_s] }.gsub(/:\w[\w\d]*/) {|m|
|
68
|
+
STDERR << "templ sub match #{m.inspect}, #{h[m[1..-1]]}\n"
|
69
|
+
h[m[1..-1]] }
|
70
|
+
#sub = h['transclude'] || "{{#{key}}}"
|
71
|
+
@out << sub
|
72
|
+
# then push the current context and initialize this one
|
73
|
+
@stack.push([@context, @out, *@ex])
|
74
|
+
@context = key; @out = []
|
75
|
+
@ex = @extras.map {|e| h[e]}
|
76
|
+
else @out << s end # pass through tags we aren't processing
|
77
|
+
end
|
78
|
+
|
79
|
+
def on_etag(name, s=nil)
|
80
|
+
if name.to_sym == @element
|
81
|
+
# output a card (name, content, type)
|
82
|
+
@pairs << [@context, @out, @stack[-1][0], *@ex]
|
83
|
+
# restore previous context from stack
|
84
|
+
last = @stack.pop
|
85
|
+
@context, @out, @ex = last.shift, last.shift, *last
|
86
|
+
else @out << s end
|
87
|
+
end
|
88
|
+
|
89
|
+
def on_stag_empty_end(name, s=nil, h={}, *a)
|
90
|
+
if name.to_sym == @element
|
91
|
+
|
92
|
+
key= h&&h[@key.to_s]||'*no-name*'
|
93
|
+
ex = @extras.map {|e| h[e]}
|
94
|
+
@pairs << [key, [], @context, *ex]
|
95
|
+
else @out << s end
|
96
|
+
end
|
97
|
+
|
98
|
+
attr_reader :pairs, :parser
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
ARGV.each do |a|
|
103
|
+
pairs = XMLScan::XMLProcessor.process(a, {:key=>:name, :element=>:card, :extras=>[:type]}, XMLScan::ElementProcessing)
|
104
|
+
STDOUT << "Result\n"
|
105
|
+
STDOUT << pairs.map do |p| n,o,c,t = p
|
106
|
+
"#{c&&c.size>0&&"#{c}::"||''}#{n}#{t&&"[#{t}]"}=>#{o*''}"
|
107
|
+
end * "\n"
|
108
|
+
STDOUT << "\nDone\n"
|
109
|
+
end
|