xmlscan 0.2.3 → 0.3.0prec

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -3,6 +3,9 @@
3
3
 
4
4
  require 'rubygems'
5
5
  require 'bundler'
6
+ require 'xmlscan/version'
7
+
8
+ VERSION = XMLScan::VERSION # File.exist?('VERSION') ? File.read('VERSION') : ""
6
9
 
7
10
  begin
8
11
  Bundler.setup(:default, :development)
@@ -15,10 +18,11 @@ end
15
18
  require 'rake'
16
19
 
17
20
  begin
21
+ include XMLScan
18
22
  require 'jeweler'
19
23
  Jeweler::Tasks.new do |gem|
20
- gem.name = "xmlscan"
21
- gem.version = '0.2.3'
24
+ gem.name = 'xmlscan'
25
+ gem.version = XMLScan::VERSION
22
26
  gem.license = "MIT"
23
27
  gem.summary = "The fastest XML parser written in 100% pure Ruby."
24
28
  gem.email = "gerryg@inbox.com"
@@ -56,10 +60,9 @@ task :default => :spec
56
60
 
57
61
  require 'rdoc/task'
58
62
  Rake::RDocTask.new do |rdoc|
59
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
60
63
 
61
64
  rdoc.rdoc_dir = 'rdoc'
62
- rdoc.title = "xmlscan #{version}"
65
+ rdoc.title = "xmlscan #{VERSION}"
63
66
  rdoc.rdoc_files.include('README*')
64
67
  rdoc.rdoc_files.include('lib/**/*.rb')
65
68
  end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.3
1
+ 0.3.0prec
@@ -47,7 +47,7 @@ module XMLScan
47
47
  raise "[BUG] this method must be never called"
48
48
  end
49
49
 
50
- def on_stag_end_empty(name)
50
+ def on_stag_end_empty(name, *a)
51
51
  raise "[BUG] this method must be never called"
52
52
  end
53
53
 
@@ -127,7 +127,7 @@ module XMLScan
127
127
  return found_empty_stag
128
128
  else
129
129
  parse_error "parse error at `<'"
130
- return on_chardata('<')
130
+ return on_chardata '<'
131
131
  end
132
132
  end
133
133
  on_stag name
@@ -142,7 +142,7 @@ module XMLScan
142
142
  if @src.close_tag then
143
143
  s << '>'
144
144
  end
145
- return on_chardata('<' << s)
145
+ return on_chardata '<'+s
146
146
  end
147
147
  on_stag name
148
148
  begin
@@ -156,9 +156,9 @@ module XMLScan
156
156
  qmark = val.slice!(0,1)
157
157
  if val[-1] == qmark[0] then
158
158
  val.chop!
159
- scan_attvalue val unless val.empty?
159
+ scan_attr_value val unless val.empty?
160
160
  else
161
- scan_attvalue val unless val.empty?
161
+ scan_attr_value val unless val.empty?
162
162
  begin
163
163
  s = @src.get
164
164
  unless s then
@@ -167,8 +167,8 @@ module XMLScan
167
167
  end
168
168
  c = s[0]
169
169
  val, s = s.split(qmark, 2)
170
- scan_attvalue '>' unless c == ?< or c == ?>
171
- scan_attvalue val if c
170
+ scan_attr_value '>' unless c == ?< or c == ?>
171
+ scan_attr_value val if c
172
172
  end until s
173
173
  continue = s
174
174
  end
@@ -54,16 +54,16 @@ module XMLScan
54
54
  # on_stag_end_empty_ns ('foo:bar', { 'foo' => '', ... })
55
55
  #
56
56
 
57
- def on_stag_ns(qname, prefix, localpart)
57
+ def on_stag_ns(qname, prefix, localpart, *a)
58
58
  end
59
59
 
60
- def on_attribute_ns(qname, prefix, localpart)
60
+ def on_attribute_ns(qname, prefix, localpart, *a)
61
61
  end
62
62
 
63
- def on_stag_end_ns(qname, namespaces)
63
+ def on_stag_end_ns(qname, namespaces, *a)
64
64
  end
65
65
 
66
- def on_stag_end_empty_ns(qname, namespaces)
66
+ def on_stag_end_empty_ns(qname, namespaces, *a)
67
67
  end
68
68
 
69
69
  end
@@ -99,7 +99,7 @@ module XMLScan
99
99
  end
100
100
 
101
101
 
102
- def on_start_document
102
+ def on_start_document(*a)
103
103
  @namespace = {} #PredefinedNamespace.dup
104
104
  @ns_hist = []
105
105
  @ns_undeclared = {} # for checking undeclared namespace prefixes.
@@ -107,14 +107,14 @@ module XMLScan
107
107
  @dont_same = [] # ditto.
108
108
  @xmlns = NamespaceDeclaration.new(self)
109
109
  @orig_visitor = @visitor
110
- @visitor.on_start_document
110
+ @visitor.on_start_document *a
111
111
  end
112
112
 
113
113
 
114
- def on_stag(name)
114
+ def on_stag(name, *a)
115
115
  @ns_hist.push nil
116
116
  unless /:/n =~ name then
117
- @visitor.on_stag_ns name, '', name
117
+ @visitor.on_stag_ns name, '', name, *a
118
118
  else
119
119
  prefix, localpart = $`, $'
120
120
  if localpart.include? ?: then
@@ -131,12 +131,12 @@ module XMLScan
131
131
  @ns_undeclared[prefix] = true
132
132
  end
133
133
  end
134
- @visitor.on_stag_ns name, prefix, localpart
134
+ @visitor.on_stag_ns name, prefix, localpart, *a
135
135
  end
136
136
  end
137
137
 
138
138
 
139
- def on_attribute(name)
139
+ def on_attribute(name, *a)
140
140
  if /:/n =~ name then
141
141
  prefix, localpart = $`, $'
142
142
  if localpart.include? ?: then
@@ -157,13 +157,13 @@ module XMLScan
157
157
  @dont_same.push [ prev, prefix, localpart ]
158
158
  end
159
159
  @prev_prefix[localpart] = prefix
160
- @visitor.on_attribute_ns name, prefix, localpart
160
+ @visitor.on_attribute_ns name, prefix, localpart, *a
161
161
  end
162
162
  elsif name == 'xmlns' then
163
163
  @visitor = @xmlns
164
164
  @xmlns.on_xmlns_start ''
165
165
  else
166
- @visitor.on_attribute_ns name, nil, name
166
+ @visitor.on_attribute_ns name, nil, name, *a
167
167
  end
168
168
  end
169
169
 
@@ -176,36 +176,36 @@ module XMLScan
176
176
  @parent = parent
177
177
  end
178
178
 
179
- def on_xmlns_start(prefix)
179
+ def on_xmlns_start(prefix, *a)
180
180
  @prefix = prefix
181
181
  @nsdecl = ''
182
182
  end
183
183
 
184
- def on_attr_value(str)
184
+ def on_attr_value(str, *a)
185
185
  @nsdecl << str
186
186
  end
187
187
 
188
- def on_attr_entityref(ref)
188
+ def on_attr_entityref(ref, *a)
189
189
  @parent.ns_wellformed_error \
190
190
  "xmlns includes undeclared entity reference"
191
191
  end
192
192
 
193
- def on_attr_charref(code)
193
+ def on_attr_charref(code, *a)
194
194
  @nsdecl << [code].pack('U')
195
195
  end
196
196
 
197
- def on_attr_charref_hex(code)
197
+ def on_attr_charref_hex(code, *a)
198
198
  @nsdecl << [code].pack('U')
199
199
  end
200
200
 
201
- def on_attribute_end(name)
201
+ def on_attribute_end(name, *a)
202
202
  @parent.on_xmlns_end @prefix, @nsdecl
203
203
  end
204
204
 
205
205
  end
206
206
 
207
207
 
208
- def on_xmlns_end(prefix, uri)
208
+ def on_xmlns_end(prefix, uri, *a)
209
209
  @visitor = @orig_visitor
210
210
  if PredefinedNamespace.key? prefix then
211
211
  if prefix == 'xmlns' then
@@ -254,54 +254,54 @@ module XMLScan
254
254
  end
255
255
 
256
256
 
257
- def on_stag_end(name)
257
+ def on_stag_end(name, *a)
258
258
  fix_namespace
259
- @visitor.on_stag_end_ns name, @namespace
259
+ @visitor.on_stag_end_ns name, @namespace, *a
260
260
  end
261
261
 
262
262
 
263
- def on_etag(name)
263
+ def on_etag(name, *a)
264
264
  h = @ns_hist.pop and @namespace.update h
265
- @visitor.on_etag name
265
+ @visitor.on_etag name, *a
266
266
  end
267
267
 
268
268
 
269
- def on_stag_end_empty(name)
269
+ def on_stag_end_empty(name, *a)
270
270
  fix_namespace
271
- @visitor.on_stag_end_empty_ns name, @namespace
271
+ @visitor.on_stag_end_empty_ns name, @namespace, *a
272
272
  h = @ns_hist.pop and @namespace.update h
273
273
  end
274
274
 
275
275
 
276
- def on_doctype(root, pubid, sysid)
276
+ def on_doctype(root, pubid, sysid, *a)
277
277
  if root.count(':') > 1 then
278
278
  ns_parse_error "qualified name `#{root}' includes `:'"
279
279
  end
280
- @visitor.on_doctype root, pubid, sysid
280
+ @visitor.on_doctype root, pubid, sysid, *a
281
281
  end
282
282
 
283
283
 
284
- def on_pi(target, pi)
284
+ def on_pi(target, pi, *a)
285
285
  if target.include? ?: then
286
286
  ns_parse_error "PI target `#{target}' includes `:'"
287
287
  end
288
- @visitor.on_pi target, pi
288
+ @visitor.on_pi target, pi, *a
289
289
  end
290
290
 
291
291
 
292
- def on_entityref(ref)
292
+ def on_entityref(ref, *a)
293
293
  if ref.include? ?: then
294
294
  ns_parse_error "entity reference `#{ref}' includes `:'"
295
295
  end
296
- @visitor.on_entityref ref
296
+ @visitor.on_entityref ref, *a
297
297
  end
298
298
 
299
299
 
300
- def on_attr_entityref(ref)
300
+ def on_attr_entityref(ref, *a)
301
301
  if ref.include? ?: then
302
302
  ns_parse_error "entity reference `#{ref}' includes `:'"
303
303
  end
304
- @visitor.on_attr_entityref ref
304
+ @visitor.on_attr_entityref ref, *a
305
305
  end
306
306
 
307
307
  end
@@ -43,7 +43,7 @@ module XMLScan
43
43
 
44
44
  private
45
45
 
46
- def on_xmldecl_version(str)
46
+ def on_xmldecl_version(str, *a)
47
47
  unless str == '1.0' then
48
48
  warning "unsupported XML version `#{str}'"
49
49
  end
@@ -51,7 +51,7 @@ module XMLScan
51
51
  end
52
52
 
53
53
 
54
- def on_xmldecl_standalone(str)
54
+ def on_xmldecl_standalone(str, *a)
55
55
  if str == 'yes' then
56
56
  @standalone = true
57
57
  elsif str == 'no' then
@@ -63,7 +63,7 @@ module XMLScan
63
63
  end
64
64
 
65
65
 
66
- def on_doctype(name, pubid, sysid)
66
+ def on_doctype(name, pubid, sysid, *a)
67
67
  if pubid and not sysid then
68
68
  parse_error "public external ID must have both public ID and system ID"
69
69
  end
@@ -71,12 +71,12 @@ module XMLScan
71
71
  end
72
72
 
73
73
 
74
- def on_prolog_space(s)
74
+ def on_prolog_space(s, *a)
75
75
  # just ignore it.
76
76
  end
77
77
 
78
78
 
79
- def on_pi(target, pi)
79
+ def on_pi(target, pi, *a)
80
80
  if target.downcase == 'xml' then
81
81
  parse_error "reserved PI target `#{target}'"
82
82
  end
@@ -114,39 +114,43 @@ module XMLScan
114
114
  #end
115
115
 
116
116
 
117
- def on_stag(name)
117
+ def on_stag(name, *a)
118
118
  @elem.push name
119
119
  @visitor.on_stag name
120
120
  @attr.clear
121
121
  end
122
122
 
123
- def on_attribute(name)
123
+ def on_attribute(name, *a)
124
124
  unless @attr.check_unique name then
125
125
  wellformed_error "doubled attribute `#{name}'"
126
126
  end
127
127
  @visitor.on_attribute name
128
128
  end
129
129
 
130
- def on_attr_value(str)
130
+ def on_attr_value(str, *a)
131
131
  str.tr! "\t\r\n", ' ' # normalize
132
132
  @visitor.on_attr_value str
133
133
  end
134
134
 
135
- def on_stag_end_empty(name)
135
+ def on_stag_end(name, *a)
136
+ @visitor.on_stag_end name, *a
137
+ end
138
+
139
+ def on_stag_end_empty(name, *a)
136
140
  # @visitor.on_stag_end name
137
141
  # @elem.pop
138
142
  # @visitor.on_etag name
139
- @visitor.on_stag_end_empty name
143
+ @visitor.on_stag_end_empty name, *a
140
144
  @elem.pop
141
145
  end
142
146
 
143
- def on_etag(name)
147
+ def on_etag(name, *a)
144
148
  last = @elem.pop
145
149
  if last == name then
146
- @visitor.on_etag name
150
+ @visitor.on_etag name, *a
147
151
  elsif last then
148
152
  wellformed_error "element type `#{name}' is not matched"
149
- @visitor.on_etag last
153
+ @visitor.on_etag last, *a
150
154
  else
151
155
  parse_error "end tag `#{name}' appears alone"
152
156
  end
@@ -0,0 +1,97 @@
1
+ # encoding: UTF-8
2
+ require 'xmlscan/parser'
3
+ require 'xmlscan/visitor'
4
+ require 'stringio'
5
+
6
+ module XMLScan
7
+ module ElementProcessor
8
+ include XMLScan::Visitor
9
+
10
+ SKIP = [:on_chardata, :on_stag, :on_etag, :on_attribute, :on_attr_entityref,
11
+ :on_attr_value, :on_start_document, :on_end_document, :on_attribute_end,
12
+ :on_stag_end, :on_stag_end_empty, :on_attr_charref, :on_attr_charref_hex]
13
+
14
+ MY_METHODS = XMLScan::Visitor.instance_methods.to_a - SKIP
15
+
16
+ def initialize(opts={}, mod=nil)
17
+ raise "No module" unless mod
18
+ (MY_METHODS - mod.instance_methods).each do |i|
19
+ self.class.class_eval %{def #{i}(d, *a) d&&(self << d) end}, __FILE__, __LINE__
20
+ end
21
+ self.class.send :include, mod
22
+
23
+ @element = opts[:element] || raise("need an element")
24
+ @key = opts[:key] || raise("need a key")
25
+ @extras = (ex = opts[:extras]) ? ex.map(&:to_sym) : []
26
+ @tmpl = opts[:substitute] || "{{:key}}"
27
+
28
+ @pairs = {} # output name=> [content, context, extra_values] * 1 or more
29
+ @context = '' # current key(name) of the element (card)
30
+ @stack = [] # stack of containing context cards
31
+ @out = [] # current output for name(card)
32
+ @parser = XMLScan::XMLParser.new(self)
33
+ self
34
+ end
35
+
36
+ end
37
+
38
+ class XMLProcessor
39
+ include ElementProcessor
40
+
41
+ def self.process(io, opts={}, mod=nil)
42
+ mod ||= ElementProcessing
43
+ STDERR << "process #{io.inspect}, #{opts.inspect}\n"
44
+ io = case io
45
+ when IO, StringIO; io
46
+ when String; open(io)
47
+ else raise "bad type file input #{io.inspect}"
48
+ end
49
+
50
+ visitor = new(opts, mod)
51
+ visitor.parser.parse(io)
52
+ visitor.pairs
53
+ end
54
+ end
55
+
56
+
57
+ module ElementProcessing
58
+ def <<(s) @out << s end
59
+ def on_chardata(s) self << s end
60
+ def on_stag_end(name, s, h, *a)
61
+ if name.to_sym == @element
62
+ # starting a new context, first output our substitute string
63
+ key= h&&h[@key.to_s]||'*no-name*'
64
+ self << @tmpl.split('|').find {
65
+ |x| !(/:\w[\w\d]*/ =~ x) || h[$&[1..-1].to_s] }.gsub(/:\w[\w\d]*/) {
66
+ |m| h[m[1..-1]]
67
+ }
68
+ # then push the current context and initialize this one
69
+ @stack.push([@context, @out, *@ex])
70
+ @pairs[key] = nil # insert it when first seen
71
+ @context = key; @out = []; @ex = @extras.map {|e| h[e.to_s]}
72
+ else self << s end # pass through tags we aren't processing
73
+ end
74
+
75
+ def on_etag(name, s=nil)
76
+ if name.to_sym == @element
77
+ # output a card (name, content, type)
78
+ @pairs[@context] = [@out, @stack[-1][0], *@ex]
79
+ # restore previous context from stack
80
+ last = @stack.pop
81
+ @context, @out, @ex = last.shift, last.shift, *last
82
+ else self << s end
83
+ end
84
+
85
+ def on_stag_empty_end(name, s=nil, h={}, *a)
86
+ if name.to_sym == @element
87
+
88
+ key= h&&h[@key.to_s]||'*no-name*'
89
+ ex = @extras.map {|e| h[e]}
90
+ @pairs[key] = [[], @context, *ex]
91
+ else self << s end
92
+ end
93
+
94
+ attr_reader :pairs, :parser
95
+ end
96
+
97
+ end
@@ -122,20 +122,29 @@ module XMLScan
122
122
  self
123
123
  end
124
124
 
125
-
125
+ =begin
126
+ Managing source in a private array.
127
+ * tag oriented (?< and ?> are the key tokens
128
+ * ?> that aren't followed by another ?< or ?> are stripped in splitting
129
+ =end
126
130
  def get
127
131
  pop or
128
132
  unless @eof then
129
133
  last = @last
130
134
  begin
131
- src = @src.gets
132
- unless src then
135
+ unless chunk = @src.gets then
133
136
  @eof = true
134
- unshift last
135
- last = nil
136
- break
137
+ @last = nil
138
+ return last
139
+ #unshift last # to be popped after reverse!
140
+ #last = nil
141
+ #break
137
142
  end
138
- a = src.split(/(?=<|>[<>])|>/, -1)
143
+ # negative lookahead: < or >< or >>
144
+ # so don't consume those (but split leaving them always at the
145
+ # end of chunks)
146
+ # consume (>) and split on >
147
+ a = chunk.split(/(?=<|>[<>])|>/, -1)
139
148
  if last then
140
149
  unless /\A[<>]/ =~ a.first then
141
150
  a[0] = last << (a.first || '')
@@ -143,6 +152,7 @@ module XMLScan
143
152
  push last
144
153
  end
145
154
  end
155
+ raise "size #{size}" if size > 1
146
156
  concat a
147
157
  last = pop
148
158
  end while empty?
@@ -223,7 +233,7 @@ module XMLScan
223
233
  last.push @last.inspect
224
234
  end
225
235
  a.push '#eof' if @eof
226
- "((#{a.join(' ')}) (#{last.join(' ')}) . #{source.inspect})"
236
+ "((#{a*' '}) l(#{last*' '}) . #{source.inspect})"
227
237
  end
228
238
 
229
239
  def each
@@ -354,72 +364,72 @@ module XMLScan
354
364
  end
355
365
  end
356
366
 
357
- def on_xmldecl_version(str)
358
- @visitor.on_xmldecl_version str
367
+ def on_xmldecl_version(str, *a)
368
+ @visitor.on_xmldecl_version str, *a
359
369
  end
360
370
 
361
- def on_xmldecl_encoding(str)
362
- @visitor.on_xmldecl_encoding str
371
+ def on_xmldecl_encoding(str, *a)
372
+ @visitor.on_xmldecl_encoding str, *a
363
373
  end
364
374
 
365
- def on_xmldecl_standalone(str)
366
- @visitor.on_xmldecl_standalone str
375
+ def on_xmldecl_standalone(str, *a)
376
+ @visitor.on_xmldecl_standalone str, *a
367
377
  end
368
378
 
369
- def on_xmldecl_other(name, value)
370
- @visitor.on_xmldecl_other name, value
379
+ def on_xmldecl_other(name, value, *a)
380
+ @visitor.on_xmldecl_other name, value, *a
371
381
  end
372
382
 
373
- def on_xmldecl_end
374
- @visitor.on_xmldecl_end
383
+ def on_xmldecl_end(*a)
384
+ @visitor.on_xmldecl_end *a
375
385
  end
376
386
 
377
- def on_doctype(root, pubid, sysid)
378
- @visitor.on_doctype root, pubid, sysid
387
+ def on_doctype(root, pubid, sysid, *a)
388
+ @visitor.on_doctype root, pubid, sysid, *a
379
389
  end
380
390
 
381
- def on_prolog_space(str)
382
- @visitor.on_prolog_space str
391
+ def on_prolog_space(str, *a)
392
+ @visitor.on_prolog_space str, *a
383
393
  end
384
394
 
385
- def on_comment(str)
386
- @visitor.on_comment str
395
+ def on_comment(str, *a)
396
+ @visitor.on_comment str, *a
387
397
  end
388
398
 
389
- def on_pi(target, pi)
390
- @visitor.on_pi target, pi
399
+ def on_pi(target, pi, *a)
400
+ @visitor.on_pi target, pi, *a
391
401
  end
392
402
 
393
- def on_chardata(str)
394
- @visitor.on_chardata str
403
+ def on_chardata(str, *a)
404
+ @visitor.on_chardata str, *a
395
405
  end
396
406
 
397
- def on_cdata(str)
398
- @visitor.on_cdata str
407
+ def on_cdata(str, *a)
408
+ @visitor.on_cdata str, *a
399
409
  end
400
410
 
401
- def on_etag(name)
402
- @visitor.on_etag name
411
+ def on_etag(name, *a)
412
+ @visitor.on_etag name, *a
403
413
  end
404
414
 
405
- def on_entityref(ref)
406
- @visitor.on_entityref ref
415
+ def on_entityref(ref, *a)
416
+ @visitor.on_entityref ref, *a
407
417
  end
408
418
 
409
- def on_charref(code)
410
- @visitor.on_charref code
419
+ def on_charref(code, *a)
420
+ @visitor.on_charref code, *a
411
421
  end
412
422
 
413
- def on_charref_hex(code)
414
- @visitor.on_charref_hex code
423
+ def on_charref_hex(code, *a)
424
+ @visitor.on_charref_hex code, *a
415
425
  end
416
426
 
417
- def on_start_document
418
- @visitor.on_start_document
427
+ def on_start_document(*a)
428
+ @visitor.on_start_document *a
419
429
  end
420
430
 
421
- def on_end_document
422
- @visitor.on_end_document
431
+ def on_end_document(*a)
432
+ @visitor.on_end_document *a
423
433
  end
424
434
 
425
435
 
@@ -444,50 +454,51 @@ module XMLScan
444
454
  #
445
455
  # A: on_chardata ('HOGE')
446
456
 
447
- def on_stag(name)
448
- @visitor.on_stag name
457
+ def on_stag(name, *a)
458
+ @visitor.on_stag name, *a
449
459
  end
450
460
 
451
- def on_attribute(name)
452
- @visitor.on_attribute name
461
+ def on_attribute(name, *a)
462
+ @visitor.on_attribute name, *a
453
463
  end
454
464
 
455
- def on_attr_value(str)
456
- @visitor.on_attr_value str
465
+ def on_attr_value(str, *a)
466
+ @visitor.on_attr_value str, *a
457
467
  end
458
468
 
459
- def on_attr_entityref(ref)
460
- @visitor.on_attr_entityref ref
469
+ def on_attr_entityref(ref, *a)
470
+ @visitor.on_attr_entityref ref, *a
461
471
  end
462
472
 
463
- def on_attr_charref(code)
464
- @visitor.on_attr_charref code
473
+ def on_attr_charref(code, *a)
474
+ @visitor.on_attr_charref code, *a
465
475
  end
466
476
 
467
- def on_attr_charref_hex(code)
468
- @visitor.on_attr_charref_hex code
477
+ def on_attr_charref_hex(code, *a)
478
+ @visitor.on_attr_charref_hex code, *a
469
479
  end
470
480
 
471
- def on_attribute_end(name)
472
- @visitor.on_attribute_end name
481
+ def on_attribute_end(name, *a)
482
+ @visitor.on_attribute_end name, *a, *a
473
483
  end
474
484
 
475
- def on_stag_end_empty(name)
476
- @visitor.on_stag_end_empty name
485
+ def on_stag_end_empty(name, *a)
486
+ @visitor.on_stag_end_empty name, *a
477
487
  end
478
488
 
479
- def on_stag_end(name)
480
- @visitor.on_stag_end name
489
+ def on_stag_end(name, *a)
490
+ #STDERR << "ose #{name}, #{a.inspect}\n"
491
+ @visitor.on_stag_end name, *a
481
492
  end
482
493
 
483
494
 
495
+ S_OPT_EXAMPLE = "".encode(::Encoding::WINDOWS_31J)
496
+ E_OPT_EXAMPLE = "".encode(::Encoding::EUCJP)
484
497
 
485
498
  private
486
499
 
487
500
  module OptRegexp
488
501
  UTFSTR = "é"
489
- S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
490
- E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
491
502
 
492
503
  RE_ENCODINGS = {
493
504
  :n=>/e/n.encoding,
@@ -525,6 +536,7 @@ module XMLScan
525
536
  else
526
537
  s = $`
527
538
  on_chardata s unless s.empty?
539
+ #orig = $'.sub(/(?=;).*$/,'')
528
540
  ref = nil
529
541
  $'.split('&', -1).each { |s|
530
542
  unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
@@ -533,18 +545,18 @@ module XMLScan
533
545
  parse_error "reference to `#{ref}' doesn't end with `;'"
534
546
  else
535
547
  parse_error "`&' is not used for entity/character references"
536
- on_chardata('&' << s)
548
+ on_chardata '&'+s
537
549
  next
538
550
  end
539
551
  end
540
- ref = $`
552
+ orig = ?& + (ref = $`) + ?;
541
553
  s = $'
542
554
  if /\A[^#]/ =~ ref then
543
- on_entityref ref
555
+ on_entityref ref, orig
544
556
  elsif /\A#(\d+)\z/ =~ ref then
545
- on_charref $1.to_i
557
+ on_charref $1.to_i, orig
546
558
  elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
547
- on_charref_hex $1.hex
559
+ on_charref_hex $1.hex, orig
548
560
  else
549
561
  parse_error "invalid character reference `#{ref}'"
550
562
  end
@@ -558,8 +570,9 @@ module XMLScan
558
570
  end
559
571
 
560
572
 
561
- def scan_attvalue(s) # almostly copy & paste from scan_chardata
573
+ def scan_attr_value(s) # almostly copy & paste from scan_chardata
562
574
  unless /&/ =~ s then
575
+ #STDERR << "no& attr_val #{s.inspect}, #{caller*"\n"}\n" if s == ?>
563
576
  on_attr_value s
564
577
  else
565
578
  s = $`
@@ -576,14 +589,14 @@ module XMLScan
576
589
  next
577
590
  end
578
591
  end
579
- ref = $`
592
+ orig = ?& + (ref = $`) + ?;
580
593
  s = $'
581
594
  if /\A[^#]/ =~ ref then
582
- on_attr_entityref ref
595
+ on_attr_entityref ref, orig
583
596
  elsif /\A#(\d+)\z/ =~ ref then
584
- on_attr_charref $1.to_i
597
+ on_attr_charref $1.to_i, orig
585
598
  elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
586
- on_attr_charref_hex $1.hex
599
+ on_attr_charref_hex $1.hex, orig
587
600
  else
588
601
  parse_error "invalid character reference `#{ref}'"
589
602
  end
@@ -682,6 +695,7 @@ module XMLScan
682
695
 
683
696
 
684
697
  def scan_etag(s)
698
+ orig="#{s}>"
685
699
  s[0,2] = '' # remove '</'
686
700
  if s.empty? then
687
701
  if @src.close_tag then # </>
@@ -689,14 +703,14 @@ module XMLScan
689
703
  else # </< or </[EOF]
690
704
  parse_error "parse error at `</'"
691
705
  s << '>' if @src.close_tag
692
- return on_chardata('</' << s)
706
+ return on_chardata '</' << s
693
707
  end
694
708
  elsif /[ \t\n\r]+/ =~ s then
695
709
  s1, s2 = $`, $'
696
710
  if s1.empty? then # </ tag
697
711
  parse_error "parse error at `</'"
698
712
  s << '>' if @src.close_tag
699
- return on_chardata('</' + s)
713
+ return on_chardata '</' + s
700
714
  elsif not s2.empty? then # </ta g
701
715
  parse_error "illegal whitespace is found within end tag `#{s1}'"
702
716
  while @src.get_tag
@@ -705,7 +719,7 @@ module XMLScan
705
719
  s = s1
706
720
  end
707
721
  found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
708
- on_etag s
722
+ on_etag s, orig
709
723
  end
710
724
 
711
725
 
@@ -745,6 +759,8 @@ module XMLScan
745
759
 
746
760
 
747
761
  def scan_stag(s)
762
+ hash = {}
763
+ orig = [s.dup]
748
764
  unless /(?=[\/ \t\n\r='"])/ =~ s then
749
765
  name = s
750
766
  name[0,1] = '' # remove `<'
@@ -753,54 +769,65 @@ module XMLScan
753
769
  return found_empty_stag
754
770
  else # << or <[EOF]
755
771
  parse_error "parse error at `<'"
756
- return on_chardata('<')
772
+ return on_chardata '<'
757
773
  end
758
774
  end
759
775
  on_stag name
760
776
  found_unclosed_stag name unless @src.close_tag
761
- on_stag_end name
777
+ on_stag_end name, orig*''+?>, {}
762
778
  else
779
+ k = nil
763
780
  name = $`
764
781
  s = $'
765
782
  name[0,1] = '' # remove `<'
766
783
  if name.empty? then # `< tag' or `<=`
767
784
  parse_error "parse error at `<'"
768
785
  s << '>' if @src.close_tag
769
- return on_chardata('<' << s)
786
+ return on_chardata '<' << s
770
787
  end
771
788
  on_stag name
772
789
  emptyelem = false
773
- key,val,error,qmark,c = nil
774
790
  begin
775
791
  continue = false
776
792
  s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
777
793
  ) { |key,val,error|
778
- if key then # key="value"
794
+ orig_val = []
795
+ if key then
779
796
  on_attribute key
797
+ k=key
798
+ orig_val << val
780
799
  qmark = val.slice!(0,1)
781
800
  if val[-1] == qmark[0] then
782
801
  val.chop!
783
- scan_attvalue val unless val.empty?
802
+ scan_attr_value val unless val.empty?
784
803
  else
785
- scan_attvalue val unless val.empty?
804
+ scan_attr_value val unless val.empty?
786
805
  begin
787
806
  s = @src.get
807
+ #STDERR << "get some more? #{s.inspect}, #{orig.inspect}\n"
788
808
  unless s then
789
809
  parse_error "unterminated attribute `#{key}' meets EOF"
790
810
  break
791
811
  end
812
+ orig << s.dup
792
813
  c = s[0]
793
814
  val, s = s.split(qmark, 2)
815
+ orig_val << val
794
816
  if c == ?< then
795
817
  wellformed_error "`<' is found in attribute `#{key}'"
796
818
  elsif c != ?> then
797
- scan_attvalue '>'
819
+ #STDERR << "close in quote? #{c.inspect}, #{@src.tag_start?}, #{@src.tag_end?}, #{s.inspect}, #{val.inspect}, #{orig.inspect}, #{orig_val.inspect}\n"
820
+ orig_val[-1,0] = orig[-1,0] = ?> # if @src.tag_start?
821
+ scan_attr_value ?>
798
822
  end
799
- scan_attvalue val if c
823
+ scan_attr_value val if c
800
824
  end until s
801
825
  continue = s # if eof then continue is false, else true.
802
826
  end
803
- on_attribute_end key
827
+ #STDERR << "attr:#{k}, #{orig_val}\n"
828
+ hash[k] = orig_val*''
829
+ #STDERR << "attr end #{hash.inspect}, #{k}, #{orig_val}\n"
830
+ on_attribute_end key #, orig_val*''
804
831
  elsif error then
805
832
  continue = s = found_stag_error(error)
806
833
  else
@@ -816,9 +843,11 @@ module XMLScan
816
843
  end
817
844
  end
818
845
  if emptyelem then
819
- on_stag_end_empty name
846
+ on_stag_end_empty name, orig*''+?>, hash
820
847
  else
821
- on_stag_end name
848
+ #STDERR << "on stag end #{ name}, \"<#{name}#{s}>\", #{hash.inspect}\n"
849
+ on_stag_end name, orig*''+?>, hash
850
+ #on_stag_end name, "<#{name}#{s}>", hash
822
851
  end
823
852
  end
824
853
  end
@@ -1067,10 +1096,10 @@ module XMLScan
1067
1096
 
1068
1097
 
1069
1098
  def scan_document
1070
- on_start_document
1099
+ on_start_document ''
1071
1100
  @src.prepare
1072
1101
  scan_prolog @src.get
1073
- on_end_document
1102
+ on_end_document ''
1074
1103
  end
1075
1104
 
1076
1105
 
@@ -9,15 +9,9 @@
9
9
 
10
10
  module XMLScan
11
11
 
12
- # The version like 'X.X.0' (TENNY is 0) means that this is an unstable
13
- # release. Incompatible changes will be applied to this version
14
- # without special notice. This version should be distributed as a
15
- # snapshot only.
16
- #
17
- # TENNY which is larger than 1 (e.g. 'X.X.1' or 'X.X.2') means this
18
- # release is a stable release.
19
-
20
- VERSION = '0.2.3'
21
- RELEASE_DATE = '2003-05-02'
12
+ GEMNAME = 'xmlscan'
13
+ VERSION_FILE = File.expand_path('../../VERSION', File.dirname(__FILE__))
14
+ VERSION = open(VERSION_FILE).to_a*''.chop
15
+ RELEASE_DATE = open(VERSION_FILE).mtime.strftime('%Y-%m-%d')
22
16
 
23
17
  end
@@ -54,88 +54,88 @@ module XMLScan
54
54
  def warning(msg)
55
55
  end
56
56
 
57
- def on_xmldecl
57
+ def on_xmldecl(*a)
58
58
  end
59
59
 
60
- def on_xmldecl_key(key, str)
60
+ def on_xmldecl_key(key, str, *a)
61
61
  end
62
62
 
63
- def on_xmldecl_version(str)
63
+ def on_xmldecl_version(str, *a)
64
64
  end
65
65
 
66
- def on_xmldecl_encoding(str)
66
+ def on_xmldecl_encoding(str, *a)
67
67
  end
68
68
 
69
- def on_xmldecl_standalone(str)
69
+ def on_xmldecl_standalone(str, *a)
70
70
  end
71
71
 
72
- def on_xmldecl_other(name, value)
72
+ def on_xmldecl_other(name, value, *a)
73
73
  end
74
74
 
75
- def on_xmldecl_end
75
+ def on_xmldecl_end(*a)
76
76
  end
77
77
 
78
- def on_doctype(root, pubid, sysid)
78
+ def on_doctype(root, pubid, sysid, *a)
79
79
  end
80
80
 
81
- def on_prolog_space(str)
81
+ def on_prolog_space(str, *a)
82
82
  end
83
83
 
84
- def on_comment(str)
84
+ def on_comment(str, *a)
85
85
  end
86
86
 
87
- def on_pi(target, pi)
87
+ def on_pi(target, pi, *a)
88
88
  end
89
89
 
90
- def on_chardata(str)
90
+ def on_chardata(str, *a)
91
91
  end
92
92
 
93
- def on_cdata(str)
93
+ def on_cdata(str, *a)
94
94
  end
95
95
 
96
- def on_etag(name)
96
+ def on_etag(name, *a)
97
97
  end
98
98
 
99
- def on_entityref(ref)
99
+ def on_entityref(ref, *a)
100
100
  end
101
101
 
102
- def on_charref(code)
102
+ def on_charref(code, *a)
103
103
  end
104
104
 
105
- def on_charref_hex(code)
105
+ def on_charref_hex(code, *a)
106
106
  end
107
107
 
108
- def on_start_document
108
+ def on_start_document(*a)
109
109
  end
110
110
 
111
- def on_end_document
111
+ def on_end_document(*a)
112
112
  end
113
113
 
114
- def on_stag(name)
114
+ def on_stag(name, *a)
115
115
  end
116
116
 
117
- def on_attribute(name)
117
+ def on_attribute(name, *a)
118
118
  end
119
119
 
120
- def on_attr_value(str)
120
+ def on_attr_value(str, *a)
121
121
  end
122
122
 
123
- def on_attr_entityref(ref)
123
+ def on_attr_entityref(ref, *a)
124
124
  end
125
125
 
126
- def on_attr_charref(code)
126
+ def on_attr_charref(code, *a)
127
127
  end
128
128
 
129
- def on_attr_charref_hex(code)
129
+ def on_attr_charref_hex(code, *a)
130
130
  end
131
131
 
132
- def on_attribute_end(name)
132
+ def on_attribute_end(name, *a)
133
133
  end
134
134
 
135
- def on_stag_end_empty(name)
135
+ def on_stag_end_empty(name, *a)
136
136
  end
137
137
 
138
- def on_stag_end(name)
138
+ def on_stag_end(name, *a)
139
139
  end
140
140
 
141
141
  end
@@ -146,13 +146,15 @@ module XMLScan
146
146
  include Visitor
147
147
 
148
148
  def initialize(visitor)
149
+ #STDERR << "new Decoration #{visitor}\n"
149
150
  @visitor = visitor
150
151
  end
151
152
 
152
153
  Visitor.instance_methods.each { |i|
154
+ #STDERR << "#{i} \#{args.inspect}\\n"
153
155
  module_eval <<-END, __FILE__, __LINE__ + 1
154
156
  def #{i}(*args)
155
- @visitor.#{i}(*args)
157
+ @visitor&&@visitor.#{i}(*args)
156
158
  end
157
159
  END
158
160
  }
@@ -115,95 +115,95 @@ module XMLScan
115
115
  end
116
116
 
117
117
 
118
- def on_xmldecl_version(str)
118
+ def on_xmldecl_version(str, *a)
119
119
  check_valid_version str
120
120
  super
121
121
  end
122
122
 
123
- def on_xmldecl_encoding(str)
123
+ def on_xmldecl_encoding(str, *a)
124
124
  check_valid_encoding str
125
125
  super
126
126
  end
127
127
 
128
- def on_xmldecl_standalone(str)
128
+ def on_xmldecl_standalone(str, *a)
129
129
  check_valid_chardata str
130
130
  super
131
131
  end
132
132
 
133
- def on_doctype(root, pubid, sysid)
133
+ def on_doctype(root, pubid, sysid, *a)
134
134
  check_valid_name root
135
135
  check_valid_pubid pubid if pubid
136
136
  check_valid_chardata sysid if sysid
137
137
  super
138
138
  end
139
139
 
140
- def on_comment(str)
140
+ def on_comment(str, *a)
141
141
  check_valid_chardata str
142
142
  super
143
143
  end
144
144
 
145
- def on_pi(target, pi)
145
+ def on_pi(target, pi, *a)
146
146
  check_valid_name target
147
147
  check_valid_chardata pi
148
148
  super
149
149
  end
150
150
 
151
- def on_chardata(str)
151
+ def on_chardata(str, *a)
152
152
  check_valid_chardata str
153
153
  super
154
154
  end
155
155
 
156
- def on_cdata(str)
156
+ def on_cdata(str, *a)
157
157
  check_valid_chardata str
158
158
  super
159
159
  end
160
160
 
161
- def on_etag(name)
161
+ def on_etag(name, *a)
162
162
  check_valid_name name
163
163
  super
164
164
  end
165
165
 
166
- def on_entityref(ref)
166
+ def on_entityref(ref, *a)
167
167
  check_valid_name ref
168
168
  super
169
169
  end
170
170
 
171
- def on_charref(code)
171
+ def on_charref(code, *a)
172
172
  check_valid_char code
173
173
  super
174
174
  end
175
175
 
176
- def on_charref_hex(code)
176
+ def on_charref_hex(code, *a)
177
177
  check_valid_char code
178
178
  super
179
179
  end
180
180
 
181
- def on_stag(name)
181
+ def on_stag(name, *a)
182
182
  check_valid_name name
183
183
  super
184
184
  end
185
185
 
186
- def on_attribute(name)
186
+ def on_attribute(name, *a)
187
187
  check_valid_name name
188
188
  super
189
189
  end
190
190
 
191
- def on_attr_value(str)
191
+ def on_attr_value(str, *a)
192
192
  check_valid_chardata str
193
193
  super
194
194
  end
195
195
 
196
- def on_attr_entityref(ref)
196
+ def on_attr_entityref(ref, *a)
197
197
  check_valid_name ref
198
198
  super
199
199
  end
200
200
 
201
- def on_attr_charref(code)
201
+ def on_attr_charref(code, *a)
202
202
  check_valid_char code
203
203
  super
204
204
  end
205
205
 
206
- def on_attr_charref_hex(code)
206
+ def on_attr_charref_hex(code, *a)
207
207
  check_valid_char code
208
208
  super
209
209
  end
metadata CHANGED
@@ -1,19 +1,19 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xmlscan
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
5
- prerelease:
4
+ version: 0.3.0prec
5
+ prerelease: 5
6
6
  platform: ruby
7
7
  authors:
8
8
  - UENO Katsuhiro <katsu@blue.sky.or.jp>
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-13 00:00:00.000000000 Z
12
+ date: 2012-02-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &8077620 !ruby/object:Gem::Requirement
16
+ requirement: &9706320 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.8.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *8077620
24
+ version_requirements: *9706320
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &8076660 !ruby/object:Gem::Requirement
27
+ requirement: &9705800 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '3.12'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *8076660
35
+ version_requirements: *9705800
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: bundler
38
- requirement: &8075620 !ruby/object:Gem::Requirement
38
+ requirement: &9705220 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.0.0
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *8075620
46
+ version_requirements: *9705220
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: jeweler
49
- requirement: &8074720 !ruby/object:Gem::Requirement
49
+ requirement: &9704580 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: 1.8.3
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *8074720
57
+ version_requirements: *9704580
58
58
  description: The fastest XML parser written in 100% pure Ruby.
59
59
  email: gerryg@inbox.com
60
60
  executables: []
@@ -69,15 +69,14 @@ files:
69
69
  - Rakefile
70
70
  - THANKS
71
71
  - VERSION
72
- - install.rb
73
72
  - lib/xmlscan/htmlscan.rb
74
73
  - lib/xmlscan/namespace.rb
75
74
  - lib/xmlscan/parser.rb
75
+ - lib/xmlscan/processor.rb
76
76
  - lib/xmlscan/scanner.rb
77
77
  - lib/xmlscan/version.rb
78
78
  - lib/xmlscan/visitor.rb
79
79
  - lib/xmlscan/xmlchar.rb
80
- - test.rb
81
80
  homepage: http://github.com/GerryG/xmlformat/
82
81
  licenses:
83
82
  - MIT
@@ -97,13 +96,13 @@ required_ruby_version: !ruby/object:Gem::Requirement
97
96
  version: '0'
98
97
  segments:
99
98
  - 0
100
- hash: 3268123461909302440
99
+ hash: 4206592949743860129
101
100
  required_rubygems_version: !ruby/object:Gem::Requirement
102
101
  none: false
103
102
  requirements:
104
- - - ! '>='
103
+ - - ! '>'
105
104
  - !ruby/object:Gem::Version
106
- version: '0'
105
+ version: 1.3.1
107
106
  requirements: []
108
107
  rubyforge_project:
109
108
  rubygems_version: 1.8.15
data/install.rb DELETED
@@ -1,41 +0,0 @@
1
- #!/usr/bin/ruby
2
- #
3
- # install.rb
4
- #
5
- # $Id: install.rb,v 1.2 2002/12/26 21:09:38 katsu Exp $
6
-
7
- require 'rbconfig'
8
- require 'ftools'
9
- require 'find'
10
- require 'getoptlong'
11
-
12
- DEFAULT_DESTDIR = Config::CONFIG['sitelibdir'] || Config::CONFIG['sitedir']
13
- SRCDIR = File.dirname(__FILE__)
14
-
15
-
16
- def install_rb(from, to)
17
- from = SRCDIR + '/' + from
18
- Find.find(from) { |src|
19
- next unless File.file? src
20
- next unless /\.rb\z/ =~ src
21
- dst = src.sub(/\A#{Regexp.escape(from)}/, to)
22
- File.makedirs File.dirname(dst), true
23
- File.install src, dst, 0644, true
24
- }
25
- end
26
-
27
-
28
- destdir = DEFAULT_DESTDIR
29
- begin
30
- GetoptLong.new([ "-d", "--destdir", GetoptLong::REQUIRED_ARGUMENT ]
31
- ).each_option { |opt, arg|
32
- case opt
33
- when '-d' then
34
- destdir = arg
35
- end
36
- }
37
- rescue
38
- exit 2
39
- end
40
-
41
- install_rb "lib", destdir
data/test.rb DELETED
@@ -1,7 +0,0 @@
1
- #!/usr/bin/ruby
2
- $-w = true
3
- $LOAD_PATH.unshift 'lib'
4
- $LOAD_PATH.unshift 'tests'
5
- Dir.chdir File.dirname($0)
6
- require 'testall'
7
- load 'runtest.rb'