xmlscan 0.2.3 → 0.3.0prea

Sign up to get free protection for your applications and to get access to all the features.
data/README.processor ADDED
@@ -0,0 +1,33 @@
1
+
2
+ processor = XMLScan::Processor.hash(:element=>:card, :key=>:name) do |k,h,s,|
3
+ h[:transclude] || "{{#{h[:name]}}}"
4
+ end
5
+
6
+ test_cases [
7
+ [ '<card name="foo" transclude="{{foo|titled}}">Some
8
+ <card name="name">Name data</card> and &lt; &gt;
9
+ <p>para data<b>bold</b>
10
+ </p><br/>
11
+ more<card
12
+ name="+hello" attr="&quote;foo&quote;"> and <card name="+nested">nested twice data</card>
13
+ </card>
14
+ </card>
15
+ ', {
16
+ 'foo' => 'Some
17
+ {{name}} and &lt; &gt;
18
+ <p>para data<b>bold</b>
19
+ </p><br/>
20
+ more{{+hello}}
21
+ ',
22
+ 'name' => 'Name data',
23
+ 'foo+hello' => ' and {{+nested}}
24
+ ',
25
+ 'foo+hello+nested' => 'nested twice data' } ],
26
+ ]
27
+
28
+ test_casts.each { |p|
29
+ assert processor.call(p[0]) == p[1]
30
+ }
31
+
32
+
33
+
data/Rakefile CHANGED
@@ -3,6 +3,9 @@
3
3
 
4
4
  require 'rubygems'
5
5
  require 'bundler'
6
+ require 'xmlscan/version'
7
+
8
+ VERSION = XMLScan::VERSION # File.exist?('VERSION') ? File.read('VERSION') : ""
6
9
 
7
10
  begin
8
11
  Bundler.setup(:default, :development)
@@ -15,10 +18,11 @@ end
15
18
  require 'rake'
16
19
 
17
20
  begin
21
+ include XMLScan
18
22
  require 'jeweler'
19
23
  Jeweler::Tasks.new do |gem|
20
- gem.name = "xmlscan"
21
- gem.version = '0.2.3'
24
+ gem.name = 'xmlscan'
25
+ gem.version = XMLScan::VERSION
22
26
  gem.license = "MIT"
23
27
  gem.summary = "The fastest XML parser written in 100% pure Ruby."
24
28
  gem.email = "gerryg@inbox.com"
@@ -56,10 +60,9 @@ task :default => :spec
56
60
 
57
61
  require 'rdoc/task'
58
62
  Rake::RDocTask.new do |rdoc|
59
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
60
63
 
61
64
  rdoc.rdoc_dir = 'rdoc'
62
- rdoc.title = "xmlscan #{version}"
65
+ rdoc.title = "xmlscan #{VERSION}"
63
66
  rdoc.rdoc_files.include('README*')
64
67
  rdoc.rdoc_files.include('lib/**/*.rb')
65
68
  end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.3
1
+ 0.3.0prea
@@ -47,7 +47,7 @@ module XMLScan
47
47
  raise "[BUG] this method must be never called"
48
48
  end
49
49
 
50
- def on_stag_end_empty(name)
50
+ def on_stag_end_empty(name, *a)
51
51
  raise "[BUG] this method must be never called"
52
52
  end
53
53
 
@@ -127,7 +127,7 @@ module XMLScan
127
127
  return found_empty_stag
128
128
  else
129
129
  parse_error "parse error at `<'"
130
- return on_chardata('<')
130
+ return on_chardata '<'
131
131
  end
132
132
  end
133
133
  on_stag name
@@ -142,7 +142,7 @@ module XMLScan
142
142
  if @src.close_tag then
143
143
  s << '>'
144
144
  end
145
- return on_chardata('<' << s)
145
+ return on_chardata '<'+s
146
146
  end
147
147
  on_stag name
148
148
  begin
@@ -156,9 +156,9 @@ module XMLScan
156
156
  qmark = val.slice!(0,1)
157
157
  if val[-1] == qmark[0] then
158
158
  val.chop!
159
- scan_attvalue val unless val.empty?
159
+ scan_attr_value val unless val.empty?
160
160
  else
161
- scan_attvalue val unless val.empty?
161
+ scan_attr_value val unless val.empty?
162
162
  begin
163
163
  s = @src.get
164
164
  unless s then
@@ -167,8 +167,8 @@ module XMLScan
167
167
  end
168
168
  c = s[0]
169
169
  val, s = s.split(qmark, 2)
170
- scan_attvalue '>' unless c == ?< or c == ?>
171
- scan_attvalue val if c
170
+ scan_attr_value '>' unless c == ?< or c == ?>
171
+ scan_attr_value val if c
172
172
  end until s
173
173
  continue = s
174
174
  end
@@ -54,16 +54,16 @@ module XMLScan
54
54
  # on_stag_end_empty_ns ('foo:bar', { 'foo' => '', ... })
55
55
  #
56
56
 
57
- def on_stag_ns(qname, prefix, localpart)
57
+ def on_stag_ns(qname, prefix, localpart, *a)
58
58
  end
59
59
 
60
- def on_attribute_ns(qname, prefix, localpart)
60
+ def on_attribute_ns(qname, prefix, localpart, *a)
61
61
  end
62
62
 
63
- def on_stag_end_ns(qname, namespaces)
63
+ def on_stag_end_ns(qname, namespaces, *a)
64
64
  end
65
65
 
66
- def on_stag_end_empty_ns(qname, namespaces)
66
+ def on_stag_end_empty_ns(qname, namespaces, *a)
67
67
  end
68
68
 
69
69
  end
@@ -99,7 +99,7 @@ module XMLScan
99
99
  end
100
100
 
101
101
 
102
- def on_start_document
102
+ def on_start_document(*a)
103
103
  @namespace = {} #PredefinedNamespace.dup
104
104
  @ns_hist = []
105
105
  @ns_undeclared = {} # for checking undeclared namespace prefixes.
@@ -107,14 +107,14 @@ module XMLScan
107
107
  @dont_same = [] # ditto.
108
108
  @xmlns = NamespaceDeclaration.new(self)
109
109
  @orig_visitor = @visitor
110
- @visitor.on_start_document
110
+ @visitor.on_start_document *a
111
111
  end
112
112
 
113
113
 
114
- def on_stag(name)
114
+ def on_stag(name, *a)
115
115
  @ns_hist.push nil
116
116
  unless /:/n =~ name then
117
- @visitor.on_stag_ns name, '', name
117
+ @visitor.on_stag_ns name, '', name, *a
118
118
  else
119
119
  prefix, localpart = $`, $'
120
120
  if localpart.include? ?: then
@@ -131,12 +131,12 @@ module XMLScan
131
131
  @ns_undeclared[prefix] = true
132
132
  end
133
133
  end
134
- @visitor.on_stag_ns name, prefix, localpart
134
+ @visitor.on_stag_ns name, prefix, localpart, *a
135
135
  end
136
136
  end
137
137
 
138
138
 
139
- def on_attribute(name)
139
+ def on_attribute(name, *a)
140
140
  if /:/n =~ name then
141
141
  prefix, localpart = $`, $'
142
142
  if localpart.include? ?: then
@@ -157,13 +157,13 @@ module XMLScan
157
157
  @dont_same.push [ prev, prefix, localpart ]
158
158
  end
159
159
  @prev_prefix[localpart] = prefix
160
- @visitor.on_attribute_ns name, prefix, localpart
160
+ @visitor.on_attribute_ns name, prefix, localpart, *a
161
161
  end
162
162
  elsif name == 'xmlns' then
163
163
  @visitor = @xmlns
164
164
  @xmlns.on_xmlns_start ''
165
165
  else
166
- @visitor.on_attribute_ns name, nil, name
166
+ @visitor.on_attribute_ns name, nil, name, *a
167
167
  end
168
168
  end
169
169
 
@@ -176,36 +176,36 @@ module XMLScan
176
176
  @parent = parent
177
177
  end
178
178
 
179
- def on_xmlns_start(prefix)
179
+ def on_xmlns_start(prefix, *a)
180
180
  @prefix = prefix
181
181
  @nsdecl = ''
182
182
  end
183
183
 
184
- def on_attr_value(str)
184
+ def on_attr_value(str, *a)
185
185
  @nsdecl << str
186
186
  end
187
187
 
188
- def on_attr_entityref(ref)
188
+ def on_attr_entityref(ref, *a)
189
189
  @parent.ns_wellformed_error \
190
190
  "xmlns includes undeclared entity reference"
191
191
  end
192
192
 
193
- def on_attr_charref(code)
193
+ def on_attr_charref(code, *a)
194
194
  @nsdecl << [code].pack('U')
195
195
  end
196
196
 
197
- def on_attr_charref_hex(code)
197
+ def on_attr_charref_hex(code, *a)
198
198
  @nsdecl << [code].pack('U')
199
199
  end
200
200
 
201
- def on_attribute_end(name)
201
+ def on_attribute_end(name, *a)
202
202
  @parent.on_xmlns_end @prefix, @nsdecl
203
203
  end
204
204
 
205
205
  end
206
206
 
207
207
 
208
- def on_xmlns_end(prefix, uri)
208
+ def on_xmlns_end(prefix, uri, *a)
209
209
  @visitor = @orig_visitor
210
210
  if PredefinedNamespace.key? prefix then
211
211
  if prefix == 'xmlns' then
@@ -254,54 +254,54 @@ module XMLScan
254
254
  end
255
255
 
256
256
 
257
- def on_stag_end(name)
257
+ def on_stag_end(name, *a)
258
258
  fix_namespace
259
- @visitor.on_stag_end_ns name, @namespace
259
+ @visitor.on_stag_end_ns name, @namespace, *a
260
260
  end
261
261
 
262
262
 
263
- def on_etag(name)
263
+ def on_etag(name, *a)
264
264
  h = @ns_hist.pop and @namespace.update h
265
- @visitor.on_etag name
265
+ @visitor.on_etag name, *a
266
266
  end
267
267
 
268
268
 
269
- def on_stag_end_empty(name)
269
+ def on_stag_end_empty(name, *a)
270
270
  fix_namespace
271
- @visitor.on_stag_end_empty_ns name, @namespace
271
+ @visitor.on_stag_end_empty_ns name, @namespace, *a
272
272
  h = @ns_hist.pop and @namespace.update h
273
273
  end
274
274
 
275
275
 
276
- def on_doctype(root, pubid, sysid)
276
+ def on_doctype(root, pubid, sysid, *a)
277
277
  if root.count(':') > 1 then
278
278
  ns_parse_error "qualified name `#{root}' includes `:'"
279
279
  end
280
- @visitor.on_doctype root, pubid, sysid
280
+ @visitor.on_doctype root, pubid, sysid, *a
281
281
  end
282
282
 
283
283
 
284
- def on_pi(target, pi)
284
+ def on_pi(target, pi, *a)
285
285
  if target.include? ?: then
286
286
  ns_parse_error "PI target `#{target}' includes `:'"
287
287
  end
288
- @visitor.on_pi target, pi
288
+ @visitor.on_pi target, pi, *a
289
289
  end
290
290
 
291
291
 
292
- def on_entityref(ref)
292
+ def on_entityref(ref, *a)
293
293
  if ref.include? ?: then
294
294
  ns_parse_error "entity reference `#{ref}' includes `:'"
295
295
  end
296
- @visitor.on_entityref ref
296
+ @visitor.on_entityref ref, *a
297
297
  end
298
298
 
299
299
 
300
- def on_attr_entityref(ref)
300
+ def on_attr_entityref(ref, *a)
301
301
  if ref.include? ?: then
302
302
  ns_parse_error "entity reference `#{ref}' includes `:'"
303
303
  end
304
- @visitor.on_attr_entityref ref
304
+ @visitor.on_attr_entityref ref, *a
305
305
  end
306
306
 
307
307
  end
@@ -43,7 +43,7 @@ module XMLScan
43
43
 
44
44
  private
45
45
 
46
- def on_xmldecl_version(str)
46
+ def on_xmldecl_version(str, *a)
47
47
  unless str == '1.0' then
48
48
  warning "unsupported XML version `#{str}'"
49
49
  end
@@ -51,7 +51,7 @@ module XMLScan
51
51
  end
52
52
 
53
53
 
54
- def on_xmldecl_standalone(str)
54
+ def on_xmldecl_standalone(str, *a)
55
55
  if str == 'yes' then
56
56
  @standalone = true
57
57
  elsif str == 'no' then
@@ -63,7 +63,7 @@ module XMLScan
63
63
  end
64
64
 
65
65
 
66
- def on_doctype(name, pubid, sysid)
66
+ def on_doctype(name, pubid, sysid, *a)
67
67
  if pubid and not sysid then
68
68
  parse_error "public external ID must have both public ID and system ID"
69
69
  end
@@ -71,12 +71,12 @@ module XMLScan
71
71
  end
72
72
 
73
73
 
74
- def on_prolog_space(s)
74
+ def on_prolog_space(s, *a)
75
75
  # just ignore it.
76
76
  end
77
77
 
78
78
 
79
- def on_pi(target, pi)
79
+ def on_pi(target, pi, *a)
80
80
  if target.downcase == 'xml' then
81
81
  parse_error "reserved PI target `#{target}'"
82
82
  end
@@ -114,39 +114,43 @@ module XMLScan
114
114
  #end
115
115
 
116
116
 
117
- def on_stag(name)
117
+ def on_stag(name, *a)
118
118
  @elem.push name
119
119
  @visitor.on_stag name
120
120
  @attr.clear
121
121
  end
122
122
 
123
- def on_attribute(name)
123
+ def on_attribute(name, *a)
124
124
  unless @attr.check_unique name then
125
125
  wellformed_error "doubled attribute `#{name}'"
126
126
  end
127
127
  @visitor.on_attribute name
128
128
  end
129
129
 
130
- def on_attr_value(str)
130
+ def on_attr_value(str, *a)
131
131
  str.tr! "\t\r\n", ' ' # normalize
132
132
  @visitor.on_attr_value str
133
133
  end
134
134
 
135
- def on_stag_end_empty(name)
135
+ def on_stag_end(name, *a)
136
+ @visitor.on_stag_end name, *a
137
+ end
138
+
139
+ def on_stag_end_empty(name, *a)
136
140
  # @visitor.on_stag_end name
137
141
  # @elem.pop
138
142
  # @visitor.on_etag name
139
- @visitor.on_stag_end_empty name
143
+ @visitor.on_stag_end_empty name, *a
140
144
  @elem.pop
141
145
  end
142
146
 
143
- def on_etag(name)
147
+ def on_etag(name, *a)
144
148
  last = @elem.pop
145
149
  if last == name then
146
- @visitor.on_etag name
150
+ @visitor.on_etag name, *a
147
151
  elsif last then
148
152
  wellformed_error "element type `#{name}' is not matched"
149
- @visitor.on_etag last
153
+ @visitor.on_etag last, *a
150
154
  else
151
155
  parse_error "end tag `#{name}' appears alone"
152
156
  end
@@ -0,0 +1,47 @@
1
+ # encoding: UTF-8
2
+ require 'xmlscan/parser'
3
+ require 'xmlscan/visitor'
4
+
5
+ module XMLScan
6
+ module ElementProcessor
7
+ include XMLScan::Visitor
8
+
9
+ SKIP = [:on_chardata, :on_stag, :on_etag, :on_attribute, :on_attr_entityref,
10
+ :on_attr_value, :on_start_document, :on_end_document, :on_attribute_end,
11
+ :on_stag_end, :on_stag_end_empty, :on_attr_charref, :on_attr_charref_hex]
12
+
13
+ MY_METHODS = XMLScan::Visitor.instance_methods.to_a - SKIP
14
+
15
+ def initialize(opts={}, mod=nil)
16
+ (mod ? MY_METHODS - mod.instance_methods : MY_METHODS).each do |i|
17
+ self.class.class_eval %{def #{i}(d, *a) d&&(@out << d) end}, __FILE__, __LINE__
18
+ end
19
+ self.class.send :include, mod
20
+
21
+ @element = opts[:element] || raise("need an element")
22
+ @key = opts[:key] || raise("need a key")
23
+ @extras = (ex = opts[:extras]) ? ex.map(&:to_sym) : []
24
+
25
+ @pairs = [] # output [name, content, value] * 1 or more
26
+ @context = '' # current key(name) of the element (card)
27
+ @stack = [] # stack of containing context cards
28
+ @out = [] # current output for name(card)
29
+ @parser = XMLScan::XMLParser.new(self)
30
+ self
31
+ end
32
+
33
+ end
34
+
35
+ class XMLProcessor
36
+ include ElementProcessor
37
+
38
+ def self.process(file, opts={}, mod=nil)
39
+ raise "Not readable #{file.inspect}" unless IO===( io =
40
+ IO===file ? file : open(file) )
41
+ visitor = new(opts, mod)
42
+ visitor.parser.parse(io)
43
+ visitor.pairs
44
+ end
45
+ end
46
+
47
+ end
@@ -122,20 +122,29 @@ module XMLScan
122
122
  self
123
123
  end
124
124
 
125
-
125
+ =begin
126
+ Managing source in a private array.
127
+ * tag oriented (?< and ?> are the key tokens
128
+ * ?> that aren't followed by another ?< or ?> are stripped in splitting
129
+ =end
126
130
  def get
127
131
  pop or
128
132
  unless @eof then
129
133
  last = @last
130
134
  begin
131
- src = @src.gets
132
- unless src then
135
+ unless chunk = @src.gets then
133
136
  @eof = true
134
- unshift last
135
- last = nil
136
- break
137
+ @last = nil
138
+ return last
139
+ #unshift last # to be popped after reverse!
140
+ #last = nil
141
+ #break
137
142
  end
138
- a = src.split(/(?=<|>[<>])|>/, -1)
143
+ # negative lookahead: < or >< or >>
144
+ # so don't consume those (but split leaving them always at the
145
+ # end of chunks)
146
+ # consume (>) and split on >
147
+ a = chunk.split(/(?=<|>[<>])|>/, -1)
139
148
  if last then
140
149
  unless /\A[<>]/ =~ a.first then
141
150
  a[0] = last << (a.first || '')
@@ -143,6 +152,7 @@ module XMLScan
143
152
  push last
144
153
  end
145
154
  end
155
+ raise "size #{size}" if size > 1
146
156
  concat a
147
157
  last = pop
148
158
  end while empty?
@@ -223,7 +233,7 @@ module XMLScan
223
233
  last.push @last.inspect
224
234
  end
225
235
  a.push '#eof' if @eof
226
- "((#{a.join(' ')}) (#{last.join(' ')}) . #{source.inspect})"
236
+ "((#{a*' '}) l(#{last*' '}) . #{source.inspect})"
227
237
  end
228
238
 
229
239
  def each
@@ -354,72 +364,72 @@ module XMLScan
354
364
  end
355
365
  end
356
366
 
357
- def on_xmldecl_version(str)
358
- @visitor.on_xmldecl_version str
367
+ def on_xmldecl_version(str, *a)
368
+ @visitor.on_xmldecl_version str, *a
359
369
  end
360
370
 
361
- def on_xmldecl_encoding(str)
362
- @visitor.on_xmldecl_encoding str
371
+ def on_xmldecl_encoding(str, *a)
372
+ @visitor.on_xmldecl_encoding str, *a
363
373
  end
364
374
 
365
- def on_xmldecl_standalone(str)
366
- @visitor.on_xmldecl_standalone str
375
+ def on_xmldecl_standalone(str, *a)
376
+ @visitor.on_xmldecl_standalone str, *a
367
377
  end
368
378
 
369
- def on_xmldecl_other(name, value)
370
- @visitor.on_xmldecl_other name, value
379
+ def on_xmldecl_other(name, value, *a)
380
+ @visitor.on_xmldecl_other name, value, *a
371
381
  end
372
382
 
373
- def on_xmldecl_end
374
- @visitor.on_xmldecl_end
383
+ def on_xmldecl_end(*a)
384
+ @visitor.on_xmldecl_end *a
375
385
  end
376
386
 
377
- def on_doctype(root, pubid, sysid)
378
- @visitor.on_doctype root, pubid, sysid
387
+ def on_doctype(root, pubid, sysid, *a)
388
+ @visitor.on_doctype root, pubid, sysid, *a
379
389
  end
380
390
 
381
- def on_prolog_space(str)
382
- @visitor.on_prolog_space str
391
+ def on_prolog_space(str, *a)
392
+ @visitor.on_prolog_space str, *a
383
393
  end
384
394
 
385
- def on_comment(str)
386
- @visitor.on_comment str
395
+ def on_comment(str, *a)
396
+ @visitor.on_comment str, *a
387
397
  end
388
398
 
389
- def on_pi(target, pi)
390
- @visitor.on_pi target, pi
399
+ def on_pi(target, pi, *a)
400
+ @visitor.on_pi target, pi, *a
391
401
  end
392
402
 
393
- def on_chardata(str)
394
- @visitor.on_chardata str
403
+ def on_chardata(str, *a)
404
+ @visitor.on_chardata str, *a
395
405
  end
396
406
 
397
- def on_cdata(str)
398
- @visitor.on_cdata str
407
+ def on_cdata(str, *a)
408
+ @visitor.on_cdata str, *a
399
409
  end
400
410
 
401
- def on_etag(name)
402
- @visitor.on_etag name
411
+ def on_etag(name, *a)
412
+ @visitor.on_etag name, *a
403
413
  end
404
414
 
405
- def on_entityref(ref)
406
- @visitor.on_entityref ref
415
+ def on_entityref(ref, *a)
416
+ @visitor.on_entityref ref, *a
407
417
  end
408
418
 
409
- def on_charref(code)
410
- @visitor.on_charref code
419
+ def on_charref(code, *a)
420
+ @visitor.on_charref code, *a
411
421
  end
412
422
 
413
- def on_charref_hex(code)
414
- @visitor.on_charref_hex code
423
+ def on_charref_hex(code, *a)
424
+ @visitor.on_charref_hex code, *a
415
425
  end
416
426
 
417
- def on_start_document
418
- @visitor.on_start_document
427
+ def on_start_document(*a)
428
+ @visitor.on_start_document *a
419
429
  end
420
430
 
421
- def on_end_document
422
- @visitor.on_end_document
431
+ def on_end_document(*a)
432
+ @visitor.on_end_document *a
423
433
  end
424
434
 
425
435
 
@@ -444,50 +454,51 @@ module XMLScan
444
454
  #
445
455
  # A: on_chardata ('HOGE')
446
456
 
447
- def on_stag(name)
448
- @visitor.on_stag name
457
+ def on_stag(name, *a)
458
+ @visitor.on_stag name, *a
449
459
  end
450
460
 
451
- def on_attribute(name)
452
- @visitor.on_attribute name
461
+ def on_attribute(name, *a)
462
+ @visitor.on_attribute name, *a
453
463
  end
454
464
 
455
- def on_attr_value(str)
456
- @visitor.on_attr_value str
465
+ def on_attr_value(str, *a)
466
+ @visitor.on_attr_value str, *a
457
467
  end
458
468
 
459
- def on_attr_entityref(ref)
460
- @visitor.on_attr_entityref ref
469
+ def on_attr_entityref(ref, *a)
470
+ @visitor.on_attr_entityref ref, *a
461
471
  end
462
472
 
463
- def on_attr_charref(code)
464
- @visitor.on_attr_charref code
473
+ def on_attr_charref(code, *a)
474
+ @visitor.on_attr_charref code, *a
465
475
  end
466
476
 
467
- def on_attr_charref_hex(code)
468
- @visitor.on_attr_charref_hex code
477
+ def on_attr_charref_hex(code, *a)
478
+ @visitor.on_attr_charref_hex code, *a
469
479
  end
470
480
 
471
- def on_attribute_end(name)
472
- @visitor.on_attribute_end name
481
+ def on_attribute_end(name, *a)
482
+ @visitor.on_attribute_end name, *a, *a
473
483
  end
474
484
 
475
- def on_stag_end_empty(name)
476
- @visitor.on_stag_end_empty name
485
+ def on_stag_end_empty(name, *a)
486
+ @visitor.on_stag_end_empty name, *a
477
487
  end
478
488
 
479
- def on_stag_end(name)
480
- @visitor.on_stag_end name
489
+ def on_stag_end(name, *a)
490
+ #STDERR << "ose #{name}, #{a.inspect}\n"
491
+ @visitor.on_stag_end name, *a
481
492
  end
482
493
 
483
494
 
495
+ S_OPT_EXAMPLE = "".encode(::Encoding::WINDOWS_31J)
496
+ E_OPT_EXAMPLE = "".encode(::Encoding::EUCJP)
484
497
 
485
498
  private
486
499
 
487
500
  module OptRegexp
488
501
  UTFSTR = "é"
489
- S_OPT_EXAMPLE = "".encode Encoding.find('Windows-31J')
490
- E_OPT_EXAMPLE = "".encode Encoding.find('EUC-JP')
491
502
 
492
503
  RE_ENCODINGS = {
493
504
  :n=>/e/n.encoding,
@@ -525,6 +536,7 @@ module XMLScan
525
536
  else
526
537
  s = $`
527
538
  on_chardata s unless s.empty?
539
+ #orig = $'.sub(/(?=;).*$/,'')
528
540
  ref = nil
529
541
  $'.split('&', -1).each { |s|
530
542
  unless /(?!\A);|(?=[ \t\r\n])/ =~ s and not $&.empty? then
@@ -533,18 +545,18 @@ module XMLScan
533
545
  parse_error "reference to `#{ref}' doesn't end with `;'"
534
546
  else
535
547
  parse_error "`&' is not used for entity/character references"
536
- on_chardata('&' << s)
548
+ on_chardata '&'+s
537
549
  next
538
550
  end
539
551
  end
540
- ref = $`
552
+ orig = ?& + (ref = $`) + ?;
541
553
  s = $'
542
554
  if /\A[^#]/ =~ ref then
543
- on_entityref ref
555
+ on_entityref ref, orig
544
556
  elsif /\A#(\d+)\z/ =~ ref then
545
- on_charref $1.to_i
557
+ on_charref $1.to_i, orig
546
558
  elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
547
- on_charref_hex $1.hex
559
+ on_charref_hex $1.hex, orig
548
560
  else
549
561
  parse_error "invalid character reference `#{ref}'"
550
562
  end
@@ -558,8 +570,9 @@ module XMLScan
558
570
  end
559
571
 
560
572
 
561
- def scan_attvalue(s) # almostly copy & paste from scan_chardata
573
+ def scan_attr_value(s) # almostly copy & paste from scan_chardata
562
574
  unless /&/ =~ s then
575
+ #STDERR << "no& attr_val #{s.inspect}, #{caller*"\n"}\n" if s == ?>
563
576
  on_attr_value s
564
577
  else
565
578
  s = $`
@@ -576,14 +589,14 @@ module XMLScan
576
589
  next
577
590
  end
578
591
  end
579
- ref = $`
592
+ orig = ?& + (ref = $`) + ?;
580
593
  s = $'
581
594
  if /\A[^#]/ =~ ref then
582
- on_attr_entityref ref
595
+ on_attr_entityref ref, orig
583
596
  elsif /\A#(\d+)\z/ =~ ref then
584
- on_attr_charref $1.to_i
597
+ on_attr_charref $1.to_i, orig
585
598
  elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then
586
- on_attr_charref_hex $1.hex
599
+ on_attr_charref_hex $1.hex, orig
587
600
  else
588
601
  parse_error "invalid character reference `#{ref}'"
589
602
  end
@@ -682,6 +695,7 @@ module XMLScan
682
695
 
683
696
 
684
697
  def scan_etag(s)
698
+ orig="#{s}>"
685
699
  s[0,2] = '' # remove '</'
686
700
  if s.empty? then
687
701
  if @src.close_tag then # </>
@@ -689,14 +703,14 @@ module XMLScan
689
703
  else # </< or </[EOF]
690
704
  parse_error "parse error at `</'"
691
705
  s << '>' if @src.close_tag
692
- return on_chardata('</' << s)
706
+ return on_chardata '</' << s
693
707
  end
694
708
  elsif /[ \t\n\r]+/ =~ s then
695
709
  s1, s2 = $`, $'
696
710
  if s1.empty? then # </ tag
697
711
  parse_error "parse error at `</'"
698
712
  s << '>' if @src.close_tag
699
- return on_chardata('</' + s)
713
+ return on_chardata '</' + s
700
714
  elsif not s2.empty? then # </ta g
701
715
  parse_error "illegal whitespace is found within end tag `#{s1}'"
702
716
  while @src.get_tag
@@ -705,7 +719,7 @@ module XMLScan
705
719
  s = s1
706
720
  end
707
721
  found_unclosed_etag s unless @src.close_tag # </tag< or </tag[EOF]
708
- on_etag s
722
+ on_etag s, orig
709
723
  end
710
724
 
711
725
 
@@ -745,6 +759,8 @@ module XMLScan
745
759
 
746
760
 
747
761
  def scan_stag(s)
762
+ hash = {}
763
+ orig = [s.dup]
748
764
  unless /(?=[\/ \t\n\r='"])/ =~ s then
749
765
  name = s
750
766
  name[0,1] = '' # remove `<'
@@ -753,54 +769,65 @@ module XMLScan
753
769
  return found_empty_stag
754
770
  else # << or <[EOF]
755
771
  parse_error "parse error at `<'"
756
- return on_chardata('<')
772
+ return on_chardata '<'
757
773
  end
758
774
  end
759
775
  on_stag name
760
776
  found_unclosed_stag name unless @src.close_tag
761
- on_stag_end name
777
+ on_stag_end name, orig*''+?>, {}
762
778
  else
779
+ k = nil
763
780
  name = $`
764
781
  s = $'
765
782
  name[0,1] = '' # remove `<'
766
783
  if name.empty? then # `< tag' or `<=`
767
784
  parse_error "parse error at `<'"
768
785
  s << '>' if @src.close_tag
769
- return on_chardata('<' << s)
786
+ return on_chardata '<' << s
770
787
  end
771
788
  on_stag name
772
789
  emptyelem = false
773
- key,val,error,qmark,c = nil
774
790
  begin
775
791
  continue = false
776
792
  s.scan(/[ \t\n\r]([^= \t\n\r\/'"]+)[ \t\n\r]*=[ \t\n\r]*('[^']*'?|"[^"]*"?)|\/\z|([^ \t\n\r][\S\s]*)/
777
793
  ) { |key,val,error|
778
- if key then # key="value"
794
+ orig_val = []
795
+ if key then
779
796
  on_attribute key
797
+ k=key
798
+ orig_val << val
780
799
  qmark = val.slice!(0,1)
781
800
  if val[-1] == qmark[0] then
782
801
  val.chop!
783
- scan_attvalue val unless val.empty?
802
+ scan_attr_value val unless val.empty?
784
803
  else
785
- scan_attvalue val unless val.empty?
804
+ scan_attr_value val unless val.empty?
786
805
  begin
787
806
  s = @src.get
807
+ #STDERR << "get some more? #{s.inspect}, #{orig.inspect}\n"
788
808
  unless s then
789
809
  parse_error "unterminated attribute `#{key}' meets EOF"
790
810
  break
791
811
  end
812
+ orig << s.dup
792
813
  c = s[0]
793
814
  val, s = s.split(qmark, 2)
815
+ orig_val << val
794
816
  if c == ?< then
795
817
  wellformed_error "`<' is found in attribute `#{key}'"
796
818
  elsif c != ?> then
797
- scan_attvalue '>'
819
+ #STDERR << "close in quote? #{c.inspect}, #{@src.tag_start?}, #{@src.tag_end?}, #{s.inspect}, #{val.inspect}, #{orig.inspect}, #{orig_val.inspect}\n"
820
+ orig_val[-1,0] = orig[-1,0] = ?> # if @src.tag_start?
821
+ scan_attr_value ?>
798
822
  end
799
- scan_attvalue val if c
823
+ scan_attr_value val if c
800
824
  end until s
801
825
  continue = s # if eof then continue is false, else true.
802
826
  end
803
- on_attribute_end key
827
+ #STDERR << "attr:#{k}, #{orig_val}\n"
828
+ hash[k] = orig_val*''
829
+ #STDERR << "attr end #{hash.inspect}, #{k}, #{orig_val}\n"
830
+ on_attribute_end key #, orig_val*''
804
831
  elsif error then
805
832
  continue = s = found_stag_error(error)
806
833
  else
@@ -816,9 +843,11 @@ module XMLScan
816
843
  end
817
844
  end
818
845
  if emptyelem then
819
- on_stag_end_empty name
846
+ on_stag_end_empty name, orig*''+?>, hash
820
847
  else
821
- on_stag_end name
848
+ #STDERR << "on stag end #{ name}, \"<#{name}#{s}>\", #{hash.inspect}\n"
849
+ on_stag_end name, orig*''+?>, hash
850
+ #on_stag_end name, "<#{name}#{s}>", hash
822
851
  end
823
852
  end
824
853
  end
@@ -1067,10 +1096,10 @@ module XMLScan
1067
1096
 
1068
1097
 
1069
1098
  def scan_document
1070
- on_start_document
1099
+ on_start_document ''
1071
1100
  @src.prepare
1072
1101
  scan_prolog @src.get
1073
- on_end_document
1102
+ on_end_document ''
1074
1103
  end
1075
1104
 
1076
1105
 
@@ -9,15 +9,9 @@
9
9
 
10
10
  module XMLScan
11
11
 
12
- # The version like 'X.X.0' (TENNY is 0) means that this is an unstable
13
- # release. Incompatible changes will be applied to this version
14
- # without special notice. This version should be distributed as a
15
- # snapshot only.
16
- #
17
- # TENNY which is larger than 1 (e.g. 'X.X.1' or 'X.X.2') means this
18
- # release is a stable release.
19
-
20
- VERSION = '0.2.3'
21
- RELEASE_DATE = '2003-05-02'
12
+ GEMNAME = 'xmlscan'
13
+ VERSION_FILE = File.expand_path('../../VERSION', File.dirname(__FILE__))
14
+ VERSION = open(VERSION_FILE).to_a*''.chop
15
+ RELEASE_DATE = open(VERSION_FILE).mtime.strftime('%Y-%m-%d')
22
16
 
23
17
  end
@@ -54,88 +54,88 @@ module XMLScan
54
54
  def warning(msg)
55
55
  end
56
56
 
57
- def on_xmldecl
57
+ def on_xmldecl(*a)
58
58
  end
59
59
 
60
- def on_xmldecl_key(key, str)
60
+ def on_xmldecl_key(key, str, *a)
61
61
  end
62
62
 
63
- def on_xmldecl_version(str)
63
+ def on_xmldecl_version(str, *a)
64
64
  end
65
65
 
66
- def on_xmldecl_encoding(str)
66
+ def on_xmldecl_encoding(str, *a)
67
67
  end
68
68
 
69
- def on_xmldecl_standalone(str)
69
+ def on_xmldecl_standalone(str, *a)
70
70
  end
71
71
 
72
- def on_xmldecl_other(name, value)
72
+ def on_xmldecl_other(name, value, *a)
73
73
  end
74
74
 
75
- def on_xmldecl_end
75
+ def on_xmldecl_end(*a)
76
76
  end
77
77
 
78
- def on_doctype(root, pubid, sysid)
78
+ def on_doctype(root, pubid, sysid, *a)
79
79
  end
80
80
 
81
- def on_prolog_space(str)
81
+ def on_prolog_space(str, *a)
82
82
  end
83
83
 
84
- def on_comment(str)
84
+ def on_comment(str, *a)
85
85
  end
86
86
 
87
- def on_pi(target, pi)
87
+ def on_pi(target, pi, *a)
88
88
  end
89
89
 
90
- def on_chardata(str)
90
+ def on_chardata(str, *a)
91
91
  end
92
92
 
93
- def on_cdata(str)
93
+ def on_cdata(str, *a)
94
94
  end
95
95
 
96
- def on_etag(name)
96
+ def on_etag(name, *a)
97
97
  end
98
98
 
99
- def on_entityref(ref)
99
+ def on_entityref(ref, *a)
100
100
  end
101
101
 
102
- def on_charref(code)
102
+ def on_charref(code, *a)
103
103
  end
104
104
 
105
- def on_charref_hex(code)
105
+ def on_charref_hex(code, *a)
106
106
  end
107
107
 
108
- def on_start_document
108
+ def on_start_document(*a)
109
109
  end
110
110
 
111
- def on_end_document
111
+ def on_end_document(*a)
112
112
  end
113
113
 
114
- def on_stag(name)
114
+ def on_stag(name, *a)
115
115
  end
116
116
 
117
- def on_attribute(name)
117
+ def on_attribute(name, *a)
118
118
  end
119
119
 
120
- def on_attr_value(str)
120
+ def on_attr_value(str, *a)
121
121
  end
122
122
 
123
- def on_attr_entityref(ref)
123
+ def on_attr_entityref(ref, *a)
124
124
  end
125
125
 
126
- def on_attr_charref(code)
126
+ def on_attr_charref(code, *a)
127
127
  end
128
128
 
129
- def on_attr_charref_hex(code)
129
+ def on_attr_charref_hex(code, *a)
130
130
  end
131
131
 
132
- def on_attribute_end(name)
132
+ def on_attribute_end(name, *a)
133
133
  end
134
134
 
135
- def on_stag_end_empty(name)
135
+ def on_stag_end_empty(name, *a)
136
136
  end
137
137
 
138
- def on_stag_end(name)
138
+ def on_stag_end(name, *a)
139
139
  end
140
140
 
141
141
  end
@@ -146,13 +146,15 @@ module XMLScan
146
146
  include Visitor
147
147
 
148
148
  def initialize(visitor)
149
+ #STDERR << "new Decoration #{visitor}\n"
149
150
  @visitor = visitor
150
151
  end
151
152
 
152
153
  Visitor.instance_methods.each { |i|
154
+ #STDERR << "#{i} \#{args.inspect}\\n"
153
155
  module_eval <<-END, __FILE__, __LINE__ + 1
154
156
  def #{i}(*args)
155
- @visitor.#{i}(*args)
157
+ @visitor&&@visitor.#{i}(*args)
156
158
  end
157
159
  END
158
160
  }
@@ -115,95 +115,95 @@ module XMLScan
115
115
  end
116
116
 
117
117
 
118
- def on_xmldecl_version(str)
118
+ def on_xmldecl_version(str, *a)
119
119
  check_valid_version str
120
120
  super
121
121
  end
122
122
 
123
- def on_xmldecl_encoding(str)
123
+ def on_xmldecl_encoding(str, *a)
124
124
  check_valid_encoding str
125
125
  super
126
126
  end
127
127
 
128
- def on_xmldecl_standalone(str)
128
+ def on_xmldecl_standalone(str, *a)
129
129
  check_valid_chardata str
130
130
  super
131
131
  end
132
132
 
133
- def on_doctype(root, pubid, sysid)
133
+ def on_doctype(root, pubid, sysid, *a)
134
134
  check_valid_name root
135
135
  check_valid_pubid pubid if pubid
136
136
  check_valid_chardata sysid if sysid
137
137
  super
138
138
  end
139
139
 
140
- def on_comment(str)
140
+ def on_comment(str, *a)
141
141
  check_valid_chardata str
142
142
  super
143
143
  end
144
144
 
145
- def on_pi(target, pi)
145
+ def on_pi(target, pi, *a)
146
146
  check_valid_name target
147
147
  check_valid_chardata pi
148
148
  super
149
149
  end
150
150
 
151
- def on_chardata(str)
151
+ def on_chardata(str, *a)
152
152
  check_valid_chardata str
153
153
  super
154
154
  end
155
155
 
156
- def on_cdata(str)
156
+ def on_cdata(str, *a)
157
157
  check_valid_chardata str
158
158
  super
159
159
  end
160
160
 
161
- def on_etag(name)
161
+ def on_etag(name, *a)
162
162
  check_valid_name name
163
163
  super
164
164
  end
165
165
 
166
- def on_entityref(ref)
166
+ def on_entityref(ref, *a)
167
167
  check_valid_name ref
168
168
  super
169
169
  end
170
170
 
171
- def on_charref(code)
171
+ def on_charref(code, *a)
172
172
  check_valid_char code
173
173
  super
174
174
  end
175
175
 
176
- def on_charref_hex(code)
176
+ def on_charref_hex(code, *a)
177
177
  check_valid_char code
178
178
  super
179
179
  end
180
180
 
181
- def on_stag(name)
181
+ def on_stag(name, *a)
182
182
  check_valid_name name
183
183
  super
184
184
  end
185
185
 
186
- def on_attribute(name)
186
+ def on_attribute(name, *a)
187
187
  check_valid_name name
188
188
  super
189
189
  end
190
190
 
191
- def on_attr_value(str)
191
+ def on_attr_value(str, *a)
192
192
  check_valid_chardata str
193
193
  super
194
194
  end
195
195
 
196
- def on_attr_entityref(ref)
196
+ def on_attr_entityref(ref, *a)
197
197
  check_valid_name ref
198
198
  super
199
199
  end
200
200
 
201
- def on_attr_charref(code)
201
+ def on_attr_charref(code, *a)
202
202
  check_valid_char code
203
203
  super
204
204
  end
205
205
 
206
- def on_attr_charref_hex(code)
206
+ def on_attr_charref_hex(code, *a)
207
207
  check_valid_char code
208
208
  super
209
209
  end
data/xmlcard.rb ADDED
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'xmlscan/processor'
4
+
5
+ # need to make these into supplied blocks somehome
6
+ module CustomProcessing
7
+ def on_chardata(s) @out << s end
8
+ def on_stag_end(name, s, h, *a)
9
+ if name.to_sym == @element
10
+ # starting a new context, first output our substitute string
11
+ key= h&&h[@key.to_s]||'*no-name*'
12
+ sub = h['transclude'] || "{{#{key}}}"
13
+ @out << sub
14
+ # then push the current context and initialize this one
15
+ @stack.push([@context, @out, *@ex])
16
+ @context = key; @out = []
17
+ @ex = @extras.map {|e| h[e]}
18
+ else @out << s end # pass through tags we aren't processing
19
+ end
20
+
21
+ def on_etag(name, s=nil)
22
+ if name.to_sym == @element
23
+ # output a card (name, content, type)
24
+ @pairs << [@context, @out, @stack[-1][0], *@ex]
25
+ # restore previous context from stack
26
+ last = @stack.pop
27
+ @context, @out, @ex = last.shift, last.shift, *last
28
+ else @out << s end
29
+ end
30
+
31
+ def on_stag_empty_end(name, s=nil, h={}, *a)
32
+ if name.to_sym == @element
33
+ # I don't think we have this case, but it is simple to add later
34
+ STDERR << "empty card ???: #{name}, #{s}, #{h.inspect}\n"
35
+ else @out << s end
36
+ end
37
+
38
+ attr_reader :pairs, :parser
39
+ end
40
+
41
+ ARGV.each do |a|
42
+ pairs = XMLScan::XMLProcessor.process(a, {:key=>:name, :element=>:card, :extras=>[:type]}, CustomProcessing)
43
+ STDOUT << "Result\n"
44
+ STDOUT << pairs.map do |p| n,o,c,t = p
45
+ "#{c&&c.size>0&&"#{c}::"||''}#{n}#{t&&"[#{t}]"}=>#{o*''}"
46
+ end * "\n"
47
+ STDOUT << "\nDone\n"
48
+ end
metadata CHANGED
@@ -1,19 +1,19 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xmlscan
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
5
- prerelease:
4
+ version: 0.3.0prea
5
+ prerelease: 5
6
6
  platform: ruby
7
7
  authors:
8
8
  - UENO Katsuhiro <katsu@blue.sky.or.jp>
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-13 00:00:00.000000000 Z
12
+ date: 2012-02-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &8077620 !ruby/object:Gem::Requirement
16
+ requirement: &9220620 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 2.8.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *8077620
24
+ version_requirements: *9220620
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &8076660 !ruby/object:Gem::Requirement
27
+ requirement: &9220120 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '3.12'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *8076660
35
+ version_requirements: *9220120
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: bundler
38
- requirement: &8075620 !ruby/object:Gem::Requirement
38
+ requirement: &9219620 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 1.0.0
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *8075620
46
+ version_requirements: *9219620
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: jeweler
49
- requirement: &8074720 !ruby/object:Gem::Requirement
49
+ requirement: &9219060 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: 1.8.3
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *8074720
57
+ version_requirements: *9219060
58
58
  description: The fastest XML parser written in 100% pure Ruby.
59
59
  email: gerryg@inbox.com
60
60
  executables: []
@@ -65,6 +65,7 @@ files:
65
65
  - ChangeLog
66
66
  - Gemfile
67
67
  - Gemfile.lock
68
+ - README.processor
68
69
  - README.rdoc
69
70
  - Rakefile
70
71
  - THANKS
@@ -73,11 +74,13 @@ files:
73
74
  - lib/xmlscan/htmlscan.rb
74
75
  - lib/xmlscan/namespace.rb
75
76
  - lib/xmlscan/parser.rb
77
+ - lib/xmlscan/processor.rb
76
78
  - lib/xmlscan/scanner.rb
77
79
  - lib/xmlscan/version.rb
78
80
  - lib/xmlscan/visitor.rb
79
81
  - lib/xmlscan/xmlchar.rb
80
82
  - test.rb
83
+ - xmlcard.rb
81
84
  homepage: http://github.com/GerryG/xmlformat/
82
85
  licenses:
83
86
  - MIT
@@ -97,13 +100,13 @@ required_ruby_version: !ruby/object:Gem::Requirement
97
100
  version: '0'
98
101
  segments:
99
102
  - 0
100
- hash: 3268123461909302440
103
+ hash: -1617079850723236327
101
104
  required_rubygems_version: !ruby/object:Gem::Requirement
102
105
  none: false
103
106
  requirements:
104
- - - ! '>='
107
+ - - ! '>'
105
108
  - !ruby/object:Gem::Version
106
- version: '0'
109
+ version: 1.3.1
107
110
  requirements: []
108
111
  rubyforge_project:
109
112
  rubygems_version: 1.8.15