hpricot 0.6.164 → 0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -0
- data/Rakefile +31 -33
- data/ext/fast_xs/fast_xs.c +11 -5
- data/ext/hpricot_scan/HpricotScanService.java +10 -6
- data/ext/hpricot_scan/hpricot_css.c +3502 -0
- data/ext/hpricot_scan/hpricot_css.rl +115 -0
- data/ext/hpricot_scan/hpricot_scan.c +1032 -589
- data/ext/hpricot_scan/hpricot_scan.java.rl +5 -1
- data/ext/hpricot_scan/hpricot_scan.rl +493 -50
- data/ext/hpricot_scan/test.rb +1 -2
- data/lib/hpricot/builder.rb +21 -20
- data/lib/hpricot/elements.rb +12 -12
- data/lib/hpricot/htmlinfo.rb +19 -0
- data/lib/hpricot/inspect.rb +27 -31
- data/lib/hpricot/modules.rb +2 -1
- data/lib/hpricot/parse.rb +8 -268
- data/lib/hpricot/tag.rb +65 -99
- data/lib/hpricot/traverse.rb +20 -14
- data/test/files/boingboing.html +1 -1
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_builder.rb +4 -4
- data/test/test_parser.rb +36 -13
- data/test/test_preserved.rb +6 -2
- metadata +51 -51
- data/ext/hpricot_scan/hpricot_gram.c +0 -882
- data/ext/hpricot_scan/hpricot_gram.h +0 -9
data/ext/hpricot_scan/test.rb
CHANGED
data/lib/hpricot/builder.rb
CHANGED
@@ -1,20 +1,13 @@
|
|
1
1
|
require 'hpricot/tags'
|
2
2
|
require 'fast_xs'
|
3
3
|
require 'hpricot/blankslate'
|
4
|
+
require 'hpricot/htmlinfo'
|
4
5
|
|
5
6
|
module Hpricot
|
6
|
-
PREDEFINED = {
|
7
|
-
34 => '"', # quotation mark
|
8
|
-
38 => '&', # ampersand
|
9
|
-
60 => '<', # left angle bracket
|
10
|
-
62 => '>' # right angle bracket
|
11
|
-
}
|
12
|
-
PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
|
13
|
-
|
14
7
|
# XML unescape
|
15
8
|
def self.uxs(str)
|
16
9
|
str.to_s.
|
17
|
-
gsub(
|
10
|
+
gsub(/\&(\w+);/) { [NamedCharacters[$1] || ??].pack("U*") }.
|
18
11
|
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
|
19
12
|
end
|
20
13
|
|
@@ -23,7 +16,7 @@ module Hpricot
|
|
23
16
|
assigns.each do |k, v|
|
24
17
|
ele.instance_variable_set("@#{k}", v)
|
25
18
|
end
|
26
|
-
ele.instance_eval
|
19
|
+
ele.instance_eval(&blk)
|
27
20
|
ele
|
28
21
|
end
|
29
22
|
|
@@ -45,14 +38,21 @@ module Hpricot
|
|
45
38
|
@@default[option] = value
|
46
39
|
end
|
47
40
|
|
41
|
+
def add_child ele
|
42
|
+
ele.parent = self
|
43
|
+
self.children ||= []
|
44
|
+
self.children << ele
|
45
|
+
ele
|
46
|
+
end
|
47
|
+
|
48
48
|
# Write a +string+ to the HTML stream, making sure to escape it.
|
49
49
|
def text!(string)
|
50
|
-
|
50
|
+
add_child Text.new(string.fast_xs)
|
51
51
|
end
|
52
52
|
|
53
53
|
# Write a +string+ to the HTML stream without escaping it.
|
54
54
|
def text(string)
|
55
|
-
|
55
|
+
add_child Text.new(string)
|
56
56
|
nil
|
57
57
|
end
|
58
58
|
alias_method :<<, :text
|
@@ -67,11 +67,11 @@ module Hpricot
|
|
67
67
|
raise InvalidXhtmlError, "no element `#{tag}' for #{tagset.doctype}"
|
68
68
|
elsif args.last.respond_to?(:to_hash)
|
69
69
|
attrs = args.last.to_hash
|
70
|
-
|
70
|
+
|
71
71
|
if @tagset.forms.include?(tag) and attrs[:id]
|
72
72
|
attrs[:name] ||= attrs[:id]
|
73
73
|
end
|
74
|
-
|
74
|
+
|
75
75
|
attrs.each do |k, v|
|
76
76
|
atname = k.to_s.downcase.intern
|
77
77
|
unless k =~ /:/ or @tagset.tagset[tag].include? atname
|
@@ -105,14 +105,15 @@ module Hpricot
|
|
105
105
|
end
|
106
106
|
|
107
107
|
# create the element itself
|
108
|
-
|
108
|
+
tag = tag.to_s
|
109
|
+
f = Elem.new(tag, attrs, childs, ETag.new(tag))
|
109
110
|
|
110
111
|
# build children from the block
|
111
112
|
if block
|
112
113
|
build(f, &block)
|
113
114
|
end
|
114
115
|
|
115
|
-
|
116
|
+
add_child f
|
116
117
|
f
|
117
118
|
end
|
118
119
|
|
@@ -145,11 +146,11 @@ module Hpricot
|
|
145
146
|
end
|
146
147
|
|
147
148
|
def doctype(target, pub, sys)
|
148
|
-
|
149
|
+
add_child DocType.new(target, pub, sys)
|
149
150
|
end
|
150
151
|
|
151
152
|
remove_method :head
|
152
|
-
|
153
|
+
|
153
154
|
# Builds a head tag. Adds a <tt>meta</tt> tag inside with Content-Type
|
154
155
|
# set to <tt>text/html; charset=utf-8</tt>.
|
155
156
|
def head(*args, &block)
|
@@ -193,7 +194,7 @@ module Hpricot
|
|
193
194
|
def initialize(builder, sym)
|
194
195
|
@builder, @sym, @attrs = builder, sym, {}
|
195
196
|
end
|
196
|
-
|
197
|
+
|
197
198
|
# Adds attributes to an element. Bang methods set the :id attribute.
|
198
199
|
# Other methods add to the :class attribute.
|
199
200
|
def method_missing(id_or_class, *args, &block)
|
@@ -207,7 +208,7 @@ module Hpricot
|
|
207
208
|
args.push(@attrs)
|
208
209
|
return @builder.tag!(@sym, *args, &block)
|
209
210
|
end
|
210
|
-
|
211
|
+
|
211
212
|
return self
|
212
213
|
end
|
213
214
|
|
data/lib/hpricot/elements.rb
CHANGED
@@ -168,7 +168,7 @@ module Hpricot
|
|
168
168
|
end
|
169
169
|
x.parent.replace_child(x, wrap)
|
170
170
|
nest = nest.children.first until nest.empty?
|
171
|
-
nest.html(
|
171
|
+
nest.html([x])
|
172
172
|
end
|
173
173
|
end
|
174
174
|
|
@@ -275,7 +275,7 @@ module Hpricot
|
|
275
275
|
expr = $'
|
276
276
|
m.compact!
|
277
277
|
if m[0] == '@'
|
278
|
-
m[0] = "@#{m.slice!(2,1)}"
|
278
|
+
m[0] = "@#{m.slice!(2,1).join}"
|
279
279
|
end
|
280
280
|
|
281
281
|
if m[0] == '[' && m[1] =~ /^\d+$/
|
@@ -300,10 +300,10 @@ module Hpricot
|
|
300
300
|
args = m[1..-1]
|
301
301
|
end
|
302
302
|
end
|
303
|
-
|
303
|
+
args << -1
|
304
304
|
nodes = Elements[*nodes.find_all do |x|
|
305
|
-
|
306
|
-
x.send(meth, *
|
305
|
+
args[-1] += 1
|
306
|
+
x.send(meth, *args) ? truth : !truth
|
307
307
|
end]
|
308
308
|
end
|
309
309
|
end
|
@@ -422,7 +422,7 @@ module Hpricot
|
|
422
422
|
case arg
|
423
423
|
when 'even'; (parent.containers.index(self) + 1) % 2 == 0
|
424
424
|
when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
|
425
|
-
else self == (parent.containers[arg.to_i
|
425
|
+
else self == (parent.containers[arg.to_i - 1])
|
426
426
|
end
|
427
427
|
end
|
428
428
|
|
@@ -446,23 +446,23 @@ module Hpricot
|
|
446
446
|
parent.containers.length == 1
|
447
447
|
end
|
448
448
|
|
449
|
-
filter :parent do
|
449
|
+
filter :parent do |*a|
|
450
450
|
containers.length > 0
|
451
451
|
end
|
452
452
|
|
453
|
-
filter :empty do
|
453
|
+
filter :empty do |*a|
|
454
454
|
containers.length == 0
|
455
455
|
end
|
456
456
|
|
457
|
-
filter :root do
|
457
|
+
filter :root do |*a|
|
458
458
|
self.is_a? Hpricot::Doc
|
459
459
|
end
|
460
460
|
|
461
|
-
filter 'text' do
|
461
|
+
filter 'text' do |*a|
|
462
462
|
self.text?
|
463
463
|
end
|
464
464
|
|
465
|
-
filter 'comment' do
|
465
|
+
filter 'comment' do |*a|
|
466
466
|
self.comment?
|
467
467
|
end
|
468
468
|
|
@@ -495,7 +495,7 @@ module Hpricot
|
|
495
495
|
end
|
496
496
|
|
497
497
|
filter 'text()' do |val,i|
|
498
|
-
|
498
|
+
self.children.grep(Hpricot::Text).detect { |x| x.content =~ /\S/ } if self.children
|
499
499
|
end
|
500
500
|
|
501
501
|
filter '@' do |attr,val,i|
|
data/lib/hpricot/htmlinfo.rb
CHANGED
@@ -473,9 +473,23 @@ module Hpricot
|
|
473
473
|
"menu", "noframes", "noscript", "object", "ol", "p", "pre", "q", "s",
|
474
474
|
"samp", "script", "select", "small", "span", "strike", "strong", "sub",
|
475
475
|
"sup", "table", "textarea", "tt", "u", "ul", "var"]}
|
476
|
+
ElementContent.keys.each do |k|
|
477
|
+
v = ElementContent[k]
|
478
|
+
if v.is_a? Array
|
479
|
+
ElementContent[k] = v.inject({}) do |h, name|
|
480
|
+
h[name.hash] = true
|
481
|
+
h
|
482
|
+
end
|
483
|
+
end
|
484
|
+
end
|
476
485
|
|
477
486
|
ElementInclusions =
|
478
487
|
{"head"=>["link", "meta", "object", "script", "style"], "body"=>["del", "ins"]}
|
488
|
+
ElementInclusions.each do |k, v|
|
489
|
+
v.each do |name|
|
490
|
+
ElementContent[k][name.hash] = :allow
|
491
|
+
end
|
492
|
+
end
|
479
493
|
|
480
494
|
ElementExclusions =
|
481
495
|
{"button"=>
|
@@ -496,6 +510,11 @@ module Hpricot
|
|
496
510
|
"h1", "h2", "h3", "h4", "h5", "h6", "hr", "isindex", "menu", "noframes",
|
497
511
|
"noscript", "ol", "p", "pre", "table", "ul"],
|
498
512
|
"label"=>["label"]}
|
513
|
+
ElementExclusions.each do |k, v|
|
514
|
+
v.each do |name|
|
515
|
+
ElementContent[k][name.hash] = :deny
|
516
|
+
end
|
517
|
+
end
|
499
518
|
|
500
519
|
OmittedAttrName =
|
501
520
|
{"h6"=>
|
data/lib/hpricot/inspect.rb
CHANGED
@@ -11,28 +11,7 @@ module Hpricot
|
|
11
11
|
|
12
12
|
class Doc
|
13
13
|
def pretty_print(q)
|
14
|
-
q.object_group(self) {
|
15
|
-
end
|
16
|
-
alias inspect pretty_print_inspect
|
17
|
-
end
|
18
|
-
|
19
|
-
class Elem
|
20
|
-
def pretty_print(q)
|
21
|
-
if empty?
|
22
|
-
q.group(1, '{emptyelem', '}') {
|
23
|
-
q.breakable; q.pp @stag
|
24
|
-
}
|
25
|
-
else
|
26
|
-
q.group(1, "{elem", "}") {
|
27
|
-
q.breakable; q.pp @stag
|
28
|
-
if @children
|
29
|
-
@children.each {|elt| q.breakable; q.pp elt }
|
30
|
-
end
|
31
|
-
if @etag
|
32
|
-
q.breakable; q.pp @etag
|
33
|
-
end
|
34
|
-
}
|
35
|
-
end
|
14
|
+
q.object_group(self) { children.each {|elt| q.breakable; q.pp elt } }
|
36
15
|
end
|
37
16
|
alias inspect pretty_print_inspect
|
38
17
|
end
|
@@ -41,7 +20,7 @@ module Hpricot
|
|
41
20
|
def pretty_print(q)
|
42
21
|
q.group(1, '{', '}') {
|
43
22
|
q.text self.class.name.sub(/.*::/,'').downcase
|
44
|
-
if rs =
|
23
|
+
if rs = raw_string
|
45
24
|
rs.scan(/[^\r\n]*(?:\r\n?|\n|[^\r\n]\z)/) {|line|
|
46
25
|
q.breakable
|
47
26
|
q.pp line
|
@@ -55,13 +34,30 @@ module Hpricot
|
|
55
34
|
alias inspect pretty_print_inspect
|
56
35
|
end
|
57
36
|
|
58
|
-
class
|
37
|
+
class Elem
|
59
38
|
def pretty_print(q)
|
39
|
+
if empty?
|
40
|
+
q.group(1, '{emptyelem', '}') {
|
41
|
+
q.breakable; pretty_print_stag q
|
42
|
+
}
|
43
|
+
else
|
44
|
+
q.group(1, "{elem", "}") {
|
45
|
+
q.breakable; pretty_print_stag q
|
46
|
+
if children
|
47
|
+
children.each {|elt| q.breakable; q.pp elt }
|
48
|
+
end
|
49
|
+
if etag
|
50
|
+
q.breakable; q.pp etag
|
51
|
+
end
|
52
|
+
}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
def pretty_print_stag(q)
|
60
56
|
q.group(1, '<', '>') {
|
61
|
-
q.text
|
57
|
+
q.text name
|
62
58
|
|
63
|
-
if
|
64
|
-
|
59
|
+
if raw_attributes
|
60
|
+
raw_attributes.each {|n, t|
|
65
61
|
q.breakable
|
66
62
|
if t
|
67
63
|
q.text "#{n}=\"#{Hpricot.uxs(t)}\""
|
@@ -78,7 +74,7 @@ module Hpricot
|
|
78
74
|
class ETag
|
79
75
|
def pretty_print(q)
|
80
76
|
q.group(1, '</', '>') {
|
81
|
-
q.text
|
77
|
+
q.text name
|
82
78
|
}
|
83
79
|
end
|
84
80
|
alias inspect pretty_print_inspect
|
@@ -86,7 +82,7 @@ module Hpricot
|
|
86
82
|
|
87
83
|
class Text
|
88
84
|
def pretty_print(q)
|
89
|
-
q.text
|
85
|
+
q.text content.dump
|
90
86
|
end
|
91
87
|
end
|
92
88
|
|
@@ -94,11 +90,11 @@ module Hpricot
|
|
94
90
|
def pretty_print(q)
|
95
91
|
q.group(1, '{', '}') {
|
96
92
|
q.text self.class.name.sub(/.*::/,'').downcase
|
97
|
-
if rs =
|
93
|
+
if rs = raw_string
|
98
94
|
q.breakable
|
99
95
|
q.text rs
|
100
96
|
else
|
101
|
-
q.text "</#{
|
97
|
+
q.text "</#{name}>"
|
102
98
|
end
|
103
99
|
}
|
104
100
|
end
|
data/lib/hpricot/modules.rb
CHANGED
@@ -4,7 +4,6 @@ module Hpricot
|
|
4
4
|
|
5
5
|
# :stopdoc:
|
6
6
|
module Tag; include Hpricot end
|
7
|
-
class STag; include Tag end
|
8
7
|
class ETag; include Tag end
|
9
8
|
# :startdoc:
|
10
9
|
|
@@ -12,6 +11,7 @@ module Hpricot
|
|
12
11
|
module Container; include Node end
|
13
12
|
class Doc; include Container end
|
14
13
|
class Elem; include Container end
|
14
|
+
|
15
15
|
module Leaf; include Node end
|
16
16
|
class Text; include Leaf end
|
17
17
|
class XMLDecl; include Leaf end
|
@@ -25,6 +25,7 @@ module Hpricot
|
|
25
25
|
module Leaf::Trav; include Traverse end
|
26
26
|
class Doc; module Trav; include Container::Trav end; include Trav end
|
27
27
|
class Elem; module Trav; include Container::Trav end; include Trav end
|
28
|
+
class CData; module Trav; include Leaf::Trav end; include Trav end
|
28
29
|
class Text; module Trav; include Leaf::Trav end; include Trav end
|
29
30
|
class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
|
30
31
|
class DocType; module Trav; include Leaf::Trav end; include Trav end
|
data/lib/hpricot/parse.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'hpricot/htmlinfo'
|
2
2
|
|
3
3
|
def Hpricot(input = nil, opts = {}, &blk)
|
4
|
-
Hpricot.
|
4
|
+
Hpricot.make(input, opts, &blk)
|
5
5
|
end
|
6
6
|
|
7
7
|
module Hpricot
|
@@ -12,287 +12,27 @@ module Hpricot
|
|
12
12
|
# Hpricot.parse parses <i>input</i> and return a document tree.
|
13
13
|
# represented by Hpricot::Doc.
|
14
14
|
def Hpricot.parse(input = nil, opts = {}, &blk)
|
15
|
-
|
15
|
+
make(input, opts, &blk)
|
16
16
|
end
|
17
17
|
|
18
18
|
# Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
|
19
19
|
# and returning a document tree.
|
20
20
|
def Hpricot.XML(input = nil, opts = {}, &blk)
|
21
21
|
opts.merge! :xml => true
|
22
|
-
|
22
|
+
make(input, opts, &blk)
|
23
23
|
end
|
24
24
|
|
25
25
|
# :stopdoc:
|
26
26
|
|
27
27
|
def Hpricot.make(input = nil, opts = {}, &blk)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
conv = opts[:xml] ? :to_s : :downcase
|
34
|
-
|
35
|
-
fragment =
|
36
|
-
if input
|
37
|
-
case opts[:encoding]
|
38
|
-
when nil
|
39
|
-
when 'utf-8'
|
40
|
-
unless defined? Encoding::Character::UTF8
|
41
|
-
raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
|
42
|
-
end
|
43
|
-
else
|
44
|
-
raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
|
45
|
-
end
|
46
|
-
|
47
|
-
if opts[:xhtml_strict]
|
48
|
-
opts[:fixup_tags] = true
|
49
|
-
end
|
50
|
-
|
51
|
-
stack = [[nil, nil, [], [], [], []]]
|
52
|
-
Hpricot.scan(input) do |token|
|
53
|
-
if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
|
54
|
-
!(token[0] == :etag and token[1].casecmp(stack.last[0]).zero?)
|
55
|
-
token[0] = :text
|
56
|
-
token[1] = token[3] if token[3]
|
57
|
-
end
|
58
|
-
|
59
|
-
if !opts[:xml] and token[0] == :emptytag
|
60
|
-
token[1] = token[1].send(conv)
|
61
|
-
if ElementContent[token[1].downcase] != :EMPTY
|
62
|
-
token[0] = :stag
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
# TODO: downcase instead when parsing attributes?
|
67
|
-
if !opts[:xml] and token[2].is_a?(Hash)
|
68
|
-
token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
|
69
|
-
end
|
70
|
-
|
71
|
-
case token[0]
|
72
|
-
when :stag
|
73
|
-
case opts[:encoding] when 'utf-8'
|
74
|
-
token.map! { |str| u(str) if str.is_a? String }
|
75
|
-
end
|
76
|
-
|
77
|
-
stagname = token[0] = token[1] = token[1].send(conv)
|
78
|
-
if ElementContent[stagname] == :EMPTY and !opts[:xml]
|
79
|
-
token[0] = :emptytag
|
80
|
-
stack.last[2] << token
|
81
|
-
else
|
82
|
-
unless opts[:xml]
|
83
|
-
if opts[:fixup_tags]
|
84
|
-
# obey the tag rules set up by the current element
|
85
|
-
if ElementContent.has_key? stagname
|
86
|
-
trans = nil
|
87
|
-
(stack.length-1).downto(0) do |i|
|
88
|
-
untags = stack[i][5]
|
89
|
-
break unless untags.include? stagname
|
90
|
-
# puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
|
91
|
-
trans = i
|
92
|
-
end
|
93
|
-
if trans.to_i > 1
|
94
|
-
eles = stack.slice!(trans..-1)
|
95
|
-
stack.last[2] += eles
|
96
|
-
# puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
|
97
|
-
end
|
98
|
-
elsif opts[:xhtml_strict]
|
99
|
-
token[2] = {'class' => stagname}
|
100
|
-
stagname = token[0] = "div"
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
# setup tag rules for inside this element
|
105
|
-
if ElementContent[stagname] == :CDATA
|
106
|
-
uncontainable_tags = :CDATA
|
107
|
-
elsif opts[:fixup_tags]
|
108
|
-
possible_tags = ElementContent[stagname]
|
109
|
-
excluded_tags, included_tags = stack.last[3..4]
|
110
|
-
if possible_tags
|
111
|
-
excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
|
112
|
-
included_tags = included_tags | (ElementInclusions[stagname] || [])
|
113
|
-
containable_tags = (possible_tags | included_tags) - excluded_tags
|
114
|
-
uncontainable_tags = ElementContent.keys - containable_tags
|
115
|
-
else
|
116
|
-
# If the tagname is unknown, it is assumed that any element
|
117
|
-
# except excluded can be contained.
|
118
|
-
uncontainable_tags = excluded_tags
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
|
-
unless opts[:xml]
|
123
|
-
case token[2] when Hash
|
124
|
-
token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
|
125
|
-
end
|
126
|
-
end
|
127
|
-
stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
|
128
|
-
end
|
129
|
-
when :etag
|
130
|
-
etagname = token[0] = token[1].send(conv)
|
131
|
-
if opts[:xhtml_strict] and not ElementContent.has_key? etagname
|
132
|
-
etagname = token[0] = "div"
|
133
|
-
end
|
134
|
-
matched_elem = nil
|
135
|
-
(stack.length-1).downto(0) do |i|
|
136
|
-
stagname, = stack[i]
|
137
|
-
if stagname == etagname
|
138
|
-
matched_elem = stack[i]
|
139
|
-
stack[i][1] += token
|
140
|
-
eles = stack.slice!((i+1)..-1)
|
141
|
-
stack.last[2] += eles if eles
|
142
|
-
break
|
143
|
-
end
|
144
|
-
end
|
145
|
-
unless matched_elem
|
146
|
-
stack.last[2] << [:bogus_etag, token.first, token.last]
|
147
|
-
else
|
148
|
-
ele = stack.pop
|
149
|
-
stack.last[2] << ele
|
150
|
-
end
|
151
|
-
when :text
|
152
|
-
l = stack.last[2].last
|
153
|
-
if l and l[0] == :text
|
154
|
-
l[1] += token[1]
|
155
|
-
else
|
156
|
-
stack.last[2] << token
|
157
|
-
end
|
158
|
-
else
|
159
|
-
stack.last[2] << token
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
while 1 < stack.length
|
164
|
-
ele = stack.pop
|
165
|
-
stack.last[2] << ele
|
166
|
-
end
|
167
|
-
|
168
|
-
structure_list = stack[0][2]
|
169
|
-
structure_list.map {|s| build_node(s, opts) }
|
170
|
-
elsif blk
|
171
|
-
Hpricot.build(&blk).children
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
def Hpricot.build_node(structure, opts = {})
|
176
|
-
case structure[0]
|
177
|
-
when String
|
178
|
-
tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
|
179
|
-
children = structure[2]
|
180
|
-
etag = eraw && ETag.parse(tagname, eraw)
|
181
|
-
stag = STag.parse(tagname, attrs, sraw, true)
|
182
|
-
if !children.empty? || etag
|
183
|
-
Elem.new(stag,
|
184
|
-
children.map {|c| build_node(c, opts) },
|
185
|
-
etag)
|
186
|
-
else
|
187
|
-
Elem.new(stag)
|
188
|
-
end
|
189
|
-
when :text
|
190
|
-
Text.parse_pcdata(structure[1])
|
191
|
-
when :emptytag
|
192
|
-
Elem.new(STag.parse(structure[1], structure[2], structure[3], false))
|
193
|
-
when :bogus_etag
|
194
|
-
BogusETag.parse(structure[1], structure[2])
|
195
|
-
when :xmldecl
|
196
|
-
XMLDecl.parse(structure[2], structure[3])
|
197
|
-
when :doctype
|
198
|
-
if opts[:xhtml_strict]
|
199
|
-
structure[2]['system_id'] = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
|
200
|
-
structure[2]['public_id'] = "-//W3C//DTD XHTML 1.0 Strict//EN"
|
201
|
-
end
|
202
|
-
DocType.parse(structure[1], structure[2], structure[3])
|
203
|
-
when :procins
|
204
|
-
ProcIns.parse(structure[1])
|
205
|
-
when :comment
|
206
|
-
Comment.parse(structure[1])
|
207
|
-
when :cdata_content
|
208
|
-
Text.parse_cdata_content(structure[1])
|
209
|
-
when :cdata
|
210
|
-
Text.parse_cdata_section(structure[1])
|
28
|
+
if blk
|
29
|
+
doc = Hpricot.build(&blk)
|
30
|
+
doc.instance_variable_set("@options", opts)
|
31
|
+
doc
|
211
32
|
else
|
212
|
-
|
33
|
+
Hpricot.scan(input, opts)
|
213
34
|
end
|
214
35
|
end
|
215
36
|
|
216
|
-
def STag.parse(qname, attrs, raw_string, is_stag)
|
217
|
-
result = STag.new(qname, attrs)
|
218
|
-
result.raw_string = raw_string
|
219
|
-
result
|
220
|
-
end
|
221
|
-
|
222
|
-
def ETag.parse(qname, raw_string)
|
223
|
-
result = self.new(qname)
|
224
|
-
result.raw_string = raw_string
|
225
|
-
result
|
226
|
-
end
|
227
|
-
|
228
|
-
def BogusETag.parse(qname, raw_string)
|
229
|
-
result = self.new(qname)
|
230
|
-
result.raw_string = raw_string
|
231
|
-
result
|
232
|
-
end
|
233
|
-
|
234
|
-
def Text.parse_pcdata(raw_string)
|
235
|
-
result = Text.new(raw_string)
|
236
|
-
result
|
237
|
-
end
|
238
|
-
|
239
|
-
def Text.parse_cdata_content(raw_string)
|
240
|
-
result = CData.new(raw_string)
|
241
|
-
result
|
242
|
-
end
|
243
|
-
|
244
|
-
def Text.parse_cdata_section(content)
|
245
|
-
result = CData.new(content)
|
246
|
-
result
|
247
|
-
end
|
248
|
-
|
249
|
-
def XMLDecl.parse(attrs, raw_string)
|
250
|
-
attrs ||= {}
|
251
|
-
version = attrs['version']
|
252
|
-
encoding = attrs['encoding']
|
253
|
-
case attrs['standalone']
|
254
|
-
when 'yes'
|
255
|
-
standalone = true
|
256
|
-
when 'no'
|
257
|
-
standalone = false
|
258
|
-
else
|
259
|
-
standalone = nil
|
260
|
-
end
|
261
|
-
|
262
|
-
result = XMLDecl.new(version, encoding, standalone)
|
263
|
-
result.raw_string = raw_string
|
264
|
-
result
|
265
|
-
end
|
266
|
-
|
267
|
-
def DocType.parse(root_element_name, attrs, raw_string)
|
268
|
-
if attrs
|
269
|
-
public_identifier = attrs['public_id']
|
270
|
-
system_identifier = attrs['system_id']
|
271
|
-
end
|
272
|
-
|
273
|
-
root_element_name = root_element_name.downcase
|
274
|
-
|
275
|
-
result = DocType.new(root_element_name, public_identifier, system_identifier)
|
276
|
-
result.raw_string = raw_string
|
277
|
-
result
|
278
|
-
end
|
279
|
-
|
280
|
-
def ProcIns.parse(raw_string)
|
281
|
-
_, target, content = *raw_string.match(/\A<\?(\S+)\s+(.+)/m)
|
282
|
-
result = ProcIns.new(target, content)
|
283
|
-
result
|
284
|
-
end
|
285
|
-
|
286
|
-
def Comment.parse(content)
|
287
|
-
result = Comment.new(content)
|
288
|
-
result
|
289
|
-
end
|
290
|
-
|
291
|
-
module Pat
|
292
|
-
NameChar = /[-A-Za-z0-9._:]/
|
293
|
-
Name = /[A-Za-z_:]#{NameChar}*/
|
294
|
-
Nmtoken = /#{NameChar}+/
|
295
|
-
end
|
296
|
-
|
297
37
|
# :startdoc:
|
298
38
|
end
|