hpricot 0.6.164 → 0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -0
- data/Rakefile +31 -33
- data/ext/fast_xs/fast_xs.c +11 -5
- data/ext/hpricot_scan/HpricotScanService.java +10 -6
- data/ext/hpricot_scan/hpricot_css.c +3502 -0
- data/ext/hpricot_scan/hpricot_css.rl +115 -0
- data/ext/hpricot_scan/hpricot_scan.c +1032 -589
- data/ext/hpricot_scan/hpricot_scan.java.rl +5 -1
- data/ext/hpricot_scan/hpricot_scan.rl +493 -50
- data/ext/hpricot_scan/test.rb +1 -2
- data/lib/hpricot/builder.rb +21 -20
- data/lib/hpricot/elements.rb +12 -12
- data/lib/hpricot/htmlinfo.rb +19 -0
- data/lib/hpricot/inspect.rb +27 -31
- data/lib/hpricot/modules.rb +2 -1
- data/lib/hpricot/parse.rb +8 -268
- data/lib/hpricot/tag.rb +65 -99
- data/lib/hpricot/traverse.rb +20 -14
- data/test/files/boingboing.html +1 -1
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_builder.rb +4 -4
- data/test/test_parser.rb +36 -13
- data/test/test_preserved.rb +6 -2
- metadata +51 -51
- data/ext/hpricot_scan/hpricot_gram.c +0 -882
- data/ext/hpricot_scan/hpricot_gram.h +0 -9
data/ext/hpricot_scan/test.rb
CHANGED
data/lib/hpricot/builder.rb
CHANGED
@@ -1,20 +1,13 @@
|
|
1
1
|
require 'hpricot/tags'
|
2
2
|
require 'fast_xs'
|
3
3
|
require 'hpricot/blankslate'
|
4
|
+
require 'hpricot/htmlinfo'
|
4
5
|
|
5
6
|
module Hpricot
|
6
|
-
PREDEFINED = {
|
7
|
-
34 => '"', # quotation mark
|
8
|
-
38 => '&', # ampersand
|
9
|
-
60 => '<', # left angle bracket
|
10
|
-
62 => '>' # right angle bracket
|
11
|
-
}
|
12
|
-
PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
|
13
|
-
|
14
7
|
# XML unescape
|
15
8
|
def self.uxs(str)
|
16
9
|
str.to_s.
|
17
|
-
gsub(
|
10
|
+
gsub(/\&(\w+);/) { [NamedCharacters[$1] || ??].pack("U*") }.
|
18
11
|
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
|
19
12
|
end
|
20
13
|
|
@@ -23,7 +16,7 @@ module Hpricot
|
|
23
16
|
assigns.each do |k, v|
|
24
17
|
ele.instance_variable_set("@#{k}", v)
|
25
18
|
end
|
26
|
-
ele.instance_eval
|
19
|
+
ele.instance_eval(&blk)
|
27
20
|
ele
|
28
21
|
end
|
29
22
|
|
@@ -45,14 +38,21 @@ module Hpricot
|
|
45
38
|
@@default[option] = value
|
46
39
|
end
|
47
40
|
|
41
|
+
def add_child ele
|
42
|
+
ele.parent = self
|
43
|
+
self.children ||= []
|
44
|
+
self.children << ele
|
45
|
+
ele
|
46
|
+
end
|
47
|
+
|
48
48
|
# Write a +string+ to the HTML stream, making sure to escape it.
|
49
49
|
def text!(string)
|
50
|
-
|
50
|
+
add_child Text.new(string.fast_xs)
|
51
51
|
end
|
52
52
|
|
53
53
|
# Write a +string+ to the HTML stream without escaping it.
|
54
54
|
def text(string)
|
55
|
-
|
55
|
+
add_child Text.new(string)
|
56
56
|
nil
|
57
57
|
end
|
58
58
|
alias_method :<<, :text
|
@@ -67,11 +67,11 @@ module Hpricot
|
|
67
67
|
raise InvalidXhtmlError, "no element `#{tag}' for #{tagset.doctype}"
|
68
68
|
elsif args.last.respond_to?(:to_hash)
|
69
69
|
attrs = args.last.to_hash
|
70
|
-
|
70
|
+
|
71
71
|
if @tagset.forms.include?(tag) and attrs[:id]
|
72
72
|
attrs[:name] ||= attrs[:id]
|
73
73
|
end
|
74
|
-
|
74
|
+
|
75
75
|
attrs.each do |k, v|
|
76
76
|
atname = k.to_s.downcase.intern
|
77
77
|
unless k =~ /:/ or @tagset.tagset[tag].include? atname
|
@@ -105,14 +105,15 @@ module Hpricot
|
|
105
105
|
end
|
106
106
|
|
107
107
|
# create the element itself
|
108
|
-
|
108
|
+
tag = tag.to_s
|
109
|
+
f = Elem.new(tag, attrs, childs, ETag.new(tag))
|
109
110
|
|
110
111
|
# build children from the block
|
111
112
|
if block
|
112
113
|
build(f, &block)
|
113
114
|
end
|
114
115
|
|
115
|
-
|
116
|
+
add_child f
|
116
117
|
f
|
117
118
|
end
|
118
119
|
|
@@ -145,11 +146,11 @@ module Hpricot
|
|
145
146
|
end
|
146
147
|
|
147
148
|
def doctype(target, pub, sys)
|
148
|
-
|
149
|
+
add_child DocType.new(target, pub, sys)
|
149
150
|
end
|
150
151
|
|
151
152
|
remove_method :head
|
152
|
-
|
153
|
+
|
153
154
|
# Builds a head tag. Adds a <tt>meta</tt> tag inside with Content-Type
|
154
155
|
# set to <tt>text/html; charset=utf-8</tt>.
|
155
156
|
def head(*args, &block)
|
@@ -193,7 +194,7 @@ module Hpricot
|
|
193
194
|
def initialize(builder, sym)
|
194
195
|
@builder, @sym, @attrs = builder, sym, {}
|
195
196
|
end
|
196
|
-
|
197
|
+
|
197
198
|
# Adds attributes to an element. Bang methods set the :id attribute.
|
198
199
|
# Other methods add to the :class attribute.
|
199
200
|
def method_missing(id_or_class, *args, &block)
|
@@ -207,7 +208,7 @@ module Hpricot
|
|
207
208
|
args.push(@attrs)
|
208
209
|
return @builder.tag!(@sym, *args, &block)
|
209
210
|
end
|
210
|
-
|
211
|
+
|
211
212
|
return self
|
212
213
|
end
|
213
214
|
|
data/lib/hpricot/elements.rb
CHANGED
@@ -168,7 +168,7 @@ module Hpricot
|
|
168
168
|
end
|
169
169
|
x.parent.replace_child(x, wrap)
|
170
170
|
nest = nest.children.first until nest.empty?
|
171
|
-
nest.html(
|
171
|
+
nest.html([x])
|
172
172
|
end
|
173
173
|
end
|
174
174
|
|
@@ -275,7 +275,7 @@ module Hpricot
|
|
275
275
|
expr = $'
|
276
276
|
m.compact!
|
277
277
|
if m[0] == '@'
|
278
|
-
m[0] = "@#{m.slice!(2,1)}"
|
278
|
+
m[0] = "@#{m.slice!(2,1).join}"
|
279
279
|
end
|
280
280
|
|
281
281
|
if m[0] == '[' && m[1] =~ /^\d+$/
|
@@ -300,10 +300,10 @@ module Hpricot
|
|
300
300
|
args = m[1..-1]
|
301
301
|
end
|
302
302
|
end
|
303
|
-
|
303
|
+
args << -1
|
304
304
|
nodes = Elements[*nodes.find_all do |x|
|
305
|
-
|
306
|
-
x.send(meth, *
|
305
|
+
args[-1] += 1
|
306
|
+
x.send(meth, *args) ? truth : !truth
|
307
307
|
end]
|
308
308
|
end
|
309
309
|
end
|
@@ -422,7 +422,7 @@ module Hpricot
|
|
422
422
|
case arg
|
423
423
|
when 'even'; (parent.containers.index(self) + 1) % 2 == 0
|
424
424
|
when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
|
425
|
-
else self == (parent.containers[arg.to_i
|
425
|
+
else self == (parent.containers[arg.to_i - 1])
|
426
426
|
end
|
427
427
|
end
|
428
428
|
|
@@ -446,23 +446,23 @@ module Hpricot
|
|
446
446
|
parent.containers.length == 1
|
447
447
|
end
|
448
448
|
|
449
|
-
filter :parent do
|
449
|
+
filter :parent do |*a|
|
450
450
|
containers.length > 0
|
451
451
|
end
|
452
452
|
|
453
|
-
filter :empty do
|
453
|
+
filter :empty do |*a|
|
454
454
|
containers.length == 0
|
455
455
|
end
|
456
456
|
|
457
|
-
filter :root do
|
457
|
+
filter :root do |*a|
|
458
458
|
self.is_a? Hpricot::Doc
|
459
459
|
end
|
460
460
|
|
461
|
-
filter 'text' do
|
461
|
+
filter 'text' do |*a|
|
462
462
|
self.text?
|
463
463
|
end
|
464
464
|
|
465
|
-
filter 'comment' do
|
465
|
+
filter 'comment' do |*a|
|
466
466
|
self.comment?
|
467
467
|
end
|
468
468
|
|
@@ -495,7 +495,7 @@ module Hpricot
|
|
495
495
|
end
|
496
496
|
|
497
497
|
filter 'text()' do |val,i|
|
498
|
-
|
498
|
+
self.children.grep(Hpricot::Text).detect { |x| x.content =~ /\S/ } if self.children
|
499
499
|
end
|
500
500
|
|
501
501
|
filter '@' do |attr,val,i|
|
data/lib/hpricot/htmlinfo.rb
CHANGED
@@ -473,9 +473,23 @@ module Hpricot
|
|
473
473
|
"menu", "noframes", "noscript", "object", "ol", "p", "pre", "q", "s",
|
474
474
|
"samp", "script", "select", "small", "span", "strike", "strong", "sub",
|
475
475
|
"sup", "table", "textarea", "tt", "u", "ul", "var"]}
|
476
|
+
ElementContent.keys.each do |k|
|
477
|
+
v = ElementContent[k]
|
478
|
+
if v.is_a? Array
|
479
|
+
ElementContent[k] = v.inject({}) do |h, name|
|
480
|
+
h[name.hash] = true
|
481
|
+
h
|
482
|
+
end
|
483
|
+
end
|
484
|
+
end
|
476
485
|
|
477
486
|
ElementInclusions =
|
478
487
|
{"head"=>["link", "meta", "object", "script", "style"], "body"=>["del", "ins"]}
|
488
|
+
ElementInclusions.each do |k, v|
|
489
|
+
v.each do |name|
|
490
|
+
ElementContent[k][name.hash] = :allow
|
491
|
+
end
|
492
|
+
end
|
479
493
|
|
480
494
|
ElementExclusions =
|
481
495
|
{"button"=>
|
@@ -496,6 +510,11 @@ module Hpricot
|
|
496
510
|
"h1", "h2", "h3", "h4", "h5", "h6", "hr", "isindex", "menu", "noframes",
|
497
511
|
"noscript", "ol", "p", "pre", "table", "ul"],
|
498
512
|
"label"=>["label"]}
|
513
|
+
ElementExclusions.each do |k, v|
|
514
|
+
v.each do |name|
|
515
|
+
ElementContent[k][name.hash] = :deny
|
516
|
+
end
|
517
|
+
end
|
499
518
|
|
500
519
|
OmittedAttrName =
|
501
520
|
{"h6"=>
|
data/lib/hpricot/inspect.rb
CHANGED
@@ -11,28 +11,7 @@ module Hpricot
|
|
11
11
|
|
12
12
|
class Doc
|
13
13
|
def pretty_print(q)
|
14
|
-
q.object_group(self) {
|
15
|
-
end
|
16
|
-
alias inspect pretty_print_inspect
|
17
|
-
end
|
18
|
-
|
19
|
-
class Elem
|
20
|
-
def pretty_print(q)
|
21
|
-
if empty?
|
22
|
-
q.group(1, '{emptyelem', '}') {
|
23
|
-
q.breakable; q.pp @stag
|
24
|
-
}
|
25
|
-
else
|
26
|
-
q.group(1, "{elem", "}") {
|
27
|
-
q.breakable; q.pp @stag
|
28
|
-
if @children
|
29
|
-
@children.each {|elt| q.breakable; q.pp elt }
|
30
|
-
end
|
31
|
-
if @etag
|
32
|
-
q.breakable; q.pp @etag
|
33
|
-
end
|
34
|
-
}
|
35
|
-
end
|
14
|
+
q.object_group(self) { children.each {|elt| q.breakable; q.pp elt } }
|
36
15
|
end
|
37
16
|
alias inspect pretty_print_inspect
|
38
17
|
end
|
@@ -41,7 +20,7 @@ module Hpricot
|
|
41
20
|
def pretty_print(q)
|
42
21
|
q.group(1, '{', '}') {
|
43
22
|
q.text self.class.name.sub(/.*::/,'').downcase
|
44
|
-
if rs =
|
23
|
+
if rs = raw_string
|
45
24
|
rs.scan(/[^\r\n]*(?:\r\n?|\n|[^\r\n]\z)/) {|line|
|
46
25
|
q.breakable
|
47
26
|
q.pp line
|
@@ -55,13 +34,30 @@ module Hpricot
|
|
55
34
|
alias inspect pretty_print_inspect
|
56
35
|
end
|
57
36
|
|
58
|
-
class
|
37
|
+
class Elem
|
59
38
|
def pretty_print(q)
|
39
|
+
if empty?
|
40
|
+
q.group(1, '{emptyelem', '}') {
|
41
|
+
q.breakable; pretty_print_stag q
|
42
|
+
}
|
43
|
+
else
|
44
|
+
q.group(1, "{elem", "}") {
|
45
|
+
q.breakable; pretty_print_stag q
|
46
|
+
if children
|
47
|
+
children.each {|elt| q.breakable; q.pp elt }
|
48
|
+
end
|
49
|
+
if etag
|
50
|
+
q.breakable; q.pp etag
|
51
|
+
end
|
52
|
+
}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
def pretty_print_stag(q)
|
60
56
|
q.group(1, '<', '>') {
|
61
|
-
q.text
|
57
|
+
q.text name
|
62
58
|
|
63
|
-
if
|
64
|
-
|
59
|
+
if raw_attributes
|
60
|
+
raw_attributes.each {|n, t|
|
65
61
|
q.breakable
|
66
62
|
if t
|
67
63
|
q.text "#{n}=\"#{Hpricot.uxs(t)}\""
|
@@ -78,7 +74,7 @@ module Hpricot
|
|
78
74
|
class ETag
|
79
75
|
def pretty_print(q)
|
80
76
|
q.group(1, '</', '>') {
|
81
|
-
q.text
|
77
|
+
q.text name
|
82
78
|
}
|
83
79
|
end
|
84
80
|
alias inspect pretty_print_inspect
|
@@ -86,7 +82,7 @@ module Hpricot
|
|
86
82
|
|
87
83
|
class Text
|
88
84
|
def pretty_print(q)
|
89
|
-
q.text
|
85
|
+
q.text content.dump
|
90
86
|
end
|
91
87
|
end
|
92
88
|
|
@@ -94,11 +90,11 @@ module Hpricot
|
|
94
90
|
def pretty_print(q)
|
95
91
|
q.group(1, '{', '}') {
|
96
92
|
q.text self.class.name.sub(/.*::/,'').downcase
|
97
|
-
if rs =
|
93
|
+
if rs = raw_string
|
98
94
|
q.breakable
|
99
95
|
q.text rs
|
100
96
|
else
|
101
|
-
q.text "</#{
|
97
|
+
q.text "</#{name}>"
|
102
98
|
end
|
103
99
|
}
|
104
100
|
end
|
data/lib/hpricot/modules.rb
CHANGED
@@ -4,7 +4,6 @@ module Hpricot
|
|
4
4
|
|
5
5
|
# :stopdoc:
|
6
6
|
module Tag; include Hpricot end
|
7
|
-
class STag; include Tag end
|
8
7
|
class ETag; include Tag end
|
9
8
|
# :startdoc:
|
10
9
|
|
@@ -12,6 +11,7 @@ module Hpricot
|
|
12
11
|
module Container; include Node end
|
13
12
|
class Doc; include Container end
|
14
13
|
class Elem; include Container end
|
14
|
+
|
15
15
|
module Leaf; include Node end
|
16
16
|
class Text; include Leaf end
|
17
17
|
class XMLDecl; include Leaf end
|
@@ -25,6 +25,7 @@ module Hpricot
|
|
25
25
|
module Leaf::Trav; include Traverse end
|
26
26
|
class Doc; module Trav; include Container::Trav end; include Trav end
|
27
27
|
class Elem; module Trav; include Container::Trav end; include Trav end
|
28
|
+
class CData; module Trav; include Leaf::Trav end; include Trav end
|
28
29
|
class Text; module Trav; include Leaf::Trav end; include Trav end
|
29
30
|
class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
|
30
31
|
class DocType; module Trav; include Leaf::Trav end; include Trav end
|
data/lib/hpricot/parse.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'hpricot/htmlinfo'
|
2
2
|
|
3
3
|
def Hpricot(input = nil, opts = {}, &blk)
|
4
|
-
Hpricot.
|
4
|
+
Hpricot.make(input, opts, &blk)
|
5
5
|
end
|
6
6
|
|
7
7
|
module Hpricot
|
@@ -12,287 +12,27 @@ module Hpricot
|
|
12
12
|
# Hpricot.parse parses <i>input</i> and return a document tree.
|
13
13
|
# represented by Hpricot::Doc.
|
14
14
|
def Hpricot.parse(input = nil, opts = {}, &blk)
|
15
|
-
|
15
|
+
make(input, opts, &blk)
|
16
16
|
end
|
17
17
|
|
18
18
|
# Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
|
19
19
|
# and returning a document tree.
|
20
20
|
def Hpricot.XML(input = nil, opts = {}, &blk)
|
21
21
|
opts.merge! :xml => true
|
22
|
-
|
22
|
+
make(input, opts, &blk)
|
23
23
|
end
|
24
24
|
|
25
25
|
# :stopdoc:
|
26
26
|
|
27
27
|
def Hpricot.make(input = nil, opts = {}, &blk)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
conv = opts[:xml] ? :to_s : :downcase
|
34
|
-
|
35
|
-
fragment =
|
36
|
-
if input
|
37
|
-
case opts[:encoding]
|
38
|
-
when nil
|
39
|
-
when 'utf-8'
|
40
|
-
unless defined? Encoding::Character::UTF8
|
41
|
-
raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
|
42
|
-
end
|
43
|
-
else
|
44
|
-
raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
|
45
|
-
end
|
46
|
-
|
47
|
-
if opts[:xhtml_strict]
|
48
|
-
opts[:fixup_tags] = true
|
49
|
-
end
|
50
|
-
|
51
|
-
stack = [[nil, nil, [], [], [], []]]
|
52
|
-
Hpricot.scan(input) do |token|
|
53
|
-
if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
|
54
|
-
!(token[0] == :etag and token[1].casecmp(stack.last[0]).zero?)
|
55
|
-
token[0] = :text
|
56
|
-
token[1] = token[3] if token[3]
|
57
|
-
end
|
58
|
-
|
59
|
-
if !opts[:xml] and token[0] == :emptytag
|
60
|
-
token[1] = token[1].send(conv)
|
61
|
-
if ElementContent[token[1].downcase] != :EMPTY
|
62
|
-
token[0] = :stag
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
# TODO: downcase instead when parsing attributes?
|
67
|
-
if !opts[:xml] and token[2].is_a?(Hash)
|
68
|
-
token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
|
69
|
-
end
|
70
|
-
|
71
|
-
case token[0]
|
72
|
-
when :stag
|
73
|
-
case opts[:encoding] when 'utf-8'
|
74
|
-
token.map! { |str| u(str) if str.is_a? String }
|
75
|
-
end
|
76
|
-
|
77
|
-
stagname = token[0] = token[1] = token[1].send(conv)
|
78
|
-
if ElementContent[stagname] == :EMPTY and !opts[:xml]
|
79
|
-
token[0] = :emptytag
|
80
|
-
stack.last[2] << token
|
81
|
-
else
|
82
|
-
unless opts[:xml]
|
83
|
-
if opts[:fixup_tags]
|
84
|
-
# obey the tag rules set up by the current element
|
85
|
-
if ElementContent.has_key? stagname
|
86
|
-
trans = nil
|
87
|
-
(stack.length-1).downto(0) do |i|
|
88
|
-
untags = stack[i][5]
|
89
|
-
break unless untags.include? stagname
|
90
|
-
# puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
|
91
|
-
trans = i
|
92
|
-
end
|
93
|
-
if trans.to_i > 1
|
94
|
-
eles = stack.slice!(trans..-1)
|
95
|
-
stack.last[2] += eles
|
96
|
-
# puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
|
97
|
-
end
|
98
|
-
elsif opts[:xhtml_strict]
|
99
|
-
token[2] = {'class' => stagname}
|
100
|
-
stagname = token[0] = "div"
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
|
-
# setup tag rules for inside this element
|
105
|
-
if ElementContent[stagname] == :CDATA
|
106
|
-
uncontainable_tags = :CDATA
|
107
|
-
elsif opts[:fixup_tags]
|
108
|
-
possible_tags = ElementContent[stagname]
|
109
|
-
excluded_tags, included_tags = stack.last[3..4]
|
110
|
-
if possible_tags
|
111
|
-
excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
|
112
|
-
included_tags = included_tags | (ElementInclusions[stagname] || [])
|
113
|
-
containable_tags = (possible_tags | included_tags) - excluded_tags
|
114
|
-
uncontainable_tags = ElementContent.keys - containable_tags
|
115
|
-
else
|
116
|
-
# If the tagname is unknown, it is assumed that any element
|
117
|
-
# except excluded can be contained.
|
118
|
-
uncontainable_tags = excluded_tags
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
|
-
unless opts[:xml]
|
123
|
-
case token[2] when Hash
|
124
|
-
token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
|
125
|
-
end
|
126
|
-
end
|
127
|
-
stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
|
128
|
-
end
|
129
|
-
when :etag
|
130
|
-
etagname = token[0] = token[1].send(conv)
|
131
|
-
if opts[:xhtml_strict] and not ElementContent.has_key? etagname
|
132
|
-
etagname = token[0] = "div"
|
133
|
-
end
|
134
|
-
matched_elem = nil
|
135
|
-
(stack.length-1).downto(0) do |i|
|
136
|
-
stagname, = stack[i]
|
137
|
-
if stagname == etagname
|
138
|
-
matched_elem = stack[i]
|
139
|
-
stack[i][1] += token
|
140
|
-
eles = stack.slice!((i+1)..-1)
|
141
|
-
stack.last[2] += eles if eles
|
142
|
-
break
|
143
|
-
end
|
144
|
-
end
|
145
|
-
unless matched_elem
|
146
|
-
stack.last[2] << [:bogus_etag, token.first, token.last]
|
147
|
-
else
|
148
|
-
ele = stack.pop
|
149
|
-
stack.last[2] << ele
|
150
|
-
end
|
151
|
-
when :text
|
152
|
-
l = stack.last[2].last
|
153
|
-
if l and l[0] == :text
|
154
|
-
l[1] += token[1]
|
155
|
-
else
|
156
|
-
stack.last[2] << token
|
157
|
-
end
|
158
|
-
else
|
159
|
-
stack.last[2] << token
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
while 1 < stack.length
|
164
|
-
ele = stack.pop
|
165
|
-
stack.last[2] << ele
|
166
|
-
end
|
167
|
-
|
168
|
-
structure_list = stack[0][2]
|
169
|
-
structure_list.map {|s| build_node(s, opts) }
|
170
|
-
elsif blk
|
171
|
-
Hpricot.build(&blk).children
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
def Hpricot.build_node(structure, opts = {})
|
176
|
-
case structure[0]
|
177
|
-
when String
|
178
|
-
tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
|
179
|
-
children = structure[2]
|
180
|
-
etag = eraw && ETag.parse(tagname, eraw)
|
181
|
-
stag = STag.parse(tagname, attrs, sraw, true)
|
182
|
-
if !children.empty? || etag
|
183
|
-
Elem.new(stag,
|
184
|
-
children.map {|c| build_node(c, opts) },
|
185
|
-
etag)
|
186
|
-
else
|
187
|
-
Elem.new(stag)
|
188
|
-
end
|
189
|
-
when :text
|
190
|
-
Text.parse_pcdata(structure[1])
|
191
|
-
when :emptytag
|
192
|
-
Elem.new(STag.parse(structure[1], structure[2], structure[3], false))
|
193
|
-
when :bogus_etag
|
194
|
-
BogusETag.parse(structure[1], structure[2])
|
195
|
-
when :xmldecl
|
196
|
-
XMLDecl.parse(structure[2], structure[3])
|
197
|
-
when :doctype
|
198
|
-
if opts[:xhtml_strict]
|
199
|
-
structure[2]['system_id'] = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
|
200
|
-
structure[2]['public_id'] = "-//W3C//DTD XHTML 1.0 Strict//EN"
|
201
|
-
end
|
202
|
-
DocType.parse(structure[1], structure[2], structure[3])
|
203
|
-
when :procins
|
204
|
-
ProcIns.parse(structure[1])
|
205
|
-
when :comment
|
206
|
-
Comment.parse(structure[1])
|
207
|
-
when :cdata_content
|
208
|
-
Text.parse_cdata_content(structure[1])
|
209
|
-
when :cdata
|
210
|
-
Text.parse_cdata_section(structure[1])
|
28
|
+
if blk
|
29
|
+
doc = Hpricot.build(&blk)
|
30
|
+
doc.instance_variable_set("@options", opts)
|
31
|
+
doc
|
211
32
|
else
|
212
|
-
|
33
|
+
Hpricot.scan(input, opts)
|
213
34
|
end
|
214
35
|
end
|
215
36
|
|
216
|
-
def STag.parse(qname, attrs, raw_string, is_stag)
|
217
|
-
result = STag.new(qname, attrs)
|
218
|
-
result.raw_string = raw_string
|
219
|
-
result
|
220
|
-
end
|
221
|
-
|
222
|
-
def ETag.parse(qname, raw_string)
|
223
|
-
result = self.new(qname)
|
224
|
-
result.raw_string = raw_string
|
225
|
-
result
|
226
|
-
end
|
227
|
-
|
228
|
-
def BogusETag.parse(qname, raw_string)
|
229
|
-
result = self.new(qname)
|
230
|
-
result.raw_string = raw_string
|
231
|
-
result
|
232
|
-
end
|
233
|
-
|
234
|
-
def Text.parse_pcdata(raw_string)
|
235
|
-
result = Text.new(raw_string)
|
236
|
-
result
|
237
|
-
end
|
238
|
-
|
239
|
-
def Text.parse_cdata_content(raw_string)
|
240
|
-
result = CData.new(raw_string)
|
241
|
-
result
|
242
|
-
end
|
243
|
-
|
244
|
-
def Text.parse_cdata_section(content)
|
245
|
-
result = CData.new(content)
|
246
|
-
result
|
247
|
-
end
|
248
|
-
|
249
|
-
def XMLDecl.parse(attrs, raw_string)
|
250
|
-
attrs ||= {}
|
251
|
-
version = attrs['version']
|
252
|
-
encoding = attrs['encoding']
|
253
|
-
case attrs['standalone']
|
254
|
-
when 'yes'
|
255
|
-
standalone = true
|
256
|
-
when 'no'
|
257
|
-
standalone = false
|
258
|
-
else
|
259
|
-
standalone = nil
|
260
|
-
end
|
261
|
-
|
262
|
-
result = XMLDecl.new(version, encoding, standalone)
|
263
|
-
result.raw_string = raw_string
|
264
|
-
result
|
265
|
-
end
|
266
|
-
|
267
|
-
def DocType.parse(root_element_name, attrs, raw_string)
|
268
|
-
if attrs
|
269
|
-
public_identifier = attrs['public_id']
|
270
|
-
system_identifier = attrs['system_id']
|
271
|
-
end
|
272
|
-
|
273
|
-
root_element_name = root_element_name.downcase
|
274
|
-
|
275
|
-
result = DocType.new(root_element_name, public_identifier, system_identifier)
|
276
|
-
result.raw_string = raw_string
|
277
|
-
result
|
278
|
-
end
|
279
|
-
|
280
|
-
def ProcIns.parse(raw_string)
|
281
|
-
_, target, content = *raw_string.match(/\A<\?(\S+)\s+(.+)/m)
|
282
|
-
result = ProcIns.new(target, content)
|
283
|
-
result
|
284
|
-
end
|
285
|
-
|
286
|
-
def Comment.parse(content)
|
287
|
-
result = Comment.new(content)
|
288
|
-
result
|
289
|
-
end
|
290
|
-
|
291
|
-
module Pat
|
292
|
-
NameChar = /[-A-Za-z0-9._:]/
|
293
|
-
Name = /[A-Za-z_:]#{NameChar}*/
|
294
|
-
Nmtoken = /#{NameChar}+/
|
295
|
-
end
|
296
|
-
|
297
37
|
# :startdoc:
|
298
38
|
end
|