hpricot 0.4-mswin32 → 0.5-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +16 -0
- data/README +279 -4
- data/Rakefile +12 -3
- data/ext/hpricot_scan/hpricot_scan.c +3106 -3348
- data/ext/hpricot_scan/hpricot_scan.rl +78 -38
- data/lib/hpricot.rb +19 -0
- data/lib/hpricot/elements.rb +194 -87
- data/lib/hpricot/inspect.rb +13 -0
- data/lib/hpricot/parse.rb +83 -99
- data/lib/hpricot/tag.rb +114 -40
- data/lib/hpricot/traverse.rb +311 -61
- data/lib/hpricot_scan.so +0 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/test_parser.rb +160 -10
- data/test/test_paths.rb +16 -0
- data/test/test_preserved.rb +46 -0
- data/test/test_xml.rb +15 -0
- metadata +41 -35
data/lib/hpricot/inspect.rb
CHANGED
@@ -2,6 +2,13 @@ require 'pp'
|
|
2
2
|
|
3
3
|
module Hpricot
|
4
4
|
# :stopdoc:
|
5
|
+
class Elements
|
6
|
+
def pretty_print(q)
|
7
|
+
q.object_group(self) { super }
|
8
|
+
end
|
9
|
+
alias inspect pretty_print_inspect
|
10
|
+
end
|
11
|
+
|
5
12
|
class Doc
|
6
13
|
def pretty_print(q)
|
7
14
|
q.object_group(self) { @children.each {|elt| q.breakable; q.pp elt } }
|
@@ -73,6 +80,12 @@ module Hpricot
|
|
73
80
|
alias inspect pretty_print_inspect
|
74
81
|
end
|
75
82
|
|
83
|
+
class Text
|
84
|
+
def pretty_print(q)
|
85
|
+
q.text @content.dump
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
76
89
|
class BogusETag
|
77
90
|
def pretty_print(q)
|
78
91
|
q.group(1, '{', '}') {
|
data/lib/hpricot/parse.rb
CHANGED
@@ -5,69 +5,107 @@ def Hpricot(input, opts = {})
|
|
5
5
|
end
|
6
6
|
|
7
7
|
module Hpricot
|
8
|
+
# Exception class used for any errors related to deficiencies in the system when
|
9
|
+
# handling the character encodings of a document.
|
10
|
+
class EncodingError < StandardError; end
|
11
|
+
|
8
12
|
# Hpricot.parse parses <i>input</i> and return a document tree.
|
9
13
|
# represented by Hpricot::Doc.
|
10
14
|
def Hpricot.parse(input, opts = {})
|
11
15
|
Doc.new(make(input, opts))
|
12
16
|
end
|
13
17
|
|
18
|
+
# Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
|
19
|
+
# and returning a document tree.
|
20
|
+
def Hpricot.XML(input, opts = {})
|
21
|
+
Doc.new(make(input, opts.merge(:xml => true)))
|
22
|
+
end
|
23
|
+
|
14
24
|
# :stopdoc:
|
15
25
|
|
16
26
|
def Hpricot.make(input, opts = {})
|
17
27
|
opts = {:fixup_tags => false}.merge(opts)
|
28
|
+
|
29
|
+
case opts[:encoding]
|
30
|
+
when nil
|
31
|
+
when 'utf-8'
|
32
|
+
unless defined? Encoding::Character::UTF8
|
33
|
+
raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
|
34
|
+
end
|
35
|
+
else
|
36
|
+
raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
|
37
|
+
end
|
38
|
+
|
39
|
+
if opts[:xhtml_strict]
|
40
|
+
opts[:fixup_tags] = true
|
41
|
+
end
|
42
|
+
|
18
43
|
stack = [[nil, nil, [], [], [], []]]
|
19
44
|
Hpricot.scan(input) do |token|
|
20
|
-
if stack.last[5] == :CDATA and !
|
45
|
+
if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
|
46
|
+
!(token[0] == :etag and token[1].downcase == stack.last[0])
|
21
47
|
token[0] = :text
|
22
48
|
token[1] = token[3] if token[3]
|
23
49
|
end
|
24
50
|
|
25
51
|
case token[0]
|
26
52
|
when :stag
|
53
|
+
case opts[:encoding] when 'utf-8'
|
54
|
+
token.map! { |str| u(str) if str.is_a? String }
|
55
|
+
end
|
56
|
+
|
27
57
|
stagname = token[0] = token[1].downcase
|
28
|
-
if ElementContent[stagname] == :EMPTY
|
58
|
+
if ElementContent[stagname] == :EMPTY and !opts[:xml]
|
29
59
|
token[0] = :emptytag
|
30
60
|
stack.last[2] << token
|
31
61
|
else
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
62
|
+
unless opts[:xml]
|
63
|
+
if opts[:fixup_tags]
|
64
|
+
# obey the tag rules set up by the current element
|
65
|
+
if ElementContent.has_key? stagname
|
66
|
+
trans = nil
|
67
|
+
(stack.length-1).downto(0) do |i|
|
68
|
+
untags = stack[i][5]
|
69
|
+
break unless untags.include? stagname
|
70
|
+
# puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
|
71
|
+
trans = i
|
72
|
+
end
|
73
|
+
if trans.to_i > 1
|
74
|
+
eles = stack.slice!(trans..-1)
|
75
|
+
stack.last[2] += eles
|
76
|
+
# puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
|
77
|
+
end
|
78
|
+
elsif opts[:xhtml_strict]
|
79
|
+
token[2] = {'class' => stagname}
|
80
|
+
stagname = token[0] = "div"
|
46
81
|
end
|
47
82
|
end
|
48
|
-
end
|
49
83
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
84
|
+
# setup tag rules for inside this element
|
85
|
+
if ElementContent[stagname] == :CDATA
|
86
|
+
uncontainable_tags = :CDATA
|
87
|
+
elsif opts[:fixup_tags]
|
88
|
+
possible_tags = ElementContent[stagname]
|
89
|
+
excluded_tags, included_tags = stack.last[3..4]
|
90
|
+
if possible_tags
|
91
|
+
excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
|
92
|
+
included_tags = included_tags | (ElementInclusions[stagname] || [])
|
93
|
+
containable_tags = (possible_tags | included_tags) - excluded_tags
|
94
|
+
uncontainable_tags = ElementContent.keys - containable_tags
|
95
|
+
else
|
96
|
+
# If the tagname is unknown, it is assumed that any element
|
97
|
+
# except excluded can be contained.
|
98
|
+
uncontainable_tags = excluded_tags
|
99
|
+
end
|
65
100
|
end
|
66
101
|
end
|
67
102
|
stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
|
68
103
|
end
|
69
104
|
when :etag
|
70
105
|
etagname = token[0] = token[1].downcase
|
106
|
+
if opts[:xhtml_strict] and not ElementContent.has_key? etagname
|
107
|
+
etagname = token[0] = "div"
|
108
|
+
end
|
71
109
|
matched_elem = nil
|
72
110
|
(stack.length-1).downto(0) do |i|
|
73
111
|
stagname, = stack[i]
|
@@ -80,7 +118,7 @@ module Hpricot
|
|
80
118
|
end
|
81
119
|
end
|
82
120
|
unless matched_elem
|
83
|
-
stack.last[2] << [:bogus_etag, token]
|
121
|
+
stack.last[2] << [:bogus_etag, token.first, token.last]
|
84
122
|
else
|
85
123
|
ele = stack.pop
|
86
124
|
stack.last[2] << ele
|
@@ -103,63 +141,10 @@ module Hpricot
|
|
103
141
|
end
|
104
142
|
|
105
143
|
structure_list = stack[0][2]
|
106
|
-
structure_list.map {|s| build_node(s) }
|
107
|
-
end
|
108
|
-
|
109
|
-
def Hpricot.fix_element(elem, excluded_tags, included_tags)
|
110
|
-
tagname, _, attrs, sraw, _, _, _, eraw = elem[1]
|
111
|
-
children = elem[2]
|
112
|
-
if eraw
|
113
|
-
elem[2] = fix_structure_list(children)
|
114
|
-
return elem, []
|
115
|
-
else
|
116
|
-
if ElementContent[tagname] == :EMPTY
|
117
|
-
elem[2] = []
|
118
|
-
return elem, children
|
119
|
-
else
|
120
|
-
if ElementContent[tagname] == :CDATA
|
121
|
-
possible_tags = []
|
122
|
-
else
|
123
|
-
possible_tags = ElementContent[tagname]
|
124
|
-
end
|
125
|
-
if possible_tags
|
126
|
-
excluded_tags2 = ElementExclusions[tagname]
|
127
|
-
included_tags2 = ElementInclusions[tagname]
|
128
|
-
excluded_tags |= excluded_tags2 if excluded_tags2
|
129
|
-
included_tags |= included_tags2 if included_tags2
|
130
|
-
containable_tags = (possible_tags | included_tags) - excluded_tags
|
131
|
-
uncontainable_tags = ElementContent.keys - containable_tags
|
132
|
-
else
|
133
|
-
# If the tagname is unknown, it is assumed that any element
|
134
|
-
# except excluded can be contained.
|
135
|
-
uncontainable_tags = excluded_tags
|
136
|
-
end
|
137
|
-
fixed_children = []
|
138
|
-
rest = children
|
139
|
-
until rest.empty?
|
140
|
-
if String === rest[0][0]
|
141
|
-
elem = rest.shift
|
142
|
-
elem_tagname = elem[0]
|
143
|
-
elem_tagname = elem_tagname.downcase
|
144
|
-
if uncontainable_tags.include? elem_tagname
|
145
|
-
rest.unshift elem
|
146
|
-
break
|
147
|
-
else
|
148
|
-
fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags)
|
149
|
-
fixed_children << fixed_elem
|
150
|
-
rest = rest2 + rest
|
151
|
-
end
|
152
|
-
else
|
153
|
-
fixed_children << rest.shift
|
154
|
-
end
|
155
|
-
end
|
156
|
-
elem[2] = fixed_children
|
157
|
-
return elem, rest
|
158
|
-
end
|
159
|
-
end
|
144
|
+
structure_list.map {|s| build_node(s, opts) }
|
160
145
|
end
|
161
146
|
|
162
|
-
def Hpricot.build_node(structure)
|
147
|
+
def Hpricot.build_node(structure, opts = {})
|
163
148
|
case structure[0]
|
164
149
|
when String
|
165
150
|
tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
|
@@ -168,7 +153,7 @@ module Hpricot
|
|
168
153
|
stag = STag.parse(tagname, attrs, sraw, true)
|
169
154
|
if !children.empty? || etag
|
170
155
|
Elem.new(stag,
|
171
|
-
children.map {|c| build_node(c) },
|
156
|
+
children.map {|c| build_node(c, opts) },
|
172
157
|
etag)
|
173
158
|
else
|
174
159
|
Elem.new(stag)
|
@@ -182,9 +167,13 @@ module Hpricot
|
|
182
167
|
when :xmldecl
|
183
168
|
XMLDecl.parse(structure[2], structure[3])
|
184
169
|
when :doctype
|
170
|
+
if opts[:xhtml_strict]
|
171
|
+
structure[2]['system_id'] = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
|
172
|
+
structure[2]['public_id'] = "-//W3C//DTD XHTML 1.0 Strict//EN"
|
173
|
+
end
|
185
174
|
DocType.parse(structure[1], structure[2], structure[3])
|
186
175
|
when :procins
|
187
|
-
ProcIns.parse(structure[1]
|
176
|
+
ProcIns.parse(structure[1])
|
188
177
|
when :comment
|
189
178
|
Comment.parse(structure[1])
|
190
179
|
when :cdata_content
|
@@ -216,20 +205,16 @@ module Hpricot
|
|
216
205
|
|
217
206
|
def Text.parse_pcdata(raw_string)
|
218
207
|
result = Text.new(raw_string)
|
219
|
-
result.raw_string = raw_string
|
220
208
|
result
|
221
209
|
end
|
222
210
|
|
223
211
|
def Text.parse_cdata_content(raw_string)
|
224
|
-
result =
|
225
|
-
result.raw_string = raw_string
|
226
|
-
result.instance_variable_set( "@cdata", true )
|
212
|
+
result = CData.new(raw_string)
|
227
213
|
result
|
228
214
|
end
|
229
215
|
|
230
216
|
def Text.parse_cdata_section(content)
|
231
|
-
result =
|
232
|
-
result.raw_string = "<![CDATA[" + content + "]]>"
|
217
|
+
result = CData.new(content)
|
233
218
|
result
|
234
219
|
end
|
235
220
|
|
@@ -264,15 +249,14 @@ module Hpricot
|
|
264
249
|
result
|
265
250
|
end
|
266
251
|
|
267
|
-
def ProcIns.parse(
|
252
|
+
def ProcIns.parse(raw_string)
|
253
|
+
_, target, content = *raw_string.match(/\A<\?(\S+)\s+(.+)/m)
|
268
254
|
result = ProcIns.new(target, content)
|
269
|
-
result.raw_string = raw_string
|
270
255
|
result
|
271
256
|
end
|
272
257
|
|
273
258
|
def Comment.parse(content)
|
274
259
|
result = Comment.new(content)
|
275
|
-
result.raw_string = "<!--" + content + "-->"
|
276
260
|
result
|
277
261
|
end
|
278
262
|
|
data/lib/hpricot/tag.rb
CHANGED
@@ -6,12 +6,13 @@ module Hpricot
|
|
6
6
|
def initialize(children)
|
7
7
|
@children = children ? children.each { |c| c.parent = self } : []
|
8
8
|
end
|
9
|
-
def output(out)
|
9
|
+
def output(out, opts = {})
|
10
10
|
@children.each do |n|
|
11
|
-
n.output(out)
|
11
|
+
n.output(out, opts)
|
12
12
|
end
|
13
13
|
out
|
14
14
|
end
|
15
|
+
def altered!; end
|
15
16
|
end
|
16
17
|
|
17
18
|
class BaseEle
|
@@ -19,6 +20,26 @@ module Hpricot
|
|
19
20
|
def html_quote(str)
|
20
21
|
"\"" + str.gsub('"', '\\"') + "\""
|
21
22
|
end
|
23
|
+
def if_output(opts)
|
24
|
+
if opts[:preserve] and not @raw_string.nil?
|
25
|
+
@raw_string
|
26
|
+
else
|
27
|
+
yield opts
|
28
|
+
end
|
29
|
+
end
|
30
|
+
def pathname; self.name end
|
31
|
+
def altered!
|
32
|
+
@raw_string = nil
|
33
|
+
end
|
34
|
+
def self.alterable(*fields)
|
35
|
+
attr_accessor(*fields)
|
36
|
+
fields.each do |f|
|
37
|
+
define_method("#{f}=") do |v|
|
38
|
+
altered!
|
39
|
+
instance_variable_set("@#{f}", v)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
22
43
|
end
|
23
44
|
|
24
45
|
class Elem
|
@@ -28,16 +49,34 @@ module Hpricot
|
|
28
49
|
@children = children ? children.each { |c| c.parent = self } : []
|
29
50
|
end
|
30
51
|
def empty?; @children.empty? end
|
31
|
-
[:name, :attributes, :parent].each do |m|
|
32
|
-
[m, "#{m}="].each { |m2| define_method(m2) { |*a| @stag.send(m2, *a) } }
|
52
|
+
[:name, :attributes, :parent, :altered!].each do |m|
|
53
|
+
[m, "#{m}="].each { |m2| define_method(m2) { |*a| [@etag, @stag].inject { |_,t| t.send(m2, *a) if t and t.respond_to?(m2) } } }
|
54
|
+
end
|
55
|
+
def to_plain_text
|
56
|
+
if self.name == 'br'
|
57
|
+
"\n"
|
58
|
+
elsif self.name == 'p'
|
59
|
+
"\n\n" + super + "\n\n"
|
60
|
+
elsif self.name == 'a' and self.has_attribute?('href')
|
61
|
+
"#{super} [#{self['href']}]"
|
62
|
+
elsif self.name == 'img' and self.has_attribute?('src')
|
63
|
+
"[img:#{self['src']}]"
|
64
|
+
else
|
65
|
+
super
|
66
|
+
end
|
33
67
|
end
|
34
|
-
def
|
68
|
+
def pathname; self.name end
|
69
|
+
def output(out, opts = {})
|
35
70
|
if empty? and ElementContent[@stag.name] == :EMPTY
|
36
|
-
@stag.output(out, :style => :empty)
|
71
|
+
@stag.output(out, opts.merge(:style => :empty))
|
37
72
|
else
|
38
|
-
@stag.output(out)
|
39
|
-
@children.each { |n| n.output(out) }
|
40
|
-
@
|
73
|
+
@stag.output(out, opts)
|
74
|
+
@children.each { |n| n.output(out, opts) }
|
75
|
+
if @etag
|
76
|
+
@etag.output(out, opts)
|
77
|
+
elsif !opts[:preserve]
|
78
|
+
ETag.new(@stag.name).output(out, opts)
|
79
|
+
end
|
41
80
|
end
|
42
81
|
out
|
43
82
|
end
|
@@ -46,11 +85,12 @@ module Hpricot
|
|
46
85
|
class STag < BaseEle
|
47
86
|
def initialize(name, attributes=nil)
|
48
87
|
@name = name.downcase
|
88
|
+
@attributes = {}
|
49
89
|
if attributes
|
50
90
|
@attributes = attributes.inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
|
51
91
|
end
|
52
92
|
end
|
53
|
-
|
93
|
+
alterable :name, :attributes
|
54
94
|
def attributes_as_html
|
55
95
|
if @attributes
|
56
96
|
@attributes.map do |aname, aval|
|
@@ -61,10 +101,7 @@ module Hpricot
|
|
61
101
|
end
|
62
102
|
def output(out, opts = {})
|
63
103
|
out <<
|
64
|
-
|
65
|
-
when :end
|
66
|
-
"</#{@name}>"
|
67
|
-
else
|
104
|
+
if_output(opts) do
|
68
105
|
"<#{@name}#{attributes_as_html}" +
|
69
106
|
(opts[:style] == :empty ? " /" : "") +
|
70
107
|
">"
|
@@ -76,20 +113,43 @@ module Hpricot
|
|
76
113
|
def initialize(qualified_name)
|
77
114
|
@name = qualified_name
|
78
115
|
end
|
79
|
-
|
116
|
+
alterable :name
|
117
|
+
def output(out, opts = {})
|
118
|
+
out <<
|
119
|
+
if_output(opts) do
|
120
|
+
"</#{@name}>"
|
121
|
+
end
|
122
|
+
end
|
80
123
|
end
|
81
124
|
|
82
125
|
class BogusETag < ETag
|
83
|
-
def output(out); end
|
126
|
+
def output(out, opts = {}); out << if_output(opts) { '' }; end
|
84
127
|
end
|
85
128
|
|
86
129
|
class Text < BaseEle
|
87
130
|
def initialize(text)
|
88
131
|
@content = text
|
89
132
|
end
|
90
|
-
|
91
|
-
def
|
92
|
-
|
133
|
+
alterable :content
|
134
|
+
def pathname; "text()" end
|
135
|
+
alias_method :inner_text, :content
|
136
|
+
alias_method :to_plain_text, :content
|
137
|
+
def output(out, opts = {})
|
138
|
+
out <<
|
139
|
+
if_output(opts) do
|
140
|
+
@content
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
class CData < Text
|
146
|
+
alias_method :inner_text, :content
|
147
|
+
alias_method :to_plain_text, :content
|
148
|
+
def output(out, opts = {})
|
149
|
+
out <<
|
150
|
+
if_output(opts) do
|
151
|
+
"<![CDATA[#@content]]>"
|
152
|
+
end
|
93
153
|
end
|
94
154
|
end
|
95
155
|
|
@@ -97,26 +157,32 @@ module Hpricot
|
|
97
157
|
def initialize(version, encoding, standalone)
|
98
158
|
@version, @encoding, @standalone = version, encoding, standalone
|
99
159
|
end
|
100
|
-
|
101
|
-
def
|
160
|
+
alterable :version, :encoding, :standalone
|
161
|
+
def pathname; "xmldecl()" end
|
162
|
+
def output(out, opts = {})
|
102
163
|
out <<
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
164
|
+
if_output(opts) do
|
165
|
+
"<?xml version=\"#{@version}\"" +
|
166
|
+
(@encoding ? " encoding=\"#{encoding}\"" : "") +
|
167
|
+
(@standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
|
168
|
+
"?>"
|
169
|
+
end
|
107
170
|
end
|
108
171
|
end
|
109
172
|
|
110
173
|
class DocType < BaseEle
|
111
|
-
def initialize(
|
112
|
-
@
|
174
|
+
def initialize(target, pubid, sysid)
|
175
|
+
@target, @public_id, @system_id = target, pubid, sysid
|
113
176
|
end
|
114
|
-
|
115
|
-
def
|
177
|
+
alterable :target, :public_id, :system_id
|
178
|
+
def pathname; "doctype()" end
|
179
|
+
def output(out, opts = {})
|
116
180
|
out <<
|
117
|
-
|
118
|
-
|
119
|
-
|
181
|
+
if_output(opts) do
|
182
|
+
"<!DOCTYPE #{@target} " +
|
183
|
+
(@public_id ? "PUBLIC \"#{@public_id}\"" : "SYSTEM") +
|
184
|
+
(@system_id ? " #{html_quote(@system_id)}" : "") + ">"
|
185
|
+
end
|
120
186
|
end
|
121
187
|
end
|
122
188
|
|
@@ -124,11 +190,15 @@ module Hpricot
|
|
124
190
|
def initialize(target, content)
|
125
191
|
@target, @content = target, content
|
126
192
|
end
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
193
|
+
def pathname; "procins()" end
|
194
|
+
alterable :target, :content
|
195
|
+
def output(out, opts = {})
|
196
|
+
out <<
|
197
|
+
if_output(opts) do
|
198
|
+
"<?#{@target}" +
|
199
|
+
(@content ? " #{@content}" : "") +
|
200
|
+
"?>"
|
201
|
+
end
|
132
202
|
end
|
133
203
|
end
|
134
204
|
|
@@ -136,9 +206,13 @@ module Hpricot
|
|
136
206
|
def initialize(content)
|
137
207
|
@content = content
|
138
208
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
209
|
+
def pathname; "comment()" end
|
210
|
+
alterable :content
|
211
|
+
def output(out, opts = {})
|
212
|
+
out <<
|
213
|
+
if_output(opts) do
|
214
|
+
"<!--#{@content}-->"
|
215
|
+
end
|
142
216
|
end
|
143
217
|
end
|
144
218
|
|