hpricot 0.4-mswin32 → 0.5-mswin32
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +16 -0
- data/README +279 -4
- data/Rakefile +12 -3
- data/ext/hpricot_scan/hpricot_scan.c +3106 -3348
- data/ext/hpricot_scan/hpricot_scan.rl +78 -38
- data/lib/hpricot.rb +19 -0
- data/lib/hpricot/elements.rb +194 -87
- data/lib/hpricot/inspect.rb +13 -0
- data/lib/hpricot/parse.rb +83 -99
- data/lib/hpricot/tag.rb +114 -40
- data/lib/hpricot/traverse.rb +311 -61
- data/lib/hpricot_scan.so +0 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/test_parser.rb +160 -10
- data/test/test_paths.rb +16 -0
- data/test/test_preserved.rb +46 -0
- data/test/test_xml.rb +15 -0
- metadata +41 -35
data/lib/hpricot/inspect.rb
CHANGED
@@ -2,6 +2,13 @@ require 'pp'
|
|
2
2
|
|
3
3
|
module Hpricot
|
4
4
|
# :stopdoc:
|
5
|
+
class Elements
|
6
|
+
def pretty_print(q)
|
7
|
+
q.object_group(self) { super }
|
8
|
+
end
|
9
|
+
alias inspect pretty_print_inspect
|
10
|
+
end
|
11
|
+
|
5
12
|
class Doc
|
6
13
|
def pretty_print(q)
|
7
14
|
q.object_group(self) { @children.each {|elt| q.breakable; q.pp elt } }
|
@@ -73,6 +80,12 @@ module Hpricot
|
|
73
80
|
alias inspect pretty_print_inspect
|
74
81
|
end
|
75
82
|
|
83
|
+
class Text
|
84
|
+
def pretty_print(q)
|
85
|
+
q.text @content.dump
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
76
89
|
class BogusETag
|
77
90
|
def pretty_print(q)
|
78
91
|
q.group(1, '{', '}') {
|
data/lib/hpricot/parse.rb
CHANGED
@@ -5,69 +5,107 @@ def Hpricot(input, opts = {})
|
|
5
5
|
end
|
6
6
|
|
7
7
|
module Hpricot
|
8
|
+
# Exception class used for any errors related to deficiencies in the system when
|
9
|
+
# handling the character encodings of a document.
|
10
|
+
class EncodingError < StandardError; end
|
11
|
+
|
8
12
|
# Hpricot.parse parses <i>input</i> and return a document tree.
|
9
13
|
# represented by Hpricot::Doc.
|
10
14
|
def Hpricot.parse(input, opts = {})
|
11
15
|
Doc.new(make(input, opts))
|
12
16
|
end
|
13
17
|
|
18
|
+
# Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
|
19
|
+
# and returning a document tree.
|
20
|
+
def Hpricot.XML(input, opts = {})
|
21
|
+
Doc.new(make(input, opts.merge(:xml => true)))
|
22
|
+
end
|
23
|
+
|
14
24
|
# :stopdoc:
|
15
25
|
|
16
26
|
def Hpricot.make(input, opts = {})
|
17
27
|
opts = {:fixup_tags => false}.merge(opts)
|
28
|
+
|
29
|
+
case opts[:encoding]
|
30
|
+
when nil
|
31
|
+
when 'utf-8'
|
32
|
+
unless defined? Encoding::Character::UTF8
|
33
|
+
raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
|
34
|
+
end
|
35
|
+
else
|
36
|
+
raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
|
37
|
+
end
|
38
|
+
|
39
|
+
if opts[:xhtml_strict]
|
40
|
+
opts[:fixup_tags] = true
|
41
|
+
end
|
42
|
+
|
18
43
|
stack = [[nil, nil, [], [], [], []]]
|
19
44
|
Hpricot.scan(input) do |token|
|
20
|
-
if stack.last[5] == :CDATA and !
|
45
|
+
if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
|
46
|
+
!(token[0] == :etag and token[1].downcase == stack.last[0])
|
21
47
|
token[0] = :text
|
22
48
|
token[1] = token[3] if token[3]
|
23
49
|
end
|
24
50
|
|
25
51
|
case token[0]
|
26
52
|
when :stag
|
53
|
+
case opts[:encoding] when 'utf-8'
|
54
|
+
token.map! { |str| u(str) if str.is_a? String }
|
55
|
+
end
|
56
|
+
|
27
57
|
stagname = token[0] = token[1].downcase
|
28
|
-
if ElementContent[stagname] == :EMPTY
|
58
|
+
if ElementContent[stagname] == :EMPTY and !opts[:xml]
|
29
59
|
token[0] = :emptytag
|
30
60
|
stack.last[2] << token
|
31
61
|
else
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
62
|
+
unless opts[:xml]
|
63
|
+
if opts[:fixup_tags]
|
64
|
+
# obey the tag rules set up by the current element
|
65
|
+
if ElementContent.has_key? stagname
|
66
|
+
trans = nil
|
67
|
+
(stack.length-1).downto(0) do |i|
|
68
|
+
untags = stack[i][5]
|
69
|
+
break unless untags.include? stagname
|
70
|
+
# puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
|
71
|
+
trans = i
|
72
|
+
end
|
73
|
+
if trans.to_i > 1
|
74
|
+
eles = stack.slice!(trans..-1)
|
75
|
+
stack.last[2] += eles
|
76
|
+
# puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
|
77
|
+
end
|
78
|
+
elsif opts[:xhtml_strict]
|
79
|
+
token[2] = {'class' => stagname}
|
80
|
+
stagname = token[0] = "div"
|
46
81
|
end
|
47
82
|
end
|
48
|
-
end
|
49
83
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
84
|
+
# setup tag rules for inside this element
|
85
|
+
if ElementContent[stagname] == :CDATA
|
86
|
+
uncontainable_tags = :CDATA
|
87
|
+
elsif opts[:fixup_tags]
|
88
|
+
possible_tags = ElementContent[stagname]
|
89
|
+
excluded_tags, included_tags = stack.last[3..4]
|
90
|
+
if possible_tags
|
91
|
+
excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
|
92
|
+
included_tags = included_tags | (ElementInclusions[stagname] || [])
|
93
|
+
containable_tags = (possible_tags | included_tags) - excluded_tags
|
94
|
+
uncontainable_tags = ElementContent.keys - containable_tags
|
95
|
+
else
|
96
|
+
# If the tagname is unknown, it is assumed that any element
|
97
|
+
# except excluded can be contained.
|
98
|
+
uncontainable_tags = excluded_tags
|
99
|
+
end
|
65
100
|
end
|
66
101
|
end
|
67
102
|
stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
|
68
103
|
end
|
69
104
|
when :etag
|
70
105
|
etagname = token[0] = token[1].downcase
|
106
|
+
if opts[:xhtml_strict] and not ElementContent.has_key? etagname
|
107
|
+
etagname = token[0] = "div"
|
108
|
+
end
|
71
109
|
matched_elem = nil
|
72
110
|
(stack.length-1).downto(0) do |i|
|
73
111
|
stagname, = stack[i]
|
@@ -80,7 +118,7 @@ module Hpricot
|
|
80
118
|
end
|
81
119
|
end
|
82
120
|
unless matched_elem
|
83
|
-
stack.last[2] << [:bogus_etag, token]
|
121
|
+
stack.last[2] << [:bogus_etag, token.first, token.last]
|
84
122
|
else
|
85
123
|
ele = stack.pop
|
86
124
|
stack.last[2] << ele
|
@@ -103,63 +141,10 @@ module Hpricot
|
|
103
141
|
end
|
104
142
|
|
105
143
|
structure_list = stack[0][2]
|
106
|
-
structure_list.map {|s| build_node(s) }
|
107
|
-
end
|
108
|
-
|
109
|
-
def Hpricot.fix_element(elem, excluded_tags, included_tags)
|
110
|
-
tagname, _, attrs, sraw, _, _, _, eraw = elem[1]
|
111
|
-
children = elem[2]
|
112
|
-
if eraw
|
113
|
-
elem[2] = fix_structure_list(children)
|
114
|
-
return elem, []
|
115
|
-
else
|
116
|
-
if ElementContent[tagname] == :EMPTY
|
117
|
-
elem[2] = []
|
118
|
-
return elem, children
|
119
|
-
else
|
120
|
-
if ElementContent[tagname] == :CDATA
|
121
|
-
possible_tags = []
|
122
|
-
else
|
123
|
-
possible_tags = ElementContent[tagname]
|
124
|
-
end
|
125
|
-
if possible_tags
|
126
|
-
excluded_tags2 = ElementExclusions[tagname]
|
127
|
-
included_tags2 = ElementInclusions[tagname]
|
128
|
-
excluded_tags |= excluded_tags2 if excluded_tags2
|
129
|
-
included_tags |= included_tags2 if included_tags2
|
130
|
-
containable_tags = (possible_tags | included_tags) - excluded_tags
|
131
|
-
uncontainable_tags = ElementContent.keys - containable_tags
|
132
|
-
else
|
133
|
-
# If the tagname is unknown, it is assumed that any element
|
134
|
-
# except excluded can be contained.
|
135
|
-
uncontainable_tags = excluded_tags
|
136
|
-
end
|
137
|
-
fixed_children = []
|
138
|
-
rest = children
|
139
|
-
until rest.empty?
|
140
|
-
if String === rest[0][0]
|
141
|
-
elem = rest.shift
|
142
|
-
elem_tagname = elem[0]
|
143
|
-
elem_tagname = elem_tagname.downcase
|
144
|
-
if uncontainable_tags.include? elem_tagname
|
145
|
-
rest.unshift elem
|
146
|
-
break
|
147
|
-
else
|
148
|
-
fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags)
|
149
|
-
fixed_children << fixed_elem
|
150
|
-
rest = rest2 + rest
|
151
|
-
end
|
152
|
-
else
|
153
|
-
fixed_children << rest.shift
|
154
|
-
end
|
155
|
-
end
|
156
|
-
elem[2] = fixed_children
|
157
|
-
return elem, rest
|
158
|
-
end
|
159
|
-
end
|
144
|
+
structure_list.map {|s| build_node(s, opts) }
|
160
145
|
end
|
161
146
|
|
162
|
-
def Hpricot.build_node(structure)
|
147
|
+
def Hpricot.build_node(structure, opts = {})
|
163
148
|
case structure[0]
|
164
149
|
when String
|
165
150
|
tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
|
@@ -168,7 +153,7 @@ module Hpricot
|
|
168
153
|
stag = STag.parse(tagname, attrs, sraw, true)
|
169
154
|
if !children.empty? || etag
|
170
155
|
Elem.new(stag,
|
171
|
-
children.map {|c| build_node(c) },
|
156
|
+
children.map {|c| build_node(c, opts) },
|
172
157
|
etag)
|
173
158
|
else
|
174
159
|
Elem.new(stag)
|
@@ -182,9 +167,13 @@ module Hpricot
|
|
182
167
|
when :xmldecl
|
183
168
|
XMLDecl.parse(structure[2], structure[3])
|
184
169
|
when :doctype
|
170
|
+
if opts[:xhtml_strict]
|
171
|
+
structure[2]['system_id'] = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
|
172
|
+
structure[2]['public_id'] = "-//W3C//DTD XHTML 1.0 Strict//EN"
|
173
|
+
end
|
185
174
|
DocType.parse(structure[1], structure[2], structure[3])
|
186
175
|
when :procins
|
187
|
-
ProcIns.parse(structure[1]
|
176
|
+
ProcIns.parse(structure[1])
|
188
177
|
when :comment
|
189
178
|
Comment.parse(structure[1])
|
190
179
|
when :cdata_content
|
@@ -216,20 +205,16 @@ module Hpricot
|
|
216
205
|
|
217
206
|
def Text.parse_pcdata(raw_string)
|
218
207
|
result = Text.new(raw_string)
|
219
|
-
result.raw_string = raw_string
|
220
208
|
result
|
221
209
|
end
|
222
210
|
|
223
211
|
def Text.parse_cdata_content(raw_string)
|
224
|
-
result =
|
225
|
-
result.raw_string = raw_string
|
226
|
-
result.instance_variable_set( "@cdata", true )
|
212
|
+
result = CData.new(raw_string)
|
227
213
|
result
|
228
214
|
end
|
229
215
|
|
230
216
|
def Text.parse_cdata_section(content)
|
231
|
-
result =
|
232
|
-
result.raw_string = "<![CDATA[" + content + "]]>"
|
217
|
+
result = CData.new(content)
|
233
218
|
result
|
234
219
|
end
|
235
220
|
|
@@ -264,15 +249,14 @@ module Hpricot
|
|
264
249
|
result
|
265
250
|
end
|
266
251
|
|
267
|
-
def ProcIns.parse(
|
252
|
+
def ProcIns.parse(raw_string)
|
253
|
+
_, target, content = *raw_string.match(/\A<\?(\S+)\s+(.+)/m)
|
268
254
|
result = ProcIns.new(target, content)
|
269
|
-
result.raw_string = raw_string
|
270
255
|
result
|
271
256
|
end
|
272
257
|
|
273
258
|
def Comment.parse(content)
|
274
259
|
result = Comment.new(content)
|
275
|
-
result.raw_string = "<!--" + content + "-->"
|
276
260
|
result
|
277
261
|
end
|
278
262
|
|
data/lib/hpricot/tag.rb
CHANGED
@@ -6,12 +6,13 @@ module Hpricot
|
|
6
6
|
def initialize(children)
|
7
7
|
@children = children ? children.each { |c| c.parent = self } : []
|
8
8
|
end
|
9
|
-
def output(out)
|
9
|
+
def output(out, opts = {})
|
10
10
|
@children.each do |n|
|
11
|
-
n.output(out)
|
11
|
+
n.output(out, opts)
|
12
12
|
end
|
13
13
|
out
|
14
14
|
end
|
15
|
+
def altered!; end
|
15
16
|
end
|
16
17
|
|
17
18
|
class BaseEle
|
@@ -19,6 +20,26 @@ module Hpricot
|
|
19
20
|
def html_quote(str)
|
20
21
|
"\"" + str.gsub('"', '\\"') + "\""
|
21
22
|
end
|
23
|
+
def if_output(opts)
|
24
|
+
if opts[:preserve] and not @raw_string.nil?
|
25
|
+
@raw_string
|
26
|
+
else
|
27
|
+
yield opts
|
28
|
+
end
|
29
|
+
end
|
30
|
+
def pathname; self.name end
|
31
|
+
def altered!
|
32
|
+
@raw_string = nil
|
33
|
+
end
|
34
|
+
def self.alterable(*fields)
|
35
|
+
attr_accessor(*fields)
|
36
|
+
fields.each do |f|
|
37
|
+
define_method("#{f}=") do |v|
|
38
|
+
altered!
|
39
|
+
instance_variable_set("@#{f}", v)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
22
43
|
end
|
23
44
|
|
24
45
|
class Elem
|
@@ -28,16 +49,34 @@ module Hpricot
|
|
28
49
|
@children = children ? children.each { |c| c.parent = self } : []
|
29
50
|
end
|
30
51
|
def empty?; @children.empty? end
|
31
|
-
[:name, :attributes, :parent].each do |m|
|
32
|
-
[m, "#{m}="].each { |m2| define_method(m2) { |*a| @stag.send(m2, *a) } }
|
52
|
+
[:name, :attributes, :parent, :altered!].each do |m|
|
53
|
+
[m, "#{m}="].each { |m2| define_method(m2) { |*a| [@etag, @stag].inject { |_,t| t.send(m2, *a) if t and t.respond_to?(m2) } } }
|
54
|
+
end
|
55
|
+
def to_plain_text
|
56
|
+
if self.name == 'br'
|
57
|
+
"\n"
|
58
|
+
elsif self.name == 'p'
|
59
|
+
"\n\n" + super + "\n\n"
|
60
|
+
elsif self.name == 'a' and self.has_attribute?('href')
|
61
|
+
"#{super} [#{self['href']}]"
|
62
|
+
elsif self.name == 'img' and self.has_attribute?('src')
|
63
|
+
"[img:#{self['src']}]"
|
64
|
+
else
|
65
|
+
super
|
66
|
+
end
|
33
67
|
end
|
34
|
-
def
|
68
|
+
def pathname; self.name end
|
69
|
+
def output(out, opts = {})
|
35
70
|
if empty? and ElementContent[@stag.name] == :EMPTY
|
36
|
-
@stag.output(out, :style => :empty)
|
71
|
+
@stag.output(out, opts.merge(:style => :empty))
|
37
72
|
else
|
38
|
-
@stag.output(out)
|
39
|
-
@children.each { |n| n.output(out) }
|
40
|
-
@
|
73
|
+
@stag.output(out, opts)
|
74
|
+
@children.each { |n| n.output(out, opts) }
|
75
|
+
if @etag
|
76
|
+
@etag.output(out, opts)
|
77
|
+
elsif !opts[:preserve]
|
78
|
+
ETag.new(@stag.name).output(out, opts)
|
79
|
+
end
|
41
80
|
end
|
42
81
|
out
|
43
82
|
end
|
@@ -46,11 +85,12 @@ module Hpricot
|
|
46
85
|
class STag < BaseEle
|
47
86
|
def initialize(name, attributes=nil)
|
48
87
|
@name = name.downcase
|
88
|
+
@attributes = {}
|
49
89
|
if attributes
|
50
90
|
@attributes = attributes.inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
|
51
91
|
end
|
52
92
|
end
|
53
|
-
|
93
|
+
alterable :name, :attributes
|
54
94
|
def attributes_as_html
|
55
95
|
if @attributes
|
56
96
|
@attributes.map do |aname, aval|
|
@@ -61,10 +101,7 @@ module Hpricot
|
|
61
101
|
end
|
62
102
|
def output(out, opts = {})
|
63
103
|
out <<
|
64
|
-
|
65
|
-
when :end
|
66
|
-
"</#{@name}>"
|
67
|
-
else
|
104
|
+
if_output(opts) do
|
68
105
|
"<#{@name}#{attributes_as_html}" +
|
69
106
|
(opts[:style] == :empty ? " /" : "") +
|
70
107
|
">"
|
@@ -76,20 +113,43 @@ module Hpricot
|
|
76
113
|
def initialize(qualified_name)
|
77
114
|
@name = qualified_name
|
78
115
|
end
|
79
|
-
|
116
|
+
alterable :name
|
117
|
+
def output(out, opts = {})
|
118
|
+
out <<
|
119
|
+
if_output(opts) do
|
120
|
+
"</#{@name}>"
|
121
|
+
end
|
122
|
+
end
|
80
123
|
end
|
81
124
|
|
82
125
|
class BogusETag < ETag
|
83
|
-
def output(out); end
|
126
|
+
def output(out, opts = {}); out << if_output(opts) { '' }; end
|
84
127
|
end
|
85
128
|
|
86
129
|
class Text < BaseEle
|
87
130
|
def initialize(text)
|
88
131
|
@content = text
|
89
132
|
end
|
90
|
-
|
91
|
-
def
|
92
|
-
|
133
|
+
alterable :content
|
134
|
+
def pathname; "text()" end
|
135
|
+
alias_method :inner_text, :content
|
136
|
+
alias_method :to_plain_text, :content
|
137
|
+
def output(out, opts = {})
|
138
|
+
out <<
|
139
|
+
if_output(opts) do
|
140
|
+
@content
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
class CData < Text
|
146
|
+
alias_method :inner_text, :content
|
147
|
+
alias_method :to_plain_text, :content
|
148
|
+
def output(out, opts = {})
|
149
|
+
out <<
|
150
|
+
if_output(opts) do
|
151
|
+
"<![CDATA[#@content]]>"
|
152
|
+
end
|
93
153
|
end
|
94
154
|
end
|
95
155
|
|
@@ -97,26 +157,32 @@ module Hpricot
|
|
97
157
|
def initialize(version, encoding, standalone)
|
98
158
|
@version, @encoding, @standalone = version, encoding, standalone
|
99
159
|
end
|
100
|
-
|
101
|
-
def
|
160
|
+
alterable :version, :encoding, :standalone
|
161
|
+
def pathname; "xmldecl()" end
|
162
|
+
def output(out, opts = {})
|
102
163
|
out <<
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
164
|
+
if_output(opts) do
|
165
|
+
"<?xml version=\"#{@version}\"" +
|
166
|
+
(@encoding ? " encoding=\"#{encoding}\"" : "") +
|
167
|
+
(@standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
|
168
|
+
"?>"
|
169
|
+
end
|
107
170
|
end
|
108
171
|
end
|
109
172
|
|
110
173
|
class DocType < BaseEle
|
111
|
-
def initialize(
|
112
|
-
@
|
174
|
+
def initialize(target, pubid, sysid)
|
175
|
+
@target, @public_id, @system_id = target, pubid, sysid
|
113
176
|
end
|
114
|
-
|
115
|
-
def
|
177
|
+
alterable :target, :public_id, :system_id
|
178
|
+
def pathname; "doctype()" end
|
179
|
+
def output(out, opts = {})
|
116
180
|
out <<
|
117
|
-
|
118
|
-
|
119
|
-
|
181
|
+
if_output(opts) do
|
182
|
+
"<!DOCTYPE #{@target} " +
|
183
|
+
(@public_id ? "PUBLIC \"#{@public_id}\"" : "SYSTEM") +
|
184
|
+
(@system_id ? " #{html_quote(@system_id)}" : "") + ">"
|
185
|
+
end
|
120
186
|
end
|
121
187
|
end
|
122
188
|
|
@@ -124,11 +190,15 @@ module Hpricot
|
|
124
190
|
def initialize(target, content)
|
125
191
|
@target, @content = target, content
|
126
192
|
end
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
193
|
+
def pathname; "procins()" end
|
194
|
+
alterable :target, :content
|
195
|
+
def output(out, opts = {})
|
196
|
+
out <<
|
197
|
+
if_output(opts) do
|
198
|
+
"<?#{@target}" +
|
199
|
+
(@content ? " #{@content}" : "") +
|
200
|
+
"?>"
|
201
|
+
end
|
132
202
|
end
|
133
203
|
end
|
134
204
|
|
@@ -136,9 +206,13 @@ module Hpricot
|
|
136
206
|
def initialize(content)
|
137
207
|
@content = content
|
138
208
|
end
|
139
|
-
|
140
|
-
|
141
|
-
|
209
|
+
def pathname; "comment()" end
|
210
|
+
alterable :content
|
211
|
+
def output(out, opts = {})
|
212
|
+
out <<
|
213
|
+
if_output(opts) do
|
214
|
+
"<!--#{@content}-->"
|
215
|
+
end
|
142
216
|
end
|
143
217
|
end
|
144
218
|
|