hpricot 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,13 @@ require 'pp'
2
2
 
3
3
  module Hpricot
4
4
  # :stopdoc:
5
+ class Elements
6
+ def pretty_print(q)
7
+ q.object_group(self) { super }
8
+ end
9
+ alias inspect pretty_print_inspect
10
+ end
11
+
5
12
  class Doc
6
13
  def pretty_print(q)
7
14
  q.object_group(self) { @children.each {|elt| q.breakable; q.pp elt } }
@@ -73,6 +80,12 @@ module Hpricot
73
80
  alias inspect pretty_print_inspect
74
81
  end
75
82
 
83
+ class Text
84
+ def pretty_print(q)
85
+ q.text @content.dump
86
+ end
87
+ end
88
+
76
89
  class BogusETag
77
90
  def pretty_print(q)
78
91
  q.group(1, '{', '}') {
@@ -5,69 +5,107 @@ def Hpricot(input, opts = {})
5
5
  end
6
6
 
7
7
  module Hpricot
8
+ # Exception class used for any errors related to deficiencies in the system when
9
+ # handling the character encodings of a document.
10
+ class EncodingError < StandardError; end
11
+
8
12
  # Hpricot.parse parses <i>input</i> and return a document tree.
9
13
  # represented by Hpricot::Doc.
10
14
  def Hpricot.parse(input, opts = {})
11
15
  Doc.new(make(input, opts))
12
16
  end
13
17
 
18
+ # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
+ # and returning a document tree.
20
+ def Hpricot.XML(input, opts = {})
21
+ Doc.new(make(input, opts.merge(:xml => true)))
22
+ end
23
+
14
24
  # :stopdoc:
15
25
 
16
26
  def Hpricot.make(input, opts = {})
17
27
  opts = {:fixup_tags => false}.merge(opts)
28
+
29
+ case opts[:encoding]
30
+ when nil
31
+ when 'utf-8'
32
+ unless defined? Encoding::Character::UTF8
33
+ raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
34
+ end
35
+ else
36
+ raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
37
+ end
38
+
39
+ if opts[:xhtml_strict]
40
+ opts[:fixup_tags] = true
41
+ end
42
+
18
43
  stack = [[nil, nil, [], [], [], []]]
19
44
  Hpricot.scan(input) do |token|
20
- if stack.last[5] == :CDATA and !(token[0] == :etag and token[1].downcase == stack.last[0])
45
+ if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
46
+ !(token[0] == :etag and token[1].downcase == stack.last[0])
21
47
  token[0] = :text
22
48
  token[1] = token[3] if token[3]
23
49
  end
24
50
 
25
51
  case token[0]
26
52
  when :stag
53
+ case opts[:encoding] when 'utf-8'
54
+ token.map! { |str| u(str) if str.is_a? String }
55
+ end
56
+
27
57
  stagname = token[0] = token[1].downcase
28
- if ElementContent[stagname] == :EMPTY
58
+ if ElementContent[stagname] == :EMPTY and !opts[:xml]
29
59
  token[0] = :emptytag
30
60
  stack.last[2] << token
31
61
  else
32
- if opts[:fixup_tags]
33
- # obey the tag rules set up by the current element
34
- if ElementContent.has_key? stagname
35
- trans = nil
36
- (stack.length-1).downto(0) do |i|
37
- untags = stack[i][5]
38
- break unless untags.include? stagname
39
- # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
40
- trans = i
41
- end
42
- if trans.to_i > 1
43
- eles = stack.slice!(trans..-1)
44
- stack.last[2] += eles
45
- # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
62
+ unless opts[:xml]
63
+ if opts[:fixup_tags]
64
+ # obey the tag rules set up by the current element
65
+ if ElementContent.has_key? stagname
66
+ trans = nil
67
+ (stack.length-1).downto(0) do |i|
68
+ untags = stack[i][5]
69
+ break unless untags.include? stagname
70
+ # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
71
+ trans = i
72
+ end
73
+ if trans.to_i > 1
74
+ eles = stack.slice!(trans..-1)
75
+ stack.last[2] += eles
76
+ # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
77
+ end
78
+ elsif opts[:xhtml_strict]
79
+ token[2] = {'class' => stagname}
80
+ stagname = token[0] = "div"
46
81
  end
47
82
  end
48
- end
49
83
 
50
- # setup tag rules for inside this element
51
- if ElementContent[stagname] == :CDATA
52
- uncontainable_tags = :CDATA
53
- elsif opts[:fixup_tags]
54
- possible_tags = ElementContent[stagname]
55
- excluded_tags, included_tags = stack.last[3..4]
56
- if possible_tags
57
- excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
58
- included_tags = included_tags | (ElementInclusions[stagname] || [])
59
- containable_tags = (possible_tags | included_tags) - excluded_tags
60
- uncontainable_tags = ElementContent.keys - containable_tags
61
- else
62
- # If the tagname is unknown, it is assumed that any element
63
- # except excluded can be contained.
64
- uncontainable_tags = excluded_tags
84
+ # setup tag rules for inside this element
85
+ if ElementContent[stagname] == :CDATA
86
+ uncontainable_tags = :CDATA
87
+ elsif opts[:fixup_tags]
88
+ possible_tags = ElementContent[stagname]
89
+ excluded_tags, included_tags = stack.last[3..4]
90
+ if possible_tags
91
+ excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
92
+ included_tags = included_tags | (ElementInclusions[stagname] || [])
93
+ containable_tags = (possible_tags | included_tags) - excluded_tags
94
+ uncontainable_tags = ElementContent.keys - containable_tags
95
+ else
96
+ # If the tagname is unknown, it is assumed that any element
97
+ # except excluded can be contained.
98
+ uncontainable_tags = excluded_tags
99
+ end
65
100
  end
66
101
  end
67
102
  stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
68
103
  end
69
104
  when :etag
70
105
  etagname = token[0] = token[1].downcase
106
+ if opts[:xhtml_strict] and not ElementContent.has_key? etagname
107
+ etagname = token[0] = "div"
108
+ end
71
109
  matched_elem = nil
72
110
  (stack.length-1).downto(0) do |i|
73
111
  stagname, = stack[i]
@@ -80,7 +118,7 @@ module Hpricot
80
118
  end
81
119
  end
82
120
  unless matched_elem
83
- stack.last[2] << [:bogus_etag, token]
121
+ stack.last[2] << [:bogus_etag, token.first, token.last]
84
122
  else
85
123
  ele = stack.pop
86
124
  stack.last[2] << ele
@@ -103,63 +141,10 @@ module Hpricot
103
141
  end
104
142
 
105
143
  structure_list = stack[0][2]
106
- structure_list.map {|s| build_node(s) }
107
- end
108
-
109
- def Hpricot.fix_element(elem, excluded_tags, included_tags)
110
- tagname, _, attrs, sraw, _, _, _, eraw = elem[1]
111
- children = elem[2]
112
- if eraw
113
- elem[2] = fix_structure_list(children)
114
- return elem, []
115
- else
116
- if ElementContent[tagname] == :EMPTY
117
- elem[2] = []
118
- return elem, children
119
- else
120
- if ElementContent[tagname] == :CDATA
121
- possible_tags = []
122
- else
123
- possible_tags = ElementContent[tagname]
124
- end
125
- if possible_tags
126
- excluded_tags2 = ElementExclusions[tagname]
127
- included_tags2 = ElementInclusions[tagname]
128
- excluded_tags |= excluded_tags2 if excluded_tags2
129
- included_tags |= included_tags2 if included_tags2
130
- containable_tags = (possible_tags | included_tags) - excluded_tags
131
- uncontainable_tags = ElementContent.keys - containable_tags
132
- else
133
- # If the tagname is unknown, it is assumed that any element
134
- # except excluded can be contained.
135
- uncontainable_tags = excluded_tags
136
- end
137
- fixed_children = []
138
- rest = children
139
- until rest.empty?
140
- if String === rest[0][0]
141
- elem = rest.shift
142
- elem_tagname = elem[0]
143
- elem_tagname = elem_tagname.downcase
144
- if uncontainable_tags.include? elem_tagname
145
- rest.unshift elem
146
- break
147
- else
148
- fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags)
149
- fixed_children << fixed_elem
150
- rest = rest2 + rest
151
- end
152
- else
153
- fixed_children << rest.shift
154
- end
155
- end
156
- elem[2] = fixed_children
157
- return elem, rest
158
- end
159
- end
144
+ structure_list.map {|s| build_node(s, opts) }
160
145
  end
161
146
 
162
- def Hpricot.build_node(structure)
147
+ def Hpricot.build_node(structure, opts = {})
163
148
  case structure[0]
164
149
  when String
165
150
  tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
@@ -168,7 +153,7 @@ module Hpricot
168
153
  stag = STag.parse(tagname, attrs, sraw, true)
169
154
  if !children.empty? || etag
170
155
  Elem.new(stag,
171
- children.map {|c| build_node(c) },
156
+ children.map {|c| build_node(c, opts) },
172
157
  etag)
173
158
  else
174
159
  Elem.new(stag)
@@ -182,9 +167,13 @@ module Hpricot
182
167
  when :xmldecl
183
168
  XMLDecl.parse(structure[2], structure[3])
184
169
  when :doctype
170
+ if opts[:xhtml_strict]
171
+ structure[2]['system_id'] = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
172
+ structure[2]['public_id'] = "-//W3C//DTD XHTML 1.0 Strict//EN"
173
+ end
185
174
  DocType.parse(structure[1], structure[2], structure[3])
186
175
  when :procins
187
- ProcIns.parse(structure[1], structure[2], structure[3])
176
+ ProcIns.parse(structure[1])
188
177
  when :comment
189
178
  Comment.parse(structure[1])
190
179
  when :cdata_content
@@ -216,20 +205,16 @@ module Hpricot
216
205
 
217
206
  def Text.parse_pcdata(raw_string)
218
207
  result = Text.new(raw_string)
219
- result.raw_string = raw_string
220
208
  result
221
209
  end
222
210
 
223
211
  def Text.parse_cdata_content(raw_string)
224
- result = Text.new(raw_string)
225
- result.raw_string = raw_string
226
- result.instance_variable_set( "@cdata", true )
212
+ result = CData.new(raw_string)
227
213
  result
228
214
  end
229
215
 
230
216
  def Text.parse_cdata_section(content)
231
- result = Text.new(content)
232
- result.raw_string = "<![CDATA[" + content + "]]>"
217
+ result = CData.new(content)
233
218
  result
234
219
  end
235
220
 
@@ -264,15 +249,14 @@ module Hpricot
264
249
  result
265
250
  end
266
251
 
267
- def ProcIns.parse(target, content, raw_string)
252
+ def ProcIns.parse(raw_string)
253
+ _, target, content = *raw_string.match(/\A<\?(\S+)\s+(.+)/m)
268
254
  result = ProcIns.new(target, content)
269
- result.raw_string = raw_string
270
255
  result
271
256
  end
272
257
 
273
258
  def Comment.parse(content)
274
259
  result = Comment.new(content)
275
- result.raw_string = "<!--" + content + "-->"
276
260
  result
277
261
  end
278
262
 
@@ -6,12 +6,13 @@ module Hpricot
6
6
  def initialize(children)
7
7
  @children = children ? children.each { |c| c.parent = self } : []
8
8
  end
9
- def output(out)
9
+ def output(out, opts = {})
10
10
  @children.each do |n|
11
- n.output(out)
11
+ n.output(out, opts)
12
12
  end
13
13
  out
14
14
  end
15
+ def altered!; end
15
16
  end
16
17
 
17
18
  class BaseEle
@@ -19,6 +20,26 @@ module Hpricot
19
20
  def html_quote(str)
20
21
  "\"" + str.gsub('"', '\\"') + "\""
21
22
  end
23
+ def if_output(opts)
24
+ if opts[:preserve] and not @raw_string.nil?
25
+ @raw_string
26
+ else
27
+ yield opts
28
+ end
29
+ end
30
+ def pathname; self.name end
31
+ def altered!
32
+ @raw_string = nil
33
+ end
34
+ def self.alterable(*fields)
35
+ attr_accessor(*fields)
36
+ fields.each do |f|
37
+ define_method("#{f}=") do |v|
38
+ altered!
39
+ instance_variable_set("@#{f}", v)
40
+ end
41
+ end
42
+ end
22
43
  end
23
44
 
24
45
  class Elem
@@ -28,16 +49,34 @@ module Hpricot
28
49
  @children = children ? children.each { |c| c.parent = self } : []
29
50
  end
30
51
  def empty?; @children.empty? end
31
- [:name, :attributes, :parent].each do |m|
32
- [m, "#{m}="].each { |m2| define_method(m2) { |*a| @stag.send(m2, *a) } }
52
+ [:name, :attributes, :parent, :altered!].each do |m|
53
+ [m, "#{m}="].each { |m2| define_method(m2) { |*a| [@etag, @stag].inject { |_,t| t.send(m2, *a) if t and t.respond_to?(m2) } } }
54
+ end
55
+ def to_plain_text
56
+ if self.name == 'br'
57
+ "\n"
58
+ elsif self.name == 'p'
59
+ "\n\n" + super + "\n\n"
60
+ elsif self.name == 'a' and self.has_attribute?('href')
61
+ "#{super} [#{self['href']}]"
62
+ elsif self.name == 'img' and self.has_attribute?('src')
63
+ "[img:#{self['src']}]"
64
+ else
65
+ super
66
+ end
33
67
  end
34
- def output(out)
68
+ def pathname; self.name end
69
+ def output(out, opts = {})
35
70
  if empty? and ElementContent[@stag.name] == :EMPTY
36
- @stag.output(out, :style => :empty)
71
+ @stag.output(out, opts.merge(:style => :empty))
37
72
  else
38
- @stag.output(out)
39
- @children.each { |n| n.output(out) }
40
- @stag.output(out, :style => :end)
73
+ @stag.output(out, opts)
74
+ @children.each { |n| n.output(out, opts) }
75
+ if @etag
76
+ @etag.output(out, opts)
77
+ elsif !opts[:preserve]
78
+ ETag.new(@stag.name).output(out, opts)
79
+ end
41
80
  end
42
81
  out
43
82
  end
@@ -46,11 +85,12 @@ module Hpricot
46
85
  class STag < BaseEle
47
86
  def initialize(name, attributes=nil)
48
87
  @name = name.downcase
88
+ @attributes = {}
49
89
  if attributes
50
90
  @attributes = attributes.inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
51
91
  end
52
92
  end
53
- attr_accessor :name, :attributes
93
+ alterable :name, :attributes
54
94
  def attributes_as_html
55
95
  if @attributes
56
96
  @attributes.map do |aname, aval|
@@ -61,10 +101,7 @@ module Hpricot
61
101
  end
62
102
  def output(out, opts = {})
63
103
  out <<
64
- case opts[:style]
65
- when :end
66
- "</#{@name}>"
67
- else
104
+ if_output(opts) do
68
105
  "<#{@name}#{attributes_as_html}" +
69
106
  (opts[:style] == :empty ? " /" : "") +
70
107
  ">"
@@ -76,20 +113,43 @@ module Hpricot
76
113
  def initialize(qualified_name)
77
114
  @name = qualified_name
78
115
  end
79
- attr_reader :name
116
+ alterable :name
117
+ def output(out, opts = {})
118
+ out <<
119
+ if_output(opts) do
120
+ "</#{@name}>"
121
+ end
122
+ end
80
123
  end
81
124
 
82
125
  class BogusETag < ETag
83
- def output(out); end
126
+ def output(out, opts = {}); out << if_output(opts) { '' }; end
84
127
  end
85
128
 
86
129
  class Text < BaseEle
87
130
  def initialize(text)
88
131
  @content = text
89
132
  end
90
- attr_reader :content
91
- def output(out)
92
- out << @content
133
+ alterable :content
134
+ def pathname; "text()" end
135
+ alias_method :inner_text, :content
136
+ alias_method :to_plain_text, :content
137
+ def output(out, opts = {})
138
+ out <<
139
+ if_output(opts) do
140
+ @content
141
+ end
142
+ end
143
+ end
144
+
145
+ class CData < Text
146
+ alias_method :inner_text, :content
147
+ alias_method :to_plain_text, :content
148
+ def output(out, opts = {})
149
+ out <<
150
+ if_output(opts) do
151
+ "<![CDATA[#@content]]>"
152
+ end
93
153
  end
94
154
  end
95
155
 
@@ -97,26 +157,32 @@ module Hpricot
97
157
  def initialize(version, encoding, standalone)
98
158
  @version, @encoding, @standalone = version, encoding, standalone
99
159
  end
100
- attr_reader :version, :encoding, :standalone
101
- def output(out)
160
+ alterable :version, :encoding, :standalone
161
+ def pathname; "xmldecl()" end
162
+ def output(out, opts = {})
102
163
  out <<
103
- "<?xml version=\"#{@version}\"" +
104
- (@encoding ? " encoding=\"#{encoding}\"" : "") +
105
- (@standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
106
- "?>"
164
+ if_output(opts) do
165
+ "<?xml version=\"#{@version}\"" +
166
+ (@encoding ? " encoding=\"#{encoding}\"" : "") +
167
+ (@standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
168
+ "?>"
169
+ end
107
170
  end
108
171
  end
109
172
 
110
173
  class DocType < BaseEle
111
- def initialize(name, pubid, sysid)
112
- @name, @public_id, @system_id = name, pubid, sysid
174
+ def initialize(target, pubid, sysid)
175
+ @target, @public_id, @system_id = target, pubid, sysid
113
176
  end
114
- attr_reader :name, :public_id, :system_id
115
- def output(out)
177
+ alterable :target, :public_id, :system_id
178
+ def pathname; "doctype()" end
179
+ def output(out, opts = {})
116
180
  out <<
117
- "<!DOCTYPE #{@name} " +
118
- (@public_id ? "PUBLIC \"#{@public_id}\"" : "SYSTEM") +
119
- (@system_id ? " #{html_quote(@system_id)}" : "") + ">"
181
+ if_output(opts) do
182
+ "<!DOCTYPE #{@target} " +
183
+ (@public_id ? "PUBLIC \"#{@public_id}\"" : "SYSTEM") +
184
+ (@system_id ? " #{html_quote(@system_id)}" : "") + ">"
185
+ end
120
186
  end
121
187
  end
122
188
 
@@ -124,11 +190,15 @@ module Hpricot
124
190
  def initialize(target, content)
125
191
  @target, @content = target, content
126
192
  end
127
- attr_reader :target, :content
128
- def output(out)
129
- out << "<?#{@target}" +
130
- (@content ? " #{@content}" : "") +
131
- "?>"
193
+ def pathname; "procins()" end
194
+ alterable :target, :content
195
+ def output(out, opts = {})
196
+ out <<
197
+ if_output(opts) do
198
+ "<?#{@target}" +
199
+ (@content ? " #{@content}" : "") +
200
+ "?>"
201
+ end
132
202
  end
133
203
  end
134
204
 
@@ -136,9 +206,13 @@ module Hpricot
136
206
  def initialize(content)
137
207
  @content = content
138
208
  end
139
- attr_reader :content
140
- def output(out)
141
- out << "<!--#{@content}-->"
209
+ def pathname; "comment()" end
210
+ alterable :content
211
+ def output(out, opts = {})
212
+ out <<
213
+ if_output(opts) do
214
+ "<!--#{@content}-->"
215
+ end
142
216
  end
143
217
  end
144
218