hpricot 0.4-mswin32 → 0.5-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,13 @@ require 'pp'
2
2
 
3
3
  module Hpricot
4
4
  # :stopdoc:
5
+ class Elements
6
+ def pretty_print(q)
7
+ q.object_group(self) { super }
8
+ end
9
+ alias inspect pretty_print_inspect
10
+ end
11
+
5
12
  class Doc
6
13
  def pretty_print(q)
7
14
  q.object_group(self) { @children.each {|elt| q.breakable; q.pp elt } }
@@ -73,6 +80,12 @@ module Hpricot
73
80
  alias inspect pretty_print_inspect
74
81
  end
75
82
 
83
+ class Text
84
+ def pretty_print(q)
85
+ q.text @content.dump
86
+ end
87
+ end
88
+
76
89
  class BogusETag
77
90
  def pretty_print(q)
78
91
  q.group(1, '{', '}') {
@@ -5,69 +5,107 @@ def Hpricot(input, opts = {})
5
5
  end
6
6
 
7
7
  module Hpricot
8
+ # Exception class used for any errors related to deficiencies in the system when
9
+ # handling the character encodings of a document.
10
+ class EncodingError < StandardError; end
11
+
8
12
  # Hpricot.parse parses <i>input</i> and return a document tree.
9
13
  # represented by Hpricot::Doc.
10
14
  def Hpricot.parse(input, opts = {})
11
15
  Doc.new(make(input, opts))
12
16
  end
13
17
 
18
+ # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
+ # and returning a document tree.
20
+ def Hpricot.XML(input, opts = {})
21
+ Doc.new(make(input, opts.merge(:xml => true)))
22
+ end
23
+
14
24
  # :stopdoc:
15
25
 
16
26
  def Hpricot.make(input, opts = {})
17
27
  opts = {:fixup_tags => false}.merge(opts)
28
+
29
+ case opts[:encoding]
30
+ when nil
31
+ when 'utf-8'
32
+ unless defined? Encoding::Character::UTF8
33
+ raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
34
+ end
35
+ else
36
+ raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
37
+ end
38
+
39
+ if opts[:xhtml_strict]
40
+ opts[:fixup_tags] = true
41
+ end
42
+
18
43
  stack = [[nil, nil, [], [], [], []]]
19
44
  Hpricot.scan(input) do |token|
20
- if stack.last[5] == :CDATA and !(token[0] == :etag and token[1].downcase == stack.last[0])
45
+ if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
46
+ !(token[0] == :etag and token[1].downcase == stack.last[0])
21
47
  token[0] = :text
22
48
  token[1] = token[3] if token[3]
23
49
  end
24
50
 
25
51
  case token[0]
26
52
  when :stag
53
+ case opts[:encoding] when 'utf-8'
54
+ token.map! { |str| u(str) if str.is_a? String }
55
+ end
56
+
27
57
  stagname = token[0] = token[1].downcase
28
- if ElementContent[stagname] == :EMPTY
58
+ if ElementContent[stagname] == :EMPTY and !opts[:xml]
29
59
  token[0] = :emptytag
30
60
  stack.last[2] << token
31
61
  else
32
- if opts[:fixup_tags]
33
- # obey the tag rules set up by the current element
34
- if ElementContent.has_key? stagname
35
- trans = nil
36
- (stack.length-1).downto(0) do |i|
37
- untags = stack[i][5]
38
- break unless untags.include? stagname
39
- # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
40
- trans = i
41
- end
42
- if trans.to_i > 1
43
- eles = stack.slice!(trans..-1)
44
- stack.last[2] += eles
45
- # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
62
+ unless opts[:xml]
63
+ if opts[:fixup_tags]
64
+ # obey the tag rules set up by the current element
65
+ if ElementContent.has_key? stagname
66
+ trans = nil
67
+ (stack.length-1).downto(0) do |i|
68
+ untags = stack[i][5]
69
+ break unless untags.include? stagname
70
+ # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
71
+ trans = i
72
+ end
73
+ if trans.to_i > 1
74
+ eles = stack.slice!(trans..-1)
75
+ stack.last[2] += eles
76
+ # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
77
+ end
78
+ elsif opts[:xhtml_strict]
79
+ token[2] = {'class' => stagname}
80
+ stagname = token[0] = "div"
46
81
  end
47
82
  end
48
- end
49
83
 
50
- # setup tag rules for inside this element
51
- if ElementContent[stagname] == :CDATA
52
- uncontainable_tags = :CDATA
53
- elsif opts[:fixup_tags]
54
- possible_tags = ElementContent[stagname]
55
- excluded_tags, included_tags = stack.last[3..4]
56
- if possible_tags
57
- excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
58
- included_tags = included_tags | (ElementInclusions[stagname] || [])
59
- containable_tags = (possible_tags | included_tags) - excluded_tags
60
- uncontainable_tags = ElementContent.keys - containable_tags
61
- else
62
- # If the tagname is unknown, it is assumed that any element
63
- # except excluded can be contained.
64
- uncontainable_tags = excluded_tags
84
+ # setup tag rules for inside this element
85
+ if ElementContent[stagname] == :CDATA
86
+ uncontainable_tags = :CDATA
87
+ elsif opts[:fixup_tags]
88
+ possible_tags = ElementContent[stagname]
89
+ excluded_tags, included_tags = stack.last[3..4]
90
+ if possible_tags
91
+ excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
92
+ included_tags = included_tags | (ElementInclusions[stagname] || [])
93
+ containable_tags = (possible_tags | included_tags) - excluded_tags
94
+ uncontainable_tags = ElementContent.keys - containable_tags
95
+ else
96
+ # If the tagname is unknown, it is assumed that any element
97
+ # except excluded can be contained.
98
+ uncontainable_tags = excluded_tags
99
+ end
65
100
  end
66
101
  end
67
102
  stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
68
103
  end
69
104
  when :etag
70
105
  etagname = token[0] = token[1].downcase
106
+ if opts[:xhtml_strict] and not ElementContent.has_key? etagname
107
+ etagname = token[0] = "div"
108
+ end
71
109
  matched_elem = nil
72
110
  (stack.length-1).downto(0) do |i|
73
111
  stagname, = stack[i]
@@ -80,7 +118,7 @@ module Hpricot
80
118
  end
81
119
  end
82
120
  unless matched_elem
83
- stack.last[2] << [:bogus_etag, token]
121
+ stack.last[2] << [:bogus_etag, token.first, token.last]
84
122
  else
85
123
  ele = stack.pop
86
124
  stack.last[2] << ele
@@ -103,63 +141,10 @@ module Hpricot
103
141
  end
104
142
 
105
143
  structure_list = stack[0][2]
106
- structure_list.map {|s| build_node(s) }
107
- end
108
-
109
- def Hpricot.fix_element(elem, excluded_tags, included_tags)
110
- tagname, _, attrs, sraw, _, _, _, eraw = elem[1]
111
- children = elem[2]
112
- if eraw
113
- elem[2] = fix_structure_list(children)
114
- return elem, []
115
- else
116
- if ElementContent[tagname] == :EMPTY
117
- elem[2] = []
118
- return elem, children
119
- else
120
- if ElementContent[tagname] == :CDATA
121
- possible_tags = []
122
- else
123
- possible_tags = ElementContent[tagname]
124
- end
125
- if possible_tags
126
- excluded_tags2 = ElementExclusions[tagname]
127
- included_tags2 = ElementInclusions[tagname]
128
- excluded_tags |= excluded_tags2 if excluded_tags2
129
- included_tags |= included_tags2 if included_tags2
130
- containable_tags = (possible_tags | included_tags) - excluded_tags
131
- uncontainable_tags = ElementContent.keys - containable_tags
132
- else
133
- # If the tagname is unknown, it is assumed that any element
134
- # except excluded can be contained.
135
- uncontainable_tags = excluded_tags
136
- end
137
- fixed_children = []
138
- rest = children
139
- until rest.empty?
140
- if String === rest[0][0]
141
- elem = rest.shift
142
- elem_tagname = elem[0]
143
- elem_tagname = elem_tagname.downcase
144
- if uncontainable_tags.include? elem_tagname
145
- rest.unshift elem
146
- break
147
- else
148
- fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags)
149
- fixed_children << fixed_elem
150
- rest = rest2 + rest
151
- end
152
- else
153
- fixed_children << rest.shift
154
- end
155
- end
156
- elem[2] = fixed_children
157
- return elem, rest
158
- end
159
- end
144
+ structure_list.map {|s| build_node(s, opts) }
160
145
  end
161
146
 
162
- def Hpricot.build_node(structure)
147
+ def Hpricot.build_node(structure, opts = {})
163
148
  case structure[0]
164
149
  when String
165
150
  tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
@@ -168,7 +153,7 @@ module Hpricot
168
153
  stag = STag.parse(tagname, attrs, sraw, true)
169
154
  if !children.empty? || etag
170
155
  Elem.new(stag,
171
- children.map {|c| build_node(c) },
156
+ children.map {|c| build_node(c, opts) },
172
157
  etag)
173
158
  else
174
159
  Elem.new(stag)
@@ -182,9 +167,13 @@ module Hpricot
182
167
  when :xmldecl
183
168
  XMLDecl.parse(structure[2], structure[3])
184
169
  when :doctype
170
+ if opts[:xhtml_strict]
171
+ structure[2]['system_id'] = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
172
+ structure[2]['public_id'] = "-//W3C//DTD XHTML 1.0 Strict//EN"
173
+ end
185
174
  DocType.parse(structure[1], structure[2], structure[3])
186
175
  when :procins
187
- ProcIns.parse(structure[1], structure[2], structure[3])
176
+ ProcIns.parse(structure[1])
188
177
  when :comment
189
178
  Comment.parse(structure[1])
190
179
  when :cdata_content
@@ -216,20 +205,16 @@ module Hpricot
216
205
 
217
206
  def Text.parse_pcdata(raw_string)
218
207
  result = Text.new(raw_string)
219
- result.raw_string = raw_string
220
208
  result
221
209
  end
222
210
 
223
211
  def Text.parse_cdata_content(raw_string)
224
- result = Text.new(raw_string)
225
- result.raw_string = raw_string
226
- result.instance_variable_set( "@cdata", true )
212
+ result = CData.new(raw_string)
227
213
  result
228
214
  end
229
215
 
230
216
  def Text.parse_cdata_section(content)
231
- result = Text.new(content)
232
- result.raw_string = "<![CDATA[" + content + "]]>"
217
+ result = CData.new(content)
233
218
  result
234
219
  end
235
220
 
@@ -264,15 +249,14 @@ module Hpricot
264
249
  result
265
250
  end
266
251
 
267
- def ProcIns.parse(target, content, raw_string)
252
+ def ProcIns.parse(raw_string)
253
+ _, target, content = *raw_string.match(/\A<\?(\S+)\s+(.+)/m)
268
254
  result = ProcIns.new(target, content)
269
- result.raw_string = raw_string
270
255
  result
271
256
  end
272
257
 
273
258
  def Comment.parse(content)
274
259
  result = Comment.new(content)
275
- result.raw_string = "<!--" + content + "-->"
276
260
  result
277
261
  end
278
262
 
@@ -6,12 +6,13 @@ module Hpricot
6
6
  def initialize(children)
7
7
  @children = children ? children.each { |c| c.parent = self } : []
8
8
  end
9
- def output(out)
9
+ def output(out, opts = {})
10
10
  @children.each do |n|
11
- n.output(out)
11
+ n.output(out, opts)
12
12
  end
13
13
  out
14
14
  end
15
+ def altered!; end
15
16
  end
16
17
 
17
18
  class BaseEle
@@ -19,6 +20,26 @@ module Hpricot
19
20
  def html_quote(str)
20
21
  "\"" + str.gsub('"', '\\"') + "\""
21
22
  end
23
+ def if_output(opts)
24
+ if opts[:preserve] and not @raw_string.nil?
25
+ @raw_string
26
+ else
27
+ yield opts
28
+ end
29
+ end
30
+ def pathname; self.name end
31
+ def altered!
32
+ @raw_string = nil
33
+ end
34
+ def self.alterable(*fields)
35
+ attr_accessor(*fields)
36
+ fields.each do |f|
37
+ define_method("#{f}=") do |v|
38
+ altered!
39
+ instance_variable_set("@#{f}", v)
40
+ end
41
+ end
42
+ end
22
43
  end
23
44
 
24
45
  class Elem
@@ -28,16 +49,34 @@ module Hpricot
28
49
  @children = children ? children.each { |c| c.parent = self } : []
29
50
  end
30
51
  def empty?; @children.empty? end
31
- [:name, :attributes, :parent].each do |m|
32
- [m, "#{m}="].each { |m2| define_method(m2) { |*a| @stag.send(m2, *a) } }
52
+ [:name, :attributes, :parent, :altered!].each do |m|
53
+ [m, "#{m}="].each { |m2| define_method(m2) { |*a| [@etag, @stag].inject { |_,t| t.send(m2, *a) if t and t.respond_to?(m2) } } }
54
+ end
55
+ def to_plain_text
56
+ if self.name == 'br'
57
+ "\n"
58
+ elsif self.name == 'p'
59
+ "\n\n" + super + "\n\n"
60
+ elsif self.name == 'a' and self.has_attribute?('href')
61
+ "#{super} [#{self['href']}]"
62
+ elsif self.name == 'img' and self.has_attribute?('src')
63
+ "[img:#{self['src']}]"
64
+ else
65
+ super
66
+ end
33
67
  end
34
- def output(out)
68
+ def pathname; self.name end
69
+ def output(out, opts = {})
35
70
  if empty? and ElementContent[@stag.name] == :EMPTY
36
- @stag.output(out, :style => :empty)
71
+ @stag.output(out, opts.merge(:style => :empty))
37
72
  else
38
- @stag.output(out)
39
- @children.each { |n| n.output(out) }
40
- @stag.output(out, :style => :end)
73
+ @stag.output(out, opts)
74
+ @children.each { |n| n.output(out, opts) }
75
+ if @etag
76
+ @etag.output(out, opts)
77
+ elsif !opts[:preserve]
78
+ ETag.new(@stag.name).output(out, opts)
79
+ end
41
80
  end
42
81
  out
43
82
  end
@@ -46,11 +85,12 @@ module Hpricot
46
85
  class STag < BaseEle
47
86
  def initialize(name, attributes=nil)
48
87
  @name = name.downcase
88
+ @attributes = {}
49
89
  if attributes
50
90
  @attributes = attributes.inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
51
91
  end
52
92
  end
53
- attr_accessor :name, :attributes
93
+ alterable :name, :attributes
54
94
  def attributes_as_html
55
95
  if @attributes
56
96
  @attributes.map do |aname, aval|
@@ -61,10 +101,7 @@ module Hpricot
61
101
  end
62
102
  def output(out, opts = {})
63
103
  out <<
64
- case opts[:style]
65
- when :end
66
- "</#{@name}>"
67
- else
104
+ if_output(opts) do
68
105
  "<#{@name}#{attributes_as_html}" +
69
106
  (opts[:style] == :empty ? " /" : "") +
70
107
  ">"
@@ -76,20 +113,43 @@ module Hpricot
76
113
  def initialize(qualified_name)
77
114
  @name = qualified_name
78
115
  end
79
- attr_reader :name
116
+ alterable :name
117
+ def output(out, opts = {})
118
+ out <<
119
+ if_output(opts) do
120
+ "</#{@name}>"
121
+ end
122
+ end
80
123
  end
81
124
 
82
125
  class BogusETag < ETag
83
- def output(out); end
126
+ def output(out, opts = {}); out << if_output(opts) { '' }; end
84
127
  end
85
128
 
86
129
  class Text < BaseEle
87
130
  def initialize(text)
88
131
  @content = text
89
132
  end
90
- attr_reader :content
91
- def output(out)
92
- out << @content
133
+ alterable :content
134
+ def pathname; "text()" end
135
+ alias_method :inner_text, :content
136
+ alias_method :to_plain_text, :content
137
+ def output(out, opts = {})
138
+ out <<
139
+ if_output(opts) do
140
+ @content
141
+ end
142
+ end
143
+ end
144
+
145
+ class CData < Text
146
+ alias_method :inner_text, :content
147
+ alias_method :to_plain_text, :content
148
+ def output(out, opts = {})
149
+ out <<
150
+ if_output(opts) do
151
+ "<![CDATA[#@content]]>"
152
+ end
93
153
  end
94
154
  end
95
155
 
@@ -97,26 +157,32 @@ module Hpricot
97
157
  def initialize(version, encoding, standalone)
98
158
  @version, @encoding, @standalone = version, encoding, standalone
99
159
  end
100
- attr_reader :version, :encoding, :standalone
101
- def output(out)
160
+ alterable :version, :encoding, :standalone
161
+ def pathname; "xmldecl()" end
162
+ def output(out, opts = {})
102
163
  out <<
103
- "<?xml version=\"#{@version}\"" +
104
- (@encoding ? " encoding=\"#{encoding}\"" : "") +
105
- (@standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
106
- "?>"
164
+ if_output(opts) do
165
+ "<?xml version=\"#{@version}\"" +
166
+ (@encoding ? " encoding=\"#{encoding}\"" : "") +
167
+ (@standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
168
+ "?>"
169
+ end
107
170
  end
108
171
  end
109
172
 
110
173
  class DocType < BaseEle
111
- def initialize(name, pubid, sysid)
112
- @name, @public_id, @system_id = name, pubid, sysid
174
+ def initialize(target, pubid, sysid)
175
+ @target, @public_id, @system_id = target, pubid, sysid
113
176
  end
114
- attr_reader :name, :public_id, :system_id
115
- def output(out)
177
+ alterable :target, :public_id, :system_id
178
+ def pathname; "doctype()" end
179
+ def output(out, opts = {})
116
180
  out <<
117
- "<!DOCTYPE #{@name} " +
118
- (@public_id ? "PUBLIC \"#{@public_id}\"" : "SYSTEM") +
119
- (@system_id ? " #{html_quote(@system_id)}" : "") + ">"
181
+ if_output(opts) do
182
+ "<!DOCTYPE #{@target} " +
183
+ (@public_id ? "PUBLIC \"#{@public_id}\"" : "SYSTEM") +
184
+ (@system_id ? " #{html_quote(@system_id)}" : "") + ">"
185
+ end
120
186
  end
121
187
  end
122
188
 
@@ -124,11 +190,15 @@ module Hpricot
124
190
  def initialize(target, content)
125
191
  @target, @content = target, content
126
192
  end
127
- attr_reader :target, :content
128
- def output(out)
129
- out << "<?#{@target}" +
130
- (@content ? " #{@content}" : "") +
131
- "?>"
193
+ def pathname; "procins()" end
194
+ alterable :target, :content
195
+ def output(out, opts = {})
196
+ out <<
197
+ if_output(opts) do
198
+ "<?#{@target}" +
199
+ (@content ? " #{@content}" : "") +
200
+ "?>"
201
+ end
132
202
  end
133
203
  end
134
204
 
@@ -136,9 +206,13 @@ module Hpricot
136
206
  def initialize(content)
137
207
  @content = content
138
208
  end
139
- attr_reader :content
140
- def output(out)
141
- out << "<!--#{@content}-->"
209
+ def pathname; "comment()" end
210
+ alterable :content
211
+ def output(out, opts = {})
212
+ out <<
213
+ if_output(opts) do
214
+ "<!--#{@content}-->"
215
+ end
142
216
  end
143
217
  end
144
218