hpricot 0.6.164 → 0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,4 @@
1
1
  require './hpricot_scan.so'
2
2
 
3
3
  doc = "<doc><person><test>YESSS</test></person><train>SET</train></doc>"
4
- Hpricot.scan(doc) { |x| p x }
5
- p Hpricot.lemon(doc)
4
+ p Hpricot.scan(doc)
@@ -1,20 +1,13 @@
1
1
  require 'hpricot/tags'
2
2
  require 'fast_xs'
3
3
  require 'hpricot/blankslate'
4
+ require 'hpricot/htmlinfo'
4
5
 
5
6
  module Hpricot
6
- PREDEFINED = {
7
- 34 => '&quot;', # quotation mark
8
- 38 => '&amp;', # ampersand
9
- 60 => '&lt;', # left angle bracket
10
- 62 => '&gt;' # right angle bracket
11
- }
12
- PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
13
-
14
7
  # XML unescape
15
8
  def self.uxs(str)
16
9
  str.to_s.
17
- gsub(/\&\w+;/) { |x| (PREDEFINED_U[x] || ??).chr }.
10
+ gsub(/\&(\w+);/) { [NamedCharacters[$1] || ??].pack("U*") }.
18
11
  gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
19
12
  end
20
13
 
@@ -23,7 +16,7 @@ module Hpricot
23
16
  assigns.each do |k, v|
24
17
  ele.instance_variable_set("@#{k}", v)
25
18
  end
26
- ele.instance_eval &blk
19
+ ele.instance_eval(&blk)
27
20
  ele
28
21
  end
29
22
 
@@ -45,14 +38,21 @@ module Hpricot
45
38
  @@default[option] = value
46
39
  end
47
40
 
41
+ def add_child ele
42
+ ele.parent = self
43
+ self.children ||= []
44
+ self.children << ele
45
+ ele
46
+ end
47
+
48
48
  # Write a +string+ to the HTML stream, making sure to escape it.
49
49
  def text!(string)
50
- @children << Text.new(string.fast_xs)
50
+ add_child Text.new(string.fast_xs)
51
51
  end
52
52
 
53
53
  # Write a +string+ to the HTML stream without escaping it.
54
54
  def text(string)
55
- @children << Text.new(string)
55
+ add_child Text.new(string)
56
56
  nil
57
57
  end
58
58
  alias_method :<<, :text
@@ -67,11 +67,11 @@ module Hpricot
67
67
  raise InvalidXhtmlError, "no element `#{tag}' for #{tagset.doctype}"
68
68
  elsif args.last.respond_to?(:to_hash)
69
69
  attrs = args.last.to_hash
70
-
70
+
71
71
  if @tagset.forms.include?(tag) and attrs[:id]
72
72
  attrs[:name] ||= attrs[:id]
73
73
  end
74
-
74
+
75
75
  attrs.each do |k, v|
76
76
  atname = k.to_s.downcase.intern
77
77
  unless k =~ /:/ or @tagset.tagset[tag].include? atname
@@ -105,14 +105,15 @@ module Hpricot
105
105
  end
106
106
 
107
107
  # create the element itself
108
- f = Elem.new(STag.new(tag, attrs), childs, ETag.new(tag))
108
+ tag = tag.to_s
109
+ f = Elem.new(tag, attrs, childs, ETag.new(tag))
109
110
 
110
111
  # build children from the block
111
112
  if block
112
113
  build(f, &block)
113
114
  end
114
115
 
115
- @children << f
116
+ add_child f
116
117
  f
117
118
  end
118
119
 
@@ -145,11 +146,11 @@ module Hpricot
145
146
  end
146
147
 
147
148
  def doctype(target, pub, sys)
148
- @children << DocType.new(target, pub, sys)
149
+ add_child DocType.new(target, pub, sys)
149
150
  end
150
151
 
151
152
  remove_method :head
152
-
153
+
153
154
  # Builds a head tag. Adds a <tt>meta</tt> tag inside with Content-Type
154
155
  # set to <tt>text/html; charset=utf-8</tt>.
155
156
  def head(*args, &block)
@@ -193,7 +194,7 @@ module Hpricot
193
194
  def initialize(builder, sym)
194
195
  @builder, @sym, @attrs = builder, sym, {}
195
196
  end
196
-
197
+
197
198
  # Adds attributes to an element. Bang methods set the :id attribute.
198
199
  # Other methods add to the :class attribute.
199
200
  def method_missing(id_or_class, *args, &block)
@@ -207,7 +208,7 @@ module Hpricot
207
208
  args.push(@attrs)
208
209
  return @builder.tag!(@sym, *args, &block)
209
210
  end
210
-
211
+
211
212
  return self
212
213
  end
213
214
 
@@ -168,7 +168,7 @@ module Hpricot
168
168
  end
169
169
  x.parent.replace_child(x, wrap)
170
170
  nest = nest.children.first until nest.empty?
171
- nest.html(nest.children + [x])
171
+ nest.html([x])
172
172
  end
173
173
  end
174
174
 
@@ -275,7 +275,7 @@ module Hpricot
275
275
  expr = $'
276
276
  m.compact!
277
277
  if m[0] == '@'
278
- m[0] = "@#{m.slice!(2,1)}"
278
+ m[0] = "@#{m.slice!(2,1).join}"
279
279
  end
280
280
 
281
281
  if m[0] == '[' && m[1] =~ /^\d+$/
@@ -300,10 +300,10 @@ module Hpricot
300
300
  args = m[1..-1]
301
301
  end
302
302
  end
303
- i = -1
303
+ args << -1
304
304
  nodes = Elements[*nodes.find_all do |x|
305
- i += 1
306
- x.send(meth, *([*args] + [i])) ? truth : !truth
305
+ args[-1] += 1
306
+ x.send(meth, *args) ? truth : !truth
307
307
  end]
308
308
  end
309
309
  end
@@ -422,7 +422,7 @@ module Hpricot
422
422
  case arg
423
423
  when 'even'; (parent.containers.index(self) + 1) % 2 == 0
424
424
  when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
425
- else self == (parent.containers[arg.to_i + 1])
425
+ else self == (parent.containers[arg.to_i - 1])
426
426
  end
427
427
  end
428
428
 
@@ -446,23 +446,23 @@ module Hpricot
446
446
  parent.containers.length == 1
447
447
  end
448
448
 
449
- filter :parent do
449
+ filter :parent do |*a|
450
450
  containers.length > 0
451
451
  end
452
452
 
453
- filter :empty do
453
+ filter :empty do |*a|
454
454
  containers.length == 0
455
455
  end
456
456
 
457
- filter :root do
457
+ filter :root do |*a|
458
458
  self.is_a? Hpricot::Doc
459
459
  end
460
460
 
461
- filter 'text' do
461
+ filter 'text' do |*a|
462
462
  self.text?
463
463
  end
464
464
 
465
- filter 'comment' do
465
+ filter 'comment' do |*a|
466
466
  self.comment?
467
467
  end
468
468
 
@@ -495,7 +495,7 @@ module Hpricot
495
495
  end
496
496
 
497
497
  filter 'text()' do |val,i|
498
- !self.inner_text.strip.empty?
498
+ self.children.grep(Hpricot::Text).detect { |x| x.content =~ /\S/ } if self.children
499
499
  end
500
500
 
501
501
  filter '@' do |attr,val,i|
@@ -473,9 +473,23 @@ module Hpricot
473
473
  "menu", "noframes", "noscript", "object", "ol", "p", "pre", "q", "s",
474
474
  "samp", "script", "select", "small", "span", "strike", "strong", "sub",
475
475
  "sup", "table", "textarea", "tt", "u", "ul", "var"]}
476
+ ElementContent.keys.each do |k|
477
+ v = ElementContent[k]
478
+ if v.is_a? Array
479
+ ElementContent[k] = v.inject({}) do |h, name|
480
+ h[name.hash] = true
481
+ h
482
+ end
483
+ end
484
+ end
476
485
 
477
486
  ElementInclusions =
478
487
  {"head"=>["link", "meta", "object", "script", "style"], "body"=>["del", "ins"]}
488
+ ElementInclusions.each do |k, v|
489
+ v.each do |name|
490
+ ElementContent[k][name.hash] = :allow
491
+ end
492
+ end
479
493
 
480
494
  ElementExclusions =
481
495
  {"button"=>
@@ -496,6 +510,11 @@ module Hpricot
496
510
  "h1", "h2", "h3", "h4", "h5", "h6", "hr", "isindex", "menu", "noframes",
497
511
  "noscript", "ol", "p", "pre", "table", "ul"],
498
512
  "label"=>["label"]}
513
+ ElementExclusions.each do |k, v|
514
+ v.each do |name|
515
+ ElementContent[k][name.hash] = :deny
516
+ end
517
+ end
499
518
 
500
519
  OmittedAttrName =
501
520
  {"h6"=>
@@ -11,28 +11,7 @@ module Hpricot
11
11
 
12
12
  class Doc
13
13
  def pretty_print(q)
14
- q.object_group(self) { @children.each {|elt| q.breakable; q.pp elt } }
15
- end
16
- alias inspect pretty_print_inspect
17
- end
18
-
19
- class Elem
20
- def pretty_print(q)
21
- if empty?
22
- q.group(1, '{emptyelem', '}') {
23
- q.breakable; q.pp @stag
24
- }
25
- else
26
- q.group(1, "{elem", "}") {
27
- q.breakable; q.pp @stag
28
- if @children
29
- @children.each {|elt| q.breakable; q.pp elt }
30
- end
31
- if @etag
32
- q.breakable; q.pp @etag
33
- end
34
- }
35
- end
14
+ q.object_group(self) { children.each {|elt| q.breakable; q.pp elt } }
36
15
  end
37
16
  alias inspect pretty_print_inspect
38
17
  end
@@ -41,7 +20,7 @@ module Hpricot
41
20
  def pretty_print(q)
42
21
  q.group(1, '{', '}') {
43
22
  q.text self.class.name.sub(/.*::/,'').downcase
44
- if rs = @raw_string
23
+ if rs = raw_string
45
24
  rs.scan(/[^\r\n]*(?:\r\n?|\n|[^\r\n]\z)/) {|line|
46
25
  q.breakable
47
26
  q.pp line
@@ -55,13 +34,30 @@ module Hpricot
55
34
  alias inspect pretty_print_inspect
56
35
  end
57
36
 
58
- class STag
37
+ class Elem
59
38
  def pretty_print(q)
39
+ if empty?
40
+ q.group(1, '{emptyelem', '}') {
41
+ q.breakable; pretty_print_stag q
42
+ }
43
+ else
44
+ q.group(1, "{elem", "}") {
45
+ q.breakable; pretty_print_stag q
46
+ if children
47
+ children.each {|elt| q.breakable; q.pp elt }
48
+ end
49
+ if etag
50
+ q.breakable; q.pp etag
51
+ end
52
+ }
53
+ end
54
+ end
55
+ def pretty_print_stag(q)
60
56
  q.group(1, '<', '>') {
61
- q.text @name
57
+ q.text name
62
58
 
63
- if @raw_attributes
64
- @raw_attributes.each {|n, t|
59
+ if raw_attributes
60
+ raw_attributes.each {|n, t|
65
61
  q.breakable
66
62
  if t
67
63
  q.text "#{n}=\"#{Hpricot.uxs(t)}\""
@@ -78,7 +74,7 @@ module Hpricot
78
74
  class ETag
79
75
  def pretty_print(q)
80
76
  q.group(1, '</', '>') {
81
- q.text @name
77
+ q.text name
82
78
  }
83
79
  end
84
80
  alias inspect pretty_print_inspect
@@ -86,7 +82,7 @@ module Hpricot
86
82
 
87
83
  class Text
88
84
  def pretty_print(q)
89
- q.text @content.dump
85
+ q.text content.dump
90
86
  end
91
87
  end
92
88
 
@@ -94,11 +90,11 @@ module Hpricot
94
90
  def pretty_print(q)
95
91
  q.group(1, '{', '}') {
96
92
  q.text self.class.name.sub(/.*::/,'').downcase
97
- if rs = @raw_string
93
+ if rs = raw_string
98
94
  q.breakable
99
95
  q.text rs
100
96
  else
101
- q.text "</#{@name}>"
97
+ q.text "</#{name}>"
102
98
  end
103
99
  }
104
100
  end
@@ -4,7 +4,6 @@ module Hpricot
4
4
 
5
5
  # :stopdoc:
6
6
  module Tag; include Hpricot end
7
- class STag; include Tag end
8
7
  class ETag; include Tag end
9
8
  # :startdoc:
10
9
 
@@ -12,6 +11,7 @@ module Hpricot
12
11
  module Container; include Node end
13
12
  class Doc; include Container end
14
13
  class Elem; include Container end
14
+
15
15
  module Leaf; include Node end
16
16
  class Text; include Leaf end
17
17
  class XMLDecl; include Leaf end
@@ -25,6 +25,7 @@ module Hpricot
25
25
  module Leaf::Trav; include Traverse end
26
26
  class Doc; module Trav; include Container::Trav end; include Trav end
27
27
  class Elem; module Trav; include Container::Trav end; include Trav end
28
+ class CData; module Trav; include Leaf::Trav end; include Trav end
28
29
  class Text; module Trav; include Leaf::Trav end; include Trav end
29
30
  class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
30
31
  class DocType; module Trav; include Leaf::Trav end; include Trav end
@@ -1,7 +1,7 @@
1
1
  require 'hpricot/htmlinfo'
2
2
 
3
3
  def Hpricot(input = nil, opts = {}, &blk)
4
- Hpricot.parse(input, opts, &blk)
4
+ Hpricot.make(input, opts, &blk)
5
5
  end
6
6
 
7
7
  module Hpricot
@@ -12,287 +12,27 @@ module Hpricot
12
12
  # Hpricot.parse parses <i>input</i> and return a document tree.
13
13
  # represented by Hpricot::Doc.
14
14
  def Hpricot.parse(input = nil, opts = {}, &blk)
15
- Doc.new(make(input, opts, &blk), opts)
15
+ make(input, opts, &blk)
16
16
  end
17
17
 
18
18
  # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
19
  # and returning a document tree.
20
20
  def Hpricot.XML(input = nil, opts = {}, &blk)
21
21
  opts.merge! :xml => true
22
- Doc.new(make(input, opts, &blk), opts)
22
+ make(input, opts, &blk)
23
23
  end
24
24
 
25
25
  # :stopdoc:
26
26
 
27
27
  def Hpricot.make(input = nil, opts = {}, &blk)
28
- opts = {:fixup_tags => false}.merge(opts)
29
- unless input or blk
30
- raise ArgumentError, "An Hpricot document must be built from an input source (a String) or a block."
31
- end
32
-
33
- conv = opts[:xml] ? :to_s : :downcase
34
-
35
- fragment =
36
- if input
37
- case opts[:encoding]
38
- when nil
39
- when 'utf-8'
40
- unless defined? Encoding::Character::UTF8
41
- raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
42
- end
43
- else
44
- raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
45
- end
46
-
47
- if opts[:xhtml_strict]
48
- opts[:fixup_tags] = true
49
- end
50
-
51
- stack = [[nil, nil, [], [], [], []]]
52
- Hpricot.scan(input) do |token|
53
- if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
54
- !(token[0] == :etag and token[1].casecmp(stack.last[0]).zero?)
55
- token[0] = :text
56
- token[1] = token[3] if token[3]
57
- end
58
-
59
- if !opts[:xml] and token[0] == :emptytag
60
- token[1] = token[1].send(conv)
61
- if ElementContent[token[1].downcase] != :EMPTY
62
- token[0] = :stag
63
- end
64
- end
65
-
66
- # TODO: downcase instead when parsing attributes?
67
- if !opts[:xml] and token[2].is_a?(Hash)
68
- token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
69
- end
70
-
71
- case token[0]
72
- when :stag
73
- case opts[:encoding] when 'utf-8'
74
- token.map! { |str| u(str) if str.is_a? String }
75
- end
76
-
77
- stagname = token[0] = token[1] = token[1].send(conv)
78
- if ElementContent[stagname] == :EMPTY and !opts[:xml]
79
- token[0] = :emptytag
80
- stack.last[2] << token
81
- else
82
- unless opts[:xml]
83
- if opts[:fixup_tags]
84
- # obey the tag rules set up by the current element
85
- if ElementContent.has_key? stagname
86
- trans = nil
87
- (stack.length-1).downto(0) do |i|
88
- untags = stack[i][5]
89
- break unless untags.include? stagname
90
- # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
91
- trans = i
92
- end
93
- if trans.to_i > 1
94
- eles = stack.slice!(trans..-1)
95
- stack.last[2] += eles
96
- # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
97
- end
98
- elsif opts[:xhtml_strict]
99
- token[2] = {'class' => stagname}
100
- stagname = token[0] = "div"
101
- end
102
- end
103
-
104
- # setup tag rules for inside this element
105
- if ElementContent[stagname] == :CDATA
106
- uncontainable_tags = :CDATA
107
- elsif opts[:fixup_tags]
108
- possible_tags = ElementContent[stagname]
109
- excluded_tags, included_tags = stack.last[3..4]
110
- if possible_tags
111
- excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
112
- included_tags = included_tags | (ElementInclusions[stagname] || [])
113
- containable_tags = (possible_tags | included_tags) - excluded_tags
114
- uncontainable_tags = ElementContent.keys - containable_tags
115
- else
116
- # If the tagname is unknown, it is assumed that any element
117
- # except excluded can be contained.
118
- uncontainable_tags = excluded_tags
119
- end
120
- end
121
- end
122
- unless opts[:xml]
123
- case token[2] when Hash
124
- token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
125
- end
126
- end
127
- stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
128
- end
129
- when :etag
130
- etagname = token[0] = token[1].send(conv)
131
- if opts[:xhtml_strict] and not ElementContent.has_key? etagname
132
- etagname = token[0] = "div"
133
- end
134
- matched_elem = nil
135
- (stack.length-1).downto(0) do |i|
136
- stagname, = stack[i]
137
- if stagname == etagname
138
- matched_elem = stack[i]
139
- stack[i][1] += token
140
- eles = stack.slice!((i+1)..-1)
141
- stack.last[2] += eles if eles
142
- break
143
- end
144
- end
145
- unless matched_elem
146
- stack.last[2] << [:bogus_etag, token.first, token.last]
147
- else
148
- ele = stack.pop
149
- stack.last[2] << ele
150
- end
151
- when :text
152
- l = stack.last[2].last
153
- if l and l[0] == :text
154
- l[1] += token[1]
155
- else
156
- stack.last[2] << token
157
- end
158
- else
159
- stack.last[2] << token
160
- end
161
- end
162
-
163
- while 1 < stack.length
164
- ele = stack.pop
165
- stack.last[2] << ele
166
- end
167
-
168
- structure_list = stack[0][2]
169
- structure_list.map {|s| build_node(s, opts) }
170
- elsif blk
171
- Hpricot.build(&blk).children
172
- end
173
- end
174
-
175
- def Hpricot.build_node(structure, opts = {})
176
- case structure[0]
177
- when String
178
- tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
179
- children = structure[2]
180
- etag = eraw && ETag.parse(tagname, eraw)
181
- stag = STag.parse(tagname, attrs, sraw, true)
182
- if !children.empty? || etag
183
- Elem.new(stag,
184
- children.map {|c| build_node(c, opts) },
185
- etag)
186
- else
187
- Elem.new(stag)
188
- end
189
- when :text
190
- Text.parse_pcdata(structure[1])
191
- when :emptytag
192
- Elem.new(STag.parse(structure[1], structure[2], structure[3], false))
193
- when :bogus_etag
194
- BogusETag.parse(structure[1], structure[2])
195
- when :xmldecl
196
- XMLDecl.parse(structure[2], structure[3])
197
- when :doctype
198
- if opts[:xhtml_strict]
199
- structure[2]['system_id'] = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
200
- structure[2]['public_id'] = "-//W3C//DTD XHTML 1.0 Strict//EN"
201
- end
202
- DocType.parse(structure[1], structure[2], structure[3])
203
- when :procins
204
- ProcIns.parse(structure[1])
205
- when :comment
206
- Comment.parse(structure[1])
207
- when :cdata_content
208
- Text.parse_cdata_content(structure[1])
209
- when :cdata
210
- Text.parse_cdata_section(structure[1])
28
+ if blk
29
+ doc = Hpricot.build(&blk)
30
+ doc.instance_variable_set("@options", opts)
31
+ doc
211
32
  else
212
- raise "[bug] unknown structure: #{structure.inspect}"
33
+ Hpricot.scan(input, opts)
213
34
  end
214
35
  end
215
36
 
216
- def STag.parse(qname, attrs, raw_string, is_stag)
217
- result = STag.new(qname, attrs)
218
- result.raw_string = raw_string
219
- result
220
- end
221
-
222
- def ETag.parse(qname, raw_string)
223
- result = self.new(qname)
224
- result.raw_string = raw_string
225
- result
226
- end
227
-
228
- def BogusETag.parse(qname, raw_string)
229
- result = self.new(qname)
230
- result.raw_string = raw_string
231
- result
232
- end
233
-
234
- def Text.parse_pcdata(raw_string)
235
- result = Text.new(raw_string)
236
- result
237
- end
238
-
239
- def Text.parse_cdata_content(raw_string)
240
- result = CData.new(raw_string)
241
- result
242
- end
243
-
244
- def Text.parse_cdata_section(content)
245
- result = CData.new(content)
246
- result
247
- end
248
-
249
- def XMLDecl.parse(attrs, raw_string)
250
- attrs ||= {}
251
- version = attrs['version']
252
- encoding = attrs['encoding']
253
- case attrs['standalone']
254
- when 'yes'
255
- standalone = true
256
- when 'no'
257
- standalone = false
258
- else
259
- standalone = nil
260
- end
261
-
262
- result = XMLDecl.new(version, encoding, standalone)
263
- result.raw_string = raw_string
264
- result
265
- end
266
-
267
- def DocType.parse(root_element_name, attrs, raw_string)
268
- if attrs
269
- public_identifier = attrs['public_id']
270
- system_identifier = attrs['system_id']
271
- end
272
-
273
- root_element_name = root_element_name.downcase
274
-
275
- result = DocType.new(root_element_name, public_identifier, system_identifier)
276
- result.raw_string = raw_string
277
- result
278
- end
279
-
280
- def ProcIns.parse(raw_string)
281
- _, target, content = *raw_string.match(/\A<\?(\S+)\s+(.+)/m)
282
- result = ProcIns.new(target, content)
283
- result
284
- end
285
-
286
- def Comment.parse(content)
287
- result = Comment.new(content)
288
- result
289
- end
290
-
291
- module Pat
292
- NameChar = /[-A-Za-z0-9._:]/
293
- Name = /[A-Za-z_:]#{NameChar}*/
294
- Nmtoken = /#{NameChar}+/
295
- end
296
-
297
37
  # :startdoc:
298
38
  end