hpricot 0.5-mswin32 → 0.6-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -60,10 +60,14 @@ module Hpricot
60
60
  q.group(1, '<', '>') {
61
61
  q.text @name
62
62
 
63
- if @attributes
64
- @attributes.each {|n, t|
63
+ if @raw_attributes
64
+ @raw_attributes.each {|n, t|
65
65
  q.breakable
66
- q.text "#{n}=\"#{t}\""
66
+ if t
67
+ q.text "#{n}=\"#{Hpricot.uxs(t)}\""
68
+ else
69
+ q.text n
70
+ end
67
71
  }
68
72
  end
69
73
  }
@@ -1,7 +1,7 @@
1
1
  require 'hpricot/htmlinfo'
2
2
 
3
- def Hpricot(input, opts = {})
4
- Hpricot.parse(input, opts)
3
+ def Hpricot(input = nil, opts = {}, &blk)
4
+ Hpricot.parse(input, opts, &blk)
5
5
  end
6
6
 
7
7
  module Hpricot
@@ -11,8 +11,8 @@ module Hpricot
11
11
 
12
12
  # Hpricot.parse parses <i>input</i> and return a document tree.
13
13
  # represented by Hpricot::Doc.
14
- def Hpricot.parse(input, opts = {})
15
- Doc.new(make(input, opts))
14
+ def Hpricot.parse(input = nil, opts = {}, &blk)
15
+ Doc.new(make(input, opts, &blk))
16
16
  end
17
17
 
18
18
  # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
@@ -23,125 +23,152 @@ module Hpricot
23
23
 
24
24
  # :stopdoc:
25
25
 
26
- def Hpricot.make(input, opts = {})
26
+ def Hpricot.make(input = nil, opts = {}, &blk)
27
27
  opts = {:fixup_tags => false}.merge(opts)
28
-
29
- case opts[:encoding]
30
- when nil
31
- when 'utf-8'
32
- unless defined? Encoding::Character::UTF8
33
- raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
34
- end
35
- else
36
- raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
28
+ unless input or blk
29
+ raise ArgumentError, "An Hpricot document must be built from an input source (a String) or a block."
37
30
  end
38
31
 
39
- if opts[:xhtml_strict]
40
- opts[:fixup_tags] = true
41
- end
32
+ conv = opts[:xml] ? :to_s : :downcase
42
33
 
43
- stack = [[nil, nil, [], [], [], []]]
44
- Hpricot.scan(input) do |token|
45
- if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
46
- !(token[0] == :etag and token[1].downcase == stack.last[0])
47
- token[0] = :text
48
- token[1] = token[3] if token[3]
34
+ fragment =
35
+ if input
36
+ case opts[:encoding]
37
+ when nil
38
+ when 'utf-8'
39
+ unless defined? Encoding::Character::UTF8
40
+ raise EncodingError, "The ruby-character-encodings library could not be found for utf-8 mode."
41
+ end
42
+ else
43
+ raise EncodingError, "No encoding option `#{opts[:encoding]}' is available."
44
+ end
45
+
46
+ if opts[:xhtml_strict]
47
+ opts[:fixup_tags] = true
49
48
  end
50
49
 
51
- case token[0]
52
- when :stag
53
- case opts[:encoding] when 'utf-8'
54
- token.map! { |str| u(str) if str.is_a? String }
50
+ stack = [[nil, nil, [], [], [], []]]
51
+ Hpricot.scan(input) do |token|
52
+ if stack.last[5] == :CDATA and ![:procins, :comment, :cdata].include?(token[0]) and
53
+ !(token[0] == :etag and token[1].casecmp(stack.last[0]).zero?)
54
+ token[0] = :text
55
+ token[1] = token[3] if token[3]
55
56
  end
56
57
 
57
- stagname = token[0] = token[1].downcase
58
- if ElementContent[stagname] == :EMPTY and !opts[:xml]
59
- token[0] = :emptytag
60
- stack.last[2] << token
61
- else
62
- unless opts[:xml]
63
- if opts[:fixup_tags]
64
- # obey the tag rules set up by the current element
65
- if ElementContent.has_key? stagname
66
- trans = nil
67
- (stack.length-1).downto(0) do |i|
68
- untags = stack[i][5]
69
- break unless untags.include? stagname
70
- # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
71
- trans = i
58
+ if !opts[:xml] and token[0] == :emptytag
59
+ token[1] = token[1].send(conv)
60
+ if ElementContent[token[1].downcase] != :EMPTY
61
+ token[0] = :stag
62
+ end
63
+ end
64
+
65
+ # TODO: downcase instead when parsing attributes?
66
+ if !opts[:xml] and token[2].is_a?(Hash)
67
+ token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
68
+ end
69
+
70
+ case token[0]
71
+ when :stag
72
+ case opts[:encoding] when 'utf-8'
73
+ token.map! { |str| u(str) if str.is_a? String }
74
+ end
75
+
76
+ stagname = token[0] = token[1] = token[1].send(conv)
77
+ if ElementContent[stagname] == :EMPTY and !opts[:xml]
78
+ token[0] = :emptytag
79
+ stack.last[2] << token
80
+ else
81
+ unless opts[:xml]
82
+ if opts[:fixup_tags]
83
+ # obey the tag rules set up by the current element
84
+ if ElementContent.has_key? stagname
85
+ trans = nil
86
+ (stack.length-1).downto(0) do |i|
87
+ untags = stack[i][5]
88
+ break unless untags.include? stagname
89
+ # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
90
+ trans = i
91
+ end
92
+ if trans.to_i > 1
93
+ eles = stack.slice!(trans..-1)
94
+ stack.last[2] += eles
95
+ # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
96
+ end
97
+ elsif opts[:xhtml_strict]
98
+ token[2] = {'class' => stagname}
99
+ stagname = token[0] = "div"
72
100
  end
73
- if trans.to_i > 1
74
- eles = stack.slice!(trans..-1)
75
- stack.last[2] += eles
76
- # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
101
+ end
102
+
103
+ # setup tag rules for inside this element
104
+ if ElementContent[stagname] == :CDATA
105
+ uncontainable_tags = :CDATA
106
+ elsif opts[:fixup_tags]
107
+ possible_tags = ElementContent[stagname]
108
+ excluded_tags, included_tags = stack.last[3..4]
109
+ if possible_tags
110
+ excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
111
+ included_tags = included_tags | (ElementInclusions[stagname] || [])
112
+ containable_tags = (possible_tags | included_tags) - excluded_tags
113
+ uncontainable_tags = ElementContent.keys - containable_tags
114
+ else
115
+ # If the tagname is unknown, it is assumed that any element
116
+ # except excluded can be contained.
117
+ uncontainable_tags = excluded_tags
77
118
  end
78
- elsif opts[:xhtml_strict]
79
- token[2] = {'class' => stagname}
80
- stagname = token[0] = "div"
81
119
  end
82
120
  end
83
-
84
- # setup tag rules for inside this element
85
- if ElementContent[stagname] == :CDATA
86
- uncontainable_tags = :CDATA
87
- elsif opts[:fixup_tags]
88
- possible_tags = ElementContent[stagname]
89
- excluded_tags, included_tags = stack.last[3..4]
90
- if possible_tags
91
- excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
92
- included_tags = included_tags | (ElementInclusions[stagname] || [])
93
- containable_tags = (possible_tags | included_tags) - excluded_tags
94
- uncontainable_tags = ElementContent.keys - containable_tags
95
- else
96
- # If the tagname is unknown, it is assumed that any element
97
- # except excluded can be contained.
98
- uncontainable_tags = excluded_tags
121
+ unless opts[:xml]
122
+ case token[2] when Hash
123
+ token[2] = token[2].inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
99
124
  end
100
125
  end
126
+ stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
101
127
  end
102
- stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
103
- end
104
- when :etag
105
- etagname = token[0] = token[1].downcase
106
- if opts[:xhtml_strict] and not ElementContent.has_key? etagname
107
- etagname = token[0] = "div"
108
- end
109
- matched_elem = nil
110
- (stack.length-1).downto(0) do |i|
111
- stagname, = stack[i]
112
- if stagname == etagname
113
- matched_elem = stack[i]
114
- stack[i][1] += token
115
- eles = stack.slice!((i+1)..-1)
116
- stack.last[2] += eles
117
- break
128
+ when :etag
129
+ etagname = token[0] = token[1].send(conv)
130
+ if opts[:xhtml_strict] and not ElementContent.has_key? etagname
131
+ etagname = token[0] = "div"
132
+ end
133
+ matched_elem = nil
134
+ (stack.length-1).downto(0) do |i|
135
+ stagname, = stack[i]
136
+ if stagname == etagname
137
+ matched_elem = stack[i]
138
+ stack[i][1] += token
139
+ eles = stack.slice!((i+1)..-1)
140
+ stack.last[2] += eles
141
+ break
142
+ end
143
+ end
144
+ unless matched_elem
145
+ stack.last[2] << [:bogus_etag, token.first, token.last]
146
+ else
147
+ ele = stack.pop
148
+ stack.last[2] << ele
149
+ end
150
+ when :text
151
+ l = stack.last[2].last
152
+ if l and l[0] == :text
153
+ l[1] += token[1]
154
+ else
155
+ stack.last[2] << token
118
156
  end
119
- end
120
- unless matched_elem
121
- stack.last[2] << [:bogus_etag, token.first, token.last]
122
- else
123
- ele = stack.pop
124
- stack.last[2] << ele
125
- end
126
- when :text
127
- l = stack.last[2].last
128
- if l and l[0] == :text
129
- l[1] += token[1]
130
157
  else
131
158
  stack.last[2] << token
132
159
  end
133
- else
134
- stack.last[2] << token
135
160
  end
136
- end
137
161
 
138
- while 1 < stack.length
139
- ele = stack.pop
140
- stack.last[2] << ele
141
- end
162
+ while 1 < stack.length
163
+ ele = stack.pop
164
+ stack.last[2] << ele
165
+ end
142
166
 
143
- structure_list = stack[0][2]
144
- structure_list.map {|s| build_node(s, opts) }
167
+ structure_list = stack[0][2]
168
+ structure_list.map {|s| build_node(s, opts) }
169
+ elsif blk
170
+ Hpricot.build(&blk).children
171
+ end
145
172
  end
146
173
 
147
174
  def Hpricot.build_node(structure, opts = {})
@@ -3,7 +3,7 @@ module Hpricot
3
3
 
4
4
  class Doc
5
5
  attr_accessor :children
6
- def initialize(children)
6
+ def initialize(children = [])
7
7
  @children = children ? children.each { |c| c.parent = self } : []
8
8
  end
9
9
  def output(out, opts = {})
@@ -49,9 +49,17 @@ module Hpricot
49
49
  @children = children ? children.each { |c| c.parent = self } : []
50
50
  end
51
51
  def empty?; @children.empty? end
52
- [:name, :attributes, :parent, :altered!].each do |m|
52
+ [:name, :raw_attributes, :parent, :altered!].each do |m|
53
53
  [m, "#{m}="].each { |m2| define_method(m2) { |*a| [@etag, @stag].inject { |_,t| t.send(m2, *a) if t and t.respond_to?(m2) } } }
54
54
  end
55
+ def attributes
56
+ if raw_attributes
57
+ raw_attributes.inject({}) do |hsh, (k, v)|
58
+ hsh[k] = Hpricot.uxs(v)
59
+ hsh
60
+ end
61
+ end
62
+ end
55
63
  def to_plain_text
56
64
  if self.name == 'br'
57
65
  "\n"
@@ -84,18 +92,15 @@ module Hpricot
84
92
 
85
93
  class STag < BaseEle
86
94
  def initialize(name, attributes=nil)
87
- @name = name.downcase
88
- @attributes = {}
89
- if attributes
90
- @attributes = attributes.inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
91
- end
95
+ @name = name.to_s
96
+ @raw_attributes = attributes || {}
92
97
  end
93
- alterable :name, :attributes
98
+ alterable :name, :raw_attributes
94
99
  def attributes_as_html
95
- if @attributes
96
- @attributes.map do |aname, aval|
100
+ if @raw_attributes
101
+ @raw_attributes.map do |aname, aval|
97
102
  " #{aname}" +
98
- (aval ? "=#{html_quote(aval)}" : "")
103
+ (aval ? "=\"#{aval}\"" : "")
99
104
  end.join
100
105
  end
101
106
  end
@@ -111,7 +116,7 @@ module Hpricot
111
116
 
112
117
  class ETag < BaseEle
113
118
  def initialize(qualified_name)
114
- @name = qualified_name
119
+ @name = qualified_name.to_s
115
120
  end
116
121
  alterable :name
117
122
  def output(out, opts = {})
@@ -132,8 +137,11 @@ module Hpricot
132
137
  end
133
138
  alterable :content
134
139
  def pathname; "text()" end
135
- alias_method :inner_text, :content
136
- alias_method :to_plain_text, :content
140
+ def to_s
141
+ Hpricot.uxs(@content)
142
+ end
143
+ alias_method :inner_text, :to_s
144
+ alias_method :to_plain_text, :to_s
137
145
  def output(out, opts = {})
138
146
  out <<
139
147
  if_output(opts) do
@@ -143,7 +151,7 @@ module Hpricot
143
151
  end
144
152
 
145
153
  class CData < Text
146
- alias_method :inner_text, :content
154
+ alias_method :to_s, :content
147
155
  alias_method :to_plain_text, :content
148
156
  def output(out, opts = {})
149
157
  out <<
@@ -0,0 +1,164 @@
1
+ module Hpricot
2
+
3
+ FORM_TAGS = [ :form, :input, :select, :textarea ]
4
+ SELF_CLOSING_TAGS = [ :base, :meta, :link, :hr, :br, :param, :img, :area, :input, :col ]
5
+
6
+ # Common sets of attributes.
7
+ AttrCore = [:id, :class, :style, :title]
8
+ AttrI18n = [:lang, 'xml:lang'.intern, :dir]
9
+ AttrEvents = [:onclick, :ondblclick, :onmousedown, :onmouseup, :onmouseover, :onmousemove,
10
+ :onmouseout, :onkeypress, :onkeydown, :onkeyup]
11
+ AttrFocus = [:accesskey, :tabindex, :onfocus, :onblur]
12
+ AttrHAlign = [:align, :char, :charoff]
13
+ AttrVAlign = [:valign]
14
+ Attrs = AttrCore + AttrI18n + AttrEvents
15
+
16
+ # All the tags and attributes from XHTML 1.0 Strict
17
+ class XHTMLStrict
18
+ class << self
19
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
20
+ end
21
+ @doctype = ["-//W3C//DTD XHTML 1.0 Strict//EN", "DTD/xhtml1-strict.dtd"]
22
+ @tagset = {
23
+ :html => AttrI18n + [:id, :xmlns],
24
+ :head => AttrI18n + [:id, :profile],
25
+ :title => AttrI18n + [:id],
26
+ :base => [:href, :id],
27
+ :meta => AttrI18n + [:id, :http, :name, :content, :scheme, 'http-equiv'.intern],
28
+ :link => Attrs + [:charset, :href, :hreflang, :type, :rel, :rev, :media],
29
+ :style => AttrI18n + [:id, :type, :media, :title, 'xml:space'.intern],
30
+ :script => [:id, :charset, :type, :src, :defer, 'xml:space'.intern],
31
+ :noscript => Attrs,
32
+ :body => Attrs + [:onload, :onunload],
33
+ :div => Attrs,
34
+ :p => Attrs,
35
+ :ul => Attrs,
36
+ :ol => Attrs,
37
+ :li => Attrs,
38
+ :dl => Attrs,
39
+ :dt => Attrs,
40
+ :dd => Attrs,
41
+ :address => Attrs,
42
+ :hr => Attrs,
43
+ :pre => Attrs + ['xml:space'.intern],
44
+ :blockquote => Attrs + [:cite],
45
+ :ins => Attrs + [:cite, :datetime],
46
+ :del => Attrs + [:cite, :datetime],
47
+ :a => Attrs + AttrFocus + [:charset, :type, :name, :href, :hreflang, :rel, :rev, :shape, :coords],
48
+ :span => Attrs,
49
+ :bdo => AttrCore + AttrEvents + [:lang, 'xml:lang'.intern, :dir],
50
+ :br => AttrCore,
51
+ :em => Attrs,
52
+ :strong => Attrs,
53
+ :dfn => Attrs,
54
+ :code => Attrs,
55
+ :samp => Attrs,
56
+ :kbd => Attrs,
57
+ :var => Attrs,
58
+ :cite => Attrs,
59
+ :abbr => Attrs,
60
+ :acronym => Attrs,
61
+ :q => Attrs + [:cite],
62
+ :sub => Attrs,
63
+ :sup => Attrs,
64
+ :tt => Attrs,
65
+ :i => Attrs,
66
+ :b => Attrs,
67
+ :big => Attrs,
68
+ :small => Attrs,
69
+ :object => Attrs + [:declare, :classid, :codebase, :data, :type, :codetype, :archive, :standby, :height, :width, :usemap, :name, :tabindex],
70
+ :param => [:id, :name, :value, :valuetype, :type],
71
+ :img => Attrs + [:src, :alt, :longdesc, :height, :width, :usemap, :ismap],
72
+ :map => AttrI18n + AttrEvents + [:id, :class, :style, :title, :name],
73
+ :area => Attrs + AttrFocus + [:shape, :coords, :href, :nohref, :alt],
74
+ :form => Attrs + [:action, :method, :enctype, :onsubmit, :onreset, :accept, :accept],
75
+ :label => Attrs + [:for, :accesskey, :onfocus, :onblur],
76
+ :input => Attrs + AttrFocus + [:type, :name, :value, :checked, :disabled, :readonly, :size, :maxlength, :src, :alt, :usemap, :onselect, :onchange, :accept],
77
+ :select => Attrs + [:name, :size, :multiple, :disabled, :tabindex, :onfocus, :onblur, :onchange],
78
+ :optgroup => Attrs + [:disabled, :label],
79
+ :option => Attrs + [:selected, :disabled, :label, :value],
80
+ :textarea => Attrs + AttrFocus + [:name, :rows, :cols, :disabled, :readonly, :onselect, :onchange],
81
+ :fieldset => Attrs,
82
+ :legend => Attrs + [:accesskey],
83
+ :button => Attrs + AttrFocus + [:name, :value, :type, :disabled],
84
+ :table => Attrs + [:summary, :width, :border, :frame, :rules, :cellspacing, :cellpadding],
85
+ :caption => Attrs,
86
+ :colgroup => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
87
+ :col => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
88
+ :thead => Attrs + AttrHAlign + AttrVAlign,
89
+ :tfoot => Attrs + AttrHAlign + AttrVAlign,
90
+ :tbody => Attrs + AttrHAlign + AttrVAlign,
91
+ :tr => Attrs + AttrHAlign + AttrVAlign,
92
+ :th => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
93
+ :td => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
94
+ :h1 => Attrs,
95
+ :h2 => Attrs,
96
+ :h3 => Attrs,
97
+ :h4 => Attrs,
98
+ :h5 => Attrs,
99
+ :h6 => Attrs
100
+ }
101
+
102
+ @tags = @tagset.keys
103
+ @forms = @tags & FORM_TAGS
104
+ @self_closing = @tags & SELF_CLOSING_TAGS
105
+ end
106
+
107
+ # Additional tags found in XHTML 1.0 Transitional
108
+ class XHTMLTransitional
109
+ class << self
110
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
111
+ end
112
+ @doctype = ["-//W3C//DTD XHTML 1.0 Transitional//EN", "DTD/xhtml1-transitional.dtd"]
113
+ @tagset = XHTMLStrict.tagset.merge \
114
+ :strike => Attrs,
115
+ :center => Attrs,
116
+ :dir => Attrs + [:compact],
117
+ :noframes => Attrs,
118
+ :basefont => [:id, :size, :color, :face],
119
+ :u => Attrs,
120
+ :menu => Attrs + [:compact],
121
+ :iframe => AttrCore + [:longdesc, :name, :src, :frameborder, :marginwidth, :marginheight, :scrolling, :align, :height, :width],
122
+ :font => AttrCore + AttrI18n + [:size, :color, :face],
123
+ :s => Attrs,
124
+ :applet => AttrCore + [:codebase, :archive, :code, :object, :alt, :name, :width, :height, :align, :hspace, :vspace],
125
+ :isindex => AttrCore + AttrI18n + [:prompt]
126
+
127
+ # Additional attributes found in XHTML 1.0 Transitional
128
+ { :script => [:language],
129
+ :a => [:target],
130
+ :td => [:bgcolor, :nowrap, :width, :height],
131
+ :p => [:align],
132
+ :h5 => [:align],
133
+ :h3 => [:align],
134
+ :li => [:type, :value],
135
+ :div => [:align],
136
+ :pre => [:width],
137
+ :body => [:background, :bgcolor, :text, :link, :vlink, :alink],
138
+ :ol => [:type, :compact, :start],
139
+ :h4 => [:align],
140
+ :h2 => [:align],
141
+ :object => [:align, :border, :hspace, :vspace],
142
+ :img => [:name, :align, :border, :hspace, :vspace],
143
+ :link => [:target],
144
+ :legend => [:align],
145
+ :dl => [:compact],
146
+ :input => [:align],
147
+ :h6 => [:align],
148
+ :hr => [:align, :noshade, :size, :width],
149
+ :base => [:target],
150
+ :ul => [:type, :compact],
151
+ :br => [:clear],
152
+ :form => [:name, :target],
153
+ :area => [:target],
154
+ :h1 => [:align]
155
+ }.each do |k, v|
156
+ @tagset[k] += v
157
+ end
158
+
159
+ @tags = @tagset.keys
160
+ @forms = @tags & FORM_TAGS
161
+ @self_closing = @tags & SELF_CLOSING_TAGS
162
+ end
163
+
164
+ end