hpricot 0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,90 @@
1
+ require 'pp'
2
+
3
+ module Hpricot
4
+ # :stopdoc:
5
+ class Doc
6
+ def pretty_print(q)
7
+ q.object_group(self) { @children.each {|elt| q.breakable; q.pp elt } }
8
+ end
9
+ alias inspect pretty_print_inspect
10
+ end
11
+
12
+ class Elem
13
+ def pretty_print(q)
14
+ if empty?
15
+ q.group(1, '{emptyelem', '}') {
16
+ q.breakable; q.pp @stag
17
+ }
18
+ else
19
+ q.group(1, "{elem", "}") {
20
+ q.breakable; q.pp @stag
21
+ if @children
22
+ @children.each {|elt| q.breakable; q.pp elt }
23
+ end
24
+ if @etag
25
+ q.breakable; q.pp @etag
26
+ end
27
+ }
28
+ end
29
+ end
30
+ alias inspect pretty_print_inspect
31
+ end
32
+
33
+ module Leaf
34
+ def pretty_print(q)
35
+ q.group(1, '{', '}') {
36
+ q.text self.class.name.sub(/.*::/,'').downcase
37
+ if rs = @raw_string
38
+ rs.scan(/[^\r\n]*(?:\r\n?|\n|[^\r\n]\z)/) {|line|
39
+ q.breakable
40
+ q.pp line
41
+ }
42
+ elsif self.respond_to? :to_s
43
+ q.breakable
44
+ q.text self.to_s
45
+ end
46
+ }
47
+ end
48
+ alias inspect pretty_print_inspect
49
+ end
50
+
51
+ class STag
52
+ def pretty_print(q)
53
+ q.group(1, '<', '>') {
54
+ q.text @name
55
+
56
+ if @attributes
57
+ @attributes.each {|n, t|
58
+ q.breakable
59
+ q.text "#{n}=\"#{t}\""
60
+ }
61
+ end
62
+ }
63
+ end
64
+ alias inspect pretty_print_inspect
65
+ end
66
+
67
+ class ETag
68
+ def pretty_print(q)
69
+ q.group(1, '</', '>') {
70
+ q.text @name
71
+ }
72
+ end
73
+ alias inspect pretty_print_inspect
74
+ end
75
+
76
+ class BogusETag
77
+ def pretty_print(q)
78
+ q.group(1, '{', '}') {
79
+ q.text self.class.name.sub(/.*::/,'').downcase
80
+ if rs = @raw_string
81
+ q.breakable
82
+ q.text rs
83
+ else
84
+ q.text "</#{@name}>"
85
+ end
86
+ }
87
+ end
88
+ end
89
+ # :startdoc:
90
+ end
@@ -0,0 +1,37 @@
1
+ module Hpricot
2
+ class Name; include Hpricot end
3
+ class Context; include Hpricot end
4
+
5
+ # :stopdoc:
6
+ module Tag; include Hpricot end
7
+ class STag; include Tag end
8
+ class ETag; include Tag end
9
+ # :startdoc:
10
+
11
+ module Node; include Hpricot end
12
+ module Container; include Node end
13
+ class Doc; include Container end
14
+ class Elem; include Container end
15
+ module Leaf; include Node end
16
+ class Text; include Leaf end
17
+ class XMLDecl; include Leaf end
18
+ class DocType; include Leaf end
19
+ class ProcIns; include Leaf end
20
+ class Comment; include Leaf end
21
+ class BogusETag; include Leaf end
22
+
23
+ module Traverse end
24
+ module Container::Trav; include Traverse end
25
+ module Leaf::Trav; include Traverse end
26
+ class Doc; module Trav; include Container::Trav end; include Trav end
27
+ class Elem; module Trav; include Container::Trav end; include Trav end
28
+ class Text; module Trav; include Leaf::Trav end; include Trav end
29
+ class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
30
+ class DocType; module Trav; include Leaf::Trav end; include Trav end
31
+ class ProcIns; module Trav; include Leaf::Trav end; include Trav end
32
+ class Comment; module Trav; include Leaf::Trav end; include Trav end
33
+ class BogusETag; module Trav; include Leaf::Trav end; include Trav end
34
+
35
+ class Error < StandardError; end
36
+ end
37
+
@@ -0,0 +1,286 @@
1
+ require 'hpricot/htmlinfo'
2
+
3
+ def Hpricot(input, opts = {})
4
+ Hpricot.parse(input, opts)
5
+ end
6
+
7
+ module Hpricot
8
+ # Hpricot.parse parses <i>input</i> and return a document tree.
9
+ # represented by Hpricot::Doc.
10
+ def Hpricot.parse(input, opts = {})
11
+ Doc.new(make(input, opts))
12
+ end
13
+
14
+ # :stopdoc:
15
+
16
+ def Hpricot.make(input, opts = {})
17
+ opts = {:fixup_tags => false}.merge(opts)
18
+ stack = [[nil, nil, [], [], [], []]]
19
+ Hpricot.scan(input) do |token|
20
+ if stack.last[5] == :CDATA and !(token[0] == :etag and token[1].downcase == stack.last[0])
21
+ token[0] = :text
22
+ token[1] = token[3] if token[3]
23
+ end
24
+
25
+ case token[0]
26
+ when :stag
27
+ stagname = token[0] = token[1].downcase
28
+ if ElementContent[stagname] == :EMPTY
29
+ token[0] = :emptytag
30
+ stack.last[2] << token
31
+ else
32
+ if opts[:fixup_tags]
33
+ # obey the tag rules set up by the current element
34
+ if ElementContent.has_key? stagname
35
+ trans = nil
36
+ (stack.length-1).downto(0) do |i|
37
+ untags = stack[i][5]
38
+ break unless untags.include? stagname
39
+ # puts "** ILLEGAL #{stagname} IN #{stack[i][0]}"
40
+ trans = i
41
+ end
42
+ if trans.to_i > 1
43
+ eles = stack.slice!(trans..-1)
44
+ stack.last[2] += eles
45
+ # puts "** TRANSPLANTED #{stagname} TO #{stack.last[0]}"
46
+ end
47
+ end
48
+ end
49
+
50
+ # setup tag rules for inside this element
51
+ if ElementContent[stagname] == :CDATA
52
+ uncontainable_tags = :CDATA
53
+ elsif opts[:fixup_tags]
54
+ possible_tags = ElementContent[stagname]
55
+ excluded_tags, included_tags = stack.last[3..4]
56
+ if possible_tags
57
+ excluded_tags = excluded_tags | (ElementExclusions[stagname] || [])
58
+ included_tags = included_tags | (ElementInclusions[stagname] || [])
59
+ containable_tags = (possible_tags | included_tags) - excluded_tags
60
+ uncontainable_tags = ElementContent.keys - containable_tags
61
+ else
62
+ # If the tagname is unknown, it is assumed that any element
63
+ # except excluded can be contained.
64
+ uncontainable_tags = excluded_tags
65
+ end
66
+ end
67
+ stack << [stagname, token, [], excluded_tags, included_tags, uncontainable_tags]
68
+ end
69
+ when :etag
70
+ etagname = token[0] = token[1].downcase
71
+ matched_elem = nil
72
+ (stack.length-1).downto(0) do |i|
73
+ stagname, = stack[i]
74
+ if stagname == etagname
75
+ matched_elem = stack[i]
76
+ stack[i][1] += token
77
+ eles = stack.slice!((i+1)..-1)
78
+ stack.last[2] += eles
79
+ break
80
+ end
81
+ end
82
+ unless matched_elem
83
+ stack.last[2] << [:bogus_etag, token]
84
+ else
85
+ ele = stack.pop
86
+ stack.last[2] << ele
87
+ end
88
+ when :text
89
+ l = stack.last[2].last
90
+ if l and l[0] == :text
91
+ l[1] += token[1]
92
+ else
93
+ stack.last[2] << token
94
+ end
95
+ else
96
+ stack.last[2] << token
97
+ end
98
+ end
99
+
100
+ while 1 < stack.length
101
+ ele = stack.pop
102
+ stack.last[2] << ele
103
+ end
104
+
105
+ structure_list = stack[0][2]
106
+ structure_list.map {|s| build_node(s) }
107
+ end
108
+
109
+ def Hpricot.fix_element(elem, excluded_tags, included_tags)
110
+ tagname, _, attrs, sraw, _, _, _, eraw = elem[1]
111
+ children = elem[2]
112
+ if eraw
113
+ elem[2] = fix_structure_list(children)
114
+ return elem, []
115
+ else
116
+ if ElementContent[tagname] == :EMPTY
117
+ elem[2] = []
118
+ return elem, children
119
+ else
120
+ if ElementContent[tagname] == :CDATA
121
+ possible_tags = []
122
+ else
123
+ possible_tags = ElementContent[tagname]
124
+ end
125
+ if possible_tags
126
+ excluded_tags2 = ElementExclusions[tagname]
127
+ included_tags2 = ElementInclusions[tagname]
128
+ excluded_tags |= excluded_tags2 if excluded_tags2
129
+ included_tags |= included_tags2 if included_tags2
130
+ containable_tags = (possible_tags | included_tags) - excluded_tags
131
+ uncontainable_tags = ElementContent.keys - containable_tags
132
+ else
133
+ # If the tagname is unknown, it is assumed that any element
134
+ # except excluded can be contained.
135
+ uncontainable_tags = excluded_tags
136
+ end
137
+ fixed_children = []
138
+ rest = children
139
+ until rest.empty?
140
+ if String === rest[0][0]
141
+ elem = rest.shift
142
+ elem_tagname = elem[0]
143
+ elem_tagname = elem_tagname.downcase
144
+ if uncontainable_tags.include? elem_tagname
145
+ rest.unshift elem
146
+ break
147
+ else
148
+ fixed_elem, rest2 = fix_element(elem, excluded_tags, included_tags)
149
+ fixed_children << fixed_elem
150
+ rest = rest2 + rest
151
+ end
152
+ else
153
+ fixed_children << rest.shift
154
+ end
155
+ end
156
+ elem[2] = fixed_children
157
+ return elem, rest
158
+ end
159
+ end
160
+ end
161
+
162
+ def Hpricot.build_node(structure)
163
+ case structure[0]
164
+ when String
165
+ tagname, _, attrs, sraw, _, _, _, eraw = structure[1]
166
+ children = structure[2]
167
+ etag = eraw && ETag.parse(tagname, eraw)
168
+ stag = STag.parse(tagname, attrs, sraw, true)
169
+ if !children.empty? || etag
170
+ Elem.new(stag,
171
+ children.map {|c| build_node(c) },
172
+ etag)
173
+ else
174
+ Elem.new(stag)
175
+ end
176
+ when :text
177
+ Text.parse_pcdata(structure[1])
178
+ when :emptytag
179
+ Elem.new(STag.parse(structure[1], structure[2], structure[3], false))
180
+ when :bogus_etag
181
+ BogusETag.parse(structure[1], structure[2])
182
+ when :xmldecl
183
+ XMLDecl.parse(structure[2], structure[3])
184
+ when :doctype
185
+ DocType.parse(structure[1], structure[2], structure[3])
186
+ when :procins
187
+ ProcIns.parse(structure[1], structure[2], structure[3])
188
+ when :comment
189
+ Comment.parse(structure[1])
190
+ when :cdata_content
191
+ Text.parse_cdata_content(structure[1])
192
+ when :cdata
193
+ Text.parse_cdata_section(structure[1])
194
+ else
195
+ raise Exception, "[bug] unknown structure: #{structure.inspect}"
196
+ end
197
+ end
198
+
199
+ def STag.parse(qname, attrs, raw_string, is_stag)
200
+ result = STag.new(qname, attrs)
201
+ result.raw_string = raw_string
202
+ result
203
+ end
204
+
205
+ def ETag.parse(qname, raw_string)
206
+ result = self.new(qname)
207
+ result.raw_string = raw_string
208
+ result
209
+ end
210
+
211
+ def BogusETag.parse(qname, raw_string)
212
+ result = self.new(qname)
213
+ result.raw_string = raw_string
214
+ result
215
+ end
216
+
217
+ def Text.parse_pcdata(raw_string)
218
+ result = Text.new(raw_string)
219
+ result.raw_string = raw_string
220
+ result
221
+ end
222
+
223
+ def Text.parse_cdata_content(raw_string)
224
+ result = Text.new(raw_string)
225
+ result.raw_string = raw_string
226
+ result.instance_variable_set( "@cdata", true )
227
+ result
228
+ end
229
+
230
+ def Text.parse_cdata_section(content)
231
+ result = Text.new(content)
232
+ result.raw_string = "<![CDATA[" + content + "]]>"
233
+ result
234
+ end
235
+
236
+ def XMLDecl.parse(attrs, raw_string)
237
+ attrs ||= {}
238
+ version = attrs['version']
239
+ encoding = attrs['encoding']
240
+ case attrs['standalone']
241
+ when 'yes'
242
+ standalone = true
243
+ when 'no'
244
+ standalone = false
245
+ else
246
+ standalone = nil
247
+ end
248
+
249
+ result = XMLDecl.new(version, encoding, standalone)
250
+ result.raw_string = raw_string
251
+ result
252
+ end
253
+
254
+ def DocType.parse(root_element_name, attrs, raw_string)
255
+ if attrs
256
+ public_identifier = attrs['public_id']
257
+ system_identifier = attrs['system_id']
258
+ end
259
+
260
+ root_element_name = root_element_name.downcase
261
+
262
+ result = DocType.new(root_element_name, public_identifier, system_identifier)
263
+ result.raw_string = raw_string
264
+ result
265
+ end
266
+
267
+ def ProcIns.parse(target, content, raw_string)
268
+ result = ProcIns.new(target, content)
269
+ result.raw_string = raw_string
270
+ result
271
+ end
272
+
273
+ def Comment.parse(content)
274
+ result = Comment.new(content)
275
+ result.raw_string = "<!--" + content + "-->"
276
+ result
277
+ end
278
+
279
+ module Pat
280
+ NameChar = /[-A-Za-z0-9._:]/
281
+ Name = /[A-Za-z_:]#{NameChar}*/
282
+ Nmtoken = /#{NameChar}+/
283
+ end
284
+
285
+ # :startdoc:
286
+ end
@@ -0,0 +1,146 @@
1
+ module Hpricot
2
+ # :stopdoc:
3
+
4
+ class Doc
5
+ attr_accessor :children
6
+ def initialize(children)
7
+ @children = children ? children.each { |c| c.parent = self } : []
8
+ end
9
+ def output(out)
10
+ @children.each do |n|
11
+ n.output(out)
12
+ end
13
+ out
14
+ end
15
+ end
16
+
17
+ class BaseEle
18
+ attr_accessor :raw_string, :parent
19
+ def html_quote(str)
20
+ "\"" + str.gsub('"', '\\"') + "\""
21
+ end
22
+ end
23
+
24
+ class Elem
25
+ attr_accessor :stag, :etag, :children
26
+ def initialize(stag, children=nil, etag=nil)
27
+ @stag, @etag = stag, etag
28
+ @children = children ? children.each { |c| c.parent = self } : []
29
+ end
30
+ def empty?; @children.empty? end
31
+ [:name, :attributes, :parent].each do |m|
32
+ [m, "#{m}="].each { |m2| define_method(m2) { |*a| @stag.send(m2, *a) } }
33
+ end
34
+ def output(out)
35
+ if empty? and ElementContent[@stag.name] == :EMPTY
36
+ @stag.output(out, :style => :empty)
37
+ else
38
+ @stag.output(out)
39
+ @children.each { |n| n.output(out) }
40
+ @stag.output(out, :style => :end)
41
+ end
42
+ out
43
+ end
44
+ end
45
+
46
+ class STag < BaseEle
47
+ def initialize(name, attributes=nil)
48
+ @name = name.downcase
49
+ if attributes
50
+ @attributes = attributes.inject({}) { |hsh,(k,v)| hsh[k.downcase] = v; hsh }
51
+ end
52
+ end
53
+ attr_accessor :name, :attributes
54
+ def attributes_as_html
55
+ if @attributes
56
+ @attributes.map do |aname, aval|
57
+ " #{aname}" +
58
+ (aval ? "=#{html_quote(aval)}" : "")
59
+ end.join
60
+ end
61
+ end
62
+ def output(out, opts = {})
63
+ out <<
64
+ case opts[:style]
65
+ when :end
66
+ "</#{@name}>"
67
+ else
68
+ "<#{@name}#{attributes_as_html}" +
69
+ (opts[:style] == :empty ? " /" : "") +
70
+ ">"
71
+ end
72
+ end
73
+ end
74
+
75
+ class ETag < BaseEle
76
+ def initialize(qualified_name)
77
+ @name = qualified_name
78
+ end
79
+ attr_reader :name
80
+ end
81
+
82
+ class BogusETag < ETag
83
+ def output(out); end
84
+ end
85
+
86
+ class Text < BaseEle
87
+ def initialize(text)
88
+ @content = text
89
+ end
90
+ attr_reader :content
91
+ def output(out)
92
+ out << @content
93
+ end
94
+ end
95
+
96
+ class XMLDecl < BaseEle
97
+ def initialize(version, encoding, standalone)
98
+ @version, @encoding, @standalone = version, encoding, standalone
99
+ end
100
+ attr_reader :version, :encoding, :standalone
101
+ def output(out)
102
+ out <<
103
+ "<?xml version=\"#{@version}\"" +
104
+ (@encoding ? " encoding=\"#{encoding}\"" : "") +
105
+ (@standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
106
+ "?>"
107
+ end
108
+ end
109
+
110
+ class DocType < BaseEle
111
+ def initialize(name, pubid, sysid)
112
+ @name, @public_id, @system_id = name, pubid, sysid
113
+ end
114
+ attr_reader :name, :public_id, :system_id
115
+ def output(out)
116
+ out <<
117
+ "<!DOCTYPE #{@name} " +
118
+ (@public_id ? "PUBLIC \"#{@public_id}\"" : "SYSTEM") +
119
+ (@system_id ? " #{html_quote(@system_id)}" : "") + ">"
120
+ end
121
+ end
122
+
123
+ class ProcIns < BaseEle
124
+ def initialize(target, content)
125
+ @target, @content = target, content
126
+ end
127
+ attr_reader :target, :content
128
+ def output(out)
129
+ out << "<?#{@target}" +
130
+ (@content ? " #{@content}" : "") +
131
+ "?>"
132
+ end
133
+ end
134
+
135
+ class Comment < BaseEle
136
+ def initialize(content)
137
+ @content = content
138
+ end
139
+ attr_reader :content
140
+ def output(out)
141
+ out << "<!--#{@content}-->"
142
+ end
143
+ end
144
+
145
+ # :startdoc:
146
+ end