htmltools 1.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,323 @@
1
+ # This module is a mix-in that provides parent/child behavior to real
2
+ # Element classes. Because it defines <tt>each()</tt> and includes Enumerable,
3
+ # you can iterate through a tree using the usual Enumerable methods.
4
+ #
5
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
6
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
7
+ # License:: Same as Ruby's
8
+ # CVS ID: $Id: element.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
9
+
10
+ require 'html/tags'
11
+
12
+ module HTMLTree
13
+ module TreeElement
14
+ include Enumerable
15
+
16
+ protected
17
+
18
+ def initialize_tree_element(parent_or_nil = nil, contents_or_nil = nil)
19
+ @_content, @_parent = contents_or_nil, parent_or_nil
20
+ if parent_or_nil
21
+ parent_or_nil.add_child(self)
22
+ end
23
+ end
24
+
25
+ attr_accessor :_parent
26
+
27
+ public
28
+
29
+ # Add one or more children to this node.
30
+ def add_child(*children_to_add)
31
+ if can_have_children?
32
+ children_to_add.each do |child|
33
+ @_content << child
34
+ child._parent = self
35
+ end
36
+ else
37
+ raise(ArgumentError.exception('node cannot have children'))
38
+ end
39
+ end
40
+
41
+ alias_method :add_children, :add_child
42
+
43
+ # Remove one or more children from this node.
44
+ def remove_child(*children_to_remove)
45
+ if can_have_children?
46
+ children_to_remove.each do |child|
47
+ child._parent = nil if @_content.delete(child)
48
+ end
49
+ else
50
+ raise(ArgumentError.exception('node cannot have children'))
51
+ end
52
+ end
53
+
54
+ alias_method :remove_children, :remove_child
55
+
56
+ # Change my parent. Disconnects from prior parent, if any.
57
+ def parent=(parent_or_nil)
58
+ @_parent.remove_child(self) if @_parent
59
+ parent_or_nil.add_child(self) if parent_or_nil
60
+ end
61
+
62
+ # Return true if my content is a collection of Elements
63
+ # rather than actual data.
64
+ def can_have_children?
65
+ @_content.kind_of?(Array)
66
+ end
67
+
68
+ # Return a collection of my children. Returns an empty Array if I am a
69
+ # data element, just to keep other methods simple.
70
+ def children
71
+ can_have_children? ? @_content : []
72
+ end
73
+
74
+ # Return my content; either my children or my data.
75
+ def content
76
+ @_content
77
+ end
78
+
79
+ # Return my parent element.
80
+ def parent
81
+ @_parent
82
+ end
83
+
84
+ def path
85
+ "/"
86
+ end
87
+
88
+ # Return the ultimate parent.
89
+ def root
90
+ @_parent ? self : @_parent.root
91
+ end
92
+
93
+ # Return true if I have any children.
94
+ def has_children?
95
+ children.size > 0
96
+ end
97
+
98
+ # Breadth-first iterator, required by Enumerable.
99
+ def each(&block)
100
+ block.call(self)
101
+ children.each { |ch| ch.each(&block) }
102
+ end
103
+
104
+ # Print out to $stdout (or given IO or String)
105
+ # a formatted dump of my structure.
106
+ def dump(indent=0, io=$stdout)
107
+ io << " " * indent
108
+ io << self.to_s
109
+ io << "\n"
110
+ children.each { |ea| ea.dump(indent+1, io) }
111
+ end
112
+
113
+ end
114
+
115
+ # This is a Element that represents the whole document (and makes a
116
+ # scope for the DTD declaration)
117
+ class Document
118
+ include TreeElement
119
+
120
+ def initialize
121
+ initialize_tree_element(nil, [])
122
+ end
123
+
124
+ def to_s
125
+ ''
126
+ end
127
+
128
+ def each(&block)
129
+ children.each { |ch| ch.each(&block) }
130
+ end
131
+
132
+ def write(io)
133
+ children.each { |t| t.write(io) }
134
+ end
135
+
136
+ def tag
137
+ ''
138
+ end
139
+
140
+ # Return my child <html> node, if any.
141
+ def html_node
142
+ children.detect { |ea| ea.tag == 'html' }
143
+ end
144
+ end
145
+
146
+ # This is a TreeElement that represents tagged items in an HTML
147
+ # document.
148
+ class Element
149
+ include TreeElement
150
+
151
+ protected
152
+
153
+ # parent_or_nil:: TreeElement or nil
154
+ # tag_name:: String
155
+ def initialize(parent_or_nil = nil, tag_name = nil)
156
+ initialize_tree_element(parent_or_nil, [])
157
+ @_tag = tag_name
158
+ @_attributes = {}
159
+ @_attribute_order = []
160
+ end
161
+
162
+ public
163
+
164
+ def can_have_children?; true; end
165
+
166
+ # Return true if I'm data instead of a tag
167
+ def data?; false; end
168
+
169
+ def to_s
170
+ a = [ "<", tag ]
171
+ @_attribute_order.each { |k|
172
+ v = @_attributes[k]
173
+ a << " #{k.to_s}=\"#{v.to_s}\""
174
+ }
175
+ a << ">"
176
+ a.join('')
177
+ end
178
+
179
+ # Append an attribute. <tt>values</tt> are first flattened into an Array,
180
+ # then converted into strings.
181
+ #
182
+ # If there is a single attribute value, it will appear as a String,
183
+ # otherwise it will be an Array of Strings.
184
+ #
185
+ # Example:
186
+ # element.add_attribute("width", "123")
187
+ # element.add_attribute("value", [ "a", "b" ])
188
+ def add_attribute(name, *values)
189
+ values = values.flatten.collect { |ea| ea.to_s.strip }
190
+ name = name.downcase
191
+ if @_attributes.include?(name)
192
+ @_attributes[name] = @_attributes[name].to_a + values
193
+ else
194
+ @_attributes[name] = values.size > 1 ? values : values[0]
195
+ end
196
+ @_attribute_order << name
197
+ self
198
+ end
199
+
200
+ # Return my tag (should be a String)
201
+ def tag; @_tag; end
202
+
203
+ # Return an HTML::Tag for further information, or nil if this is an
204
+ # unknown tag.
205
+ def tag_info
206
+ begin
207
+ HTML::Tag.named(@_tag)
208
+ rescue NoSuchHTMLTagError
209
+ nil
210
+ end
211
+ end
212
+
213
+ # Return the path to this element from the root
214
+ def path
215
+ path = []
216
+ node = self
217
+ while node do
218
+ path.unshift node.tag
219
+ node = node.parent
220
+ end
221
+ path.join(".")
222
+ end
223
+
224
+ def show_structure(indent = 0)
225
+ puts(' ' * indent) + path
226
+ elements.each { |node| node.show_structure(indent + 2) }
227
+ nil
228
+ end
229
+
230
+ # Return the children of this node that are elements (not data)
231
+ def elements
232
+ children.select { |node| node.is_a? Element }
233
+ end
234
+
235
+ # Return my attributes Hash.
236
+ def attributes; @_attributes; end
237
+
238
+ # Return the order of my attributes
239
+ def attribute_order; @_attribute_order; end
240
+
241
+ # Return the value of a single attribute (a String or Array).
242
+ def attribute(name); @_attributes[name]; end
243
+
244
+ # Return the value of a single attribute (a String or Array).
245
+ def [](name); attribute(name); end
246
+
247
+ # Replace an attribute.
248
+ def []=(name, *values)
249
+ @_attributes[name] = values.size > 1 ? values : values[0]
250
+ @_attribute_order.delete(name)
251
+ self
252
+ end
253
+
254
+ # Print me (and my descendents) on the given IO stream.
255
+ def write(io)
256
+ io << self
257
+ children.each { |t| t.write(io) }
258
+ unless tag_info.is_empty_element
259
+ io.puts( "</#{tag()}>" )
260
+ end
261
+ end
262
+
263
+ end
264
+
265
+ # This is a TreeElement that represents leaf data nodes (CDATA, scripts,
266
+ # comments, processing directives). It forwards unknown messages to the
267
+ # content element, so it otherwise behaves like a String.
268
+ class Data
269
+ include TreeElement
270
+
271
+ protected
272
+
273
+ # parent_or_nil:: parent, TreeElement or nil
274
+ # str:: contents, String
275
+ def initialize(parent_or_nil = nil, str = '')
276
+ initialize_tree_element(parent_or_nil, str)
277
+ end
278
+
279
+ public
280
+
281
+ # Return true because I am a data Element.
282
+ def data?; true; end
283
+
284
+ # Return false because I have no children.
285
+ def can_have_children?; false; end
286
+
287
+ # Return an empty collection because I have no children.
288
+ def children; []; end
289
+
290
+ # Return my (empty) tag String.
291
+ def tag; ''; end
292
+
293
+ # Return my (empty) attributes Hash.
294
+ def attributes; {}; end
295
+
296
+ def to_s
297
+ @_content
298
+ end
299
+
300
+ # Print me on the given IO stream.
301
+ def write(io)
302
+ io << self
303
+ end
304
+
305
+ # Forward all other methods to my content, so I can otherwise behave
306
+ # like a String.
307
+ def method_missing(sym, *args)
308
+ @_content.method(sym).call(*args)
309
+ end
310
+ end
311
+
312
+ class Comment < Data
313
+ def to_s
314
+ '<!--' + @_content + '-->'
315
+ end
316
+ end
317
+
318
+ class Special < Data
319
+ def to_s
320
+ '<' + @_content + '>'
321
+ end
322
+ end
323
+ end
@@ -0,0 +1,49 @@
1
+ # This module adds method 'full_path' to REXML hierarchy
2
+ # This method will print out the path to the given node, for example
3
+ # /html/body/p[0]/@text
4
+ #
5
+ # Copyright:: Copyright (C) 2004, Johannes Brodwall <johannes@brodwall.com>
6
+ # License:: Same as Ruby's
7
+ # CVS ID: $Id: rexml-nodepath.rb,v 1.5 2005/05/25 17:38:45 jhannes Exp $
8
+
9
+
10
+ module REXML
11
+ class Child
12
+ def parent_path
13
+ parent ? parent.full_path : ''
14
+ end
15
+ end
16
+
17
+ class Document
18
+ def full_path
19
+ ''
20
+ end
21
+ end
22
+
23
+ class Element
24
+ def child_index
25
+ return "" unless parent
26
+ siblings = parent.to_a.select do |node|
27
+ node.kind_of? Element and node.expanded_name == self.expanded_name
28
+ end
29
+ return "" if siblings.size < 2
30
+ "[" + (siblings.index(self)+1).to_s + "]"
31
+ end
32
+
33
+ def full_path
34
+ parent_path + '/' + expanded_name + child_index
35
+ end
36
+ end
37
+
38
+ class Text
39
+ def full_path
40
+ parent_path + '/text()' + " " + expanded_name
41
+ end
42
+ end
43
+
44
+ class Attribute
45
+ def full_path
46
+ element.full_path + '/@' + name
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,372 @@
1
+ # A parser for SGML, using the derived class as static DTD.
2
+ #
3
+ # Taken from http://raa.ruby-lang.org/list.rhtml?name=html-parser-2
4
+ # This file seems to be included in the current install of Ruby,
5
+ # but with a bug related to attributes quoted with '"', so I have
6
+ # included in in the HTML package of this distribution
7
+ #
8
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
9
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
10
+ # License:: Same as Ruby's
11
+ # CVS ID: $Id: sgml-parser.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
12
+
13
+
14
+ module HTML
15
+
16
+ class SGMLParser
17
+
18
+ attr_reader :src_range
19
+
20
+ # Regular expressions used for parsing:
21
+ Interesting = /[&<]/
22
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
23
+ '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
24
+ '![^<>]*)?')
25
+
26
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
27
+ Charref = /&#([0-9]+)[^0-9]/
28
+
29
+ Starttagopen = /<[>a-zA-Z]/
30
+ Endtagopen = /<\/[<>a-zA-Z]/
31
+ Endbracket = /[<>]/
32
+ Special = /<![^<>]*>/
33
+ Commentopen = /<!--/
34
+ Commentclose = /--[ \t\n]*>/
35
+ Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
36
+ Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
37
+ '(\s*=\s*' +
38
+ "('[^']*'" +
39
+ '|"[^"]*"' +
40
+ '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
41
+ Endtagfind = /\s*\/\s*>/
42
+ Entitydefs =
43
+ {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
44
+
45
+ def initialize(verbose=false)
46
+ @verbose = verbose
47
+ reset
48
+ end
49
+
50
+ def reset
51
+ @rawdata = ''
52
+ @stack = []
53
+ @lasttag = '???'
54
+ @nomoretags = false
55
+ @literal = false
56
+ @offset = 0
57
+ @ranges = []
58
+ end
59
+
60
+ def get_source(range)
61
+ start = range.first
62
+ end_index = range.end
63
+ exclusive = range.exclude_end?
64
+ offset_range = Range.new(start-@offset, end_index-@offset, exclusive)
65
+ return @rawdata[offset_range]
66
+ end
67
+
68
+ def set_range(start, end_index)
69
+ @src_range = Range.new(start+@offset, end_index+@offset, exclusive = true)
70
+ #puts "setting range #{@src_range}, text = \"#{get_source(src_range)}\""
71
+ end
72
+
73
+ def has_context(gi)
74
+ @stack.include? gi
75
+ end
76
+
77
+ def setnomoretags
78
+ @nomoretags = true
79
+ @literal = true
80
+ end
81
+
82
+ def setliteral(*args)
83
+ @literal = true
84
+ end
85
+
86
+ def feed(data)
87
+ @rawdata << data
88
+ goahead(false)
89
+ end
90
+
91
+ def close
92
+ goahead(true)
93
+ end
94
+
95
+ def handle_data_range(rawdata, start, end_index)
96
+ if end_index > start
97
+ set_range(start, end_index)
98
+ handle_data(rawdata[start...end_index])
99
+ end
100
+ return end_index
101
+ end
102
+
103
+ def goahead(_end)
104
+ rawdata = @rawdata
105
+ i = 0
106
+ n = rawdata.length
107
+ while i < n
108
+ if @nomoretags
109
+ i = handle_data_range(rawdata, i, n)
110
+ break
111
+ end
112
+ j = rawdata.index(Interesting, i)
113
+ j = n unless j
114
+ i = handle_data_range(rawdata, i, j)
115
+ break if (i == n)
116
+ if rawdata[i] == ?< #
117
+ if rawdata.index(Starttagopen, i) == i
118
+ if @literal
119
+ i = handle_data_range(rawdata, i, i+1)
120
+ next
121
+ end
122
+ k = parse_starttag(i)
123
+ break unless k
124
+ i = k
125
+ next
126
+ end
127
+ if rawdata.index(Endtagopen, i) == i
128
+ k = parse_endtag(i)
129
+ break unless k
130
+ i = k
131
+ @literal = false
132
+ next
133
+ end
134
+ if rawdata.index(Commentopen, i) == i
135
+ if @literal
136
+ i = handle_data_range(rawdata, i, i+1)
137
+ next
138
+ end
139
+ k = parse_comment(i)
140
+ break unless k
141
+ i += k
142
+ next
143
+ end
144
+ if rawdata.index(Special, i) == i
145
+ if @literal
146
+ i = handle_data_range(rawdata, i, i+1)
147
+ next
148
+ end
149
+ k = parse_special(i)
150
+ break unless k
151
+ i += k
152
+ next
153
+ end
154
+ elsif rawdata[i] == ?& #
155
+ if rawdata.index(Charref, i) == i
156
+ end_index = i + $&.length
157
+ end_index -= 1 unless rawdata[end_index-1] == ?;
158
+ set_range(i, end_index)
159
+ handle_charref($1)
160
+ i = end_index
161
+ next
162
+ end
163
+ if rawdata.index(Entityref, i) == i
164
+ end_index = i + $&.length
165
+ end_index -= 1 unless rawdata[end_index-1] == ?;
166
+ set_range(i, end_index)
167
+ handle_entityref($1)
168
+ i = end_index
169
+ next
170
+ end
171
+ else
172
+ raise RuntimeError, 'neither < nor & ??'
173
+ end
174
+ # We get here only if incomplete matches but
175
+ # nothing else
176
+ match = rawdata.index(Incomplete, i)
177
+ unless match == i
178
+ i = handle_data_range(rawdata, i, i+1)
179
+ next
180
+ end
181
+ j = match + $&.length
182
+ break if j == n # Really incomplete
183
+ i = handle_data_range(rawdata, i, j)
184
+ end
185
+ # end while
186
+ if _end and i < n
187
+ i = handle_data_range(rawdata, i, n)
188
+ end
189
+ @rawdata = rawdata[i..-1]
190
+ @offset += i
191
+ end
192
+
193
+ def parse_comment(i)
194
+ rawdata = @rawdata
195
+ if rawdata[i, 4] != '<!--'
196
+ raise RuntimeError, 'unexpected call to handle_comment'
197
+ end
198
+ match = rawdata.index(Commentclose, i)
199
+ return nil unless match
200
+ matched_length = $&.length
201
+ j = match
202
+ src_length = match + matched_length - i
203
+ set_range(i, i + src_length)
204
+ handle_comment(rawdata[i+4..(j-1)])
205
+ return src_length
206
+ end
207
+
208
+ def parse_starttag(i)
209
+ rawdata = @rawdata
210
+ j = rawdata.index(Endbracket, i + 1)
211
+ return nil unless j
212
+ attrs = []
213
+ if rawdata[i+1] == ?> #
214
+ # SGML shorthand: <> == <last open tag seen>
215
+ k = j
216
+ tag = @lasttag
217
+ else
218
+ match = rawdata.index(Tagfind, i + 1)
219
+ unless match
220
+ raise RuntimeError, 'unexpected call to parse_starttag'
221
+ end
222
+ k = i + 1 + ($&.length)
223
+ tag = $&.downcase
224
+ @lasttag = tag
225
+ end
226
+ while k < j
227
+ break if rawdata.index(Endtagfind, k)
228
+ break unless rawdata.index(Attrfind, k)
229
+ matched_length = $&.length
230
+ attrname, rest, attrvalue = $1, $2, $3
231
+ if not rest
232
+ attrvalue = '' # was: = attrname
233
+ elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
234
+ (attrvalue[0] == ?" && attrvalue[-1] == ?")
235
+ attrvalue = attrvalue[1..-2]
236
+ end
237
+ attrs << [attrname.downcase, attrvalue]
238
+ k += matched_length
239
+ end
240
+ if rawdata[j] == ?> #
241
+ j += 1
242
+ end
243
+ set_range(i, j)
244
+ finish_starttag(tag, attrs)
245
+ return j
246
+ end
247
+
248
+ def parse_endtag(i)
249
+ rawdata = @rawdata
250
+ j = rawdata.index(Endbracket, i + 1)
251
+ return nil unless j
252
+ tag = (rawdata[i+2..j-1].strip).downcase
253
+ if rawdata[j] == ?> #
254
+ j += 1
255
+ end
256
+ set_range(i, j)
257
+ finish_endtag(tag)
258
+ return j
259
+ end
260
+
261
+ def finish_starttag(tag, attrs)
262
+ method = 'start_' + tag
263
+ if self.respond_to?(method)
264
+ @stack << tag
265
+ handle_starttag(tag, method, attrs)
266
+ return 1
267
+ else
268
+ method = 'do_' + tag
269
+ if self.respond_to?(method)
270
+ handle_starttag(tag, method, attrs)
271
+ return 0
272
+ else
273
+ unknown_starttag(tag, attrs)
274
+ return -1
275
+ end
276
+ end
277
+ end
278
+
279
+ def finish_endtag(tag)
280
+ if tag == ''
281
+ found = @stack.length - 1
282
+ if found < 0
283
+ unknown_endtag(tag)
284
+ return
285
+ end
286
+ else
287
+ unless @stack.include? tag
288
+ method = 'end_' + tag
289
+ unless self.respond_to?(method)
290
+ unknown_endtag(tag)
291
+ end
292
+ return
293
+ end
294
+ found = @stack.index(tag) #or @stack.length
295
+ end
296
+ while @stack.length > found
297
+ tag = @stack[-1]
298
+ method = 'end_' + tag
299
+ if respond_to?(method)
300
+ handle_endtag(tag, method)
301
+ else
302
+ unknown_endtag(tag)
303
+ end
304
+ @stack.pop
305
+ end
306
+ end
307
+
308
+ def parse_special(i)
309
+ rawdata = @rawdata
310
+ match = rawdata.index(Endbracket, i+1)
311
+ return nil unless match
312
+ matched_length = $&.length
313
+ src_length = match - i + matched_length
314
+ set_range(i, i + src_length)
315
+ handle_special(rawdata[i+1..(match-1)])
316
+ return src_length
317
+ end
318
+
319
+ def handle_starttag(tag, method, attrs)
320
+ self.send(method, attrs)
321
+ end
322
+
323
+ def handle_endtag(tag, method)
324
+ self.send(method)
325
+ end
326
+
327
+ def report_unbalanced(tag)
328
+ if @verbose
329
+ print '*** Unbalanced </' + tag + '>', "\n"
330
+ print '*** Stack:', self.stack, "\n"
331
+ end
332
+ end
333
+
334
+ def handle_charref(name)
335
+ n = Integer(name)
336
+ if !(0 <= n && n <= 255)
337
+ unknown_charref(name)
338
+ return
339
+ end
340
+ handle_data(n.chr)
341
+ end
342
+
343
+ def handle_entityref(name)
344
+ table = Entitydefs
345
+ if table.include?(name)
346
+ handle_data(table[name])
347
+ else
348
+ unknown_entityref(name)
349
+ return
350
+ end
351
+ end
352
+
353
+ def handle_data(data)
354
+ end
355
+
356
+ def handle_comment(data)
357
+ end
358
+
359
+ def handle_special(data)
360
+ end
361
+
362
+ def unknown_starttag(tag, attrs)
363
+ end
364
+ def unknown_endtag(tag)
365
+ end
366
+ def unknown_charref(ref)
367
+ end
368
+ def unknown_entityref(ref)
369
+ end
370
+
371
+ end
372
+ end