htmltools 1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +58 -0
- data/README +162 -0
- data/demo/degolive.rb +89 -0
- data/demo/ebaySearch.rb +93 -0
- data/demo/xpath.rb +62 -0
- data/lib/html/element.rb +323 -0
- data/lib/html/rexml-nodepath.rb +49 -0
- data/lib/html/sgml-parser.rb +372 -0
- data/lib/html/stparser.rb +280 -0
- data/lib/html/tags.rb +288 -0
- data/lib/html/tree.rb +140 -0
- data/lib/html/xmltree.rb +173 -0
- data/lib/html/xpath.rb +72 -0
- data/test/suite.rb +5 -0
- data/test/tc_html-element.rb +73 -0
- data/test/tc_html-tree.rb +201 -0
- data/test/tc_source-parser.rb +160 -0
- data/test/tc_stacking-parser.rb +196 -0
- data/test/tc_xpath.rb +87 -0
- metadata +58 -0
data/lib/html/element.rb
ADDED
@@ -0,0 +1,323 @@
|
|
1
|
+
# This module is a mix-in that provides parent/child behavior to real
|
2
|
+
# Element classes. Because it defines <tt>each()</tt> and includes Enumerable,
|
3
|
+
# you can iterate through a tree using the usual Enumerable methods.
|
4
|
+
#
|
5
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
6
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
7
|
+
# License:: Same as Ruby's
|
8
|
+
# CVS ID: $Id: element.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
|
9
|
+
|
10
|
+
require 'html/tags'
|
11
|
+
|
12
|
+
module HTMLTree
|
13
|
+
module TreeElement
|
14
|
+
include Enumerable
|
15
|
+
|
16
|
+
protected
|
17
|
+
|
18
|
+
def initialize_tree_element(parent_or_nil = nil, contents_or_nil = nil)
|
19
|
+
@_content, @_parent = contents_or_nil, parent_or_nil
|
20
|
+
if parent_or_nil
|
21
|
+
parent_or_nil.add_child(self)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_accessor :_parent
|
26
|
+
|
27
|
+
public
|
28
|
+
|
29
|
+
# Add one or more children to this node.
|
30
|
+
def add_child(*children_to_add)
|
31
|
+
if can_have_children?
|
32
|
+
children_to_add.each do |child|
|
33
|
+
@_content << child
|
34
|
+
child._parent = self
|
35
|
+
end
|
36
|
+
else
|
37
|
+
raise(ArgumentError.exception('node cannot have children'))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
alias_method :add_children, :add_child
|
42
|
+
|
43
|
+
# Remove one or more children from this node.
|
44
|
+
def remove_child(*children_to_remove)
|
45
|
+
if can_have_children?
|
46
|
+
children_to_remove.each do |child|
|
47
|
+
child._parent = nil if @_content.delete(child)
|
48
|
+
end
|
49
|
+
else
|
50
|
+
raise(ArgumentError.exception('node cannot have children'))
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
alias_method :remove_children, :remove_child
|
55
|
+
|
56
|
+
# Change my parent. Disconnects from prior parent, if any.
|
57
|
+
def parent=(parent_or_nil)
|
58
|
+
@_parent.remove_child(self) if @_parent
|
59
|
+
parent_or_nil.add_child(self) if parent_or_nil
|
60
|
+
end
|
61
|
+
|
62
|
+
# Return true if my content is a collection of Elements
|
63
|
+
# rather than actual data.
|
64
|
+
def can_have_children?
|
65
|
+
@_content.kind_of?(Array)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Return a collection of my children. Returns an empty Array if I am a
|
69
|
+
# data element, just to keep other methods simple.
|
70
|
+
def children
|
71
|
+
can_have_children? ? @_content : []
|
72
|
+
end
|
73
|
+
|
74
|
+
# Return my content; either my children or my data.
|
75
|
+
def content
|
76
|
+
@_content
|
77
|
+
end
|
78
|
+
|
79
|
+
# Return my parent element.
|
80
|
+
def parent
|
81
|
+
@_parent
|
82
|
+
end
|
83
|
+
|
84
|
+
def path
|
85
|
+
"/"
|
86
|
+
end
|
87
|
+
|
88
|
+
# Return the ultimate parent.
|
89
|
+
def root
|
90
|
+
@_parent ? self : @_parent.root
|
91
|
+
end
|
92
|
+
|
93
|
+
# Return true if I have any children.
|
94
|
+
def has_children?
|
95
|
+
children.size > 0
|
96
|
+
end
|
97
|
+
|
98
|
+
# Breadth-first iterator, required by Enumerable.
|
99
|
+
def each(&block)
|
100
|
+
block.call(self)
|
101
|
+
children.each { |ch| ch.each(&block) }
|
102
|
+
end
|
103
|
+
|
104
|
+
# Print out to $stdout (or given IO or String)
|
105
|
+
# a formatted dump of my structure.
|
106
|
+
def dump(indent=0, io=$stdout)
|
107
|
+
io << " " * indent
|
108
|
+
io << self.to_s
|
109
|
+
io << "\n"
|
110
|
+
children.each { |ea| ea.dump(indent+1, io) }
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
# This is a Element that represents the whole document (and makes a
|
116
|
+
# scope for the DTD declaration)
|
117
|
+
class Document
|
118
|
+
include TreeElement
|
119
|
+
|
120
|
+
def initialize
|
121
|
+
initialize_tree_element(nil, [])
|
122
|
+
end
|
123
|
+
|
124
|
+
def to_s
|
125
|
+
''
|
126
|
+
end
|
127
|
+
|
128
|
+
def each(&block)
|
129
|
+
children.each { |ch| ch.each(&block) }
|
130
|
+
end
|
131
|
+
|
132
|
+
def write(io)
|
133
|
+
children.each { |t| t.write(io) }
|
134
|
+
end
|
135
|
+
|
136
|
+
def tag
|
137
|
+
''
|
138
|
+
end
|
139
|
+
|
140
|
+
# Return my child <html> node, if any.
|
141
|
+
def html_node
|
142
|
+
children.detect { |ea| ea.tag == 'html' }
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# This is a TreeElement that represents tagged items in an HTML
|
147
|
+
# document.
|
148
|
+
class Element
|
149
|
+
include TreeElement
|
150
|
+
|
151
|
+
protected
|
152
|
+
|
153
|
+
# parent_or_nil:: TreeElement or nil
|
154
|
+
# tag_name:: String
|
155
|
+
def initialize(parent_or_nil = nil, tag_name = nil)
|
156
|
+
initialize_tree_element(parent_or_nil, [])
|
157
|
+
@_tag = tag_name
|
158
|
+
@_attributes = {}
|
159
|
+
@_attribute_order = []
|
160
|
+
end
|
161
|
+
|
162
|
+
public
|
163
|
+
|
164
|
+
def can_have_children?; true; end
|
165
|
+
|
166
|
+
# Return true if I'm data instead of a tag
|
167
|
+
def data?; false; end
|
168
|
+
|
169
|
+
def to_s
|
170
|
+
a = [ "<", tag ]
|
171
|
+
@_attribute_order.each { |k|
|
172
|
+
v = @_attributes[k]
|
173
|
+
a << " #{k.to_s}=\"#{v.to_s}\""
|
174
|
+
}
|
175
|
+
a << ">"
|
176
|
+
a.join('')
|
177
|
+
end
|
178
|
+
|
179
|
+
# Append an attribute. <tt>values</tt> are first flattened into an Array,
|
180
|
+
# then converted into strings.
|
181
|
+
#
|
182
|
+
# If there is a single attribute value, it will appear as a String,
|
183
|
+
# otherwise it will be an Array of Strings.
|
184
|
+
#
|
185
|
+
# Example:
|
186
|
+
# element.add_attribute("width", "123")
|
187
|
+
# element.add_attribute("value", [ "a", "b" ])
|
188
|
+
def add_attribute(name, *values)
|
189
|
+
values = values.flatten.collect { |ea| ea.to_s.strip }
|
190
|
+
name = name.downcase
|
191
|
+
if @_attributes.include?(name)
|
192
|
+
@_attributes[name] = @_attributes[name].to_a + values
|
193
|
+
else
|
194
|
+
@_attributes[name] = values.size > 1 ? values : values[0]
|
195
|
+
end
|
196
|
+
@_attribute_order << name
|
197
|
+
self
|
198
|
+
end
|
199
|
+
|
200
|
+
# Return my tag (should be a String)
|
201
|
+
def tag; @_tag; end
|
202
|
+
|
203
|
+
# Return an HTML::Tag for further information, or nil if this is an
|
204
|
+
# unknown tag.
|
205
|
+
def tag_info
|
206
|
+
begin
|
207
|
+
HTML::Tag.named(@_tag)
|
208
|
+
rescue NoSuchHTMLTagError
|
209
|
+
nil
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
# Return the path to this element from the root
|
214
|
+
def path
|
215
|
+
path = []
|
216
|
+
node = self
|
217
|
+
while node do
|
218
|
+
path.unshift node.tag
|
219
|
+
node = node.parent
|
220
|
+
end
|
221
|
+
path.join(".")
|
222
|
+
end
|
223
|
+
|
224
|
+
def show_structure(indent = 0)
|
225
|
+
puts(' ' * indent) + path
|
226
|
+
elements.each { |node| node.show_structure(indent + 2) }
|
227
|
+
nil
|
228
|
+
end
|
229
|
+
|
230
|
+
# Return the children of this node that are elements (not data)
|
231
|
+
def elements
|
232
|
+
children.select { |node| node.is_a? Element }
|
233
|
+
end
|
234
|
+
|
235
|
+
# Return my attributes Hash.
|
236
|
+
def attributes; @_attributes; end
|
237
|
+
|
238
|
+
# Return the order of my attributes
|
239
|
+
def attribute_order; @_attribute_order; end
|
240
|
+
|
241
|
+
# Return the value of a single attribute (a String or Array).
|
242
|
+
def attribute(name); @_attributes[name]; end
|
243
|
+
|
244
|
+
# Return the value of a single attribute (a String or Array).
|
245
|
+
def [](name); attribute(name); end
|
246
|
+
|
247
|
+
# Replace an attribute.
|
248
|
+
def []=(name, *values)
|
249
|
+
@_attributes[name] = values.size > 1 ? values : values[0]
|
250
|
+
@_attribute_order.delete(name)
|
251
|
+
self
|
252
|
+
end
|
253
|
+
|
254
|
+
# Print me (and my descendents) on the given IO stream.
|
255
|
+
def write(io)
|
256
|
+
io << self
|
257
|
+
children.each { |t| t.write(io) }
|
258
|
+
unless tag_info.is_empty_element
|
259
|
+
io.puts( "</#{tag()}>" )
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
end
|
264
|
+
|
265
|
+
# This is a TreeElement that represents leaf data nodes (CDATA, scripts,
|
266
|
+
# comments, processing directives). It forwards unknown messages to the
|
267
|
+
# content element, so it otherwise behaves like a String.
|
268
|
+
class Data
|
269
|
+
include TreeElement
|
270
|
+
|
271
|
+
protected
|
272
|
+
|
273
|
+
# parent_or_nil:: parent, TreeElement or nil
|
274
|
+
# str:: contents, String
|
275
|
+
def initialize(parent_or_nil = nil, str = '')
|
276
|
+
initialize_tree_element(parent_or_nil, str)
|
277
|
+
end
|
278
|
+
|
279
|
+
public
|
280
|
+
|
281
|
+
# Return true because I am a data Element.
|
282
|
+
def data?; true; end
|
283
|
+
|
284
|
+
# Return false because I have no children.
|
285
|
+
def can_have_children?; false; end
|
286
|
+
|
287
|
+
# Return an empty collection because I have no children.
|
288
|
+
def children; []; end
|
289
|
+
|
290
|
+
# Return my (empty) tag String.
|
291
|
+
def tag; ''; end
|
292
|
+
|
293
|
+
# Return my (empty) attributes Hash.
|
294
|
+
def attributes; {}; end
|
295
|
+
|
296
|
+
def to_s
|
297
|
+
@_content
|
298
|
+
end
|
299
|
+
|
300
|
+
# Print me on the given IO stream.
|
301
|
+
def write(io)
|
302
|
+
io << self
|
303
|
+
end
|
304
|
+
|
305
|
+
# Forward all other methods to my content, so I can otherwise behave
|
306
|
+
# like a String.
|
307
|
+
def method_missing(sym, *args)
|
308
|
+
@_content.method(sym).call(*args)
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
class Comment < Data
|
313
|
+
def to_s
|
314
|
+
'<!--' + @_content + '-->'
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
class Special < Data
|
319
|
+
def to_s
|
320
|
+
'<' + @_content + '>'
|
321
|
+
end
|
322
|
+
end
|
323
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# This module adds method 'full_path' to REXML hierarchy
|
2
|
+
# This method will print out the path to the given node, for example
|
3
|
+
# /html/body/p[0]/@text
|
4
|
+
#
|
5
|
+
# Copyright:: Copyright (C) 2004, Johannes Brodwall <johannes@brodwall.com>
|
6
|
+
# License:: Same as Ruby's
|
7
|
+
# CVS ID: $Id: rexml-nodepath.rb,v 1.5 2005/05/25 17:38:45 jhannes Exp $
|
8
|
+
|
9
|
+
|
10
|
+
module REXML
|
11
|
+
class Child
|
12
|
+
def parent_path
|
13
|
+
parent ? parent.full_path : ''
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class Document
|
18
|
+
def full_path
|
19
|
+
''
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Element
|
24
|
+
def child_index
|
25
|
+
return "" unless parent
|
26
|
+
siblings = parent.to_a.select do |node|
|
27
|
+
node.kind_of? Element and node.expanded_name == self.expanded_name
|
28
|
+
end
|
29
|
+
return "" if siblings.size < 2
|
30
|
+
"[" + (siblings.index(self)+1).to_s + "]"
|
31
|
+
end
|
32
|
+
|
33
|
+
def full_path
|
34
|
+
parent_path + '/' + expanded_name + child_index
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class Text
|
39
|
+
def full_path
|
40
|
+
parent_path + '/text()' + " " + expanded_name
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
class Attribute
|
45
|
+
def full_path
|
46
|
+
element.full_path + '/@' + name
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,372 @@
|
|
1
|
+
# A parser for SGML, using the derived class as static DTD.
|
2
|
+
#
|
3
|
+
# Taken from http://raa.ruby-lang.org/list.rhtml?name=html-parser-2
|
4
|
+
# This file seems to be included in the current install of Ruby,
|
5
|
+
# but with a bug related to attributes quoted with '"', so I have
|
6
|
+
# included in in the HTML package of this distribution
|
7
|
+
#
|
8
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
9
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
10
|
+
# License:: Same as Ruby's
|
11
|
+
# CVS ID: $Id: sgml-parser.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
|
12
|
+
|
13
|
+
|
14
|
+
module HTML
|
15
|
+
|
16
|
+
class SGMLParser
|
17
|
+
|
18
|
+
attr_reader :src_range
|
19
|
+
|
20
|
+
# Regular expressions used for parsing:
|
21
|
+
Interesting = /[&<]/
|
22
|
+
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
|
23
|
+
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
|
24
|
+
'![^<>]*)?')
|
25
|
+
|
26
|
+
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
|
27
|
+
Charref = /&#([0-9]+)[^0-9]/
|
28
|
+
|
29
|
+
Starttagopen = /<[>a-zA-Z]/
|
30
|
+
Endtagopen = /<\/[<>a-zA-Z]/
|
31
|
+
Endbracket = /[<>]/
|
32
|
+
Special = /<![^<>]*>/
|
33
|
+
Commentopen = /<!--/
|
34
|
+
Commentclose = /--[ \t\n]*>/
|
35
|
+
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
|
36
|
+
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
|
37
|
+
'(\s*=\s*' +
|
38
|
+
"('[^']*'" +
|
39
|
+
'|"[^"]*"' +
|
40
|
+
'|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
|
41
|
+
Endtagfind = /\s*\/\s*>/
|
42
|
+
Entitydefs =
|
43
|
+
{'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
|
44
|
+
|
45
|
+
def initialize(verbose=false)
|
46
|
+
@verbose = verbose
|
47
|
+
reset
|
48
|
+
end
|
49
|
+
|
50
|
+
def reset
|
51
|
+
@rawdata = ''
|
52
|
+
@stack = []
|
53
|
+
@lasttag = '???'
|
54
|
+
@nomoretags = false
|
55
|
+
@literal = false
|
56
|
+
@offset = 0
|
57
|
+
@ranges = []
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_source(range)
|
61
|
+
start = range.first
|
62
|
+
end_index = range.end
|
63
|
+
exclusive = range.exclude_end?
|
64
|
+
offset_range = Range.new(start-@offset, end_index-@offset, exclusive)
|
65
|
+
return @rawdata[offset_range]
|
66
|
+
end
|
67
|
+
|
68
|
+
def set_range(start, end_index)
|
69
|
+
@src_range = Range.new(start+@offset, end_index+@offset, exclusive = true)
|
70
|
+
#puts "setting range #{@src_range}, text = \"#{get_source(src_range)}\""
|
71
|
+
end
|
72
|
+
|
73
|
+
def has_context(gi)
|
74
|
+
@stack.include? gi
|
75
|
+
end
|
76
|
+
|
77
|
+
def setnomoretags
|
78
|
+
@nomoretags = true
|
79
|
+
@literal = true
|
80
|
+
end
|
81
|
+
|
82
|
+
def setliteral(*args)
|
83
|
+
@literal = true
|
84
|
+
end
|
85
|
+
|
86
|
+
def feed(data)
|
87
|
+
@rawdata << data
|
88
|
+
goahead(false)
|
89
|
+
end
|
90
|
+
|
91
|
+
def close
|
92
|
+
goahead(true)
|
93
|
+
end
|
94
|
+
|
95
|
+
def handle_data_range(rawdata, start, end_index)
|
96
|
+
if end_index > start
|
97
|
+
set_range(start, end_index)
|
98
|
+
handle_data(rawdata[start...end_index])
|
99
|
+
end
|
100
|
+
return end_index
|
101
|
+
end
|
102
|
+
|
103
|
+
def goahead(_end)
|
104
|
+
rawdata = @rawdata
|
105
|
+
i = 0
|
106
|
+
n = rawdata.length
|
107
|
+
while i < n
|
108
|
+
if @nomoretags
|
109
|
+
i = handle_data_range(rawdata, i, n)
|
110
|
+
break
|
111
|
+
end
|
112
|
+
j = rawdata.index(Interesting, i)
|
113
|
+
j = n unless j
|
114
|
+
i = handle_data_range(rawdata, i, j)
|
115
|
+
break if (i == n)
|
116
|
+
if rawdata[i] == ?< #
|
117
|
+
if rawdata.index(Starttagopen, i) == i
|
118
|
+
if @literal
|
119
|
+
i = handle_data_range(rawdata, i, i+1)
|
120
|
+
next
|
121
|
+
end
|
122
|
+
k = parse_starttag(i)
|
123
|
+
break unless k
|
124
|
+
i = k
|
125
|
+
next
|
126
|
+
end
|
127
|
+
if rawdata.index(Endtagopen, i) == i
|
128
|
+
k = parse_endtag(i)
|
129
|
+
break unless k
|
130
|
+
i = k
|
131
|
+
@literal = false
|
132
|
+
next
|
133
|
+
end
|
134
|
+
if rawdata.index(Commentopen, i) == i
|
135
|
+
if @literal
|
136
|
+
i = handle_data_range(rawdata, i, i+1)
|
137
|
+
next
|
138
|
+
end
|
139
|
+
k = parse_comment(i)
|
140
|
+
break unless k
|
141
|
+
i += k
|
142
|
+
next
|
143
|
+
end
|
144
|
+
if rawdata.index(Special, i) == i
|
145
|
+
if @literal
|
146
|
+
i = handle_data_range(rawdata, i, i+1)
|
147
|
+
next
|
148
|
+
end
|
149
|
+
k = parse_special(i)
|
150
|
+
break unless k
|
151
|
+
i += k
|
152
|
+
next
|
153
|
+
end
|
154
|
+
elsif rawdata[i] == ?& #
|
155
|
+
if rawdata.index(Charref, i) == i
|
156
|
+
end_index = i + $&.length
|
157
|
+
end_index -= 1 unless rawdata[end_index-1] == ?;
|
158
|
+
set_range(i, end_index)
|
159
|
+
handle_charref($1)
|
160
|
+
i = end_index
|
161
|
+
next
|
162
|
+
end
|
163
|
+
if rawdata.index(Entityref, i) == i
|
164
|
+
end_index = i + $&.length
|
165
|
+
end_index -= 1 unless rawdata[end_index-1] == ?;
|
166
|
+
set_range(i, end_index)
|
167
|
+
handle_entityref($1)
|
168
|
+
i = end_index
|
169
|
+
next
|
170
|
+
end
|
171
|
+
else
|
172
|
+
raise RuntimeError, 'neither < nor & ??'
|
173
|
+
end
|
174
|
+
# We get here only if incomplete matches but
|
175
|
+
# nothing else
|
176
|
+
match = rawdata.index(Incomplete, i)
|
177
|
+
unless match == i
|
178
|
+
i = handle_data_range(rawdata, i, i+1)
|
179
|
+
next
|
180
|
+
end
|
181
|
+
j = match + $&.length
|
182
|
+
break if j == n # Really incomplete
|
183
|
+
i = handle_data_range(rawdata, i, j)
|
184
|
+
end
|
185
|
+
# end while
|
186
|
+
if _end and i < n
|
187
|
+
i = handle_data_range(rawdata, i, n)
|
188
|
+
end
|
189
|
+
@rawdata = rawdata[i..-1]
|
190
|
+
@offset += i
|
191
|
+
end
|
192
|
+
|
193
|
+
def parse_comment(i)
|
194
|
+
rawdata = @rawdata
|
195
|
+
if rawdata[i, 4] != '<!--'
|
196
|
+
raise RuntimeError, 'unexpected call to handle_comment'
|
197
|
+
end
|
198
|
+
match = rawdata.index(Commentclose, i)
|
199
|
+
return nil unless match
|
200
|
+
matched_length = $&.length
|
201
|
+
j = match
|
202
|
+
src_length = match + matched_length - i
|
203
|
+
set_range(i, i + src_length)
|
204
|
+
handle_comment(rawdata[i+4..(j-1)])
|
205
|
+
return src_length
|
206
|
+
end
|
207
|
+
|
208
|
+
def parse_starttag(i)
|
209
|
+
rawdata = @rawdata
|
210
|
+
j = rawdata.index(Endbracket, i + 1)
|
211
|
+
return nil unless j
|
212
|
+
attrs = []
|
213
|
+
if rawdata[i+1] == ?> #
|
214
|
+
# SGML shorthand: <> == <last open tag seen>
|
215
|
+
k = j
|
216
|
+
tag = @lasttag
|
217
|
+
else
|
218
|
+
match = rawdata.index(Tagfind, i + 1)
|
219
|
+
unless match
|
220
|
+
raise RuntimeError, 'unexpected call to parse_starttag'
|
221
|
+
end
|
222
|
+
k = i + 1 + ($&.length)
|
223
|
+
tag = $&.downcase
|
224
|
+
@lasttag = tag
|
225
|
+
end
|
226
|
+
while k < j
|
227
|
+
break if rawdata.index(Endtagfind, k)
|
228
|
+
break unless rawdata.index(Attrfind, k)
|
229
|
+
matched_length = $&.length
|
230
|
+
attrname, rest, attrvalue = $1, $2, $3
|
231
|
+
if not rest
|
232
|
+
attrvalue = '' # was: = attrname
|
233
|
+
elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
|
234
|
+
(attrvalue[0] == ?" && attrvalue[-1] == ?")
|
235
|
+
attrvalue = attrvalue[1..-2]
|
236
|
+
end
|
237
|
+
attrs << [attrname.downcase, attrvalue]
|
238
|
+
k += matched_length
|
239
|
+
end
|
240
|
+
if rawdata[j] == ?> #
|
241
|
+
j += 1
|
242
|
+
end
|
243
|
+
set_range(i, j)
|
244
|
+
finish_starttag(tag, attrs)
|
245
|
+
return j
|
246
|
+
end
|
247
|
+
|
248
|
+
def parse_endtag(i)
|
249
|
+
rawdata = @rawdata
|
250
|
+
j = rawdata.index(Endbracket, i + 1)
|
251
|
+
return nil unless j
|
252
|
+
tag = (rawdata[i+2..j-1].strip).downcase
|
253
|
+
if rawdata[j] == ?> #
|
254
|
+
j += 1
|
255
|
+
end
|
256
|
+
set_range(i, j)
|
257
|
+
finish_endtag(tag)
|
258
|
+
return j
|
259
|
+
end
|
260
|
+
|
261
|
+
def finish_starttag(tag, attrs)
|
262
|
+
method = 'start_' + tag
|
263
|
+
if self.respond_to?(method)
|
264
|
+
@stack << tag
|
265
|
+
handle_starttag(tag, method, attrs)
|
266
|
+
return 1
|
267
|
+
else
|
268
|
+
method = 'do_' + tag
|
269
|
+
if self.respond_to?(method)
|
270
|
+
handle_starttag(tag, method, attrs)
|
271
|
+
return 0
|
272
|
+
else
|
273
|
+
unknown_starttag(tag, attrs)
|
274
|
+
return -1
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
def finish_endtag(tag)
|
280
|
+
if tag == ''
|
281
|
+
found = @stack.length - 1
|
282
|
+
if found < 0
|
283
|
+
unknown_endtag(tag)
|
284
|
+
return
|
285
|
+
end
|
286
|
+
else
|
287
|
+
unless @stack.include? tag
|
288
|
+
method = 'end_' + tag
|
289
|
+
unless self.respond_to?(method)
|
290
|
+
unknown_endtag(tag)
|
291
|
+
end
|
292
|
+
return
|
293
|
+
end
|
294
|
+
found = @stack.index(tag) #or @stack.length
|
295
|
+
end
|
296
|
+
while @stack.length > found
|
297
|
+
tag = @stack[-1]
|
298
|
+
method = 'end_' + tag
|
299
|
+
if respond_to?(method)
|
300
|
+
handle_endtag(tag, method)
|
301
|
+
else
|
302
|
+
unknown_endtag(tag)
|
303
|
+
end
|
304
|
+
@stack.pop
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
def parse_special(i)
|
309
|
+
rawdata = @rawdata
|
310
|
+
match = rawdata.index(Endbracket, i+1)
|
311
|
+
return nil unless match
|
312
|
+
matched_length = $&.length
|
313
|
+
src_length = match - i + matched_length
|
314
|
+
set_range(i, i + src_length)
|
315
|
+
handle_special(rawdata[i+1..(match-1)])
|
316
|
+
return src_length
|
317
|
+
end
|
318
|
+
|
319
|
+
def handle_starttag(tag, method, attrs)
|
320
|
+
self.send(method, attrs)
|
321
|
+
end
|
322
|
+
|
323
|
+
def handle_endtag(tag, method)
|
324
|
+
self.send(method)
|
325
|
+
end
|
326
|
+
|
327
|
+
def report_unbalanced(tag)
|
328
|
+
if @verbose
|
329
|
+
print '*** Unbalanced </' + tag + '>', "\n"
|
330
|
+
print '*** Stack:', self.stack, "\n"
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
def handle_charref(name)
|
335
|
+
n = Integer(name)
|
336
|
+
if !(0 <= n && n <= 255)
|
337
|
+
unknown_charref(name)
|
338
|
+
return
|
339
|
+
end
|
340
|
+
handle_data(n.chr)
|
341
|
+
end
|
342
|
+
|
343
|
+
def handle_entityref(name)
|
344
|
+
table = Entitydefs
|
345
|
+
if table.include?(name)
|
346
|
+
handle_data(table[name])
|
347
|
+
else
|
348
|
+
unknown_entityref(name)
|
349
|
+
return
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|
353
|
+
def handle_data(data)
|
354
|
+
end
|
355
|
+
|
356
|
+
def handle_comment(data)
|
357
|
+
end
|
358
|
+
|
359
|
+
def handle_special(data)
|
360
|
+
end
|
361
|
+
|
362
|
+
def unknown_starttag(tag, attrs)
|
363
|
+
end
|
364
|
+
def unknown_endtag(tag)
|
365
|
+
end
|
366
|
+
def unknown_charref(ref)
|
367
|
+
end
|
368
|
+
def unknown_entityref(ref)
|
369
|
+
end
|
370
|
+
|
371
|
+
end
|
372
|
+
end
|