htmltools 1.10
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +58 -0
- data/README +162 -0
- data/demo/degolive.rb +89 -0
- data/demo/ebaySearch.rb +93 -0
- data/demo/xpath.rb +62 -0
- data/lib/html/element.rb +323 -0
- data/lib/html/rexml-nodepath.rb +49 -0
- data/lib/html/sgml-parser.rb +372 -0
- data/lib/html/stparser.rb +280 -0
- data/lib/html/tags.rb +288 -0
- data/lib/html/tree.rb +140 -0
- data/lib/html/xmltree.rb +173 -0
- data/lib/html/xpath.rb +72 -0
- data/test/suite.rb +5 -0
- data/test/tc_html-element.rb +73 -0
- data/test/tc_html-tree.rb +201 -0
- data/test/tc_source-parser.rb +160 -0
- data/test/tc_stacking-parser.rb +196 -0
- data/test/tc_xpath.rb +87 -0
- metadata +58 -0
@@ -0,0 +1,280 @@
|
|
1
|
+
# This is an SGMLParser subclass that knows about HTML 4.0 rules
|
2
|
+
# and can spot empty tags and deal with tags that may have omitted endtags.
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
5
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
6
|
+
# License:: Ruby's License
|
7
|
+
# CVS ID:: $Id: stparser.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
|
8
|
+
|
9
|
+
require 'html/sgml-parser'
|
10
|
+
require 'html/tags'
|
11
|
+
|
12
|
+
module HTML
|
13
|
+
class StackingParser < SGMLParser
|
14
|
+
# accessors
|
15
|
+
|
16
|
+
def stack; @tagStack; end
|
17
|
+
|
18
|
+
def last_tag; @tagStack[-1] || 'html'; end
|
19
|
+
|
20
|
+
def parent_tag; @tagStack[-2] || 'html'; end
|
21
|
+
|
22
|
+
def strip_whitespace=(flag); @stripWhitespace = flag; end
|
23
|
+
|
24
|
+
# input methods
|
25
|
+
|
26
|
+
# Open and parse the given file.
|
27
|
+
def parse_file_named(name)
|
28
|
+
File.open(name) { |f|
|
29
|
+
while bytes = f.read(65536)
|
30
|
+
feed(bytes)
|
31
|
+
end
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
# Feed some more data to the parser.
|
36
|
+
def feed(string)
|
37
|
+
super
|
38
|
+
while @saved.size > 0
|
39
|
+
saved = @saved
|
40
|
+
@saved = ''
|
41
|
+
super(saved)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# available only to subclasses
|
46
|
+
private
|
47
|
+
|
48
|
+
if $DEBUG
|
49
|
+
def dprint(*stuff)
|
50
|
+
print((" " * @tagStack.size), stuff) if @verbose
|
51
|
+
end
|
52
|
+
else
|
53
|
+
def dprint(*stuff); end
|
54
|
+
end
|
55
|
+
|
56
|
+
def warn(msg)
|
57
|
+
$stderr.print(msg) if @verbose
|
58
|
+
end
|
59
|
+
|
60
|
+
def initialize(verbose=false, strip_white=false)
|
61
|
+
super(verbose)
|
62
|
+
@tagStack = []
|
63
|
+
@saved = ''
|
64
|
+
@stripWhitespace = strip_white
|
65
|
+
end
|
66
|
+
|
67
|
+
# handle_data will call this.
|
68
|
+
def skip_script(data)
|
69
|
+
# is the end of the script in this buffer?
|
70
|
+
if m = data.index(%r{</[A-Za-z]})
|
71
|
+
@nomoretags = false
|
72
|
+
@saved = data[m..-1]
|
73
|
+
handle_script(data[0,m]) # call user handler
|
74
|
+
else
|
75
|
+
handle_script(data)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Unfortunately, sgml-parser calls this and there's important work to do in
|
80
|
+
# it. So the user handler has to be named something different.
|
81
|
+
def handle_data(data)
|
82
|
+
# need to handle scripts
|
83
|
+
if last_tag() == 'script' && @nomoretags
|
84
|
+
skip_script(data)
|
85
|
+
else
|
86
|
+
if @stripWhitespace
|
87
|
+
begin
|
88
|
+
data.strip! if HTML::Tag.named(last_tag()).can_ignore_whitespace
|
89
|
+
rescue NoSuchHTMLTagError
|
90
|
+
data.strip!
|
91
|
+
end
|
92
|
+
end
|
93
|
+
handle_cdata(data) if data.size > 0 # call user handler
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def finish_starttag(tag, attrs)
|
98
|
+
dprint "*START* #{tag} #{attrs.inspect}\n"
|
99
|
+
# dprint "-START- #{tag}\n"
|
100
|
+
begin
|
101
|
+
unless HTML::Tag.named(last_tag()).can_contain(tag, parent_tag())
|
102
|
+
dprint "-INSERT-\n"
|
103
|
+
finish_endtag(last_tag())
|
104
|
+
end
|
105
|
+
rescue NoSuchHTMLTagError
|
106
|
+
# hmm.. last_tag was unknown.
|
107
|
+
# Assume it doesn't have an optional endtag.
|
108
|
+
end
|
109
|
+
|
110
|
+
push(tag)
|
111
|
+
|
112
|
+
begin
|
113
|
+
if HTML::Tag.named(tag).is_empty_element
|
114
|
+
dprint "-EMPTY-\n"
|
115
|
+
handle_empty_tag(tag, attrs) # call user handler
|
116
|
+
drop_to_tag(tag)
|
117
|
+
else
|
118
|
+
handle_start_tag(tag, attrs) # call user handler
|
119
|
+
end
|
120
|
+
|
121
|
+
if tag.downcase == 'script'
|
122
|
+
@nomoretags = true
|
123
|
+
end
|
124
|
+
rescue NoSuchHTMLTagError
|
125
|
+
# hmm... the start tag is unknown.
|
126
|
+
# And we pushed it.
|
127
|
+
# If it's empty, we'll get rid of it at the next end tag.
|
128
|
+
handle_unknown_tag(tag, attrs)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# return true if tag is not extra
|
133
|
+
def drop_to_tag(tag)
|
134
|
+
dropped = @tagStack.size - (@tagStack.rindex(tag.downcase) || @tagStack.size)
|
135
|
+
if dropped == 0 # got an end tag but we haven't seen start tag?
|
136
|
+
handle_extra_end_tag(tag) # call user handler
|
137
|
+
return false
|
138
|
+
end
|
139
|
+
dropped.times do
|
140
|
+
begin
|
141
|
+
# detect missing end tag
|
142
|
+
if last_tag != tag and ! HTML::Tag.named(last_tag).can_omit_end_tag
|
143
|
+
handle_missing_end_tag(last_tag) # call user handler
|
144
|
+
elsif last_tag != tag
|
145
|
+
handle_end_tag(last_tag)
|
146
|
+
end
|
147
|
+
rescue NoSuchHTMLTagError
|
148
|
+
# oops, don't recognize last_tag.
|
149
|
+
end
|
150
|
+
pop
|
151
|
+
end
|
152
|
+
return true
|
153
|
+
end
|
154
|
+
|
155
|
+
def finish_endtag(tag)
|
156
|
+
dprint "*END* #{tag}\n"
|
157
|
+
if drop_to_tag(tag)
|
158
|
+
dprint "-END- #{tag} #{@tagStack.inspect}\n"
|
159
|
+
handle_end_tag(tag) # call user handler
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def push(tag)
|
164
|
+
@tagStack.push(tag.downcase)
|
165
|
+
dprint "*PUSH* #{tag} => #{@tagStack.inspect}\n"
|
166
|
+
end
|
167
|
+
|
168
|
+
def pop
|
169
|
+
tag = @tagStack.pop
|
170
|
+
dprint "*POP* #{tag} => #{@tagStack.inspect}\n"
|
171
|
+
tag
|
172
|
+
end
|
173
|
+
|
174
|
+
def unknown_charref(name)
|
175
|
+
handle_unknown_character(name)
|
176
|
+
end
|
177
|
+
|
178
|
+
def unknown_entityref(name)
|
179
|
+
handle_unknown_entity(name)
|
180
|
+
end
|
181
|
+
|
182
|
+
# callbacks: can be overridden in subclasses
|
183
|
+
|
184
|
+
def handle_start_tag(tag, attrs)
|
185
|
+
end
|
186
|
+
|
187
|
+
def handle_end_tag(tag)
|
188
|
+
end
|
189
|
+
|
190
|
+
# by default, an empty tag is handled as a start tag
|
191
|
+
# with an inserted end tag.
|
192
|
+
def handle_empty_tag(tag, attrs)
|
193
|
+
handle_start_tag(tag, attrs)
|
194
|
+
handle_end_tag(tag)
|
195
|
+
end
|
196
|
+
|
197
|
+
def handle_unknown_tag(tag, attrs)
|
198
|
+
warn("warning: unknown tag #{tag}\n")
|
199
|
+
end
|
200
|
+
|
201
|
+
def handle_missing_end_tag(tag)
|
202
|
+
warn("warning: missing end tag </#{tag}>\n")
|
203
|
+
end
|
204
|
+
|
205
|
+
def handle_extra_end_tag(tag)
|
206
|
+
warn("warning: extra end tag </#{tag}>\n")
|
207
|
+
end
|
208
|
+
|
209
|
+
def handle_cdata(data)
|
210
|
+
end
|
211
|
+
|
212
|
+
def handle_script(data)
|
213
|
+
end
|
214
|
+
|
215
|
+
def handle_unknown_character(name)
|
216
|
+
end
|
217
|
+
|
218
|
+
def handle_unknown_entity(name)
|
219
|
+
end
|
220
|
+
|
221
|
+
# call super if you want the data stripped
|
222
|
+
def handle_comment(data)
|
223
|
+
data.strip! if @stripWhitespace
|
224
|
+
end
|
225
|
+
|
226
|
+
def handle_special(data)
|
227
|
+
end
|
228
|
+
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
# test script
|
233
|
+
if $0 == __FILE__
|
234
|
+
$stdout.sync = true
|
235
|
+
|
236
|
+
class TestStackingParser < HTML::StackingParser
|
237
|
+
def dump_stack
|
238
|
+
stack.each { |ea| print ea, '/' }
|
239
|
+
end
|
240
|
+
def handle_start_tag(tag, attrs)
|
241
|
+
print("START: #{tag} #{attrs.inspect}\n")
|
242
|
+
end
|
243
|
+
def handle_end_tag(tag)
|
244
|
+
# print("END: #{tag}\n")
|
245
|
+
end
|
246
|
+
def handle_empty_tag(tag, attrs)
|
247
|
+
# print("EMPTY: #{tag} #{attrs.inspect}\n")
|
248
|
+
end
|
249
|
+
def handle_cdata(data)
|
250
|
+
# print("DATA: #{data.size} chars\n")
|
251
|
+
if last_tag() != 'style'
|
252
|
+
str = data.strip
|
253
|
+
if str.size > 0
|
254
|
+
dump_stack
|
255
|
+
print(str.inspect, "\n")
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
def handle_script(data)
|
260
|
+
# print("SCRIPT: #{data.size} chars\n")
|
261
|
+
end
|
262
|
+
def handle_unknown_character(name)
|
263
|
+
print("UNKC: #{name}\n")
|
264
|
+
end
|
265
|
+
def handle_unknown_entity(name)
|
266
|
+
print("UNKE: #{name}\n")
|
267
|
+
end
|
268
|
+
def handle_comment(data)
|
269
|
+
super
|
270
|
+
print("COMMENT: #{data}\n")
|
271
|
+
end
|
272
|
+
def handle_special(data)
|
273
|
+
print("SPECIAL: #{data}\n")
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
$DEBUG = false
|
278
|
+
p = TestStackingParser.new(true)
|
279
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
280
|
+
end
|
data/lib/html/tags.rb
ADDED
@@ -0,0 +1,288 @@
|
|
1
|
+
# This encodes the knowledge of HTML 4.0 tags for a parser.
|
2
|
+
# It knows about block vs. inline tags, empty tags, and optionally
|
3
|
+
# omitted end tags.
|
4
|
+
#
|
5
|
+
# Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
|
6
|
+
# Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
7
|
+
# License:: Ruby's license
|
8
|
+
# CVS ID:: $Id: tags.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
|
9
|
+
|
10
|
+
# This is an error raised by <tt>HTML::Tag.named()</tt> when a tag doesn't exist.
|
11
|
+
class NoSuchHTMLTagError < RuntimeError
|
12
|
+
end
|
13
|
+
|
14
|
+
# This is the base class for all the HTML tag classes.
|
15
|
+
module HTML
|
16
|
+
|
17
|
+
class Tag
|
18
|
+
|
19
|
+
# tag_name:: a String, the name of the tag
|
20
|
+
# can_omit:: a Boolean, true if end tag is optional
|
21
|
+
def initialize(tag_name, can_omit)
|
22
|
+
@name = tag_name.downcase
|
23
|
+
@can_omit_end = can_omit
|
24
|
+
end
|
25
|
+
|
26
|
+
# Return my tag name.
|
27
|
+
def name; @name; end
|
28
|
+
|
29
|
+
# Return true if my end tag can be omitted.
|
30
|
+
def can_omit_end_tag; @can_omit_end; end
|
31
|
+
|
32
|
+
# Return true if I am a block element.
|
33
|
+
def is_block_element; false; end
|
34
|
+
|
35
|
+
# Return true if I am an inline element.
|
36
|
+
def is_inline_element; false; end
|
37
|
+
|
38
|
+
# Return true if I am an empty element.
|
39
|
+
def is_empty_element; false; end
|
40
|
+
|
41
|
+
# Return true if I can contain <tt>tag</tt> if my parent is of type <tt>parent</tt>.
|
42
|
+
# tag:: tag name, a String
|
43
|
+
# parent:: parent tag name, a String.
|
44
|
+
def can_contain(tag, parent); false; end
|
45
|
+
|
46
|
+
# Return true if whitespace within me can be omitted (ignoring browser
|
47
|
+
# bugs)
|
48
|
+
def can_ignore_whitespace; true; end
|
49
|
+
end
|
50
|
+
|
51
|
+
# This represents an HTML block element.
|
52
|
+
class BlockTag < Tag
|
53
|
+
def is_block_element; true; end
|
54
|
+
|
55
|
+
# Blocks can contain anything, so return true.
|
56
|
+
def can_contain(tag, parent); true; end
|
57
|
+
end
|
58
|
+
|
59
|
+
# This represents an HTML inline element.
|
60
|
+
class InlineTag < Tag
|
61
|
+
def is_inline_element; true; end
|
62
|
+
|
63
|
+
# Inlines can only contain other inlines.
|
64
|
+
def can_contain(tag, parent)
|
65
|
+
Tag.named(tag).is_inline_element
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# This represents an HTML element that can be regarded as either a block
|
70
|
+
# or an inline element..
|
71
|
+
class BlockOrInlineTag < InlineTag
|
72
|
+
|
73
|
+
def is_block_element; true; end
|
74
|
+
|
75
|
+
# If used as inline elements (e.g., within another inline element or a P),
|
76
|
+
# these elements should not contain any block-level elements.
|
77
|
+
def can_contain(tag, parent)
|
78
|
+
return ((parent.downcase == 'p' \
|
79
|
+
or Tag.named(parent).is_inline_element) \
|
80
|
+
and ! Tag.named(tag).is_block_element)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# This represents an HTML tag that never has an end tag.
|
85
|
+
class EmptyTag < Tag
|
86
|
+
def is_empty_element; true; end
|
87
|
+
def is_inline_element; true; end
|
88
|
+
def can_contain(tag, parent); false; end
|
89
|
+
end
|
90
|
+
|
91
|
+
# This block initializes the tag lookup table.
|
92
|
+
class Tag
|
93
|
+
@table = Hash.new
|
94
|
+
|
95
|
+
# Add the given tag to the tag lookup table.
|
96
|
+
#
|
97
|
+
# This can be called by user code to add otherwise unknown tags to the
|
98
|
+
# table.
|
99
|
+
#
|
100
|
+
# name:: the tag name, a String.
|
101
|
+
# is_block:: true if I am a block element.
|
102
|
+
# is_inline:: true if I am an inline element.
|
103
|
+
# is_empty:: true if I am an empty element.
|
104
|
+
# can_omit:: true if my end tag can be omitted.
|
105
|
+
def Tag.add_tag(name, is_block, is_inline, is_empty, can_omit)
|
106
|
+
@table[ name.upcase ] = @table[ name.downcase ] = \
|
107
|
+
if is_empty
|
108
|
+
EmptyTag.new(name, true)
|
109
|
+
elsif is_block
|
110
|
+
if is_inline
|
111
|
+
BlockOrInlineTag.new(name, can_omit)
|
112
|
+
else
|
113
|
+
BlockTag.new(name, can_omit)
|
114
|
+
end
|
115
|
+
else
|
116
|
+
InlineTag.new(name, can_omit)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Return an Tag with the given name, or raise a
|
121
|
+
# NoSuchHTMLTagError.
|
122
|
+
def Tag.named(tagname)
|
123
|
+
@table[ tagname ] || raise(NoSuchHTMLTagError.exception(tagname))
|
124
|
+
end
|
125
|
+
|
126
|
+
# Block Inline Empty can_omit_end
|
127
|
+
[
|
128
|
+
[ 'A', false, true, false, false ], # Anchor
|
129
|
+
[ 'ABBR', false, true, false, false ], # Abbreviation
|
130
|
+
[ 'ACRONYM', false, true, false, false ], # Acronym
|
131
|
+
[ 'ADDRESS', true, false, false, false ], # Address
|
132
|
+
[ 'APPLET', true, true, false, false ], # Java applet
|
133
|
+
[ 'AREA', true, false, true, true ], # Image map region
|
134
|
+
[ 'B', false, true, false, false ], # Bold text
|
135
|
+
[ 'BASE', false, false, true, true ], # Document base URI
|
136
|
+
[ 'BASEFONT', false, true, true, true ], # Base font change
|
137
|
+
[ 'BDO', false, true, false, false ], # Bi_di override
|
138
|
+
[ 'BIG', false, true, false, false ], # Large text
|
139
|
+
[ 'BLOCKQUOTE', true, false, false, false ], # Block quotation
|
140
|
+
[ 'BODY', true, false, false, false ], # Document body
|
141
|
+
[ 'BR', false, true, true, true ], # Line break
|
142
|
+
[ 'BUTTON', true, true, false, false ], # Button
|
143
|
+
[ 'CAPTION', false, true, false, false ], # Table caption
|
144
|
+
[ 'CENTER', false, true, false, false ], # Centered block
|
145
|
+
[ 'CITE', false, true, false, false ], # Citation
|
146
|
+
[ 'CODE', false, true, false, false ], # Computer code
|
147
|
+
[ 'COL', false, false, true, true ], # Table column
|
148
|
+
[ 'COLGROUP', true, false, false, true ], # Table column group
|
149
|
+
[ 'DD', true, false, false, true ], # Definition description
|
150
|
+
[ 'DEL', true, true, false, false ], # Deleted text
|
151
|
+
[ 'DFN', false, true, false, false ], # Defined term
|
152
|
+
[ 'DIR', true, false, false, false ], # Directory list
|
153
|
+
[ 'DIV', true, false, false, false ], # Generic block-level container
|
154
|
+
[ 'DL', true, false, false, false ], # Definition list
|
155
|
+
[ 'DT', false, true, false, true ], # Definition term
|
156
|
+
[ 'EM', false, true, false, false ], # Emphasis
|
157
|
+
[ 'FIELDSET', true, false, false, false ], # Form control group
|
158
|
+
[ 'FONT', false, true, false, false ], # Font change
|
159
|
+
[ 'FORM', true, false, false, false ], # Interactive form
|
160
|
+
[ 'FRAME', false, false, true, true ], # Frame
|
161
|
+
[ 'FRAMESET', true, false, false, false ], # Frameset
|
162
|
+
[ 'H1', true, false, false, false ], # Level-one heading
|
163
|
+
[ 'H2', true, false, false, false ], # Level-two heading
|
164
|
+
[ 'H3', true, false, false, false ], # Level-three heading
|
165
|
+
[ 'H4', true, false, false, false ], # Level-four heading
|
166
|
+
[ 'H5', true, false, false, false ], # Level-five heading
|
167
|
+
[ 'H6', true, false, false, false ], # Level-six heading
|
168
|
+
[ 'HEAD', true, false, false, false ], # Document head
|
169
|
+
[ 'HR', false, true, true, true ], # Horizontal rule
|
170
|
+
[ 'HTML', true, false, false, false ], # HTML document
|
171
|
+
[ 'I', false, true, false, false ], # Italic text
|
172
|
+
[ 'IFRAME', true, true, false, false ], # Inline frame
|
173
|
+
[ 'IMG', false, true, true, true ], # Inline image
|
174
|
+
[ 'INPUT', false, true, true, true ], # Form input
|
175
|
+
[ 'INS', true, true, false, false ], # Inserted text
|
176
|
+
[ 'ISINDEX', false, true, true, true ], # Input prompt
|
177
|
+
[ 'KBD', false, true, false, false ], # Text to be input
|
178
|
+
[ 'LABEL', false, true, false, false ], # Form field label
|
179
|
+
[ 'LEGEND', false, true, false, false ], # Fieldset caption
|
180
|
+
[ 'LI', true, false, false, true ], # List item
|
181
|
+
[ 'LINK', true, false, false, true ], # Document relationship
|
182
|
+
[ 'MAP', true, true, false, false ], # Image map
|
183
|
+
[ 'MENU', true, false, false, false ], # Menu list
|
184
|
+
[ 'META', false, true, true, true ], # Metadata
|
185
|
+
[ 'NOFRAMES', true, false, false, false ], # Frames alternate content
|
186
|
+
[ 'NOSCRIPT', true, false, false, false ], # Alternate script content
|
187
|
+
[ 'OBJECT', true, true, false, false ], # Object
|
188
|
+
[ 'OL', true, false, false, false ], # Ordered list
|
189
|
+
[ 'OPTGROUP', true, false, false, false ], # Option group
|
190
|
+
[ 'OPTION', true, false, false, false ], # Menu option
|
191
|
+
[ 'P', true, false, false, true ], # Paragraph
|
192
|
+
[ 'PARAM', false, true, true, true ], # Object parameter
|
193
|
+
[ 'PRE', true, false, false, false ], # Preformatted text
|
194
|
+
[ 'Q', false, true, false, false ], # Short quotation
|
195
|
+
[ 'S', false, true, false, false ], # Strike-through text
|
196
|
+
[ 'SAMP', false, true, false, false ], # Sample output
|
197
|
+
[ 'SCRIPT', true, true, false, false ], # Client-side script
|
198
|
+
[ 'SELECT', true, false, false, false ], # Option selector
|
199
|
+
[ 'SMALL', false, true, false, false ], # Small text
|
200
|
+
[ 'SPAN', false, true, false, false ], # Generic inline container
|
201
|
+
[ 'STRIKE', false, true, false, false ], # Strike-through text
|
202
|
+
[ 'STRONG', false, true, false, false ], # Strong emphasis
|
203
|
+
[ 'STYLE', true, false, false, false ], # Embedded style sheet
|
204
|
+
[ 'SUB', false, true, false, false ], # Subscript
|
205
|
+
[ 'SUP', false, true, false, false ], # Superscript
|
206
|
+
[ 'TABLE', true, false, false, false ], # Table
|
207
|
+
[ 'TBODY', true, false, false, false ], # Table body
|
208
|
+
[ 'TD', true, false, false, true ], # Table data cell
|
209
|
+
[ 'TEXTAREA', false, true, false, false ], # Multi-line text input
|
210
|
+
[ 'TFOOT', true, false, false, true ], # Table foot
|
211
|
+
[ 'TH', true, false, false, true ], # Table header cell
|
212
|
+
[ 'THEAD', true, false, false, true ], # Table head
|
213
|
+
[ 'TITLE', true, false, false, false ], # Document title
|
214
|
+
[ 'TR', true, false, false, true ], # Table row
|
215
|
+
[ 'TT', false, true, false, false ], # Teletype text
|
216
|
+
[ 'U', false, true, false, false ], # Underlined text
|
217
|
+
[ 'UL', true, false, false, false ], # Unordered list
|
218
|
+
[ 'VAR', false, true, false, false ], # Variable
|
219
|
+
].each { |a| add_tag(*a) }
|
220
|
+
|
221
|
+
# EXCEPTIONS TODO
|
222
|
+
# A, LABEL can't contain itself
|
223
|
+
# several things (fonts, etc) can't be in PRE
|
224
|
+
# SELECT can only have OPTGROUP or OPTION
|
225
|
+
# TEXTAREA, OPTION only contains plain text
|
226
|
+
# APPLET and OBJECT has PARAM+ followed by block and/or inline
|
227
|
+
# BUTTON can't contain:
|
228
|
+
# A, INPUT, SELECT, TEXTAREA, LABEL, BUTTON, or IFRAME
|
229
|
+
# nor FORM, ISINDEX, and FIELDSET
|
230
|
+
# IFRAME can only contain block elems if parent can
|
231
|
+
# MAP can contain block+ *xor* AREA+
|
232
|
+
# SCRIPT only contains a SCRIPT (that is, until /<\/[A-Za-z]/)
|
233
|
+
# BODY must be in HTML or NOFRAMES
|
234
|
+
# COL can only be in COLGROUP or TABLE
|
235
|
+
# COLGROUP has only COL*, and can only be in TABLE
|
236
|
+
# DIR, MENU can only contain LI+, none of which may contain block elems
|
237
|
+
# DL must contain (DT|DD)+
|
238
|
+
# DT and DD are only allowed in DL
|
239
|
+
# FIELDSET contains LEGEND, (block|inline)*
|
240
|
+
# FRAMESET contains (FRAMESET|FRAME), plus NOFRAMES and must be in HTML
|
241
|
+
# H# can only be contained in block elems, but only contain inlines.
|
242
|
+
# HEAD must only contain TITLE, BASE?, ISINDEX?, SCRIPT* STYLE* META* LINK*
|
243
|
+
# OBJECT* HEAD must be in HTML
|
244
|
+
# HTML is top-level and can only contain HEAD, BODY, or HEAD, FRAMESET
|
245
|
+
# LI can contain blocks except when inside DIR or MENU
|
246
|
+
# LI can only be inside OL, UL, DIR, MENU
|
247
|
+
# OL, UL can only contain LI+
|
248
|
+
# OPTGROUP contains OPTION+
|
249
|
+
# P can only contain inlines. However, it is a block-level elem.
|
250
|
+
# PRE can only contain inlines except IMG, OBJECT, APPLET, BIG, SMALL, SUB,
|
251
|
+
# SUP, FONT, BASEFONT
|
252
|
+
|
253
|
+
# tags with optional omitted endtags and their allowed contents:
|
254
|
+
# anchor matches at beginning and end
|
255
|
+
{
|
256
|
+
'AREA' => '(?!AREA)[A-Z]+',
|
257
|
+
'COLGROUP' => 'COL',
|
258
|
+
'DD' => '(?!D[DT]$)[A-Z]+',
|
259
|
+
'DT' => '(?!D[DT]$)[A-Z]+',
|
260
|
+
'LI' => '(?!LI$)[A-Z]+',
|
261
|
+
'MAP' => 'AREA',
|
262
|
+
'P' => '(?!P$)[A-Z]+',
|
263
|
+
'TD' => '(?!T[HDR]$)[A-Z]+',
|
264
|
+
'TFOOT' => 'TR',
|
265
|
+
'TH' => '(?!T[HDR]$)[A-Z]+',
|
266
|
+
'THEAD' => 'TR',
|
267
|
+
'TR' => 'T[HD]',
|
268
|
+
}.each_pair { |tagname, pattern|
|
269
|
+
eval <<EOM
|
270
|
+
class << named(tagname) # :nodoc:
|
271
|
+
def can_contain(tag, parent)
|
272
|
+
(/\\A#{pattern}\\z/i =~ tag) == 0
|
273
|
+
end
|
274
|
+
end
|
275
|
+
EOM
|
276
|
+
}
|
277
|
+
|
278
|
+
class << named('TEXTAREA') # :nodoc:
|
279
|
+
def can_ignore_whitespace; false; end
|
280
|
+
end
|
281
|
+
class << named('PRE') # :nodoc:
|
282
|
+
def can_ignore_whitespace; false; end
|
283
|
+
end
|
284
|
+
class << named('OPTION') # :nodoc:
|
285
|
+
def can_ignore_whitespace; false; end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|