htmltools 1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,280 @@
1
+ # This is an SGMLParser subclass that knows about HTML 4.0 rules
2
+ # and can spot empty tags and deal with tags that may have omitted endtags.
3
+ #
4
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
5
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
6
+ # License:: Ruby's License
7
+ # CVS ID:: $Id: stparser.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
8
+
9
+ require 'html/sgml-parser'
10
+ require 'html/tags'
11
+
12
+ module HTML
13
+ class StackingParser < SGMLParser
14
+ # accessors
15
+
16
+ def stack; @tagStack; end
17
+
18
+ def last_tag; @tagStack[-1] || 'html'; end
19
+
20
+ def parent_tag; @tagStack[-2] || 'html'; end
21
+
22
+ def strip_whitespace=(flag); @stripWhitespace = flag; end
23
+
24
+ # input methods
25
+
26
+ # Open and parse the given file.
27
+ def parse_file_named(name)
28
+ File.open(name) { |f|
29
+ while bytes = f.read(65536)
30
+ feed(bytes)
31
+ end
32
+ }
33
+ end
34
+
35
+ # Feed some more data to the parser.
36
+ def feed(string)
37
+ super
38
+ while @saved.size > 0
39
+ saved = @saved
40
+ @saved = ''
41
+ super(saved)
42
+ end
43
+ end
44
+
45
+ # available only to subclasses
46
+ private
47
+
48
+ if $DEBUG
49
+ def dprint(*stuff)
50
+ print((" " * @tagStack.size), stuff) if @verbose
51
+ end
52
+ else
53
+ def dprint(*stuff); end
54
+ end
55
+
56
+ def warn(msg)
57
+ $stderr.print(msg) if @verbose
58
+ end
59
+
60
+ def initialize(verbose=false, strip_white=false)
61
+ super(verbose)
62
+ @tagStack = []
63
+ @saved = ''
64
+ @stripWhitespace = strip_white
65
+ end
66
+
67
+ # handle_data will call this.
68
+ def skip_script(data)
69
+ # is the end of the script in this buffer?
70
+ if m = data.index(%r{</[A-Za-z]})
71
+ @nomoretags = false
72
+ @saved = data[m..-1]
73
+ handle_script(data[0,m]) # call user handler
74
+ else
75
+ handle_script(data)
76
+ end
77
+ end
78
+
79
+ # Unfortunately, sgml-parser calls this and there's important work to do in
80
+ # it. So the user handler has to be named something different.
81
+ def handle_data(data)
82
+ # need to handle scripts
83
+ if last_tag() == 'script' && @nomoretags
84
+ skip_script(data)
85
+ else
86
+ if @stripWhitespace
87
+ begin
88
+ data.strip! if HTML::Tag.named(last_tag()).can_ignore_whitespace
89
+ rescue NoSuchHTMLTagError
90
+ data.strip!
91
+ end
92
+ end
93
+ handle_cdata(data) if data.size > 0 # call user handler
94
+ end
95
+ end
96
+
97
+ def finish_starttag(tag, attrs)
98
+ dprint "*START* #{tag} #{attrs.inspect}\n"
99
+ # dprint "-START- #{tag}\n"
100
+ begin
101
+ unless HTML::Tag.named(last_tag()).can_contain(tag, parent_tag())
102
+ dprint "-INSERT-\n"
103
+ finish_endtag(last_tag())
104
+ end
105
+ rescue NoSuchHTMLTagError
106
+ # hmm.. last_tag was unknown.
107
+ # Assume it doesn't have an optional endtag.
108
+ end
109
+
110
+ push(tag)
111
+
112
+ begin
113
+ if HTML::Tag.named(tag).is_empty_element
114
+ dprint "-EMPTY-\n"
115
+ handle_empty_tag(tag, attrs) # call user handler
116
+ drop_to_tag(tag)
117
+ else
118
+ handle_start_tag(tag, attrs) # call user handler
119
+ end
120
+
121
+ if tag.downcase == 'script'
122
+ @nomoretags = true
123
+ end
124
+ rescue NoSuchHTMLTagError
125
+ # hmm... the start tag is unknown.
126
+ # And we pushed it.
127
+ # If it's empty, we'll get rid of it at the next end tag.
128
+ handle_unknown_tag(tag, attrs)
129
+ end
130
+ end
131
+
132
+ # return true if tag is not extra
133
+ def drop_to_tag(tag)
134
+ dropped = @tagStack.size - (@tagStack.rindex(tag.downcase) || @tagStack.size)
135
+ if dropped == 0 # got an end tag but we haven't seen start tag?
136
+ handle_extra_end_tag(tag) # call user handler
137
+ return false
138
+ end
139
+ dropped.times do
140
+ begin
141
+ # detect missing end tag
142
+ if last_tag != tag and ! HTML::Tag.named(last_tag).can_omit_end_tag
143
+ handle_missing_end_tag(last_tag) # call user handler
144
+ elsif last_tag != tag
145
+ handle_end_tag(last_tag)
146
+ end
147
+ rescue NoSuchHTMLTagError
148
+ # oops, don't recognize last_tag.
149
+ end
150
+ pop
151
+ end
152
+ return true
153
+ end
154
+
155
+ def finish_endtag(tag)
156
+ dprint "*END* #{tag}\n"
157
+ if drop_to_tag(tag)
158
+ dprint "-END- #{tag} #{@tagStack.inspect}\n"
159
+ handle_end_tag(tag) # call user handler
160
+ end
161
+ end
162
+
163
+ def push(tag)
164
+ @tagStack.push(tag.downcase)
165
+ dprint "*PUSH* #{tag} => #{@tagStack.inspect}\n"
166
+ end
167
+
168
+ def pop
169
+ tag = @tagStack.pop
170
+ dprint "*POP* #{tag} => #{@tagStack.inspect}\n"
171
+ tag
172
+ end
173
+
174
+ def unknown_charref(name)
175
+ handle_unknown_character(name)
176
+ end
177
+
178
+ def unknown_entityref(name)
179
+ handle_unknown_entity(name)
180
+ end
181
+
182
+ # callbacks: can be overridden in subclasses
183
+
184
+ def handle_start_tag(tag, attrs)
185
+ end
186
+
187
+ def handle_end_tag(tag)
188
+ end
189
+
190
+ # by default, an empty tag is handled as a start tag
191
+ # with an inserted end tag.
192
+ def handle_empty_tag(tag, attrs)
193
+ handle_start_tag(tag, attrs)
194
+ handle_end_tag(tag)
195
+ end
196
+
197
+ def handle_unknown_tag(tag, attrs)
198
+ warn("warning: unknown tag #{tag}\n")
199
+ end
200
+
201
+ def handle_missing_end_tag(tag)
202
+ warn("warning: missing end tag </#{tag}>\n")
203
+ end
204
+
205
+ def handle_extra_end_tag(tag)
206
+ warn("warning: extra end tag </#{tag}>\n")
207
+ end
208
+
209
+ def handle_cdata(data)
210
+ end
211
+
212
+ def handle_script(data)
213
+ end
214
+
215
+ def handle_unknown_character(name)
216
+ end
217
+
218
+ def handle_unknown_entity(name)
219
+ end
220
+
221
+ # call super if you want the data stripped
222
+ def handle_comment(data)
223
+ data.strip! if @stripWhitespace
224
+ end
225
+
226
+ def handle_special(data)
227
+ end
228
+
229
+ end
230
+ end
231
+
232
+ # test script
233
+ if $0 == __FILE__
234
+ $stdout.sync = true
235
+
236
+ class TestStackingParser < HTML::StackingParser
237
+ def dump_stack
238
+ stack.each { |ea| print ea, '/' }
239
+ end
240
+ def handle_start_tag(tag, attrs)
241
+ print("START: #{tag} #{attrs.inspect}\n")
242
+ end
243
+ def handle_end_tag(tag)
244
+ # print("END: #{tag}\n")
245
+ end
246
+ def handle_empty_tag(tag, attrs)
247
+ # print("EMPTY: #{tag} #{attrs.inspect}\n")
248
+ end
249
+ def handle_cdata(data)
250
+ # print("DATA: #{data.size} chars\n")
251
+ if last_tag() != 'style'
252
+ str = data.strip
253
+ if str.size > 0
254
+ dump_stack
255
+ print(str.inspect, "\n")
256
+ end
257
+ end
258
+ end
259
+ def handle_script(data)
260
+ # print("SCRIPT: #{data.size} chars\n")
261
+ end
262
+ def handle_unknown_character(name)
263
+ print("UNKC: #{name}\n")
264
+ end
265
+ def handle_unknown_entity(name)
266
+ print("UNKE: #{name}\n")
267
+ end
268
+ def handle_comment(data)
269
+ super
270
+ print("COMMENT: #{data}\n")
271
+ end
272
+ def handle_special(data)
273
+ print("SPECIAL: #{data}\n")
274
+ end
275
+ end
276
+
277
+ $DEBUG = false
278
+ p = TestStackingParser.new(true)
279
+ p.parse_file_named(ARGV[0] || 'ebay.html')
280
+ end
@@ -0,0 +1,288 @@
1
+ # This encodes the knowledge of HTML 4.0 tags for a parser.
2
+ # It knows about block vs. inline tags, empty tags, and optionally
3
+ # omitted end tags.
4
+ #
5
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
6
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
7
+ # License:: Ruby's license
8
+ # CVS ID:: $Id: tags.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
9
+
10
+ # This is an error raised by <tt>HTML::Tag.named()</tt> when a tag doesn't exist.
11
+ class NoSuchHTMLTagError < RuntimeError
12
+ end
13
+
14
+ # This is the base class for all the HTML tag classes.
15
+ module HTML
16
+
17
+ class Tag
18
+
19
+ # tag_name:: a String, the name of the tag
20
+ # can_omit:: a Boolean, true if end tag is optional
21
+ def initialize(tag_name, can_omit)
22
+ @name = tag_name.downcase
23
+ @can_omit_end = can_omit
24
+ end
25
+
26
+ # Return my tag name.
27
+ def name; @name; end
28
+
29
+ # Return true if my end tag can be omitted.
30
+ def can_omit_end_tag; @can_omit_end; end
31
+
32
+ # Return true if I am a block element.
33
+ def is_block_element; false; end
34
+
35
+ # Return true if I am an inline element.
36
+ def is_inline_element; false; end
37
+
38
+ # Return true if I am an empty element.
39
+ def is_empty_element; false; end
40
+
41
+ # Return true if I can contain <tt>tag</tt> if my parent is of type <tt>parent</tt>.
42
+ # tag:: tag name, a String
43
+ # parent:: parent tag name, a String.
44
+ def can_contain(tag, parent); false; end
45
+
46
+ # Return true if whitespace within me can be omitted (ignoring browser
47
+ # bugs)
48
+ def can_ignore_whitespace; true; end
49
+ end
50
+
51
+ # This represents an HTML block element.
52
+ class BlockTag < Tag
53
+ def is_block_element; true; end
54
+
55
+ # Blocks can contain anything, so return true.
56
+ def can_contain(tag, parent); true; end
57
+ end
58
+
59
+ # This represents an HTML inline element.
60
+ class InlineTag < Tag
61
+ def is_inline_element; true; end
62
+
63
+ # Inlines can only contain other inlines.
64
+ def can_contain(tag, parent)
65
+ Tag.named(tag).is_inline_element
66
+ end
67
+ end
68
+
69
+ # This represents an HTML element that can be regarded as either a block
70
+ # or an inline element..
71
+ class BlockOrInlineTag < InlineTag
72
+
73
+ def is_block_element; true; end
74
+
75
+ # If used as inline elements (e.g., within another inline element or a P),
76
+ # these elements should not contain any block-level elements.
77
+ def can_contain(tag, parent)
78
+ return ((parent.downcase == 'p' \
79
+ or Tag.named(parent).is_inline_element) \
80
+ and ! Tag.named(tag).is_block_element)
81
+ end
82
+ end
83
+
84
+ # This represents an HTML tag that never has an end tag.
85
+ class EmptyTag < Tag
86
+ def is_empty_element; true; end
87
+ def is_inline_element; true; end
88
+ def can_contain(tag, parent); false; end
89
+ end
90
+
91
+ # This block initializes the tag lookup table.
92
+ class Tag
93
+ @table = Hash.new
94
+
95
+ # Add the given tag to the tag lookup table.
96
+ #
97
+ # This can be called by user code to add otherwise unknown tags to the
98
+ # table.
99
+ #
100
+ # name:: the tag name, a String.
101
+ # is_block:: true if I am a block element.
102
+ # is_inline:: true if I am an inline element.
103
+ # is_empty:: true if I am an empty element.
104
+ # can_omit:: true if my end tag can be omitted.
105
+ def Tag.add_tag(name, is_block, is_inline, is_empty, can_omit)
106
+ @table[ name.upcase ] = @table[ name.downcase ] = \
107
+ if is_empty
108
+ EmptyTag.new(name, true)
109
+ elsif is_block
110
+ if is_inline
111
+ BlockOrInlineTag.new(name, can_omit)
112
+ else
113
+ BlockTag.new(name, can_omit)
114
+ end
115
+ else
116
+ InlineTag.new(name, can_omit)
117
+ end
118
+ end
119
+
120
+ # Return an Tag with the given name, or raise a
121
+ # NoSuchHTMLTagError.
122
+ def Tag.named(tagname)
123
+ @table[ tagname ] || raise(NoSuchHTMLTagError.exception(tagname))
124
+ end
125
+
126
+ # Block Inline Empty can_omit_end
127
+ [
128
+ [ 'A', false, true, false, false ], # Anchor
129
+ [ 'ABBR', false, true, false, false ], # Abbreviation
130
+ [ 'ACRONYM', false, true, false, false ], # Acronym
131
+ [ 'ADDRESS', true, false, false, false ], # Address
132
+ [ 'APPLET', true, true, false, false ], # Java applet
133
+ [ 'AREA', true, false, true, true ], # Image map region
134
+ [ 'B', false, true, false, false ], # Bold text
135
+ [ 'BASE', false, false, true, true ], # Document base URI
136
+ [ 'BASEFONT', false, true, true, true ], # Base font change
137
+ [ 'BDO', false, true, false, false ], # Bi_di override
138
+ [ 'BIG', false, true, false, false ], # Large text
139
+ [ 'BLOCKQUOTE', true, false, false, false ], # Block quotation
140
+ [ 'BODY', true, false, false, false ], # Document body
141
+ [ 'BR', false, true, true, true ], # Line break
142
+ [ 'BUTTON', true, true, false, false ], # Button
143
+ [ 'CAPTION', false, true, false, false ], # Table caption
144
+ [ 'CENTER', false, true, false, false ], # Centered block
145
+ [ 'CITE', false, true, false, false ], # Citation
146
+ [ 'CODE', false, true, false, false ], # Computer code
147
+ [ 'COL', false, false, true, true ], # Table column
148
+ [ 'COLGROUP', true, false, false, true ], # Table column group
149
+ [ 'DD', true, false, false, true ], # Definition description
150
+ [ 'DEL', true, true, false, false ], # Deleted text
151
+ [ 'DFN', false, true, false, false ], # Defined term
152
+ [ 'DIR', true, false, false, false ], # Directory list
153
+ [ 'DIV', true, false, false, false ], # Generic block-level container
154
+ [ 'DL', true, false, false, false ], # Definition list
155
+ [ 'DT', false, true, false, true ], # Definition term
156
+ [ 'EM', false, true, false, false ], # Emphasis
157
+ [ 'FIELDSET', true, false, false, false ], # Form control group
158
+ [ 'FONT', false, true, false, false ], # Font change
159
+ [ 'FORM', true, false, false, false ], # Interactive form
160
+ [ 'FRAME', false, false, true, true ], # Frame
161
+ [ 'FRAMESET', true, false, false, false ], # Frameset
162
+ [ 'H1', true, false, false, false ], # Level-one heading
163
+ [ 'H2', true, false, false, false ], # Level-two heading
164
+ [ 'H3', true, false, false, false ], # Level-three heading
165
+ [ 'H4', true, false, false, false ], # Level-four heading
166
+ [ 'H5', true, false, false, false ], # Level-five heading
167
+ [ 'H6', true, false, false, false ], # Level-six heading
168
+ [ 'HEAD', true, false, false, false ], # Document head
169
+ [ 'HR', false, true, true, true ], # Horizontal rule
170
+ [ 'HTML', true, false, false, false ], # HTML document
171
+ [ 'I', false, true, false, false ], # Italic text
172
+ [ 'IFRAME', true, true, false, false ], # Inline frame
173
+ [ 'IMG', false, true, true, true ], # Inline image
174
+ [ 'INPUT', false, true, true, true ], # Form input
175
+ [ 'INS', true, true, false, false ], # Inserted text
176
+ [ 'ISINDEX', false, true, true, true ], # Input prompt
177
+ [ 'KBD', false, true, false, false ], # Text to be input
178
+ [ 'LABEL', false, true, false, false ], # Form field label
179
+ [ 'LEGEND', false, true, false, false ], # Fieldset caption
180
+ [ 'LI', true, false, false, true ], # List item
181
+ [ 'LINK', true, false, false, true ], # Document relationship
182
+ [ 'MAP', true, true, false, false ], # Image map
183
+ [ 'MENU', true, false, false, false ], # Menu list
184
+ [ 'META', false, true, true, true ], # Metadata
185
+ [ 'NOFRAMES', true, false, false, false ], # Frames alternate content
186
+ [ 'NOSCRIPT', true, false, false, false ], # Alternate script content
187
+ [ 'OBJECT', true, true, false, false ], # Object
188
+ [ 'OL', true, false, false, false ], # Ordered list
189
+ [ 'OPTGROUP', true, false, false, false ], # Option group
190
+ [ 'OPTION', true, false, false, false ], # Menu option
191
+ [ 'P', true, false, false, true ], # Paragraph
192
+ [ 'PARAM', false, true, true, true ], # Object parameter
193
+ [ 'PRE', true, false, false, false ], # Preformatted text
194
+ [ 'Q', false, true, false, false ], # Short quotation
195
+ [ 'S', false, true, false, false ], # Strike-through text
196
+ [ 'SAMP', false, true, false, false ], # Sample output
197
+ [ 'SCRIPT', true, true, false, false ], # Client-side script
198
+ [ 'SELECT', true, false, false, false ], # Option selector
199
+ [ 'SMALL', false, true, false, false ], # Small text
200
+ [ 'SPAN', false, true, false, false ], # Generic inline container
201
+ [ 'STRIKE', false, true, false, false ], # Strike-through text
202
+ [ 'STRONG', false, true, false, false ], # Strong emphasis
203
+ [ 'STYLE', true, false, false, false ], # Embedded style sheet
204
+ [ 'SUB', false, true, false, false ], # Subscript
205
+ [ 'SUP', false, true, false, false ], # Superscript
206
+ [ 'TABLE', true, false, false, false ], # Table
207
+ [ 'TBODY', true, false, false, false ], # Table body
208
+ [ 'TD', true, false, false, true ], # Table data cell
209
+ [ 'TEXTAREA', false, true, false, false ], # Multi-line text input
210
+ [ 'TFOOT', true, false, false, true ], # Table foot
211
+ [ 'TH', true, false, false, true ], # Table header cell
212
+ [ 'THEAD', true, false, false, true ], # Table head
213
+ [ 'TITLE', true, false, false, false ], # Document title
214
+ [ 'TR', true, false, false, true ], # Table row
215
+ [ 'TT', false, true, false, false ], # Teletype text
216
+ [ 'U', false, true, false, false ], # Underlined text
217
+ [ 'UL', true, false, false, false ], # Unordered list
218
+ [ 'VAR', false, true, false, false ], # Variable
219
+ ].each { |a| add_tag(*a) }
220
+
221
+ # EXCEPTIONS TODO
222
+ # A, LABEL can't contain itself
223
+ # several things (fonts, etc) can't be in PRE
224
+ # SELECT can only have OPTGROUP or OPTION
225
+ # TEXTAREA, OPTION only contains plain text
226
+ # APPLET and OBJECT has PARAM+ followed by block and/or inline
227
+ # BUTTON can't contain:
228
+ # A, INPUT, SELECT, TEXTAREA, LABEL, BUTTON, or IFRAME
229
+ # nor FORM, ISINDEX, and FIELDSET
230
+ # IFRAME can only contain block elems if parent can
231
+ # MAP can contain block+ *xor* AREA+
232
+ # SCRIPT only contains a SCRIPT (that is, until /<\/[A-Za-z]/)
233
+ # BODY must be in HTML or NOFRAMES
234
+ # COL can only be in COLGROUP or TABLE
235
+ # COLGROUP has only COL*, and can only be in TABLE
236
+ # DIR, MENU can only contain LI+, none of which may contain block elems
237
+ # DL must contain (DT|DD)+
238
+ # DT and DD are only allowed in DL
239
+ # FIELDSET contains LEGEND, (block|inline)*
240
+ # FRAMESET contains (FRAMESET|FRAME), plus NOFRAMES and must be in HTML
241
+ # H# can only be contained in block elems, but only contain inlines.
242
+ # HEAD must only contain TITLE, BASE?, ISINDEX?, SCRIPT* STYLE* META* LINK*
243
+ # OBJECT* HEAD must be in HTML
244
+ # HTML is top-level and can only contain HEAD, BODY, or HEAD, FRAMESET
245
+ # LI can contain blocks except when inside DIR or MENU
246
+ # LI can only be inside OL, UL, DIR, MENU
247
+ # OL, UL can only contain LI+
248
+ # OPTGROUP contains OPTION+
249
+ # P can only contain inlines. However, it is a block-level elem.
250
+ # PRE can only contain inlines except IMG, OBJECT, APPLET, BIG, SMALL, SUB,
251
+ # SUP, FONT, BASEFONT
252
+
253
+ # tags with optional omitted endtags and their allowed contents:
254
+ # anchor matches at beginning and end
255
+ {
256
+ 'AREA' => '(?!AREA)[A-Z]+',
257
+ 'COLGROUP' => 'COL',
258
+ 'DD' => '(?!D[DT]$)[A-Z]+',
259
+ 'DT' => '(?!D[DT]$)[A-Z]+',
260
+ 'LI' => '(?!LI$)[A-Z]+',
261
+ 'MAP' => 'AREA',
262
+ 'P' => '(?!P$)[A-Z]+',
263
+ 'TD' => '(?!T[HDR]$)[A-Z]+',
264
+ 'TFOOT' => 'TR',
265
+ 'TH' => '(?!T[HDR]$)[A-Z]+',
266
+ 'THEAD' => 'TR',
267
+ 'TR' => 'T[HD]',
268
+ }.each_pair { |tagname, pattern|
269
+ eval <<EOM
270
+ class << named(tagname) # :nodoc:
271
+ def can_contain(tag, parent)
272
+ (/\\A#{pattern}\\z/i =~ tag) == 0
273
+ end
274
+ end
275
+ EOM
276
+ }
277
+
278
+ class << named('TEXTAREA') # :nodoc:
279
+ def can_ignore_whitespace; false; end
280
+ end
281
+ class << named('PRE') # :nodoc:
282
+ def can_ignore_whitespace; false; end
283
+ end
284
+ class << named('OPTION') # :nodoc:
285
+ def can_ignore_whitespace; false; end
286
+ end
287
+ end
288
+ end