htmltools 1.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,280 @@
1
+ # This is an SGMLParser subclass that knows about HTML 4.0 rules
2
+ # and can spot empty tags and deal with tags that may have omitted endtags.
3
+ #
4
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
5
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
6
+ # License:: Ruby's License
7
+ # CVS ID:: $Id: stparser.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
8
+
9
+ require 'html/sgml-parser'
10
+ require 'html/tags'
11
+
12
+ module HTML
13
+ class StackingParser < SGMLParser
14
+ # accessors
15
+
16
+ def stack; @tagStack; end
17
+
18
+ def last_tag; @tagStack[-1] || 'html'; end
19
+
20
+ def parent_tag; @tagStack[-2] || 'html'; end
21
+
22
+ def strip_whitespace=(flag); @stripWhitespace = flag; end
23
+
24
+ # input methods
25
+
26
+ # Open and parse the given file.
27
+ def parse_file_named(name)
28
+ File.open(name) { |f|
29
+ while bytes = f.read(65536)
30
+ feed(bytes)
31
+ end
32
+ }
33
+ end
34
+
35
+ # Feed some more data to the parser.
36
+ def feed(string)
37
+ super
38
+ while @saved.size > 0
39
+ saved = @saved
40
+ @saved = ''
41
+ super(saved)
42
+ end
43
+ end
44
+
45
+ # available only to subclasses
46
+ private
47
+
48
+ if $DEBUG
49
+ def dprint(*stuff)
50
+ print((" " * @tagStack.size), stuff) if @verbose
51
+ end
52
+ else
53
+ def dprint(*stuff); end
54
+ end
55
+
56
+ def warn(msg)
57
+ $stderr.print(msg) if @verbose
58
+ end
59
+
60
+ def initialize(verbose=false, strip_white=false)
61
+ super(verbose)
62
+ @tagStack = []
63
+ @saved = ''
64
+ @stripWhitespace = strip_white
65
+ end
66
+
67
+ # handle_data will call this.
68
+ def skip_script(data)
69
+ # is the end of the script in this buffer?
70
+ if m = data.index(%r{</[A-Za-z]})
71
+ @nomoretags = false
72
+ @saved = data[m..-1]
73
+ handle_script(data[0,m]) # call user handler
74
+ else
75
+ handle_script(data)
76
+ end
77
+ end
78
+
79
+ # Unfortunately, sgml-parser calls this and there's important work to do in
80
+ # it. So the user handler has to be named something different.
81
+ def handle_data(data)
82
+ # need to handle scripts
83
+ if last_tag() == 'script' && @nomoretags
84
+ skip_script(data)
85
+ else
86
+ if @stripWhitespace
87
+ begin
88
+ data.strip! if HTML::Tag.named(last_tag()).can_ignore_whitespace
89
+ rescue NoSuchHTMLTagError
90
+ data.strip!
91
+ end
92
+ end
93
+ handle_cdata(data) if data.size > 0 # call user handler
94
+ end
95
+ end
96
+
97
+ def finish_starttag(tag, attrs)
98
+ dprint "*START* #{tag} #{attrs.inspect}\n"
99
+ # dprint "-START- #{tag}\n"
100
+ begin
101
+ unless HTML::Tag.named(last_tag()).can_contain(tag, parent_tag())
102
+ dprint "-INSERT-\n"
103
+ finish_endtag(last_tag())
104
+ end
105
+ rescue NoSuchHTMLTagError
106
+ # hmm.. last_tag was unknown.
107
+ # Assume it doesn't have an optional endtag.
108
+ end
109
+
110
+ push(tag)
111
+
112
+ begin
113
+ if HTML::Tag.named(tag).is_empty_element
114
+ dprint "-EMPTY-\n"
115
+ handle_empty_tag(tag, attrs) # call user handler
116
+ drop_to_tag(tag)
117
+ else
118
+ handle_start_tag(tag, attrs) # call user handler
119
+ end
120
+
121
+ if tag.downcase == 'script'
122
+ @nomoretags = true
123
+ end
124
+ rescue NoSuchHTMLTagError
125
+ # hmm... the start tag is unknown.
126
+ # And we pushed it.
127
+ # If it's empty, we'll get rid of it at the next end tag.
128
+ handle_unknown_tag(tag, attrs)
129
+ end
130
+ end
131
+
132
+ # return true if tag is not extra
133
+ def drop_to_tag(tag)
134
+ dropped = @tagStack.size - (@tagStack.rindex(tag.downcase) || @tagStack.size)
135
+ if dropped == 0 # got an end tag but we haven't seen start tag?
136
+ handle_extra_end_tag(tag) # call user handler
137
+ return false
138
+ end
139
+ dropped.times do
140
+ begin
141
+ # detect missing end tag
142
+ if last_tag != tag and ! HTML::Tag.named(last_tag).can_omit_end_tag
143
+ handle_missing_end_tag(last_tag) # call user handler
144
+ elsif last_tag != tag
145
+ handle_end_tag(last_tag)
146
+ end
147
+ rescue NoSuchHTMLTagError
148
+ # oops, don't recognize last_tag.
149
+ end
150
+ pop
151
+ end
152
+ return true
153
+ end
154
+
155
+ def finish_endtag(tag)
156
+ dprint "*END* #{tag}\n"
157
+ if drop_to_tag(tag)
158
+ dprint "-END- #{tag} #{@tagStack.inspect}\n"
159
+ handle_end_tag(tag) # call user handler
160
+ end
161
+ end
162
+
163
+ def push(tag)
164
+ @tagStack.push(tag.downcase)
165
+ dprint "*PUSH* #{tag} => #{@tagStack.inspect}\n"
166
+ end
167
+
168
+ def pop
169
+ tag = @tagStack.pop
170
+ dprint "*POP* #{tag} => #{@tagStack.inspect}\n"
171
+ tag
172
+ end
173
+
174
+ def unknown_charref(name)
175
+ handle_unknown_character(name)
176
+ end
177
+
178
+ def unknown_entityref(name)
179
+ handle_unknown_entity(name)
180
+ end
181
+
182
+ # callbacks: can be overridden in subclasses
183
+
184
+ def handle_start_tag(tag, attrs)
185
+ end
186
+
187
+ def handle_end_tag(tag)
188
+ end
189
+
190
+ # by default, an empty tag is handled as a start tag
191
+ # with an inserted end tag.
192
+ def handle_empty_tag(tag, attrs)
193
+ handle_start_tag(tag, attrs)
194
+ handle_end_tag(tag)
195
+ end
196
+
197
+ def handle_unknown_tag(tag, attrs)
198
+ warn("warning: unknown tag #{tag}\n")
199
+ end
200
+
201
+ def handle_missing_end_tag(tag)
202
+ warn("warning: missing end tag </#{tag}>\n")
203
+ end
204
+
205
+ def handle_extra_end_tag(tag)
206
+ warn("warning: extra end tag </#{tag}>\n")
207
+ end
208
+
209
+ def handle_cdata(data)
210
+ end
211
+
212
+ def handle_script(data)
213
+ end
214
+
215
+ def handle_unknown_character(name)
216
+ end
217
+
218
+ def handle_unknown_entity(name)
219
+ end
220
+
221
+ # call super if you want the data stripped
222
+ def handle_comment(data)
223
+ data.strip! if @stripWhitespace
224
+ end
225
+
226
+ def handle_special(data)
227
+ end
228
+
229
+ end
230
+ end
231
+
232
+ # test script
233
+ if $0 == __FILE__
234
+ $stdout.sync = true
235
+
236
+ class TestStackingParser < HTML::StackingParser
237
+ def dump_stack
238
+ stack.each { |ea| print ea, '/' }
239
+ end
240
+ def handle_start_tag(tag, attrs)
241
+ print("START: #{tag} #{attrs.inspect}\n")
242
+ end
243
+ def handle_end_tag(tag)
244
+ # print("END: #{tag}\n")
245
+ end
246
+ def handle_empty_tag(tag, attrs)
247
+ # print("EMPTY: #{tag} #{attrs.inspect}\n")
248
+ end
249
+ def handle_cdata(data)
250
+ # print("DATA: #{data.size} chars\n")
251
+ if last_tag() != 'style'
252
+ str = data.strip
253
+ if str.size > 0
254
+ dump_stack
255
+ print(str.inspect, "\n")
256
+ end
257
+ end
258
+ end
259
+ def handle_script(data)
260
+ # print("SCRIPT: #{data.size} chars\n")
261
+ end
262
+ def handle_unknown_character(name)
263
+ print("UNKC: #{name}\n")
264
+ end
265
+ def handle_unknown_entity(name)
266
+ print("UNKE: #{name}\n")
267
+ end
268
+ def handle_comment(data)
269
+ super
270
+ print("COMMENT: #{data}\n")
271
+ end
272
+ def handle_special(data)
273
+ print("SPECIAL: #{data}\n")
274
+ end
275
+ end
276
+
277
+ $DEBUG = false
278
+ p = TestStackingParser.new(true)
279
+ p.parse_file_named(ARGV[0] || 'ebay.html')
280
+ end
@@ -0,0 +1,288 @@
1
+ # This encodes the knowledge of HTML 4.0 tags for a parser.
2
+ # It knows about block vs. inline tags, empty tags, and optionally
3
+ # omitted end tags.
4
+ #
5
+ # Copyright:: Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
6
+ # Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
7
+ # License:: Ruby's license
8
+ # CVS ID:: $Id: tags.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
9
+
10
+ # This is an error raised by <tt>HTML::Tag.named()</tt> when a tag doesn't exist.
11
+ class NoSuchHTMLTagError < RuntimeError
12
+ end
13
+
14
+ # This is the base class for all the HTML tag classes.
15
+ module HTML
16
+
17
+ class Tag
18
+
19
+ # tag_name:: a String, the name of the tag
20
+ # can_omit:: a Boolean, true if end tag is optional
21
+ def initialize(tag_name, can_omit)
22
+ @name = tag_name.downcase
23
+ @can_omit_end = can_omit
24
+ end
25
+
26
+ # Return my tag name.
27
+ def name; @name; end
28
+
29
+ # Return true if my end tag can be omitted.
30
+ def can_omit_end_tag; @can_omit_end; end
31
+
32
+ # Return true if I am a block element.
33
+ def is_block_element; false; end
34
+
35
+ # Return true if I am an inline element.
36
+ def is_inline_element; false; end
37
+
38
+ # Return true if I am an empty element.
39
+ def is_empty_element; false; end
40
+
41
+ # Return true if I can contain <tt>tag</tt> if my parent is of type <tt>parent</tt>.
42
+ # tag:: tag name, a String
43
+ # parent:: parent tag name, a String.
44
+ def can_contain(tag, parent); false; end
45
+
46
+ # Return true if whitespace within me can be omitted (ignoring browser
47
+ # bugs)
48
+ def can_ignore_whitespace; true; end
49
+ end
50
+
51
+ # This represents an HTML block element.
52
+ class BlockTag < Tag
53
+ def is_block_element; true; end
54
+
55
+ # Blocks can contain anything, so return true.
56
+ def can_contain(tag, parent); true; end
57
+ end
58
+
59
+ # This represents an HTML inline element.
60
+ class InlineTag < Tag
61
+ def is_inline_element; true; end
62
+
63
+ # Inlines can only contain other inlines.
64
+ def can_contain(tag, parent)
65
+ Tag.named(tag).is_inline_element
66
+ end
67
+ end
68
+
69
+ # This represents an HTML element that can be regarded as either a block
70
+ # or an inline element..
71
+ class BlockOrInlineTag < InlineTag
72
+
73
+ def is_block_element; true; end
74
+
75
+ # If used as inline elements (e.g., within another inline element or a P),
76
+ # these elements should not contain any block-level elements.
77
+ def can_contain(tag, parent)
78
+ return ((parent.downcase == 'p' \
79
+ or Tag.named(parent).is_inline_element) \
80
+ and ! Tag.named(tag).is_block_element)
81
+ end
82
+ end
83
+
84
+ # This represents an HTML tag that never has an end tag.
85
+ class EmptyTag < Tag
86
+ def is_empty_element; true; end
87
+ def is_inline_element; true; end
88
+ def can_contain(tag, parent); false; end
89
+ end
90
+
91
+ # This block initializes the tag lookup table.
92
+ class Tag
93
+ @table = Hash.new
94
+
95
+ # Add the given tag to the tag lookup table.
96
+ #
97
+ # This can be called by user code to add otherwise unknown tags to the
98
+ # table.
99
+ #
100
+ # name:: the tag name, a String.
101
+ # is_block:: true if I am a block element.
102
+ # is_inline:: true if I am an inline element.
103
+ # is_empty:: true if I am an empty element.
104
+ # can_omit:: true if my end tag can be omitted.
105
+ def Tag.add_tag(name, is_block, is_inline, is_empty, can_omit)
106
+ @table[ name.upcase ] = @table[ name.downcase ] = \
107
+ if is_empty
108
+ EmptyTag.new(name, true)
109
+ elsif is_block
110
+ if is_inline
111
+ BlockOrInlineTag.new(name, can_omit)
112
+ else
113
+ BlockTag.new(name, can_omit)
114
+ end
115
+ else
116
+ InlineTag.new(name, can_omit)
117
+ end
118
+ end
119
+
120
+ # Return an Tag with the given name, or raise a
121
+ # NoSuchHTMLTagError.
122
+ def Tag.named(tagname)
123
+ @table[ tagname ] || raise(NoSuchHTMLTagError.exception(tagname))
124
+ end
125
+
126
+ # Block Inline Empty can_omit_end
127
+ [
128
+ [ 'A', false, true, false, false ], # Anchor
129
+ [ 'ABBR', false, true, false, false ], # Abbreviation
130
+ [ 'ACRONYM', false, true, false, false ], # Acronym
131
+ [ 'ADDRESS', true, false, false, false ], # Address
132
+ [ 'APPLET', true, true, false, false ], # Java applet
133
+ [ 'AREA', true, false, true, true ], # Image map region
134
+ [ 'B', false, true, false, false ], # Bold text
135
+ [ 'BASE', false, false, true, true ], # Document base URI
136
+ [ 'BASEFONT', false, true, true, true ], # Base font change
137
+ [ 'BDO', false, true, false, false ], # Bi_di override
138
+ [ 'BIG', false, true, false, false ], # Large text
139
+ [ 'BLOCKQUOTE', true, false, false, false ], # Block quotation
140
+ [ 'BODY', true, false, false, false ], # Document body
141
+ [ 'BR', false, true, true, true ], # Line break
142
+ [ 'BUTTON', true, true, false, false ], # Button
143
+ [ 'CAPTION', false, true, false, false ], # Table caption
144
+ [ 'CENTER', false, true, false, false ], # Centered block
145
+ [ 'CITE', false, true, false, false ], # Citation
146
+ [ 'CODE', false, true, false, false ], # Computer code
147
+ [ 'COL', false, false, true, true ], # Table column
148
+ [ 'COLGROUP', true, false, false, true ], # Table column group
149
+ [ 'DD', true, false, false, true ], # Definition description
150
+ [ 'DEL', true, true, false, false ], # Deleted text
151
+ [ 'DFN', false, true, false, false ], # Defined term
152
+ [ 'DIR', true, false, false, false ], # Directory list
153
+ [ 'DIV', true, false, false, false ], # Generic block-level container
154
+ [ 'DL', true, false, false, false ], # Definition list
155
+ [ 'DT', false, true, false, true ], # Definition term
156
+ [ 'EM', false, true, false, false ], # Emphasis
157
+ [ 'FIELDSET', true, false, false, false ], # Form control group
158
+ [ 'FONT', false, true, false, false ], # Font change
159
+ [ 'FORM', true, false, false, false ], # Interactive form
160
+ [ 'FRAME', false, false, true, true ], # Frame
161
+ [ 'FRAMESET', true, false, false, false ], # Frameset
162
+ [ 'H1', true, false, false, false ], # Level-one heading
163
+ [ 'H2', true, false, false, false ], # Level-two heading
164
+ [ 'H3', true, false, false, false ], # Level-three heading
165
+ [ 'H4', true, false, false, false ], # Level-four heading
166
+ [ 'H5', true, false, false, false ], # Level-five heading
167
+ [ 'H6', true, false, false, false ], # Level-six heading
168
+ [ 'HEAD', true, false, false, false ], # Document head
169
+ [ 'HR', false, true, true, true ], # Horizontal rule
170
+ [ 'HTML', true, false, false, false ], # HTML document
171
+ [ 'I', false, true, false, false ], # Italic text
172
+ [ 'IFRAME', true, true, false, false ], # Inline frame
173
+ [ 'IMG', false, true, true, true ], # Inline image
174
+ [ 'INPUT', false, true, true, true ], # Form input
175
+ [ 'INS', true, true, false, false ], # Inserted text
176
+ [ 'ISINDEX', false, true, true, true ], # Input prompt
177
+ [ 'KBD', false, true, false, false ], # Text to be input
178
+ [ 'LABEL', false, true, false, false ], # Form field label
179
+ [ 'LEGEND', false, true, false, false ], # Fieldset caption
180
+ [ 'LI', true, false, false, true ], # List item
181
+ [ 'LINK', true, false, false, true ], # Document relationship
182
+ [ 'MAP', true, true, false, false ], # Image map
183
+ [ 'MENU', true, false, false, false ], # Menu list
184
+ [ 'META', false, true, true, true ], # Metadata
185
+ [ 'NOFRAMES', true, false, false, false ], # Frames alternate content
186
+ [ 'NOSCRIPT', true, false, false, false ], # Alternate script content
187
+ [ 'OBJECT', true, true, false, false ], # Object
188
+ [ 'OL', true, false, false, false ], # Ordered list
189
+ [ 'OPTGROUP', true, false, false, false ], # Option group
190
+ [ 'OPTION', true, false, false, false ], # Menu option
191
+ [ 'P', true, false, false, true ], # Paragraph
192
+ [ 'PARAM', false, true, true, true ], # Object parameter
193
+ [ 'PRE', true, false, false, false ], # Preformatted text
194
+ [ 'Q', false, true, false, false ], # Short quotation
195
+ [ 'S', false, true, false, false ], # Strike-through text
196
+ [ 'SAMP', false, true, false, false ], # Sample output
197
+ [ 'SCRIPT', true, true, false, false ], # Client-side script
198
+ [ 'SELECT', true, false, false, false ], # Option selector
199
+ [ 'SMALL', false, true, false, false ], # Small text
200
+ [ 'SPAN', false, true, false, false ], # Generic inline container
201
+ [ 'STRIKE', false, true, false, false ], # Strike-through text
202
+ [ 'STRONG', false, true, false, false ], # Strong emphasis
203
+ [ 'STYLE', true, false, false, false ], # Embedded style sheet
204
+ [ 'SUB', false, true, false, false ], # Subscript
205
+ [ 'SUP', false, true, false, false ], # Superscript
206
+ [ 'TABLE', true, false, false, false ], # Table
207
+ [ 'TBODY', true, false, false, false ], # Table body
208
+ [ 'TD', true, false, false, true ], # Table data cell
209
+ [ 'TEXTAREA', false, true, false, false ], # Multi-line text input
210
+ [ 'TFOOT', true, false, false, true ], # Table foot
211
+ [ 'TH', true, false, false, true ], # Table header cell
212
+ [ 'THEAD', true, false, false, true ], # Table head
213
+ [ 'TITLE', true, false, false, false ], # Document title
214
+ [ 'TR', true, false, false, true ], # Table row
215
+ [ 'TT', false, true, false, false ], # Teletype text
216
+ [ 'U', false, true, false, false ], # Underlined text
217
+ [ 'UL', true, false, false, false ], # Unordered list
218
+ [ 'VAR', false, true, false, false ], # Variable
219
+ ].each { |a| add_tag(*a) }
220
+
221
+ # EXCEPTIONS TODO
222
+ # A, LABEL can't contain itself
223
+ # several things (fonts, etc) can't be in PRE
224
+ # SELECT can only have OPTGROUP or OPTION
225
+ # TEXTAREA, OPTION only contains plain text
226
+ # APPLET and OBJECT has PARAM+ followed by block and/or inline
227
+ # BUTTON can't contain:
228
+ # A, INPUT, SELECT, TEXTAREA, LABEL, BUTTON, or IFRAME
229
+ # nor FORM, ISINDEX, and FIELDSET
230
+ # IFRAME can only contain block elems if parent can
231
+ # MAP can contain block+ *xor* AREA+
232
+ # SCRIPT only contains a SCRIPT (that is, until /<\/[A-Za-z]/)
233
+ # BODY must be in HTML or NOFRAMES
234
+ # COL can only be in COLGROUP or TABLE
235
+ # COLGROUP has only COL*, and can only be in TABLE
236
+ # DIR, MENU can only contain LI+, none of which may contain block elems
237
+ # DL must contain (DT|DD)+
238
+ # DT and DD are only allowed in DL
239
+ # FIELDSET contains LEGEND, (block|inline)*
240
+ # FRAMESET contains (FRAMESET|FRAME), plus NOFRAMES and must be in HTML
241
+ # H# can only be contained in block elems, but only contain inlines.
242
+ # HEAD must only contain TITLE, BASE?, ISINDEX?, SCRIPT* STYLE* META* LINK*
243
+ # OBJECT* HEAD must be in HTML
244
+ # HTML is top-level and can only contain HEAD, BODY, or HEAD, FRAMESET
245
+ # LI can contain blocks except when inside DIR or MENU
246
+ # LI can only be inside OL, UL, DIR, MENU
247
+ # OL, UL can only contain LI+
248
+ # OPTGROUP contains OPTION+
249
+ # P can only contain inlines. However, it is a block-level elem.
250
+ # PRE can only contain inlines except IMG, OBJECT, APPLET, BIG, SMALL, SUB,
251
+ # SUP, FONT, BASEFONT
252
+
253
+ # tags with optional omitted endtags and their allowed contents:
254
+ # anchor matches at beginning and end
255
+ {
256
+ 'AREA' => '(?!AREA)[A-Z]+',
257
+ 'COLGROUP' => 'COL',
258
+ 'DD' => '(?!D[DT]$)[A-Z]+',
259
+ 'DT' => '(?!D[DT]$)[A-Z]+',
260
+ 'LI' => '(?!LI$)[A-Z]+',
261
+ 'MAP' => 'AREA',
262
+ 'P' => '(?!P$)[A-Z]+',
263
+ 'TD' => '(?!T[HDR]$)[A-Z]+',
264
+ 'TFOOT' => 'TR',
265
+ 'TH' => '(?!T[HDR]$)[A-Z]+',
266
+ 'THEAD' => 'TR',
267
+ 'TR' => 'T[HD]',
268
+ }.each_pair { |tagname, pattern|
269
+ eval <<EOM
270
+ class << named(tagname) # :nodoc:
271
+ def can_contain(tag, parent)
272
+ (/\\A#{pattern}\\z/i =~ tag) == 0
273
+ end
274
+ end
275
+ EOM
276
+ }
277
+
278
+ class << named('TEXTAREA') # :nodoc:
279
+ def can_ignore_whitespace; false; end
280
+ end
281
+ class << named('PRE') # :nodoc:
282
+ def can_ignore_whitespace; false; end
283
+ end
284
+ class << named('OPTION') # :nodoc:
285
+ def can_ignore_whitespace; false; end
286
+ end
287
+ end
288
+ end