ruby-web 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +474 -0
- data/INSTALL.txt +9 -0
- data/InstalledFiles +180 -0
- data/LICENSE.txt +74 -0
- data/Rakefile +529 -0
- data/TODO +65 -0
- data/doc/additional.xml +149 -0
- data/doc/core.xml +652 -0
- data/doc/credits/index.xml +52 -0
- data/doc/credits/php.contributors.xml +118 -0
- data/doc/credits/php.language-snippets.ent +622 -0
- data/doc/install/index.xml +136 -0
- data/doc/install/mac/index.xml +21 -0
- data/doc/install/ruby-web.install.rb.instructions.xml +7 -0
- data/doc/install/unix/index.xml +46 -0
- data/doc/install/win/apache1.xml +166 -0
- data/doc/install/win/apache2.xml +141 -0
- data/doc/install/win/iis.xml +162 -0
- data/doc/install/win/index.xml +24 -0
- data/doc/install/win/installer.xml +31 -0
- data/doc/install/win/manual.xml +43 -0
- data/doc/manual.xml +69 -0
- data/doc/old/apache_cgi.txt +23 -0
- data/doc/old/fastcgi.txt +23 -0
- data/doc/old/mod_ruby.txt +21 -0
- data/doc/old/snippets.rdoc +183 -0
- data/doc/old/webrick.txt +23 -0
- data/doc/old/windows_cgi.txt +9 -0
- data/doc/tutorial.xml +14 -0
- data/doc/xsl/manual-multi.xsl +10 -0
- data/doc/xsl/manual-pdf.xsl +6 -0
- data/doc/xsl/manual-single.xsl +6 -0
- data/doc/xsl/manual.css +22 -0
- data/install.rb +1022 -0
- data/lib/formatter.rb +314 -0
- data/lib/html-parser.rb +429 -0
- data/lib/htmlrepair.rb +113 -0
- data/lib/htmlsplit.rb +842 -0
- data/lib/sgml-parser.rb +332 -0
- data/lib/web.rb +68 -0
- data/lib/web/assertinclude.rb +129 -0
- data/lib/web/config.rb +50 -0
- data/lib/web/connection.rb +1070 -0
- data/lib/web/convenience.rb +154 -0
- data/lib/web/formreader.rb +318 -0
- data/lib/web/htmlparser/html-parser.rb +429 -0
- data/lib/web/htmlparser/sgml-parser.rb +332 -0
- data/lib/web/htmltools/element.rb +296 -0
- data/lib/web/htmltools/stparser.rb +276 -0
- data/lib/web/htmltools/tags.rb +286 -0
- data/lib/web/htmltools/tree.rb +139 -0
- data/lib/web/htmltools/xmltree.rb +160 -0
- data/lib/web/htmltools/xpath.rb +71 -0
- data/lib/web/info.rb +63 -0
- data/lib/web/load.rb +210 -0
- data/lib/web/mime.rb +87 -0
- data/lib/web/phprb.rb +340 -0
- data/lib/web/resources/test/cookie.rb +33 -0
- data/lib/web/resources/test/counter.rb +20 -0
- data/lib/web/resources/test/multipart.rb +14 -0
- data/lib/web/resources/test/redirect.rb +8 -0
- data/lib/web/resources/test/stock.rb +33 -0
- data/lib/web/sapi/apache.rb +129 -0
- data/lib/web/sapi/fastcgi.rb +22 -0
- data/lib/web/sapi/install/apache.rb +180 -0
- data/lib/web/sapi/install/iis.rb +93 -0
- data/lib/web/sapi/install/macosx.rb +90 -0
- data/lib/web/sapi/webrick.rb +86 -0
- data/lib/web/session.rb +83 -0
- data/lib/web/shim/cgi.rb +129 -0
- data/lib/web/shim/rails.rb +175 -0
- data/lib/web/stringio.rb +78 -0
- data/lib/web/strscanparser.rb +24 -0
- data/lib/web/tagparser.rb +96 -0
- data/lib/web/testing.rb +666 -0
- data/lib/web/traceoutput.rb +75 -0
- data/lib/web/unit.rb +56 -0
- data/lib/web/upload.rb +59 -0
- data/lib/web/validate.rb +52 -0
- data/lib/web/wiki.rb +557 -0
- data/lib/web/wiki/linker.rb +72 -0
- data/lib/web/wiki/page.rb +201 -0
- data/lib/webunit.rb +27 -0
- data/lib/webunit/assert.rb +152 -0
- data/lib/webunit/converter.rb +154 -0
- data/lib/webunit/cookie.rb +118 -0
- data/lib/webunit/domwalker.rb +185 -0
- data/lib/webunit/exception.rb +14 -0
- data/lib/webunit/form.rb +116 -0
- data/lib/webunit/frame.rb +37 -0
- data/lib/webunit/htmlelem.rb +122 -0
- data/lib/webunit/image.rb +26 -0
- data/lib/webunit/jscript.rb +31 -0
- data/lib/webunit/link.rb +33 -0
- data/lib/webunit/params.rb +321 -0
- data/lib/webunit/parser.rb +229 -0
- data/lib/webunit/response.rb +464 -0
- data/lib/webunit/runtest.rb +41 -0
- data/lib/webunit/table.rb +148 -0
- data/lib/webunit/testcase.rb +45 -0
- data/lib/webunit/ui/cui/testrunner.rb +50 -0
- data/lib/webunit/utils.rb +68 -0
- data/lib/webunit/webunit.rb +28 -0
- data/test/dev/action.rb +83 -0
- data/test/dev/forms.rb +104 -0
- data/test/dev/forms2.rb +104 -0
- data/test/dev/parser.rb +17 -0
- data/test/dev/scripts/dump.rb +24 -0
- data/test/dev/scripts/makedist.rb +62 -0
- data/test/dev/scripts/uri.rb +41 -0
- data/test/dev/scripts/uri/common.rb +432 -0
- data/test/dev/scripts/uri/ftp.rb +149 -0
- data/test/dev/scripts/uri/generic.rb +1106 -0
- data/test/dev/scripts/uri/http.rb +76 -0
- data/test/dev/scripts/uri/https.rb +26 -0
- data/test/dev/scripts/uri/ldap.rb +238 -0
- data/test/dev/scripts/uri/mailto.rb +260 -0
- data/test/dev/scripts/urireg.rb +174 -0
- data/test/dev/simpledispatcher.rb +156 -0
- data/test/dev/test.action.rb +146 -0
- data/test/dev/test.formreader.rb +463 -0
- data/test/dev/test.simpledispatcher.rb +186 -0
- data/test/dev/webunit/conv/digit-0.rb +21 -0
- data/test/dev/webunit/conv/digit-1.rb +17 -0
- data/test/dev/webunit/conv/digit.rb +23 -0
- data/test/dev/webunit/conv/test_digit-0.rb +16 -0
- data/test/dev/webunit/conv/test_digit-1.rb +19 -0
- data/test/dev/webunit/conv/test_digit.rb +26 -0
- data/test/dev/webunit/conv/test_digit_view-0.rb +76 -0
- data/test/dev/webunit/conv/test_digit_view-1.rb +102 -0
- data/test/dev/webunit/conv/test_digit_view.rb +134 -0
- data/test/installation/htdocs/cgi_test.rb +296 -0
- data/test/installation/htdocs/test_install.rb +4 -0
- data/test/installation/runwebtest.rb +5 -0
- data/test/installation/test_cookie.rb +128 -0
- data/test/installation/test_form.rb +47 -0
- data/test/installation/test_multipart.rb +51 -0
- data/test/installation/test_request.rb +24 -0
- data/test/installation/test_response.rb +35 -0
- data/test/unit/htdocs/cookie.rb +32 -0
- data/test/unit/htdocs/multipart.rb +28 -0
- data/test/unit/htdocs/redirect.rb +12 -0
- data/test/unit/htdocs/simple.rb +13 -0
- data/test/unit/htdocs/stock.rb +33 -0
- data/test/unit/test_assert.rb +162 -0
- data/test/unit/test_cookie.rb +114 -0
- data/test/unit/test_domwalker.rb +77 -0
- data/test/unit/test_form.rb +42 -0
- data/test/unit/test_frame.rb +40 -0
- data/test/unit/test_htmlelem.rb +74 -0
- data/test/unit/test_image.rb +45 -0
- data/test/unit/test_jscript.rb +57 -0
- data/test/unit/test_link.rb +85 -0
- data/test/unit/test_multipart.rb +51 -0
- data/test/unit/test_params.rb +210 -0
- data/test/unit/test_parser.rb +53 -0
- data/test/unit/test_response.rb +150 -0
- data/test/unit/test_table.rb +70 -0
- data/test/unit/test_utils.rb +106 -0
- data/test/unit/test_webunit.rb +28 -0
- data/test/web/mod_ruby_stub.rb +39 -0
- data/test/web/test.assertinclude.rb +109 -0
- data/test/web/test.buffer.rb +182 -0
- data/test/web/test.code.loader.rb +78 -0
- data/test/web/test.config.rb +31 -0
- data/test/web/test.error.handling.rb +91 -0
- data/test/web/test.formreader-2.0.rb +352 -0
- data/test/web/test.load.rb +125 -0
- data/test/web/test.mime-type.rb +23 -0
- data/test/web/test.narf.cgi.rb +106 -0
- data/test/web/test.phprb.rb +239 -0
- data/test/web/test.request.rb +368 -0
- data/test/web/test.response.rb +637 -0
- data/test/web/test.ruby-web.rb +10 -0
- data/test/web/test.session.rb +50 -0
- data/test/web/test.shim.cgi.rb +96 -0
- data/test/web/test.tagparser.rb +65 -0
- data/test/web/test.template2.rb +297 -0
- data/test/web/test.testing2.rb +318 -0
- data/test/web/test.upload.rb +45 -0
- data/test/web/test.validate.rb +46 -0
- data/test/web/test.web.test.rb +495 -0
- data/test/wiki/test.history.rb +297 -0
- data/test/wiki/test.illustration_page.rb +287 -0
- data/test/wiki/test.linker.rb +197 -0
- data/test/wiki/test.tarpit.rb +56 -0
- data/test/wiki/test.wiki.rb +300 -0
- data/test/wikitestroot/admin.rb +7 -0
- data/test/wikitestroot/wiki.rb +6 -0
- metadata +234 -0
@@ -0,0 +1,286 @@
|
|
1
|
+
# This encodes the knowledge of HTML 4.0 tags for a parser.
|
2
|
+
# It knows about block vs. inline tags, empty tags, and optionally
|
3
|
+
# omitted end tags.
|
4
|
+
#
|
5
|
+
# Copyright:: Copyright(C) 2002 Ned Konz <ned@bike-nomad.com>
|
6
|
+
# License:: Ruby's license
|
7
|
+
# CVS ID:: $Id: tags.rb,v 1.7 2002/06/04 01:55:59 ned Exp $
|
8
|
+
|
9
|
+
# This is an error raised by <tt>HTML::Tag.named()</tt> when a tag doesn't exist.
|
10
|
+
class NoSuchHTMLTagError < RuntimeError #:nodoc:
|
11
|
+
end
|
12
|
+
|
13
|
+
# This is the base class for all the HTML tag classes.
|
14
|
+
module HTML #:nodoc: all
|
15
|
+
|
16
|
+
class Tag
|
17
|
+
|
18
|
+
# tag_name:: a String, the name of the tag
|
19
|
+
# can_omit:: a Boolean, true if end tag is optional
|
20
|
+
def initialize(tag_name, can_omit)
|
21
|
+
@name = tag_name.downcase
|
22
|
+
@can_omit_end = can_omit
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return my tag name.
|
26
|
+
def name; @name; end
|
27
|
+
|
28
|
+
# Return true if my end tag can be omitted.
|
29
|
+
def can_omit_end_tag; @can_omit_end; end
|
30
|
+
|
31
|
+
# Return true if I am a block element.
|
32
|
+
def is_block_element; false; end
|
33
|
+
|
34
|
+
# Return true if I am an inline element.
|
35
|
+
def is_inline_element; false; end
|
36
|
+
|
37
|
+
# Return true if I am an empty element.
|
38
|
+
def is_empty_element; false; end
|
39
|
+
|
40
|
+
# Return true if I can contain <tt>tag</tt> if my parent is of type <tt>parent</tt>.
|
41
|
+
# tag:: tag name, a String
|
42
|
+
# parent:: parent tag name, a String.
|
43
|
+
def can_contain(tag, parent); false; end
|
44
|
+
|
45
|
+
# Return true if whitespace within me can be omitted (ignoring browser
|
46
|
+
# bugs)
|
47
|
+
def can_ignore_whitespace; true; end
|
48
|
+
end
|
49
|
+
|
50
|
+
# This represents an HTML block element.
|
51
|
+
class BlockTag < Tag
|
52
|
+
def is_block_element; true; end
|
53
|
+
|
54
|
+
# Blocks can contain anything, so return true.
|
55
|
+
def can_contain(tag, parent); true; end
|
56
|
+
end
|
57
|
+
|
58
|
+
# This represents an HTML inline element.
|
59
|
+
class InlineTag < Tag
|
60
|
+
def is_inline_element; true; end
|
61
|
+
|
62
|
+
# Inlines can only contain other inlines.
|
63
|
+
def can_contain(tag, parent)
|
64
|
+
Tag.named(tag).is_inline_element
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# This represents an HTML element that can be regarded as either a block
|
69
|
+
# or an inline element..
|
70
|
+
class BlockOrInlineTag < InlineTag
|
71
|
+
|
72
|
+
def is_block_element; true; end
|
73
|
+
|
74
|
+
# If used as inline elements (e.g., within another inline element or a P),
|
75
|
+
# these elements should not contain any block-level elements.
|
76
|
+
def can_contain(tag, parent)
|
77
|
+
return ((parent.downcase == 'p' \
|
78
|
+
or Tag.named(parent).is_inline_element) \
|
79
|
+
and ! Tag.named(tag).is_block_element)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# This represents an HTML tag that never has an end tag.
|
84
|
+
class EmptyTag < Tag
|
85
|
+
def is_empty_element; true; end
|
86
|
+
def is_inline_element; true; end
|
87
|
+
def can_contain(tag, parent); false; end
|
88
|
+
end
|
89
|
+
|
90
|
+
# This block initializes the tag lookup table.
|
91
|
+
class Tag
|
92
|
+
@table = Hash.new
|
93
|
+
|
94
|
+
# Add the given tag to the tag lookup table.
|
95
|
+
#
|
96
|
+
# This can be called by user code to add otherwise unknown tags to the
|
97
|
+
# table.
|
98
|
+
#
|
99
|
+
# name:: the tag name, a String.
|
100
|
+
# is_block:: true if I am a block element.
|
101
|
+
# is_inline:: true if I am an inline element.
|
102
|
+
# is_empty:: true if I am an empty element.
|
103
|
+
# can_omit:: true if my end tag can be omitted.
|
104
|
+
def Tag.add_tag(name, is_block, is_inline, is_empty, can_omit)
|
105
|
+
@table[ name.upcase ] = @table[ name.downcase ] = \
|
106
|
+
if is_empty
|
107
|
+
EmptyTag.new(name, true)
|
108
|
+
elsif is_block
|
109
|
+
if is_inline
|
110
|
+
BlockOrInlineTag.new(name, can_omit)
|
111
|
+
else
|
112
|
+
BlockTag.new(name, can_omit)
|
113
|
+
end
|
114
|
+
else
|
115
|
+
InlineTag.new(name, can_omit)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Return an Tag with the given name, or raise a
|
120
|
+
# NoSuchHTMLTagError.
|
121
|
+
def Tag.named(tagname)
|
122
|
+
@table[ tagname ] || raise(NoSuchHTMLTagError.exception(tagname))
|
123
|
+
end
|
124
|
+
|
125
|
+
# Block Inline Empty can_omit_end
|
126
|
+
[
|
127
|
+
[ 'A', false, true, false, false ], # Anchor
|
128
|
+
[ 'ABBR', false, true, false, false ], # Abbreviation
|
129
|
+
[ 'ACRONYM', false, true, false, false ], # Acronym
|
130
|
+
[ 'ADDRESS', true, false, false, false ], # Address
|
131
|
+
[ 'APPLET', true, true, false, false ], # Java applet
|
132
|
+
[ 'AREA', true, false, true, true ], # Image map region
|
133
|
+
[ 'B', false, true, false, false ], # Bold text
|
134
|
+
[ 'BASE', false, false, true, true ], # Document base URI
|
135
|
+
[ 'BASEFONT', false, true, true, true ], # Base font change
|
136
|
+
[ 'BDO', false, true, false, false ], # Bi_di override
|
137
|
+
[ 'BIG', false, true, false, false ], # Large text
|
138
|
+
[ 'BLOCKQUOTE', true, false, false, false ], # Block quotation
|
139
|
+
[ 'BODY', true, false, false, false ], # Document body
|
140
|
+
[ 'BR', false, true, true, true ], # Line break
|
141
|
+
[ 'BUTTON', true, true, false, false ], # Button
|
142
|
+
[ 'CAPTION', false, true, false, false ], # Table caption
|
143
|
+
[ 'CENTER', false, true, false, false ], # Centered block
|
144
|
+
[ 'CITE', false, true, false, false ], # Citation
|
145
|
+
[ 'CODE', false, true, false, false ], # Computer code
|
146
|
+
[ 'COL', false, false, true, true ], # Table column
|
147
|
+
[ 'COLGROUP', true, false, false, true ], # Table column group
|
148
|
+
[ 'DD', true, false, false, true ], # Definition description
|
149
|
+
[ 'DEL', true, true, false, false ], # Deleted text
|
150
|
+
[ 'DFN', false, true, false, false ], # Defined term
|
151
|
+
[ 'DIR', true, false, false, false ], # Directory list
|
152
|
+
[ 'DIV', true, false, false, false ], # Generic block-level container
|
153
|
+
[ 'DL', true, false, false, false ], # Definition list
|
154
|
+
[ 'DT', false, true, false, true ], # Definition term
|
155
|
+
[ 'EM', false, true, false, false ], # Emphasis
|
156
|
+
[ 'FIELDSET', true, false, false, false ], # Form control group
|
157
|
+
[ 'FONT', false, true, false, false ], # Font change
|
158
|
+
[ 'FORM', true, false, false, false ], # Interactive form
|
159
|
+
[ 'FRAME', false, false, true, true ], # Frame
|
160
|
+
[ 'FRAMESET', true, false, false, false ], # Frameset
|
161
|
+
[ 'H1', true, false, false, false ], # Level-one heading
|
162
|
+
[ 'H2', true, false, false, false ], # Level-two heading
|
163
|
+
[ 'H3', true, false, false, false ], # Level-three heading
|
164
|
+
[ 'H4', true, false, false, false ], # Level-four heading
|
165
|
+
[ 'H5', true, false, false, false ], # Level-five heading
|
166
|
+
[ 'H6', true, false, false, false ], # Level-six heading
|
167
|
+
[ 'HEAD', true, false, false, false ], # Document head
|
168
|
+
[ 'HR', false, true, true, true ], # Horizontal rule
|
169
|
+
[ 'HTML', true, false, false, false ], # HTML document
|
170
|
+
[ 'I', false, true, false, false ], # Italic text
|
171
|
+
[ 'IFRAME', true, true, false, false ], # Inline frame
|
172
|
+
[ 'IMG', false, true, true, true ], # Inline image
|
173
|
+
[ 'INPUT', false, true, true, true ], # Form input
|
174
|
+
[ 'INS', true, true, false, false ], # Inserted text
|
175
|
+
[ 'ISINDEX', false, true, true, true ], # Input prompt
|
176
|
+
[ 'KBD', false, true, false, false ], # Text to be input
|
177
|
+
[ 'LABEL', false, true, false, false ], # Form field label
|
178
|
+
[ 'LEGEND', false, true, false, false ], # Fieldset caption
|
179
|
+
[ 'LI', true, false, false, true ], # List item
|
180
|
+
[ 'LINK', true, false, false, false ], # Document relationship
|
181
|
+
[ 'MAP', true, true, false, false ], # Image map
|
182
|
+
[ 'MENU', true, false, false, false ], # Menu list
|
183
|
+
[ 'META', false, true, true, true ], # Metadata
|
184
|
+
[ 'NOFRAMES', true, false, false, false ], # Frames alternate content
|
185
|
+
[ 'NOSCRIPT', true, false, false, false ], # Alternate script content
|
186
|
+
[ 'OBJECT', true, true, false, false ], # Object
|
187
|
+
[ 'OL', true, false, false, false ], # Ordered list
|
188
|
+
[ 'OPTGROUP', true, false, false, false ], # Option group
|
189
|
+
[ 'OPTION', true, false, false, false ], # Menu option
|
190
|
+
[ 'P', true, false, false, true ], # Paragraph
|
191
|
+
[ 'PARAM', false, true, true, true ], # Object parameter
|
192
|
+
[ 'PRE', true, false, false, false ], # Preformatted text
|
193
|
+
[ 'Q', false, true, false, false ], # Short quotation
|
194
|
+
[ 'S', false, true, false, false ], # Strike-through text
|
195
|
+
[ 'SAMP', false, true, false, false ], # Sample output
|
196
|
+
[ 'SCRIPT', true, true, false, false ], # Client-side script
|
197
|
+
[ 'SELECT', true, false, false, false ], # Option selector
|
198
|
+
[ 'SMALL', false, true, false, false ], # Small text
|
199
|
+
[ 'SPAN', false, true, false, false ], # Generic inline container
|
200
|
+
[ 'STRIKE', false, true, false, false ], # Strike-through text
|
201
|
+
[ 'STRONG', false, true, false, false ], # Strong emphasis
|
202
|
+
[ 'STYLE', true, false, false, false ], # Embedded style sheet
|
203
|
+
[ 'SUB', false, true, false, false ], # Subscript
|
204
|
+
[ 'SUP', false, true, false, false ], # Superscript
|
205
|
+
[ 'TABLE', true, false, false, false ], # Table
|
206
|
+
[ 'TBODY', true, false, false, false ], # Table body
|
207
|
+
[ 'TD', true, false, false, true ], # Table data cell
|
208
|
+
[ 'TEXTAREA', false, true, false, false ], # Multi-line text input
|
209
|
+
[ 'TFOOT', true, false, false, true ], # Table foot
|
210
|
+
[ 'TH', true, false, false, true ], # Table header cell
|
211
|
+
[ 'THEAD', true, false, false, true ], # Table head
|
212
|
+
[ 'TITLE', true, false, false, false ], # Document title
|
213
|
+
[ 'TR', true, false, false, true ], # Table row
|
214
|
+
[ 'TT', false, true, false, false ], # Teletype text
|
215
|
+
[ 'U', false, true, false, false ], # Underlined text
|
216
|
+
[ 'UL', true, false, false, false ], # Unordered list
|
217
|
+
[ 'VAR', false, true, false, false ], # Variable
|
218
|
+
].each { |a| add_tag(*a) }
|
219
|
+
|
220
|
+
# EXCEPTIONS TODO
|
221
|
+
# A, LABEL can't contain itself
|
222
|
+
# several things (fonts, etc) can't be in PRE
|
223
|
+
# SELECT can only have OPTGROUP or OPTION
|
224
|
+
# TEXTAREA, OPTION only contains plain text
|
225
|
+
# APPLET and OBJECT has PARAM+ followed by block and/or inline
|
226
|
+
# BUTTON can't contain:
|
227
|
+
# A, INPUT, SELECT, TEXTAREA, LABEL, BUTTON, or IFRAME
|
228
|
+
# nor FORM, ISINDEX, and FIELDSET
|
229
|
+
# IFRAME can only contain block elems if parent can
|
230
|
+
# MAP can contain block+ *xor* AREA+
|
231
|
+
# SCRIPT only contains a SCRIPT (that is, until /<\/[A-Za-z]/)
|
232
|
+
# BODY must be in HTML or NOFRAMES
|
233
|
+
# COL can only be in COLGROUP or TABLE
|
234
|
+
# COLGROUP has only COL*, and can only be in TABLE
|
235
|
+
# DIR, MENU can only contain LI+, none of which may contain block elems
|
236
|
+
# DL must contain (DT|DD)+
|
237
|
+
# DT and DD are only allowed in DL
|
238
|
+
# FIELDSET contains LEGEND, (block|inline)*
|
239
|
+
# FRAMESET contains (FRAMESET|FRAME), plus NOFRAMES and must be in HTML
|
240
|
+
# H# can only be contained in block elems, but only contain inlines.
|
241
|
+
# HEAD must only contain TITLE, BASE?, ISINDEX?, SCRIPT* STYLE* META* LINK*
|
242
|
+
# OBJECT* HEAD must be in HTML
|
243
|
+
# HTML is top-level and can only contain HEAD, BODY, or HEAD, FRAMESET
|
244
|
+
# LI can contain blocks except when inside DIR or MENU
|
245
|
+
# LI can only be inside OL, UL, DIR, MENU
|
246
|
+
# OL, UL can only contain LI+
|
247
|
+
# OPTGROUP contains OPTION+
|
248
|
+
# P can only contain inlines. However, it is a block-level elem.
|
249
|
+
# PRE can only contain inlines except IMG, OBJECT, APPLET, BIG, SMALL, SUB,
|
250
|
+
# SUP, FONT, BASEFONT
|
251
|
+
|
252
|
+
# tags with optional omitted endtags and their allowed contents:
|
253
|
+
# anchor matches at beginning and end
|
254
|
+
{
|
255
|
+
'AREA' => '(?!AREA)[A-Z]+',
|
256
|
+
'COLGROUP' => 'COL',
|
257
|
+
'DD' => '(?!D[DT]$)[A-Z]+',
|
258
|
+
'DT' => '(?!D[DT]$)[A-Z]+',
|
259
|
+
'MAP' => 'AREA',
|
260
|
+
'P' => '(?!P$)[A-Z]+',
|
261
|
+
'TD' => '(?!T[HDR]$)[A-Z]+',
|
262
|
+
'TFOOT' => 'TR',
|
263
|
+
'TH' => '(?!T[HDR]$)[A-Z]+',
|
264
|
+
'THEAD' => 'TR',
|
265
|
+
'TR' => 'T[HD]',
|
266
|
+
}.each_pair { |tagname, pattern|
|
267
|
+
eval <<EOM
|
268
|
+
class << named(tagname) # :nodoc:
|
269
|
+
def can_contain(tag, parent)
|
270
|
+
(/\\A#{pattern}\\z/i =~ tag) == 0
|
271
|
+
end
|
272
|
+
end
|
273
|
+
EOM
|
274
|
+
}
|
275
|
+
|
276
|
+
class << named('TEXTAREA') # :nodoc:
|
277
|
+
def can_ignore_whitespace; false; end
|
278
|
+
end
|
279
|
+
class << named('PRE') # :nodoc:
|
280
|
+
def can_ignore_whitespace; false; end
|
281
|
+
end
|
282
|
+
class << named('OPTION') # :nodoc:
|
283
|
+
def can_ignore_whitespace; false; end
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# This is an HTML parser that builds an element tree for further
|
3
|
+
# processing. Attributes and data are also stored.
|
4
|
+
#
|
5
|
+
# Typical usage is:
|
6
|
+
# parser = HTMLTree::Parser.new(false, false)
|
7
|
+
# parser.parse_file_named('whatever.html')
|
8
|
+
# # then you have the tree built..
|
9
|
+
# parser.tree.dump
|
10
|
+
#
|
11
|
+
# Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
12
|
+
# License:: Ruby's
|
13
|
+
# CVS ID:: $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
|
14
|
+
|
15
|
+
require 'web/htmltools/tags'
|
16
|
+
require 'web/htmltools/stparser'
|
17
|
+
require 'web/htmltools/element'
|
18
|
+
|
19
|
+
# This is a tree building HTML parser.
|
20
|
+
module HTMLTree #:nodoc: all
|
21
|
+
class Parser < HTML::StackingParser
|
22
|
+
|
23
|
+
# verbose:: if true, will warn to $stderr on unknown
|
24
|
+
# tags/entities/characters, as well as missing end tags and extra end
|
25
|
+
# tags.
|
26
|
+
# strip_white:: if true, remove all non-essential whitespace. Note
|
27
|
+
# that there are browser bugs that may cause this to change the
|
28
|
+
# appearance of HTML (even though it shouldn't by the standard).
|
29
|
+
def initialize(verbose=false, strip_white=true)
|
30
|
+
super
|
31
|
+
reset
|
32
|
+
end
|
33
|
+
|
34
|
+
# Reset this parser so that it can parse a new document.
|
35
|
+
def reset
|
36
|
+
super
|
37
|
+
@rootNode = @currentNode = Document.new
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return the tree that was built. This will be an HTMLTree::Element that
|
41
|
+
# represents the whole document. The \<html> node is a child of this.
|
42
|
+
def tree
|
43
|
+
@rootNode
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return the <html> node, if any.
|
47
|
+
def html
|
48
|
+
@rootNode.html_node()
|
49
|
+
end
|
50
|
+
|
51
|
+
# no user-serviceable parts inside...
|
52
|
+
# though you can subclass carefully.
|
53
|
+
private
|
54
|
+
|
55
|
+
def add_child_to_current(tag, attrs)
|
56
|
+
node = Element.new(@currentNode, tag)
|
57
|
+
attrs.each { |a| node.add_attribute(*a) }
|
58
|
+
node
|
59
|
+
end
|
60
|
+
|
61
|
+
# callbacks
|
62
|
+
|
63
|
+
# add a child to the current node and descend
|
64
|
+
def handle_start_tag(tag, attrs)
|
65
|
+
node = add_child_to_current(tag, attrs)
|
66
|
+
@rootNode = node unless @rootNode
|
67
|
+
@currentNode = node
|
68
|
+
end
|
69
|
+
|
70
|
+
# go up to parent
|
71
|
+
def handle_end_tag(tag)
|
72
|
+
@currentNode = @currentNode.parent
|
73
|
+
end
|
74
|
+
|
75
|
+
# add a child to the current node
|
76
|
+
def handle_empty_tag(tag, attrs)
|
77
|
+
add_child_to_current(tag, attrs)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Add a child to the current node and descend
|
81
|
+
# Assume that the unknown tag has an end tag.
|
82
|
+
def handle_unknown_tag(tag, attrs)
|
83
|
+
super
|
84
|
+
handle_start_tag(tag, attrs)
|
85
|
+
end
|
86
|
+
|
87
|
+
# go up to parent
|
88
|
+
def handle_missing_end_tag(tag)
|
89
|
+
super
|
90
|
+
handle_end_tag(tag)
|
91
|
+
end
|
92
|
+
|
93
|
+
# ignore
|
94
|
+
def handle_extra_end_tag(tag)
|
95
|
+
super
|
96
|
+
end
|
97
|
+
|
98
|
+
def handle_cdata(data)
|
99
|
+
node = Data.new(@currentNode, data)
|
100
|
+
end
|
101
|
+
|
102
|
+
def handle_script(data)
|
103
|
+
node = Data.new(@currentNode, data)
|
104
|
+
end
|
105
|
+
|
106
|
+
def handle_unknown_character(name)
|
107
|
+
super
|
108
|
+
end
|
109
|
+
|
110
|
+
def handle_unknown_entity(name)
|
111
|
+
super
|
112
|
+
end
|
113
|
+
|
114
|
+
def handle_comment(data)
|
115
|
+
super # make sure and strip whitespace.
|
116
|
+
node = Comment.new(@currentNode, data)
|
117
|
+
end
|
118
|
+
|
119
|
+
def handle_special(data)
|
120
|
+
node = HTMLTree::Special.new(@currentNode, data)
|
121
|
+
$stderr.print('special ', node, ' discarded') unless @currentNode
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
if $0 == __FILE__
|
128
|
+
$stdout.sync = true
|
129
|
+
|
130
|
+
class TestStackingParser < HTMLTree::Parser #:nodoc: all
|
131
|
+
$DEBUG = false
|
132
|
+
p = TestStackingParser.new(true, false)
|
133
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
134
|
+
File.open('xx.html', 'w') { |of|
|
135
|
+
p.tree.write(of)
|
136
|
+
}
|
137
|
+
p.tree.dump
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# This is an HTML parser that builds an element tree for further
|
3
|
+
# processing. Attributes and data are also stored.
|
4
|
+
# The storage is that of REXML, which is required.
|
5
|
+
#
|
6
|
+
# Typical usage is:
|
7
|
+
# parser = HTMLTree::XMLParser.new(false, false)
|
8
|
+
# parser.parse_file_named('whatever.html')
|
9
|
+
# # then you have the tree built..
|
10
|
+
# parser.tree # is a REXML::Document
|
11
|
+
#
|
12
|
+
# Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
13
|
+
# License:: Ruby's
|
14
|
+
# CVS ID:: $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
|
15
|
+
|
16
|
+
require 'web/htmltools/tags'
|
17
|
+
require 'web/htmltools/stparser'
|
18
|
+
require 'rexml/element'
|
19
|
+
require 'rexml/document'
|
20
|
+
|
21
|
+
# REXML::Child
|
22
|
+
# REXML::XMLDecl
|
23
|
+
# REXML::Instruction
|
24
|
+
# REXML::Text
|
25
|
+
# REXML::Comment
|
26
|
+
# REXML::Entity
|
27
|
+
# REXML::Parent
|
28
|
+
# REXML::Element (+REXML::Namespace)
|
29
|
+
# REXML::Document
|
30
|
+
# REXML::DocType
|
31
|
+
#
|
32
|
+
# This is a tree building HTML parser that makes XML.
|
33
|
+
module HTMLTree #:nodoc: all
|
34
|
+
class XMLParser < HTML::StackingParser
|
35
|
+
|
36
|
+
# verbose:: if true, will warn to $stderr on unknown
|
37
|
+
# tags/entities/characters, as well as missing end tags and extra end
|
38
|
+
# tags.
|
39
|
+
# strip_white:: if true, remove all non-essential whitespace. Note
|
40
|
+
# that there are browser bugs that may cause this to change the
|
41
|
+
# appearance of HTML (even though it shouldn't by the standard).
|
42
|
+
def initialize(verbose=false, strip_white=true)
|
43
|
+
super
|
44
|
+
reset
|
45
|
+
end
|
46
|
+
|
47
|
+
# Reset this parser so that it can parse a new document.
|
48
|
+
def reset
|
49
|
+
super
|
50
|
+
@rootNode = @currentNode = REXML::Document.new()
|
51
|
+
end
|
52
|
+
|
53
|
+
# Return the document that was built. This will be an
|
54
|
+
# REXML::Document that represents the whole document. The \<html>
|
55
|
+
# node is a child of this.
|
56
|
+
def document
|
57
|
+
@rootNode
|
58
|
+
end
|
59
|
+
|
60
|
+
def tree
|
61
|
+
document()
|
62
|
+
end
|
63
|
+
|
64
|
+
# Return the root of the document, if any.
|
65
|
+
def root
|
66
|
+
@rootNode.root()
|
67
|
+
end
|
68
|
+
|
69
|
+
# Return the <html> node, if any.
|
70
|
+
def html
|
71
|
+
@rootNode.root.elements['html']
|
72
|
+
end
|
73
|
+
|
74
|
+
# no user-serviceable parts inside...
|
75
|
+
# though you can subclass carefully.
|
76
|
+
private
|
77
|
+
|
78
|
+
def add_child_to_current(tag, attrs)
|
79
|
+
node = REXML::Element.new(tag, @currentNode)
|
80
|
+
attrs.each { |a| node.attributes[a[0]] = a[1] }
|
81
|
+
node
|
82
|
+
end
|
83
|
+
|
84
|
+
# callbacks
|
85
|
+
|
86
|
+
# add a child to the current node and descend
|
87
|
+
def handle_start_tag(tag, attrs)
|
88
|
+
node = add_child_to_current(tag, attrs)
|
89
|
+
@rootNode = node unless @rootNode
|
90
|
+
@currentNode = node
|
91
|
+
end
|
92
|
+
|
93
|
+
# go up to parent
|
94
|
+
def handle_end_tag(tag)
|
95
|
+
@currentNode = @currentNode.parent
|
96
|
+
end
|
97
|
+
|
98
|
+
# add a child to the current node
|
99
|
+
def handle_empty_tag(tag, attrs)
|
100
|
+
add_child_to_current(tag, attrs)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Add a child to the current node and descend
|
104
|
+
# Assume that the unknown tag has an end tag.
|
105
|
+
def handle_unknown_tag(tag, attrs)
|
106
|
+
super
|
107
|
+
handle_start_tag(tag, attrs)
|
108
|
+
end
|
109
|
+
|
110
|
+
# go up to parent
|
111
|
+
def handle_missing_end_tag(tag)
|
112
|
+
super
|
113
|
+
handle_end_tag(tag)
|
114
|
+
end
|
115
|
+
|
116
|
+
# ignore
|
117
|
+
def handle_extra_end_tag(tag)
|
118
|
+
super
|
119
|
+
end
|
120
|
+
|
121
|
+
def handle_cdata(data)
|
122
|
+
REXML::Text.new(data, !@stripWhitespace, @currentNode)
|
123
|
+
end
|
124
|
+
|
125
|
+
def handle_script(data)
|
126
|
+
REXML::Comment.new(data, @currentNode)
|
127
|
+
end
|
128
|
+
|
129
|
+
def handle_unknown_character(name)
|
130
|
+
super # that is, do nothing
|
131
|
+
end
|
132
|
+
|
133
|
+
def handle_unknown_entity(name)
|
134
|
+
super # that is, do nothing
|
135
|
+
end
|
136
|
+
|
137
|
+
def handle_comment(data)
|
138
|
+
super # strip white
|
139
|
+
REXML::Comment.new(data, @currentNode)
|
140
|
+
end
|
141
|
+
|
142
|
+
def handle_special(data)
|
143
|
+
REXML::DocType.new(data, @currentNode) # TODO
|
144
|
+
end
|
145
|
+
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
if $0 == __FILE__
|
150
|
+
$stdout.sync = true
|
151
|
+
|
152
|
+
class TestStackingParser < HTMLTree::XMLParser #:nodoc: all
|
153
|
+
$DEBUG = false
|
154
|
+
p = TestStackingParser.new(true, false)
|
155
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
156
|
+
File.open('xx.html', 'w') { |of|
|
157
|
+
p.document.write(of)
|
158
|
+
}
|
159
|
+
end
|
160
|
+
end
|