ruby-web 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +474 -0
- data/INSTALL.txt +9 -0
- data/InstalledFiles +180 -0
- data/LICENSE.txt +74 -0
- data/Rakefile +529 -0
- data/TODO +65 -0
- data/doc/additional.xml +149 -0
- data/doc/core.xml +652 -0
- data/doc/credits/index.xml +52 -0
- data/doc/credits/php.contributors.xml +118 -0
- data/doc/credits/php.language-snippets.ent +622 -0
- data/doc/install/index.xml +136 -0
- data/doc/install/mac/index.xml +21 -0
- data/doc/install/ruby-web.install.rb.instructions.xml +7 -0
- data/doc/install/unix/index.xml +46 -0
- data/doc/install/win/apache1.xml +166 -0
- data/doc/install/win/apache2.xml +141 -0
- data/doc/install/win/iis.xml +162 -0
- data/doc/install/win/index.xml +24 -0
- data/doc/install/win/installer.xml +31 -0
- data/doc/install/win/manual.xml +43 -0
- data/doc/manual.xml +69 -0
- data/doc/old/apache_cgi.txt +23 -0
- data/doc/old/fastcgi.txt +23 -0
- data/doc/old/mod_ruby.txt +21 -0
- data/doc/old/snippets.rdoc +183 -0
- data/doc/old/webrick.txt +23 -0
- data/doc/old/windows_cgi.txt +9 -0
- data/doc/tutorial.xml +14 -0
- data/doc/xsl/manual-multi.xsl +10 -0
- data/doc/xsl/manual-pdf.xsl +6 -0
- data/doc/xsl/manual-single.xsl +6 -0
- data/doc/xsl/manual.css +22 -0
- data/install.rb +1022 -0
- data/lib/formatter.rb +314 -0
- data/lib/html-parser.rb +429 -0
- data/lib/htmlrepair.rb +113 -0
- data/lib/htmlsplit.rb +842 -0
- data/lib/sgml-parser.rb +332 -0
- data/lib/web.rb +68 -0
- data/lib/web/assertinclude.rb +129 -0
- data/lib/web/config.rb +50 -0
- data/lib/web/connection.rb +1070 -0
- data/lib/web/convenience.rb +154 -0
- data/lib/web/formreader.rb +318 -0
- data/lib/web/htmlparser/html-parser.rb +429 -0
- data/lib/web/htmlparser/sgml-parser.rb +332 -0
- data/lib/web/htmltools/element.rb +296 -0
- data/lib/web/htmltools/stparser.rb +276 -0
- data/lib/web/htmltools/tags.rb +286 -0
- data/lib/web/htmltools/tree.rb +139 -0
- data/lib/web/htmltools/xmltree.rb +160 -0
- data/lib/web/htmltools/xpath.rb +71 -0
- data/lib/web/info.rb +63 -0
- data/lib/web/load.rb +210 -0
- data/lib/web/mime.rb +87 -0
- data/lib/web/phprb.rb +340 -0
- data/lib/web/resources/test/cookie.rb +33 -0
- data/lib/web/resources/test/counter.rb +20 -0
- data/lib/web/resources/test/multipart.rb +14 -0
- data/lib/web/resources/test/redirect.rb +8 -0
- data/lib/web/resources/test/stock.rb +33 -0
- data/lib/web/sapi/apache.rb +129 -0
- data/lib/web/sapi/fastcgi.rb +22 -0
- data/lib/web/sapi/install/apache.rb +180 -0
- data/lib/web/sapi/install/iis.rb +93 -0
- data/lib/web/sapi/install/macosx.rb +90 -0
- data/lib/web/sapi/webrick.rb +86 -0
- data/lib/web/session.rb +83 -0
- data/lib/web/shim/cgi.rb +129 -0
- data/lib/web/shim/rails.rb +175 -0
- data/lib/web/stringio.rb +78 -0
- data/lib/web/strscanparser.rb +24 -0
- data/lib/web/tagparser.rb +96 -0
- data/lib/web/testing.rb +666 -0
- data/lib/web/traceoutput.rb +75 -0
- data/lib/web/unit.rb +56 -0
- data/lib/web/upload.rb +59 -0
- data/lib/web/validate.rb +52 -0
- data/lib/web/wiki.rb +557 -0
- data/lib/web/wiki/linker.rb +72 -0
- data/lib/web/wiki/page.rb +201 -0
- data/lib/webunit.rb +27 -0
- data/lib/webunit/assert.rb +152 -0
- data/lib/webunit/converter.rb +154 -0
- data/lib/webunit/cookie.rb +118 -0
- data/lib/webunit/domwalker.rb +185 -0
- data/lib/webunit/exception.rb +14 -0
- data/lib/webunit/form.rb +116 -0
- data/lib/webunit/frame.rb +37 -0
- data/lib/webunit/htmlelem.rb +122 -0
- data/lib/webunit/image.rb +26 -0
- data/lib/webunit/jscript.rb +31 -0
- data/lib/webunit/link.rb +33 -0
- data/lib/webunit/params.rb +321 -0
- data/lib/webunit/parser.rb +229 -0
- data/lib/webunit/response.rb +464 -0
- data/lib/webunit/runtest.rb +41 -0
- data/lib/webunit/table.rb +148 -0
- data/lib/webunit/testcase.rb +45 -0
- data/lib/webunit/ui/cui/testrunner.rb +50 -0
- data/lib/webunit/utils.rb +68 -0
- data/lib/webunit/webunit.rb +28 -0
- data/test/dev/action.rb +83 -0
- data/test/dev/forms.rb +104 -0
- data/test/dev/forms2.rb +104 -0
- data/test/dev/parser.rb +17 -0
- data/test/dev/scripts/dump.rb +24 -0
- data/test/dev/scripts/makedist.rb +62 -0
- data/test/dev/scripts/uri.rb +41 -0
- data/test/dev/scripts/uri/common.rb +432 -0
- data/test/dev/scripts/uri/ftp.rb +149 -0
- data/test/dev/scripts/uri/generic.rb +1106 -0
- data/test/dev/scripts/uri/http.rb +76 -0
- data/test/dev/scripts/uri/https.rb +26 -0
- data/test/dev/scripts/uri/ldap.rb +238 -0
- data/test/dev/scripts/uri/mailto.rb +260 -0
- data/test/dev/scripts/urireg.rb +174 -0
- data/test/dev/simpledispatcher.rb +156 -0
- data/test/dev/test.action.rb +146 -0
- data/test/dev/test.formreader.rb +463 -0
- data/test/dev/test.simpledispatcher.rb +186 -0
- data/test/dev/webunit/conv/digit-0.rb +21 -0
- data/test/dev/webunit/conv/digit-1.rb +17 -0
- data/test/dev/webunit/conv/digit.rb +23 -0
- data/test/dev/webunit/conv/test_digit-0.rb +16 -0
- data/test/dev/webunit/conv/test_digit-1.rb +19 -0
- data/test/dev/webunit/conv/test_digit.rb +26 -0
- data/test/dev/webunit/conv/test_digit_view-0.rb +76 -0
- data/test/dev/webunit/conv/test_digit_view-1.rb +102 -0
- data/test/dev/webunit/conv/test_digit_view.rb +134 -0
- data/test/installation/htdocs/cgi_test.rb +296 -0
- data/test/installation/htdocs/test_install.rb +4 -0
- data/test/installation/runwebtest.rb +5 -0
- data/test/installation/test_cookie.rb +128 -0
- data/test/installation/test_form.rb +47 -0
- data/test/installation/test_multipart.rb +51 -0
- data/test/installation/test_request.rb +24 -0
- data/test/installation/test_response.rb +35 -0
- data/test/unit/htdocs/cookie.rb +32 -0
- data/test/unit/htdocs/multipart.rb +28 -0
- data/test/unit/htdocs/redirect.rb +12 -0
- data/test/unit/htdocs/simple.rb +13 -0
- data/test/unit/htdocs/stock.rb +33 -0
- data/test/unit/test_assert.rb +162 -0
- data/test/unit/test_cookie.rb +114 -0
- data/test/unit/test_domwalker.rb +77 -0
- data/test/unit/test_form.rb +42 -0
- data/test/unit/test_frame.rb +40 -0
- data/test/unit/test_htmlelem.rb +74 -0
- data/test/unit/test_image.rb +45 -0
- data/test/unit/test_jscript.rb +57 -0
- data/test/unit/test_link.rb +85 -0
- data/test/unit/test_multipart.rb +51 -0
- data/test/unit/test_params.rb +210 -0
- data/test/unit/test_parser.rb +53 -0
- data/test/unit/test_response.rb +150 -0
- data/test/unit/test_table.rb +70 -0
- data/test/unit/test_utils.rb +106 -0
- data/test/unit/test_webunit.rb +28 -0
- data/test/web/mod_ruby_stub.rb +39 -0
- data/test/web/test.assertinclude.rb +109 -0
- data/test/web/test.buffer.rb +182 -0
- data/test/web/test.code.loader.rb +78 -0
- data/test/web/test.config.rb +31 -0
- data/test/web/test.error.handling.rb +91 -0
- data/test/web/test.formreader-2.0.rb +352 -0
- data/test/web/test.load.rb +125 -0
- data/test/web/test.mime-type.rb +23 -0
- data/test/web/test.narf.cgi.rb +106 -0
- data/test/web/test.phprb.rb +239 -0
- data/test/web/test.request.rb +368 -0
- data/test/web/test.response.rb +637 -0
- data/test/web/test.ruby-web.rb +10 -0
- data/test/web/test.session.rb +50 -0
- data/test/web/test.shim.cgi.rb +96 -0
- data/test/web/test.tagparser.rb +65 -0
- data/test/web/test.template2.rb +297 -0
- data/test/web/test.testing2.rb +318 -0
- data/test/web/test.upload.rb +45 -0
- data/test/web/test.validate.rb +46 -0
- data/test/web/test.web.test.rb +495 -0
- data/test/wiki/test.history.rb +297 -0
- data/test/wiki/test.illustration_page.rb +287 -0
- data/test/wiki/test.linker.rb +197 -0
- data/test/wiki/test.tarpit.rb +56 -0
- data/test/wiki/test.wiki.rb +300 -0
- data/test/wikitestroot/admin.rb +7 -0
- data/test/wikitestroot/wiki.rb +6 -0
- metadata +234 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
# This encodes the knowledge of HTML 4.0 tags for a parser.
|
|
2
|
+
# It knows about block vs. inline tags, empty tags, and optionally
|
|
3
|
+
# omitted end tags.
|
|
4
|
+
#
|
|
5
|
+
# Copyright:: Copyright(C) 2002 Ned Konz <ned@bike-nomad.com>
|
|
6
|
+
# License:: Ruby's license
|
|
7
|
+
# CVS ID:: $Id: tags.rb,v 1.7 2002/06/04 01:55:59 ned Exp $
|
|
8
|
+
|
|
9
|
+
# This is an error raised by <tt>HTML::Tag.named()</tt> when a tag doesn't exist.
|
|
10
|
+
class NoSuchHTMLTagError < RuntimeError #:nodoc:
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# This is the base class for all the HTML tag classes.
|
|
14
|
+
module HTML #:nodoc: all
|
|
15
|
+
|
|
16
|
+
class Tag
|
|
17
|
+
|
|
18
|
+
# tag_name:: a String, the name of the tag
|
|
19
|
+
# can_omit:: a Boolean, true if end tag is optional
|
|
20
|
+
def initialize(tag_name, can_omit)
|
|
21
|
+
@name = tag_name.downcase
|
|
22
|
+
@can_omit_end = can_omit
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Return my tag name.
|
|
26
|
+
def name; @name; end
|
|
27
|
+
|
|
28
|
+
# Return true if my end tag can be omitted.
|
|
29
|
+
def can_omit_end_tag; @can_omit_end; end
|
|
30
|
+
|
|
31
|
+
# Return true if I am a block element.
|
|
32
|
+
def is_block_element; false; end
|
|
33
|
+
|
|
34
|
+
# Return true if I am an inline element.
|
|
35
|
+
def is_inline_element; false; end
|
|
36
|
+
|
|
37
|
+
# Return true if I am an empty element.
|
|
38
|
+
def is_empty_element; false; end
|
|
39
|
+
|
|
40
|
+
# Return true if I can contain <tt>tag</tt> if my parent is of type <tt>parent</tt>.
|
|
41
|
+
# tag:: tag name, a String
|
|
42
|
+
# parent:: parent tag name, a String.
|
|
43
|
+
def can_contain(tag, parent); false; end
|
|
44
|
+
|
|
45
|
+
# Return true if whitespace within me can be omitted (ignoring browser
|
|
46
|
+
# bugs)
|
|
47
|
+
def can_ignore_whitespace; true; end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# This represents an HTML block element.
|
|
51
|
+
class BlockTag < Tag
|
|
52
|
+
def is_block_element; true; end
|
|
53
|
+
|
|
54
|
+
# Blocks can contain anything, so return true.
|
|
55
|
+
def can_contain(tag, parent); true; end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# This represents an HTML inline element.
|
|
59
|
+
class InlineTag < Tag
|
|
60
|
+
def is_inline_element; true; end
|
|
61
|
+
|
|
62
|
+
# Inlines can only contain other inlines.
|
|
63
|
+
def can_contain(tag, parent)
|
|
64
|
+
Tag.named(tag).is_inline_element
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# This represents an HTML element that can be regarded as either a block
|
|
69
|
+
# or an inline element..
|
|
70
|
+
class BlockOrInlineTag < InlineTag
|
|
71
|
+
|
|
72
|
+
def is_block_element; true; end
|
|
73
|
+
|
|
74
|
+
# If used as inline elements (e.g., within another inline element or a P),
|
|
75
|
+
# these elements should not contain any block-level elements.
|
|
76
|
+
def can_contain(tag, parent)
|
|
77
|
+
return ((parent.downcase == 'p' \
|
|
78
|
+
or Tag.named(parent).is_inline_element) \
|
|
79
|
+
and ! Tag.named(tag).is_block_element)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# This represents an HTML tag that never has an end tag.
|
|
84
|
+
class EmptyTag < Tag
|
|
85
|
+
def is_empty_element; true; end
|
|
86
|
+
def is_inline_element; true; end
|
|
87
|
+
def can_contain(tag, parent); false; end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# This block initializes the tag lookup table.
|
|
91
|
+
class Tag
|
|
92
|
+
@table = Hash.new
|
|
93
|
+
|
|
94
|
+
# Add the given tag to the tag lookup table.
|
|
95
|
+
#
|
|
96
|
+
# This can be called by user code to add otherwise unknown tags to the
|
|
97
|
+
# table.
|
|
98
|
+
#
|
|
99
|
+
# name:: the tag name, a String.
|
|
100
|
+
# is_block:: true if I am a block element.
|
|
101
|
+
# is_inline:: true if I am an inline element.
|
|
102
|
+
# is_empty:: true if I am an empty element.
|
|
103
|
+
# can_omit:: true if my end tag can be omitted.
|
|
104
|
+
def Tag.add_tag(name, is_block, is_inline, is_empty, can_omit)
|
|
105
|
+
@table[ name.upcase ] = @table[ name.downcase ] = \
|
|
106
|
+
if is_empty
|
|
107
|
+
EmptyTag.new(name, true)
|
|
108
|
+
elsif is_block
|
|
109
|
+
if is_inline
|
|
110
|
+
BlockOrInlineTag.new(name, can_omit)
|
|
111
|
+
else
|
|
112
|
+
BlockTag.new(name, can_omit)
|
|
113
|
+
end
|
|
114
|
+
else
|
|
115
|
+
InlineTag.new(name, can_omit)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Return an Tag with the given name, or raise a
|
|
120
|
+
# NoSuchHTMLTagError.
|
|
121
|
+
def Tag.named(tagname)
|
|
122
|
+
@table[ tagname ] || raise(NoSuchHTMLTagError.exception(tagname))
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Block Inline Empty can_omit_end
|
|
126
|
+
[
|
|
127
|
+
[ 'A', false, true, false, false ], # Anchor
|
|
128
|
+
[ 'ABBR', false, true, false, false ], # Abbreviation
|
|
129
|
+
[ 'ACRONYM', false, true, false, false ], # Acronym
|
|
130
|
+
[ 'ADDRESS', true, false, false, false ], # Address
|
|
131
|
+
[ 'APPLET', true, true, false, false ], # Java applet
|
|
132
|
+
[ 'AREA', true, false, true, true ], # Image map region
|
|
133
|
+
[ 'B', false, true, false, false ], # Bold text
|
|
134
|
+
[ 'BASE', false, false, true, true ], # Document base URI
|
|
135
|
+
[ 'BASEFONT', false, true, true, true ], # Base font change
|
|
136
|
+
[ 'BDO', false, true, false, false ], # Bi_di override
|
|
137
|
+
[ 'BIG', false, true, false, false ], # Large text
|
|
138
|
+
[ 'BLOCKQUOTE', true, false, false, false ], # Block quotation
|
|
139
|
+
[ 'BODY', true, false, false, false ], # Document body
|
|
140
|
+
[ 'BR', false, true, true, true ], # Line break
|
|
141
|
+
[ 'BUTTON', true, true, false, false ], # Button
|
|
142
|
+
[ 'CAPTION', false, true, false, false ], # Table caption
|
|
143
|
+
[ 'CENTER', false, true, false, false ], # Centered block
|
|
144
|
+
[ 'CITE', false, true, false, false ], # Citation
|
|
145
|
+
[ 'CODE', false, true, false, false ], # Computer code
|
|
146
|
+
[ 'COL', false, false, true, true ], # Table column
|
|
147
|
+
[ 'COLGROUP', true, false, false, true ], # Table column group
|
|
148
|
+
[ 'DD', true, false, false, true ], # Definition description
|
|
149
|
+
[ 'DEL', true, true, false, false ], # Deleted text
|
|
150
|
+
[ 'DFN', false, true, false, false ], # Defined term
|
|
151
|
+
[ 'DIR', true, false, false, false ], # Directory list
|
|
152
|
+
[ 'DIV', true, false, false, false ], # Generic block-level container
|
|
153
|
+
[ 'DL', true, false, false, false ], # Definition list
|
|
154
|
+
[ 'DT', false, true, false, true ], # Definition term
|
|
155
|
+
[ 'EM', false, true, false, false ], # Emphasis
|
|
156
|
+
[ 'FIELDSET', true, false, false, false ], # Form control group
|
|
157
|
+
[ 'FONT', false, true, false, false ], # Font change
|
|
158
|
+
[ 'FORM', true, false, false, false ], # Interactive form
|
|
159
|
+
[ 'FRAME', false, false, true, true ], # Frame
|
|
160
|
+
[ 'FRAMESET', true, false, false, false ], # Frameset
|
|
161
|
+
[ 'H1', true, false, false, false ], # Level-one heading
|
|
162
|
+
[ 'H2', true, false, false, false ], # Level-two heading
|
|
163
|
+
[ 'H3', true, false, false, false ], # Level-three heading
|
|
164
|
+
[ 'H4', true, false, false, false ], # Level-four heading
|
|
165
|
+
[ 'H5', true, false, false, false ], # Level-five heading
|
|
166
|
+
[ 'H6', true, false, false, false ], # Level-six heading
|
|
167
|
+
[ 'HEAD', true, false, false, false ], # Document head
|
|
168
|
+
[ 'HR', false, true, true, true ], # Horizontal rule
|
|
169
|
+
[ 'HTML', true, false, false, false ], # HTML document
|
|
170
|
+
[ 'I', false, true, false, false ], # Italic text
|
|
171
|
+
[ 'IFRAME', true, true, false, false ], # Inline frame
|
|
172
|
+
[ 'IMG', false, true, true, true ], # Inline image
|
|
173
|
+
[ 'INPUT', false, true, true, true ], # Form input
|
|
174
|
+
[ 'INS', true, true, false, false ], # Inserted text
|
|
175
|
+
[ 'ISINDEX', false, true, true, true ], # Input prompt
|
|
176
|
+
[ 'KBD', false, true, false, false ], # Text to be input
|
|
177
|
+
[ 'LABEL', false, true, false, false ], # Form field label
|
|
178
|
+
[ 'LEGEND', false, true, false, false ], # Fieldset caption
|
|
179
|
+
[ 'LI', true, false, false, true ], # List item
|
|
180
|
+
[ 'LINK', true, false, false, false ], # Document relationship
|
|
181
|
+
[ 'MAP', true, true, false, false ], # Image map
|
|
182
|
+
[ 'MENU', true, false, false, false ], # Menu list
|
|
183
|
+
[ 'META', false, true, true, true ], # Metadata
|
|
184
|
+
[ 'NOFRAMES', true, false, false, false ], # Frames alternate content
|
|
185
|
+
[ 'NOSCRIPT', true, false, false, false ], # Alternate script content
|
|
186
|
+
[ 'OBJECT', true, true, false, false ], # Object
|
|
187
|
+
[ 'OL', true, false, false, false ], # Ordered list
|
|
188
|
+
[ 'OPTGROUP', true, false, false, false ], # Option group
|
|
189
|
+
[ 'OPTION', true, false, false, false ], # Menu option
|
|
190
|
+
[ 'P', true, false, false, true ], # Paragraph
|
|
191
|
+
[ 'PARAM', false, true, true, true ], # Object parameter
|
|
192
|
+
[ 'PRE', true, false, false, false ], # Preformatted text
|
|
193
|
+
[ 'Q', false, true, false, false ], # Short quotation
|
|
194
|
+
[ 'S', false, true, false, false ], # Strike-through text
|
|
195
|
+
[ 'SAMP', false, true, false, false ], # Sample output
|
|
196
|
+
[ 'SCRIPT', true, true, false, false ], # Client-side script
|
|
197
|
+
[ 'SELECT', true, false, false, false ], # Option selector
|
|
198
|
+
[ 'SMALL', false, true, false, false ], # Small text
|
|
199
|
+
[ 'SPAN', false, true, false, false ], # Generic inline container
|
|
200
|
+
[ 'STRIKE', false, true, false, false ], # Strike-through text
|
|
201
|
+
[ 'STRONG', false, true, false, false ], # Strong emphasis
|
|
202
|
+
[ 'STYLE', true, false, false, false ], # Embedded style sheet
|
|
203
|
+
[ 'SUB', false, true, false, false ], # Subscript
|
|
204
|
+
[ 'SUP', false, true, false, false ], # Superscript
|
|
205
|
+
[ 'TABLE', true, false, false, false ], # Table
|
|
206
|
+
[ 'TBODY', true, false, false, false ], # Table body
|
|
207
|
+
[ 'TD', true, false, false, true ], # Table data cell
|
|
208
|
+
[ 'TEXTAREA', false, true, false, false ], # Multi-line text input
|
|
209
|
+
[ 'TFOOT', true, false, false, true ], # Table foot
|
|
210
|
+
[ 'TH', true, false, false, true ], # Table header cell
|
|
211
|
+
[ 'THEAD', true, false, false, true ], # Table head
|
|
212
|
+
[ 'TITLE', true, false, false, false ], # Document title
|
|
213
|
+
[ 'TR', true, false, false, true ], # Table row
|
|
214
|
+
[ 'TT', false, true, false, false ], # Teletype text
|
|
215
|
+
[ 'U', false, true, false, false ], # Underlined text
|
|
216
|
+
[ 'UL', true, false, false, false ], # Unordered list
|
|
217
|
+
[ 'VAR', false, true, false, false ], # Variable
|
|
218
|
+
].each { |a| add_tag(*a) }
|
|
219
|
+
|
|
220
|
+
# EXCEPTIONS TODO
|
|
221
|
+
# A, LABEL can't contain itself
|
|
222
|
+
# several things (fonts, etc) can't be in PRE
|
|
223
|
+
# SELECT can only have OPTGROUP or OPTION
|
|
224
|
+
# TEXTAREA, OPTION only contains plain text
|
|
225
|
+
# APPLET and OBJECT has PARAM+ followed by block and/or inline
|
|
226
|
+
# BUTTON can't contain:
|
|
227
|
+
# A, INPUT, SELECT, TEXTAREA, LABEL, BUTTON, or IFRAME
|
|
228
|
+
# nor FORM, ISINDEX, and FIELDSET
|
|
229
|
+
# IFRAME can only contain block elems if parent can
|
|
230
|
+
# MAP can contain block+ *xor* AREA+
|
|
231
|
+
# SCRIPT only contains a SCRIPT (that is, until /<\/[A-Za-z]/)
|
|
232
|
+
# BODY must be in HTML or NOFRAMES
|
|
233
|
+
# COL can only be in COLGROUP or TABLE
|
|
234
|
+
# COLGROUP has only COL*, and can only be in TABLE
|
|
235
|
+
# DIR, MENU can only contain LI+, none of which may contain block elems
|
|
236
|
+
# DL must contain (DT|DD)+
|
|
237
|
+
# DT and DD are only allowed in DL
|
|
238
|
+
# FIELDSET contains LEGEND, (block|inline)*
|
|
239
|
+
# FRAMESET contains (FRAMESET|FRAME), plus NOFRAMES and must be in HTML
|
|
240
|
+
# H# can only be contained in block elems, but only contain inlines.
|
|
241
|
+
# HEAD must only contain TITLE, BASE?, ISINDEX?, SCRIPT* STYLE* META* LINK*
|
|
242
|
+
# OBJECT* HEAD must be in HTML
|
|
243
|
+
# HTML is top-level and can only contain HEAD, BODY, or HEAD, FRAMESET
|
|
244
|
+
# LI can contain blocks except when inside DIR or MENU
|
|
245
|
+
# LI can only be inside OL, UL, DIR, MENU
|
|
246
|
+
# OL, UL can only contain LI+
|
|
247
|
+
# OPTGROUP contains OPTION+
|
|
248
|
+
# P can only contain inlines. However, it is a block-level elem.
|
|
249
|
+
# PRE can only contain inlines except IMG, OBJECT, APPLET, BIG, SMALL, SUB,
|
|
250
|
+
# SUP, FONT, BASEFONT
|
|
251
|
+
|
|
252
|
+
# tags with optional omitted endtags and their allowed contents:
|
|
253
|
+
# anchor matches at beginning and end
|
|
254
|
+
{
|
|
255
|
+
'AREA' => '(?!AREA)[A-Z]+',
|
|
256
|
+
'COLGROUP' => 'COL',
|
|
257
|
+
'DD' => '(?!D[DT]$)[A-Z]+',
|
|
258
|
+
'DT' => '(?!D[DT]$)[A-Z]+',
|
|
259
|
+
'MAP' => 'AREA',
|
|
260
|
+
'P' => '(?!P$)[A-Z]+',
|
|
261
|
+
'TD' => '(?!T[HDR]$)[A-Z]+',
|
|
262
|
+
'TFOOT' => 'TR',
|
|
263
|
+
'TH' => '(?!T[HDR]$)[A-Z]+',
|
|
264
|
+
'THEAD' => 'TR',
|
|
265
|
+
'TR' => 'T[HD]',
|
|
266
|
+
}.each_pair { |tagname, pattern|
|
|
267
|
+
eval <<EOM
|
|
268
|
+
class << named(tagname) # :nodoc:
|
|
269
|
+
def can_contain(tag, parent)
|
|
270
|
+
(/\\A#{pattern}\\z/i =~ tag) == 0
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
EOM
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
class << named('TEXTAREA') # :nodoc:
|
|
277
|
+
def can_ignore_whitespace; false; end
|
|
278
|
+
end
|
|
279
|
+
class << named('PRE') # :nodoc:
|
|
280
|
+
def can_ignore_whitespace; false; end
|
|
281
|
+
end
|
|
282
|
+
class << named('OPTION') # :nodoc:
|
|
283
|
+
def can_ignore_whitespace; false; end
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
end
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
# This is an HTML parser that builds an element tree for further
|
|
3
|
+
# processing. Attributes and data are also stored.
|
|
4
|
+
#
|
|
5
|
+
# Typical usage is:
|
|
6
|
+
# parser = HTMLTree::Parser.new(false, false)
|
|
7
|
+
# parser.parse_file_named('whatever.html')
|
|
8
|
+
# # then you have the tree built..
|
|
9
|
+
# parser.tree.dump
|
|
10
|
+
#
|
|
11
|
+
# Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
|
12
|
+
# License:: Ruby's
|
|
13
|
+
# CVS ID:: $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
|
|
14
|
+
|
|
15
|
+
require 'web/htmltools/tags'
|
|
16
|
+
require 'web/htmltools/stparser'
|
|
17
|
+
require 'web/htmltools/element'
|
|
18
|
+
|
|
19
|
+
# This is a tree building HTML parser.
|
|
20
|
+
module HTMLTree #:nodoc: all
|
|
21
|
+
class Parser < HTML::StackingParser
|
|
22
|
+
|
|
23
|
+
# verbose:: if true, will warn to $stderr on unknown
|
|
24
|
+
# tags/entities/characters, as well as missing end tags and extra end
|
|
25
|
+
# tags.
|
|
26
|
+
# strip_white:: if true, remove all non-essential whitespace. Note
|
|
27
|
+
# that there are browser bugs that may cause this to change the
|
|
28
|
+
# appearance of HTML (even though it shouldn't by the standard).
|
|
29
|
+
def initialize(verbose=false, strip_white=true)
|
|
30
|
+
super
|
|
31
|
+
reset
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Reset this parser so that it can parse a new document.
|
|
35
|
+
def reset
|
|
36
|
+
super
|
|
37
|
+
@rootNode = @currentNode = Document.new
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Return the tree that was built. This will be an HTMLTree::Element that
|
|
41
|
+
# represents the whole document. The \<html> node is a child of this.
|
|
42
|
+
def tree
|
|
43
|
+
@rootNode
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Return the <html> node, if any.
|
|
47
|
+
def html
|
|
48
|
+
@rootNode.html_node()
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# no user-serviceable parts inside...
|
|
52
|
+
# though you can subclass carefully.
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
def add_child_to_current(tag, attrs)
|
|
56
|
+
node = Element.new(@currentNode, tag)
|
|
57
|
+
attrs.each { |a| node.add_attribute(*a) }
|
|
58
|
+
node
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# callbacks
|
|
62
|
+
|
|
63
|
+
# add a child to the current node and descend
|
|
64
|
+
def handle_start_tag(tag, attrs)
|
|
65
|
+
node = add_child_to_current(tag, attrs)
|
|
66
|
+
@rootNode = node unless @rootNode
|
|
67
|
+
@currentNode = node
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# go up to parent
|
|
71
|
+
def handle_end_tag(tag)
|
|
72
|
+
@currentNode = @currentNode.parent
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# add a child to the current node
|
|
76
|
+
def handle_empty_tag(tag, attrs)
|
|
77
|
+
add_child_to_current(tag, attrs)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Add a child to the current node and descend
|
|
81
|
+
# Assume that the unknown tag has an end tag.
|
|
82
|
+
def handle_unknown_tag(tag, attrs)
|
|
83
|
+
super
|
|
84
|
+
handle_start_tag(tag, attrs)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# go up to parent
|
|
88
|
+
def handle_missing_end_tag(tag)
|
|
89
|
+
super
|
|
90
|
+
handle_end_tag(tag)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# ignore
|
|
94
|
+
def handle_extra_end_tag(tag)
|
|
95
|
+
super
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def handle_cdata(data)
|
|
99
|
+
node = Data.new(@currentNode, data)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def handle_script(data)
|
|
103
|
+
node = Data.new(@currentNode, data)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def handle_unknown_character(name)
|
|
107
|
+
super
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def handle_unknown_entity(name)
|
|
111
|
+
super
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def handle_comment(data)
|
|
115
|
+
super # make sure and strip whitespace.
|
|
116
|
+
node = Comment.new(@currentNode, data)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def handle_special(data)
|
|
120
|
+
node = HTMLTree::Special.new(@currentNode, data)
|
|
121
|
+
$stderr.print('special ', node, ' discarded') unless @currentNode
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
if $0 == __FILE__
|
|
128
|
+
$stdout.sync = true
|
|
129
|
+
|
|
130
|
+
class TestStackingParser < HTMLTree::Parser #:nodoc: all
|
|
131
|
+
$DEBUG = false
|
|
132
|
+
p = TestStackingParser.new(true, false)
|
|
133
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
|
134
|
+
File.open('xx.html', 'w') { |of|
|
|
135
|
+
p.tree.write(of)
|
|
136
|
+
}
|
|
137
|
+
p.tree.dump
|
|
138
|
+
end
|
|
139
|
+
end
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
# This is an HTML parser that builds an element tree for further
|
|
3
|
+
# processing. Attributes and data are also stored.
|
|
4
|
+
# The storage is that of REXML, which is required.
|
|
5
|
+
#
|
|
6
|
+
# Typical usage is:
|
|
7
|
+
# parser = HTMLTree::XMLParser.new(false, false)
|
|
8
|
+
# parser.parse_file_named('whatever.html')
|
|
9
|
+
# # then you have the tree built..
|
|
10
|
+
# parser.tree # is a REXML::Document
|
|
11
|
+
#
|
|
12
|
+
# Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
|
13
|
+
# License:: Ruby's
|
|
14
|
+
# CVS ID:: $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
|
|
15
|
+
|
|
16
|
+
require 'web/htmltools/tags'
|
|
17
|
+
require 'web/htmltools/stparser'
|
|
18
|
+
require 'rexml/element'
|
|
19
|
+
require 'rexml/document'
|
|
20
|
+
|
|
21
|
+
# REXML::Child
|
|
22
|
+
# REXML::XMLDecl
|
|
23
|
+
# REXML::Instruction
|
|
24
|
+
# REXML::Text
|
|
25
|
+
# REXML::Comment
|
|
26
|
+
# REXML::Entity
|
|
27
|
+
# REXML::Parent
|
|
28
|
+
# REXML::Element (+REXML::Namespace)
|
|
29
|
+
# REXML::Document
|
|
30
|
+
# REXML::DocType
|
|
31
|
+
#
|
|
32
|
+
# This is a tree building HTML parser that makes XML.
|
|
33
|
+
module HTMLTree #:nodoc: all
|
|
34
|
+
class XMLParser < HTML::StackingParser
|
|
35
|
+
|
|
36
|
+
# verbose:: if true, will warn to $stderr on unknown
|
|
37
|
+
# tags/entities/characters, as well as missing end tags and extra end
|
|
38
|
+
# tags.
|
|
39
|
+
# strip_white:: if true, remove all non-essential whitespace. Note
|
|
40
|
+
# that there are browser bugs that may cause this to change the
|
|
41
|
+
# appearance of HTML (even though it shouldn't by the standard).
|
|
42
|
+
def initialize(verbose=false, strip_white=true)
|
|
43
|
+
super
|
|
44
|
+
reset
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Reset this parser so that it can parse a new document.
|
|
48
|
+
def reset
|
|
49
|
+
super
|
|
50
|
+
@rootNode = @currentNode = REXML::Document.new()
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Return the document that was built. This will be an
|
|
54
|
+
# REXML::Document that represents the whole document. The \<html>
|
|
55
|
+
# node is a child of this.
|
|
56
|
+
def document
|
|
57
|
+
@rootNode
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def tree
|
|
61
|
+
document()
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Return the root of the document, if any.
|
|
65
|
+
def root
|
|
66
|
+
@rootNode.root()
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Return the <html> node, if any.
|
|
70
|
+
def html
|
|
71
|
+
@rootNode.root.elements['html']
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# no user-serviceable parts inside...
|
|
75
|
+
# though you can subclass carefully.
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def add_child_to_current(tag, attrs)
|
|
79
|
+
node = REXML::Element.new(tag, @currentNode)
|
|
80
|
+
attrs.each { |a| node.attributes[a[0]] = a[1] }
|
|
81
|
+
node
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# callbacks
|
|
85
|
+
|
|
86
|
+
# add a child to the current node and descend
|
|
87
|
+
def handle_start_tag(tag, attrs)
|
|
88
|
+
node = add_child_to_current(tag, attrs)
|
|
89
|
+
@rootNode = node unless @rootNode
|
|
90
|
+
@currentNode = node
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# go up to parent
|
|
94
|
+
def handle_end_tag(tag)
|
|
95
|
+
@currentNode = @currentNode.parent
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# add a child to the current node
|
|
99
|
+
def handle_empty_tag(tag, attrs)
|
|
100
|
+
add_child_to_current(tag, attrs)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Add a child to the current node and descend
|
|
104
|
+
# Assume that the unknown tag has an end tag.
|
|
105
|
+
def handle_unknown_tag(tag, attrs)
|
|
106
|
+
super
|
|
107
|
+
handle_start_tag(tag, attrs)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# go up to parent
|
|
111
|
+
def handle_missing_end_tag(tag)
|
|
112
|
+
super
|
|
113
|
+
handle_end_tag(tag)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# ignore
|
|
117
|
+
def handle_extra_end_tag(tag)
|
|
118
|
+
super
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def handle_cdata(data)
|
|
122
|
+
REXML::Text.new(data, !@stripWhitespace, @currentNode)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def handle_script(data)
|
|
126
|
+
REXML::Comment.new(data, @currentNode)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def handle_unknown_character(name)
|
|
130
|
+
super # that is, do nothing
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def handle_unknown_entity(name)
|
|
134
|
+
super # that is, do nothing
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def handle_comment(data)
|
|
138
|
+
super # strip white
|
|
139
|
+
REXML::Comment.new(data, @currentNode)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def handle_special(data)
|
|
143
|
+
REXML::DocType.new(data, @currentNode) # TODO
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
if $0 == __FILE__
|
|
150
|
+
$stdout.sync = true
|
|
151
|
+
|
|
152
|
+
class TestStackingParser < HTMLTree::XMLParser #:nodoc: all
|
|
153
|
+
$DEBUG = false
|
|
154
|
+
p = TestStackingParser.new(true, false)
|
|
155
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
|
156
|
+
File.open('xx.html', 'w') { |of|
|
|
157
|
+
p.document.write(of)
|
|
158
|
+
}
|
|
159
|
+
end
|
|
160
|
+
end
|