ruby-web 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. data/ChangeLog +474 -0
  2. data/INSTALL.txt +9 -0
  3. data/InstalledFiles +180 -0
  4. data/LICENSE.txt +74 -0
  5. data/Rakefile +529 -0
  6. data/TODO +65 -0
  7. data/doc/additional.xml +149 -0
  8. data/doc/core.xml +652 -0
  9. data/doc/credits/index.xml +52 -0
  10. data/doc/credits/php.contributors.xml +118 -0
  11. data/doc/credits/php.language-snippets.ent +622 -0
  12. data/doc/install/index.xml +136 -0
  13. data/doc/install/mac/index.xml +21 -0
  14. data/doc/install/ruby-web.install.rb.instructions.xml +7 -0
  15. data/doc/install/unix/index.xml +46 -0
  16. data/doc/install/win/apache1.xml +166 -0
  17. data/doc/install/win/apache2.xml +141 -0
  18. data/doc/install/win/iis.xml +162 -0
  19. data/doc/install/win/index.xml +24 -0
  20. data/doc/install/win/installer.xml +31 -0
  21. data/doc/install/win/manual.xml +43 -0
  22. data/doc/manual.xml +69 -0
  23. data/doc/old/apache_cgi.txt +23 -0
  24. data/doc/old/fastcgi.txt +23 -0
  25. data/doc/old/mod_ruby.txt +21 -0
  26. data/doc/old/snippets.rdoc +183 -0
  27. data/doc/old/webrick.txt +23 -0
  28. data/doc/old/windows_cgi.txt +9 -0
  29. data/doc/tutorial.xml +14 -0
  30. data/doc/xsl/manual-multi.xsl +10 -0
  31. data/doc/xsl/manual-pdf.xsl +6 -0
  32. data/doc/xsl/manual-single.xsl +6 -0
  33. data/doc/xsl/manual.css +22 -0
  34. data/install.rb +1022 -0
  35. data/lib/formatter.rb +314 -0
  36. data/lib/html-parser.rb +429 -0
  37. data/lib/htmlrepair.rb +113 -0
  38. data/lib/htmlsplit.rb +842 -0
  39. data/lib/sgml-parser.rb +332 -0
  40. data/lib/web.rb +68 -0
  41. data/lib/web/assertinclude.rb +129 -0
  42. data/lib/web/config.rb +50 -0
  43. data/lib/web/connection.rb +1070 -0
  44. data/lib/web/convenience.rb +154 -0
  45. data/lib/web/formreader.rb +318 -0
  46. data/lib/web/htmlparser/html-parser.rb +429 -0
  47. data/lib/web/htmlparser/sgml-parser.rb +332 -0
  48. data/lib/web/htmltools/element.rb +296 -0
  49. data/lib/web/htmltools/stparser.rb +276 -0
  50. data/lib/web/htmltools/tags.rb +286 -0
  51. data/lib/web/htmltools/tree.rb +139 -0
  52. data/lib/web/htmltools/xmltree.rb +160 -0
  53. data/lib/web/htmltools/xpath.rb +71 -0
  54. data/lib/web/info.rb +63 -0
  55. data/lib/web/load.rb +210 -0
  56. data/lib/web/mime.rb +87 -0
  57. data/lib/web/phprb.rb +340 -0
  58. data/lib/web/resources/test/cookie.rb +33 -0
  59. data/lib/web/resources/test/counter.rb +20 -0
  60. data/lib/web/resources/test/multipart.rb +14 -0
  61. data/lib/web/resources/test/redirect.rb +8 -0
  62. data/lib/web/resources/test/stock.rb +33 -0
  63. data/lib/web/sapi/apache.rb +129 -0
  64. data/lib/web/sapi/fastcgi.rb +22 -0
  65. data/lib/web/sapi/install/apache.rb +180 -0
  66. data/lib/web/sapi/install/iis.rb +93 -0
  67. data/lib/web/sapi/install/macosx.rb +90 -0
  68. data/lib/web/sapi/webrick.rb +86 -0
  69. data/lib/web/session.rb +83 -0
  70. data/lib/web/shim/cgi.rb +129 -0
  71. data/lib/web/shim/rails.rb +175 -0
  72. data/lib/web/stringio.rb +78 -0
  73. data/lib/web/strscanparser.rb +24 -0
  74. data/lib/web/tagparser.rb +96 -0
  75. data/lib/web/testing.rb +666 -0
  76. data/lib/web/traceoutput.rb +75 -0
  77. data/lib/web/unit.rb +56 -0
  78. data/lib/web/upload.rb +59 -0
  79. data/lib/web/validate.rb +52 -0
  80. data/lib/web/wiki.rb +557 -0
  81. data/lib/web/wiki/linker.rb +72 -0
  82. data/lib/web/wiki/page.rb +201 -0
  83. data/lib/webunit.rb +27 -0
  84. data/lib/webunit/assert.rb +152 -0
  85. data/lib/webunit/converter.rb +154 -0
  86. data/lib/webunit/cookie.rb +118 -0
  87. data/lib/webunit/domwalker.rb +185 -0
  88. data/lib/webunit/exception.rb +14 -0
  89. data/lib/webunit/form.rb +116 -0
  90. data/lib/webunit/frame.rb +37 -0
  91. data/lib/webunit/htmlelem.rb +122 -0
  92. data/lib/webunit/image.rb +26 -0
  93. data/lib/webunit/jscript.rb +31 -0
  94. data/lib/webunit/link.rb +33 -0
  95. data/lib/webunit/params.rb +321 -0
  96. data/lib/webunit/parser.rb +229 -0
  97. data/lib/webunit/response.rb +464 -0
  98. data/lib/webunit/runtest.rb +41 -0
  99. data/lib/webunit/table.rb +148 -0
  100. data/lib/webunit/testcase.rb +45 -0
  101. data/lib/webunit/ui/cui/testrunner.rb +50 -0
  102. data/lib/webunit/utils.rb +68 -0
  103. data/lib/webunit/webunit.rb +28 -0
  104. data/test/dev/action.rb +83 -0
  105. data/test/dev/forms.rb +104 -0
  106. data/test/dev/forms2.rb +104 -0
  107. data/test/dev/parser.rb +17 -0
  108. data/test/dev/scripts/dump.rb +24 -0
  109. data/test/dev/scripts/makedist.rb +62 -0
  110. data/test/dev/scripts/uri.rb +41 -0
  111. data/test/dev/scripts/uri/common.rb +432 -0
  112. data/test/dev/scripts/uri/ftp.rb +149 -0
  113. data/test/dev/scripts/uri/generic.rb +1106 -0
  114. data/test/dev/scripts/uri/http.rb +76 -0
  115. data/test/dev/scripts/uri/https.rb +26 -0
  116. data/test/dev/scripts/uri/ldap.rb +238 -0
  117. data/test/dev/scripts/uri/mailto.rb +260 -0
  118. data/test/dev/scripts/urireg.rb +174 -0
  119. data/test/dev/simpledispatcher.rb +156 -0
  120. data/test/dev/test.action.rb +146 -0
  121. data/test/dev/test.formreader.rb +463 -0
  122. data/test/dev/test.simpledispatcher.rb +186 -0
  123. data/test/dev/webunit/conv/digit-0.rb +21 -0
  124. data/test/dev/webunit/conv/digit-1.rb +17 -0
  125. data/test/dev/webunit/conv/digit.rb +23 -0
  126. data/test/dev/webunit/conv/test_digit-0.rb +16 -0
  127. data/test/dev/webunit/conv/test_digit-1.rb +19 -0
  128. data/test/dev/webunit/conv/test_digit.rb +26 -0
  129. data/test/dev/webunit/conv/test_digit_view-0.rb +76 -0
  130. data/test/dev/webunit/conv/test_digit_view-1.rb +102 -0
  131. data/test/dev/webunit/conv/test_digit_view.rb +134 -0
  132. data/test/installation/htdocs/cgi_test.rb +296 -0
  133. data/test/installation/htdocs/test_install.rb +4 -0
  134. data/test/installation/runwebtest.rb +5 -0
  135. data/test/installation/test_cookie.rb +128 -0
  136. data/test/installation/test_form.rb +47 -0
  137. data/test/installation/test_multipart.rb +51 -0
  138. data/test/installation/test_request.rb +24 -0
  139. data/test/installation/test_response.rb +35 -0
  140. data/test/unit/htdocs/cookie.rb +32 -0
  141. data/test/unit/htdocs/multipart.rb +28 -0
  142. data/test/unit/htdocs/redirect.rb +12 -0
  143. data/test/unit/htdocs/simple.rb +13 -0
  144. data/test/unit/htdocs/stock.rb +33 -0
  145. data/test/unit/test_assert.rb +162 -0
  146. data/test/unit/test_cookie.rb +114 -0
  147. data/test/unit/test_domwalker.rb +77 -0
  148. data/test/unit/test_form.rb +42 -0
  149. data/test/unit/test_frame.rb +40 -0
  150. data/test/unit/test_htmlelem.rb +74 -0
  151. data/test/unit/test_image.rb +45 -0
  152. data/test/unit/test_jscript.rb +57 -0
  153. data/test/unit/test_link.rb +85 -0
  154. data/test/unit/test_multipart.rb +51 -0
  155. data/test/unit/test_params.rb +210 -0
  156. data/test/unit/test_parser.rb +53 -0
  157. data/test/unit/test_response.rb +150 -0
  158. data/test/unit/test_table.rb +70 -0
  159. data/test/unit/test_utils.rb +106 -0
  160. data/test/unit/test_webunit.rb +28 -0
  161. data/test/web/mod_ruby_stub.rb +39 -0
  162. data/test/web/test.assertinclude.rb +109 -0
  163. data/test/web/test.buffer.rb +182 -0
  164. data/test/web/test.code.loader.rb +78 -0
  165. data/test/web/test.config.rb +31 -0
  166. data/test/web/test.error.handling.rb +91 -0
  167. data/test/web/test.formreader-2.0.rb +352 -0
  168. data/test/web/test.load.rb +125 -0
  169. data/test/web/test.mime-type.rb +23 -0
  170. data/test/web/test.narf.cgi.rb +106 -0
  171. data/test/web/test.phprb.rb +239 -0
  172. data/test/web/test.request.rb +368 -0
  173. data/test/web/test.response.rb +637 -0
  174. data/test/web/test.ruby-web.rb +10 -0
  175. data/test/web/test.session.rb +50 -0
  176. data/test/web/test.shim.cgi.rb +96 -0
  177. data/test/web/test.tagparser.rb +65 -0
  178. data/test/web/test.template2.rb +297 -0
  179. data/test/web/test.testing2.rb +318 -0
  180. data/test/web/test.upload.rb +45 -0
  181. data/test/web/test.validate.rb +46 -0
  182. data/test/web/test.web.test.rb +495 -0
  183. data/test/wiki/test.history.rb +297 -0
  184. data/test/wiki/test.illustration_page.rb +287 -0
  185. data/test/wiki/test.linker.rb +197 -0
  186. data/test/wiki/test.tarpit.rb +56 -0
  187. data/test/wiki/test.wiki.rb +300 -0
  188. data/test/wikitestroot/admin.rb +7 -0
  189. data/test/wikitestroot/wiki.rb +6 -0
  190. metadata +234 -0
@@ -0,0 +1,286 @@
1
+ # This encodes the knowledge of HTML 4.0 tags for a parser.
2
+ # It knows about block vs. inline tags, empty tags, and optionally
3
+ # omitted end tags.
4
+ #
5
+ # Copyright:: Copyright(C) 2002 Ned Konz <ned@bike-nomad.com>
6
+ # License:: Ruby's license
7
+ # CVS ID:: $Id: tags.rb,v 1.7 2002/06/04 01:55:59 ned Exp $
8
+
9
+ # This is an error raised by <tt>HTML::Tag.named()</tt> when a tag doesn't exist.
10
+ class NoSuchHTMLTagError < RuntimeError #:nodoc:
11
+ end
12
+
13
+ # This is the base class for all the HTML tag classes.
14
+ module HTML #:nodoc: all
15
+
16
+ class Tag
17
+
18
+ # tag_name:: a String, the name of the tag
19
+ # can_omit:: a Boolean, true if end tag is optional
20
+ def initialize(tag_name, can_omit)
21
+ @name = tag_name.downcase
22
+ @can_omit_end = can_omit
23
+ end
24
+
25
+ # Return my tag name.
26
+ def name; @name; end
27
+
28
+ # Return true if my end tag can be omitted.
29
+ def can_omit_end_tag; @can_omit_end; end
30
+
31
+ # Return true if I am a block element.
32
+ def is_block_element; false; end
33
+
34
+ # Return true if I am an inline element.
35
+ def is_inline_element; false; end
36
+
37
+ # Return true if I am an empty element.
38
+ def is_empty_element; false; end
39
+
40
+ # Return true if I can contain <tt>tag</tt> if my parent is of type <tt>parent</tt>.
41
+ # tag:: tag name, a String
42
+ # parent:: parent tag name, a String.
43
+ def can_contain(tag, parent); false; end
44
+
45
+ # Return true if whitespace within me can be omitted (ignoring browser
46
+ # bugs)
47
+ def can_ignore_whitespace; true; end
48
+ end
49
+
50
+ # This represents an HTML block element.
51
+ class BlockTag < Tag
52
+ def is_block_element; true; end
53
+
54
+ # Blocks can contain anything, so return true.
55
+ def can_contain(tag, parent); true; end
56
+ end
57
+
58
+ # This represents an HTML inline element.
59
+ class InlineTag < Tag
60
+ def is_inline_element; true; end
61
+
62
+ # Inlines can only contain other inlines.
63
+ def can_contain(tag, parent)
64
+ Tag.named(tag).is_inline_element
65
+ end
66
+ end
67
+
68
+ # This represents an HTML element that can be regarded as either a block
69
+ # or an inline element..
70
+ class BlockOrInlineTag < InlineTag
71
+
72
+ def is_block_element; true; end
73
+
74
+ # If used as inline elements (e.g., within another inline element or a P),
75
+ # these elements should not contain any block-level elements.
76
+ def can_contain(tag, parent)
77
+ return ((parent.downcase == 'p' \
78
+ or Tag.named(parent).is_inline_element) \
79
+ and ! Tag.named(tag).is_block_element)
80
+ end
81
+ end
82
+
83
+ # This represents an HTML tag that never has an end tag.
84
+ class EmptyTag < Tag
85
+ def is_empty_element; true; end
86
+ def is_inline_element; true; end
87
+ def can_contain(tag, parent); false; end
88
+ end
89
+
90
+ # This block initializes the tag lookup table.
91
+ class Tag
92
+ @table = Hash.new
93
+
94
+ # Add the given tag to the tag lookup table.
95
+ #
96
+ # This can be called by user code to add otherwise unknown tags to the
97
+ # table.
98
+ #
99
+ # name:: the tag name, a String.
100
+ # is_block:: true if I am a block element.
101
+ # is_inline:: true if I am an inline element.
102
+ # is_empty:: true if I am an empty element.
103
+ # can_omit:: true if my end tag can be omitted.
104
+ def Tag.add_tag(name, is_block, is_inline, is_empty, can_omit)
105
+ @table[ name.upcase ] = @table[ name.downcase ] = \
106
+ if is_empty
107
+ EmptyTag.new(name, true)
108
+ elsif is_block
109
+ if is_inline
110
+ BlockOrInlineTag.new(name, can_omit)
111
+ else
112
+ BlockTag.new(name, can_omit)
113
+ end
114
+ else
115
+ InlineTag.new(name, can_omit)
116
+ end
117
+ end
118
+
119
+ # Return an Tag with the given name, or raise a
120
+ # NoSuchHTMLTagError.
121
+ def Tag.named(tagname)
122
+ @table[ tagname ] || raise(NoSuchHTMLTagError.exception(tagname))
123
+ end
124
+
125
+ # Block Inline Empty can_omit_end
126
+ [
127
+ [ 'A', false, true, false, false ], # Anchor
128
+ [ 'ABBR', false, true, false, false ], # Abbreviation
129
+ [ 'ACRONYM', false, true, false, false ], # Acronym
130
+ [ 'ADDRESS', true, false, false, false ], # Address
131
+ [ 'APPLET', true, true, false, false ], # Java applet
132
+ [ 'AREA', true, false, true, true ], # Image map region
133
+ [ 'B', false, true, false, false ], # Bold text
134
+ [ 'BASE', false, false, true, true ], # Document base URI
135
+ [ 'BASEFONT', false, true, true, true ], # Base font change
136
+ [ 'BDO', false, true, false, false ], # Bi_di override
137
+ [ 'BIG', false, true, false, false ], # Large text
138
+ [ 'BLOCKQUOTE', true, false, false, false ], # Block quotation
139
+ [ 'BODY', true, false, false, false ], # Document body
140
+ [ 'BR', false, true, true, true ], # Line break
141
+ [ 'BUTTON', true, true, false, false ], # Button
142
+ [ 'CAPTION', false, true, false, false ], # Table caption
143
+ [ 'CENTER', false, true, false, false ], # Centered block
144
+ [ 'CITE', false, true, false, false ], # Citation
145
+ [ 'CODE', false, true, false, false ], # Computer code
146
+ [ 'COL', false, false, true, true ], # Table column
147
+ [ 'COLGROUP', true, false, false, true ], # Table column group
148
+ [ 'DD', true, false, false, true ], # Definition description
149
+ [ 'DEL', true, true, false, false ], # Deleted text
150
+ [ 'DFN', false, true, false, false ], # Defined term
151
+ [ 'DIR', true, false, false, false ], # Directory list
152
+ [ 'DIV', true, false, false, false ], # Generic block-level container
153
+ [ 'DL', true, false, false, false ], # Definition list
154
+ [ 'DT', false, true, false, true ], # Definition term
155
+ [ 'EM', false, true, false, false ], # Emphasis
156
+ [ 'FIELDSET', true, false, false, false ], # Form control group
157
+ [ 'FONT', false, true, false, false ], # Font change
158
+ [ 'FORM', true, false, false, false ], # Interactive form
159
+ [ 'FRAME', false, false, true, true ], # Frame
160
+ [ 'FRAMESET', true, false, false, false ], # Frameset
161
+ [ 'H1', true, false, false, false ], # Level-one heading
162
+ [ 'H2', true, false, false, false ], # Level-two heading
163
+ [ 'H3', true, false, false, false ], # Level-three heading
164
+ [ 'H4', true, false, false, false ], # Level-four heading
165
+ [ 'H5', true, false, false, false ], # Level-five heading
166
+ [ 'H6', true, false, false, false ], # Level-six heading
167
+ [ 'HEAD', true, false, false, false ], # Document head
168
+ [ 'HR', false, true, true, true ], # Horizontal rule
169
+ [ 'HTML', true, false, false, false ], # HTML document
170
+ [ 'I', false, true, false, false ], # Italic text
171
+ [ 'IFRAME', true, true, false, false ], # Inline frame
172
+ [ 'IMG', false, true, true, true ], # Inline image
173
+ [ 'INPUT', false, true, true, true ], # Form input
174
+ [ 'INS', true, true, false, false ], # Inserted text
175
+ [ 'ISINDEX', false, true, true, true ], # Input prompt
176
+ [ 'KBD', false, true, false, false ], # Text to be input
177
+ [ 'LABEL', false, true, false, false ], # Form field label
178
+ [ 'LEGEND', false, true, false, false ], # Fieldset caption
179
+ [ 'LI', true, false, false, true ], # List item
180
+ [ 'LINK', true, false, false, false ], # Document relationship
181
+ [ 'MAP', true, true, false, false ], # Image map
182
+ [ 'MENU', true, false, false, false ], # Menu list
183
+ [ 'META', false, true, true, true ], # Metadata
184
+ [ 'NOFRAMES', true, false, false, false ], # Frames alternate content
185
+ [ 'NOSCRIPT', true, false, false, false ], # Alternate script content
186
+ [ 'OBJECT', true, true, false, false ], # Object
187
+ [ 'OL', true, false, false, false ], # Ordered list
188
+ [ 'OPTGROUP', true, false, false, false ], # Option group
189
+ [ 'OPTION', true, false, false, false ], # Menu option
190
+ [ 'P', true, false, false, true ], # Paragraph
191
+ [ 'PARAM', false, true, true, true ], # Object parameter
192
+ [ 'PRE', true, false, false, false ], # Preformatted text
193
+ [ 'Q', false, true, false, false ], # Short quotation
194
+ [ 'S', false, true, false, false ], # Strike-through text
195
+ [ 'SAMP', false, true, false, false ], # Sample output
196
+ [ 'SCRIPT', true, true, false, false ], # Client-side script
197
+ [ 'SELECT', true, false, false, false ], # Option selector
198
+ [ 'SMALL', false, true, false, false ], # Small text
199
+ [ 'SPAN', false, true, false, false ], # Generic inline container
200
+ [ 'STRIKE', false, true, false, false ], # Strike-through text
201
+ [ 'STRONG', false, true, false, false ], # Strong emphasis
202
+ [ 'STYLE', true, false, false, false ], # Embedded style sheet
203
+ [ 'SUB', false, true, false, false ], # Subscript
204
+ [ 'SUP', false, true, false, false ], # Superscript
205
+ [ 'TABLE', true, false, false, false ], # Table
206
+ [ 'TBODY', true, false, false, false ], # Table body
207
+ [ 'TD', true, false, false, true ], # Table data cell
208
+ [ 'TEXTAREA', false, true, false, false ], # Multi-line text input
209
+ [ 'TFOOT', true, false, false, true ], # Table foot
210
+ [ 'TH', true, false, false, true ], # Table header cell
211
+ [ 'THEAD', true, false, false, true ], # Table head
212
+ [ 'TITLE', true, false, false, false ], # Document title
213
+ [ 'TR', true, false, false, true ], # Table row
214
+ [ 'TT', false, true, false, false ], # Teletype text
215
+ [ 'U', false, true, false, false ], # Underlined text
216
+ [ 'UL', true, false, false, false ], # Unordered list
217
+ [ 'VAR', false, true, false, false ], # Variable
218
+ ].each { |a| add_tag(*a) }
219
+
220
+ # EXCEPTIONS TODO
221
+ # A, LABEL can't contain itself
222
+ # several things (fonts, etc) can't be in PRE
223
+ # SELECT can only have OPTGROUP or OPTION
224
+ # TEXTAREA, OPTION only contains plain text
225
+ # APPLET and OBJECT has PARAM+ followed by block and/or inline
226
+ # BUTTON can't contain:
227
+ # A, INPUT, SELECT, TEXTAREA, LABEL, BUTTON, or IFRAME
228
+ # nor FORM, ISINDEX, and FIELDSET
229
+ # IFRAME can only contain block elems if parent can
230
+ # MAP can contain block+ *xor* AREA+
231
+ # SCRIPT only contains a SCRIPT (that is, until /<\/[A-Za-z]/)
232
+ # BODY must be in HTML or NOFRAMES
233
+ # COL can only be in COLGROUP or TABLE
234
+ # COLGROUP has only COL*, and can only be in TABLE
235
+ # DIR, MENU can only contain LI+, none of which may contain block elems
236
+ # DL must contain (DT|DD)+
237
+ # DT and DD are only allowed in DL
238
+ # FIELDSET contains LEGEND, (block|inline)*
239
+ # FRAMESET contains (FRAMESET|FRAME), plus NOFRAMES and must be in HTML
240
+ # H# can only be contained in block elems, but only contain inlines.
241
+ # HEAD must only contain TITLE, BASE?, ISINDEX?, SCRIPT* STYLE* META* LINK*
242
+ # OBJECT* HEAD must be in HTML
243
+ # HTML is top-level and can only contain HEAD, BODY, or HEAD, FRAMESET
244
+ # LI can contain blocks except when inside DIR or MENU
245
+ # LI can only be inside OL, UL, DIR, MENU
246
+ # OL, UL can only contain LI+
247
+ # OPTGROUP contains OPTION+
248
+ # P can only contain inlines. However, it is a block-level elem.
249
+ # PRE can only contain inlines except IMG, OBJECT, APPLET, BIG, SMALL, SUB,
250
+ # SUP, FONT, BASEFONT
251
+
252
+ # tags with optional omitted endtags and their allowed contents:
253
+ # anchor matches at beginning and end
254
+ {
255
+ 'AREA' => '(?!AREA)[A-Z]+',
256
+ 'COLGROUP' => 'COL',
257
+ 'DD' => '(?!D[DT]$)[A-Z]+',
258
+ 'DT' => '(?!D[DT]$)[A-Z]+',
259
+ 'MAP' => 'AREA',
260
+ 'P' => '(?!P$)[A-Z]+',
261
+ 'TD' => '(?!T[HDR]$)[A-Z]+',
262
+ 'TFOOT' => 'TR',
263
+ 'TH' => '(?!T[HDR]$)[A-Z]+',
264
+ 'THEAD' => 'TR',
265
+ 'TR' => 'T[HD]',
266
+ }.each_pair { |tagname, pattern|
267
+ eval <<EOM
268
+ class << named(tagname) # :nodoc:
269
+ def can_contain(tag, parent)
270
+ (/\\A#{pattern}\\z/i =~ tag) == 0
271
+ end
272
+ end
273
+ EOM
274
+ }
275
+
276
+ class << named('TEXTAREA') # :nodoc:
277
+ def can_ignore_whitespace; false; end
278
+ end
279
+ class << named('PRE') # :nodoc:
280
+ def can_ignore_whitespace; false; end
281
+ end
282
+ class << named('OPTION') # :nodoc:
283
+ def can_ignore_whitespace; false; end
284
+ end
285
+ end
286
+ end
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/ruby
2
+ # This is an HTML parser that builds an element tree for further
3
+ # processing. Attributes and data are also stored.
4
+ #
5
+ # Typical usage is:
6
+ # parser = HTMLTree::Parser.new(false, false)
7
+ # parser.parse_file_named('whatever.html')
8
+ # # then you have the tree built..
9
+ # parser.tree.dump
10
+ #
11
+ # Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
12
+ # License:: Ruby's
13
+ # CVS ID:: $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
14
+
15
+ require 'web/htmltools/tags'
16
+ require 'web/htmltools/stparser'
17
+ require 'web/htmltools/element'
18
+
19
+ # This is a tree building HTML parser.
20
+ module HTMLTree #:nodoc: all
21
+ class Parser < HTML::StackingParser
22
+
23
+ # verbose:: if true, will warn to $stderr on unknown
24
+ # tags/entities/characters, as well as missing end tags and extra end
25
+ # tags.
26
+ # strip_white:: if true, remove all non-essential whitespace. Note
27
+ # that there are browser bugs that may cause this to change the
28
+ # appearance of HTML (even though it shouldn't by the standard).
29
+ def initialize(verbose=false, strip_white=true)
30
+ super
31
+ reset
32
+ end
33
+
34
+ # Reset this parser so that it can parse a new document.
35
+ def reset
36
+ super
37
+ @rootNode = @currentNode = Document.new
38
+ end
39
+
40
+ # Return the tree that was built. This will be an HTMLTree::Element that
41
+ # represents the whole document. The \<html> node is a child of this.
42
+ def tree
43
+ @rootNode
44
+ end
45
+
46
+ # Return the <html> node, if any.
47
+ def html
48
+ @rootNode.html_node()
49
+ end
50
+
51
+ # no user-serviceable parts inside...
52
+ # though you can subclass carefully.
53
+ private
54
+
55
+ def add_child_to_current(tag, attrs)
56
+ node = Element.new(@currentNode, tag)
57
+ attrs.each { |a| node.add_attribute(*a) }
58
+ node
59
+ end
60
+
61
+ # callbacks
62
+
63
+ # add a child to the current node and descend
64
+ def handle_start_tag(tag, attrs)
65
+ node = add_child_to_current(tag, attrs)
66
+ @rootNode = node unless @rootNode
67
+ @currentNode = node
68
+ end
69
+
70
+ # go up to parent
71
+ def handle_end_tag(tag)
72
+ @currentNode = @currentNode.parent
73
+ end
74
+
75
+ # add a child to the current node
76
+ def handle_empty_tag(tag, attrs)
77
+ add_child_to_current(tag, attrs)
78
+ end
79
+
80
+ # Add a child to the current node and descend
81
+ # Assume that the unknown tag has an end tag.
82
+ def handle_unknown_tag(tag, attrs)
83
+ super
84
+ handle_start_tag(tag, attrs)
85
+ end
86
+
87
+ # go up to parent
88
+ def handle_missing_end_tag(tag)
89
+ super
90
+ handle_end_tag(tag)
91
+ end
92
+
93
+ # ignore
94
+ def handle_extra_end_tag(tag)
95
+ super
96
+ end
97
+
98
+ def handle_cdata(data)
99
+ node = Data.new(@currentNode, data)
100
+ end
101
+
102
+ def handle_script(data)
103
+ node = Data.new(@currentNode, data)
104
+ end
105
+
106
+ def handle_unknown_character(name)
107
+ super
108
+ end
109
+
110
+ def handle_unknown_entity(name)
111
+ super
112
+ end
113
+
114
+ def handle_comment(data)
115
+ super # make sure and strip whitespace.
116
+ node = Comment.new(@currentNode, data)
117
+ end
118
+
119
+ def handle_special(data)
120
+ node = HTMLTree::Special.new(@currentNode, data)
121
+ $stderr.print('special ', node, ' discarded') unless @currentNode
122
+ end
123
+
124
+ end
125
+ end
126
+
127
+ if $0 == __FILE__
128
+ $stdout.sync = true
129
+
130
+ class TestStackingParser < HTMLTree::Parser #:nodoc: all
131
+ $DEBUG = false
132
+ p = TestStackingParser.new(true, false)
133
+ p.parse_file_named(ARGV[0] || 'ebay.html')
134
+ File.open('xx.html', 'w') { |of|
135
+ p.tree.write(of)
136
+ }
137
+ p.tree.dump
138
+ end
139
+ end
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/ruby
2
+ # This is an HTML parser that builds an element tree for further
3
+ # processing. Attributes and data are also stored.
4
+ # The storage is that of REXML, which is required.
5
+ #
6
+ # Typical usage is:
7
+ # parser = HTMLTree::XMLParser.new(false, false)
8
+ # parser.parse_file_named('whatever.html')
9
+ # # then you have the tree built..
10
+ # parser.tree # is a REXML::Document
11
+ #
12
+ # Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
13
+ # License:: Ruby's
14
+ # CVS ID:: $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
15
+
16
+ require 'web/htmltools/tags'
17
+ require 'web/htmltools/stparser'
18
+ require 'rexml/element'
19
+ require 'rexml/document'
20
+
21
+ # REXML::Child
22
+ # REXML::XMLDecl
23
+ # REXML::Instruction
24
+ # REXML::Text
25
+ # REXML::Comment
26
+ # REXML::Entity
27
+ # REXML::Parent
28
+ # REXML::Element (+REXML::Namespace)
29
+ # REXML::Document
30
+ # REXML::DocType
31
+ #
32
+ # This is a tree building HTML parser that makes XML.
33
+ module HTMLTree #:nodoc: all
34
+ class XMLParser < HTML::StackingParser
35
+
36
+ # verbose:: if true, will warn to $stderr on unknown
37
+ # tags/entities/characters, as well as missing end tags and extra end
38
+ # tags.
39
+ # strip_white:: if true, remove all non-essential whitespace. Note
40
+ # that there are browser bugs that may cause this to change the
41
+ # appearance of HTML (even though it shouldn't by the standard).
42
+ def initialize(verbose=false, strip_white=true)
43
+ super
44
+ reset
45
+ end
46
+
47
+ # Reset this parser so that it can parse a new document.
48
+ def reset
49
+ super
50
+ @rootNode = @currentNode = REXML::Document.new()
51
+ end
52
+
53
+ # Return the document that was built. This will be an
54
+ # REXML::Document that represents the whole document. The \<html>
55
+ # node is a child of this.
56
+ def document
57
+ @rootNode
58
+ end
59
+
60
+ def tree
61
+ document()
62
+ end
63
+
64
+ # Return the root of the document, if any.
65
+ def root
66
+ @rootNode.root()
67
+ end
68
+
69
+ # Return the <html> node, if any.
70
+ def html
71
+ @rootNode.root.elements['html']
72
+ end
73
+
74
+ # no user-serviceable parts inside...
75
+ # though you can subclass carefully.
76
+ private
77
+
78
+ def add_child_to_current(tag, attrs)
79
+ node = REXML::Element.new(tag, @currentNode)
80
+ attrs.each { |a| node.attributes[a[0]] = a[1] }
81
+ node
82
+ end
83
+
84
+ # callbacks
85
+
86
+ # add a child to the current node and descend
87
+ def handle_start_tag(tag, attrs)
88
+ node = add_child_to_current(tag, attrs)
89
+ @rootNode = node unless @rootNode
90
+ @currentNode = node
91
+ end
92
+
93
+ # go up to parent
94
+ def handle_end_tag(tag)
95
+ @currentNode = @currentNode.parent
96
+ end
97
+
98
+ # add a child to the current node
99
+ def handle_empty_tag(tag, attrs)
100
+ add_child_to_current(tag, attrs)
101
+ end
102
+
103
+ # Add a child to the current node and descend
104
+ # Assume that the unknown tag has an end tag.
105
+ def handle_unknown_tag(tag, attrs)
106
+ super
107
+ handle_start_tag(tag, attrs)
108
+ end
109
+
110
+ # go up to parent
111
+ def handle_missing_end_tag(tag)
112
+ super
113
+ handle_end_tag(tag)
114
+ end
115
+
116
+ # ignore
117
+ def handle_extra_end_tag(tag)
118
+ super
119
+ end
120
+
121
+ def handle_cdata(data)
122
+ REXML::Text.new(data, !@stripWhitespace, @currentNode)
123
+ end
124
+
125
+ def handle_script(data)
126
+ REXML::Comment.new(data, @currentNode)
127
+ end
128
+
129
+ def handle_unknown_character(name)
130
+ super # that is, do nothing
131
+ end
132
+
133
+ def handle_unknown_entity(name)
134
+ super # that is, do nothing
135
+ end
136
+
137
+ def handle_comment(data)
138
+ super # strip white
139
+ REXML::Comment.new(data, @currentNode)
140
+ end
141
+
142
+ def handle_special(data)
143
+ REXML::DocType.new(data, @currentNode) # TODO
144
+ end
145
+
146
+ end
147
+ end
148
+
149
+ if $0 == __FILE__
150
+ $stdout.sync = true
151
+
152
+ class TestStackingParser < HTMLTree::XMLParser #:nodoc: all
153
+ $DEBUG = false
154
+ p = TestStackingParser.new(true, false)
155
+ p.parse_file_named(ARGV[0] || 'ebay.html')
156
+ File.open('xx.html', 'w') { |of|
157
+ p.document.write(of)
158
+ }
159
+ end
160
+ end