ruby-web 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (190) hide show
  1. data/ChangeLog +474 -0
  2. data/INSTALL.txt +9 -0
  3. data/InstalledFiles +180 -0
  4. data/LICENSE.txt +74 -0
  5. data/Rakefile +529 -0
  6. data/TODO +65 -0
  7. data/doc/additional.xml +149 -0
  8. data/doc/core.xml +652 -0
  9. data/doc/credits/index.xml +52 -0
  10. data/doc/credits/php.contributors.xml +118 -0
  11. data/doc/credits/php.language-snippets.ent +622 -0
  12. data/doc/install/index.xml +136 -0
  13. data/doc/install/mac/index.xml +21 -0
  14. data/doc/install/ruby-web.install.rb.instructions.xml +7 -0
  15. data/doc/install/unix/index.xml +46 -0
  16. data/doc/install/win/apache1.xml +166 -0
  17. data/doc/install/win/apache2.xml +141 -0
  18. data/doc/install/win/iis.xml +162 -0
  19. data/doc/install/win/index.xml +24 -0
  20. data/doc/install/win/installer.xml +31 -0
  21. data/doc/install/win/manual.xml +43 -0
  22. data/doc/manual.xml +69 -0
  23. data/doc/old/apache_cgi.txt +23 -0
  24. data/doc/old/fastcgi.txt +23 -0
  25. data/doc/old/mod_ruby.txt +21 -0
  26. data/doc/old/snippets.rdoc +183 -0
  27. data/doc/old/webrick.txt +23 -0
  28. data/doc/old/windows_cgi.txt +9 -0
  29. data/doc/tutorial.xml +14 -0
  30. data/doc/xsl/manual-multi.xsl +10 -0
  31. data/doc/xsl/manual-pdf.xsl +6 -0
  32. data/doc/xsl/manual-single.xsl +6 -0
  33. data/doc/xsl/manual.css +22 -0
  34. data/install.rb +1022 -0
  35. data/lib/formatter.rb +314 -0
  36. data/lib/html-parser.rb +429 -0
  37. data/lib/htmlrepair.rb +113 -0
  38. data/lib/htmlsplit.rb +842 -0
  39. data/lib/sgml-parser.rb +332 -0
  40. data/lib/web.rb +68 -0
  41. data/lib/web/assertinclude.rb +129 -0
  42. data/lib/web/config.rb +50 -0
  43. data/lib/web/connection.rb +1070 -0
  44. data/lib/web/convenience.rb +154 -0
  45. data/lib/web/formreader.rb +318 -0
  46. data/lib/web/htmlparser/html-parser.rb +429 -0
  47. data/lib/web/htmlparser/sgml-parser.rb +332 -0
  48. data/lib/web/htmltools/element.rb +296 -0
  49. data/lib/web/htmltools/stparser.rb +276 -0
  50. data/lib/web/htmltools/tags.rb +286 -0
  51. data/lib/web/htmltools/tree.rb +139 -0
  52. data/lib/web/htmltools/xmltree.rb +160 -0
  53. data/lib/web/htmltools/xpath.rb +71 -0
  54. data/lib/web/info.rb +63 -0
  55. data/lib/web/load.rb +210 -0
  56. data/lib/web/mime.rb +87 -0
  57. data/lib/web/phprb.rb +340 -0
  58. data/lib/web/resources/test/cookie.rb +33 -0
  59. data/lib/web/resources/test/counter.rb +20 -0
  60. data/lib/web/resources/test/multipart.rb +14 -0
  61. data/lib/web/resources/test/redirect.rb +8 -0
  62. data/lib/web/resources/test/stock.rb +33 -0
  63. data/lib/web/sapi/apache.rb +129 -0
  64. data/lib/web/sapi/fastcgi.rb +22 -0
  65. data/lib/web/sapi/install/apache.rb +180 -0
  66. data/lib/web/sapi/install/iis.rb +93 -0
  67. data/lib/web/sapi/install/macosx.rb +90 -0
  68. data/lib/web/sapi/webrick.rb +86 -0
  69. data/lib/web/session.rb +83 -0
  70. data/lib/web/shim/cgi.rb +129 -0
  71. data/lib/web/shim/rails.rb +175 -0
  72. data/lib/web/stringio.rb +78 -0
  73. data/lib/web/strscanparser.rb +24 -0
  74. data/lib/web/tagparser.rb +96 -0
  75. data/lib/web/testing.rb +666 -0
  76. data/lib/web/traceoutput.rb +75 -0
  77. data/lib/web/unit.rb +56 -0
  78. data/lib/web/upload.rb +59 -0
  79. data/lib/web/validate.rb +52 -0
  80. data/lib/web/wiki.rb +557 -0
  81. data/lib/web/wiki/linker.rb +72 -0
  82. data/lib/web/wiki/page.rb +201 -0
  83. data/lib/webunit.rb +27 -0
  84. data/lib/webunit/assert.rb +152 -0
  85. data/lib/webunit/converter.rb +154 -0
  86. data/lib/webunit/cookie.rb +118 -0
  87. data/lib/webunit/domwalker.rb +185 -0
  88. data/lib/webunit/exception.rb +14 -0
  89. data/lib/webunit/form.rb +116 -0
  90. data/lib/webunit/frame.rb +37 -0
  91. data/lib/webunit/htmlelem.rb +122 -0
  92. data/lib/webunit/image.rb +26 -0
  93. data/lib/webunit/jscript.rb +31 -0
  94. data/lib/webunit/link.rb +33 -0
  95. data/lib/webunit/params.rb +321 -0
  96. data/lib/webunit/parser.rb +229 -0
  97. data/lib/webunit/response.rb +464 -0
  98. data/lib/webunit/runtest.rb +41 -0
  99. data/lib/webunit/table.rb +148 -0
  100. data/lib/webunit/testcase.rb +45 -0
  101. data/lib/webunit/ui/cui/testrunner.rb +50 -0
  102. data/lib/webunit/utils.rb +68 -0
  103. data/lib/webunit/webunit.rb +28 -0
  104. data/test/dev/action.rb +83 -0
  105. data/test/dev/forms.rb +104 -0
  106. data/test/dev/forms2.rb +104 -0
  107. data/test/dev/parser.rb +17 -0
  108. data/test/dev/scripts/dump.rb +24 -0
  109. data/test/dev/scripts/makedist.rb +62 -0
  110. data/test/dev/scripts/uri.rb +41 -0
  111. data/test/dev/scripts/uri/common.rb +432 -0
  112. data/test/dev/scripts/uri/ftp.rb +149 -0
  113. data/test/dev/scripts/uri/generic.rb +1106 -0
  114. data/test/dev/scripts/uri/http.rb +76 -0
  115. data/test/dev/scripts/uri/https.rb +26 -0
  116. data/test/dev/scripts/uri/ldap.rb +238 -0
  117. data/test/dev/scripts/uri/mailto.rb +260 -0
  118. data/test/dev/scripts/urireg.rb +174 -0
  119. data/test/dev/simpledispatcher.rb +156 -0
  120. data/test/dev/test.action.rb +146 -0
  121. data/test/dev/test.formreader.rb +463 -0
  122. data/test/dev/test.simpledispatcher.rb +186 -0
  123. data/test/dev/webunit/conv/digit-0.rb +21 -0
  124. data/test/dev/webunit/conv/digit-1.rb +17 -0
  125. data/test/dev/webunit/conv/digit.rb +23 -0
  126. data/test/dev/webunit/conv/test_digit-0.rb +16 -0
  127. data/test/dev/webunit/conv/test_digit-1.rb +19 -0
  128. data/test/dev/webunit/conv/test_digit.rb +26 -0
  129. data/test/dev/webunit/conv/test_digit_view-0.rb +76 -0
  130. data/test/dev/webunit/conv/test_digit_view-1.rb +102 -0
  131. data/test/dev/webunit/conv/test_digit_view.rb +134 -0
  132. data/test/installation/htdocs/cgi_test.rb +296 -0
  133. data/test/installation/htdocs/test_install.rb +4 -0
  134. data/test/installation/runwebtest.rb +5 -0
  135. data/test/installation/test_cookie.rb +128 -0
  136. data/test/installation/test_form.rb +47 -0
  137. data/test/installation/test_multipart.rb +51 -0
  138. data/test/installation/test_request.rb +24 -0
  139. data/test/installation/test_response.rb +35 -0
  140. data/test/unit/htdocs/cookie.rb +32 -0
  141. data/test/unit/htdocs/multipart.rb +28 -0
  142. data/test/unit/htdocs/redirect.rb +12 -0
  143. data/test/unit/htdocs/simple.rb +13 -0
  144. data/test/unit/htdocs/stock.rb +33 -0
  145. data/test/unit/test_assert.rb +162 -0
  146. data/test/unit/test_cookie.rb +114 -0
  147. data/test/unit/test_domwalker.rb +77 -0
  148. data/test/unit/test_form.rb +42 -0
  149. data/test/unit/test_frame.rb +40 -0
  150. data/test/unit/test_htmlelem.rb +74 -0
  151. data/test/unit/test_image.rb +45 -0
  152. data/test/unit/test_jscript.rb +57 -0
  153. data/test/unit/test_link.rb +85 -0
  154. data/test/unit/test_multipart.rb +51 -0
  155. data/test/unit/test_params.rb +210 -0
  156. data/test/unit/test_parser.rb +53 -0
  157. data/test/unit/test_response.rb +150 -0
  158. data/test/unit/test_table.rb +70 -0
  159. data/test/unit/test_utils.rb +106 -0
  160. data/test/unit/test_webunit.rb +28 -0
  161. data/test/web/mod_ruby_stub.rb +39 -0
  162. data/test/web/test.assertinclude.rb +109 -0
  163. data/test/web/test.buffer.rb +182 -0
  164. data/test/web/test.code.loader.rb +78 -0
  165. data/test/web/test.config.rb +31 -0
  166. data/test/web/test.error.handling.rb +91 -0
  167. data/test/web/test.formreader-2.0.rb +352 -0
  168. data/test/web/test.load.rb +125 -0
  169. data/test/web/test.mime-type.rb +23 -0
  170. data/test/web/test.narf.cgi.rb +106 -0
  171. data/test/web/test.phprb.rb +239 -0
  172. data/test/web/test.request.rb +368 -0
  173. data/test/web/test.response.rb +637 -0
  174. data/test/web/test.ruby-web.rb +10 -0
  175. data/test/web/test.session.rb +50 -0
  176. data/test/web/test.shim.cgi.rb +96 -0
  177. data/test/web/test.tagparser.rb +65 -0
  178. data/test/web/test.template2.rb +297 -0
  179. data/test/web/test.testing2.rb +318 -0
  180. data/test/web/test.upload.rb +45 -0
  181. data/test/web/test.validate.rb +46 -0
  182. data/test/web/test.web.test.rb +495 -0
  183. data/test/wiki/test.history.rb +297 -0
  184. data/test/wiki/test.illustration_page.rb +287 -0
  185. data/test/wiki/test.linker.rb +197 -0
  186. data/test/wiki/test.tarpit.rb +56 -0
  187. data/test/wiki/test.wiki.rb +300 -0
  188. data/test/wikitestroot/admin.rb +7 -0
  189. data/test/wikitestroot/wiki.rb +6 -0
  190. metadata +234 -0
@@ -0,0 +1,286 @@
1
+ # This encodes the knowledge of HTML 4.0 tags for a parser.
2
+ # It knows about block vs. inline tags, empty tags, and optionally
3
+ # omitted end tags.
4
+ #
5
+ # Copyright:: Copyright(C) 2002 Ned Konz <ned@bike-nomad.com>
6
+ # License:: Ruby's license
7
+ # CVS ID:: $Id: tags.rb,v 1.7 2002/06/04 01:55:59 ned Exp $
8
+
9
+ # This is an error raised by <tt>HTML::Tag.named()</tt> when a tag doesn't exist.
10
+ class NoSuchHTMLTagError < RuntimeError #:nodoc:
11
+ end
12
+
13
+ # This is the base class for all the HTML tag classes.
14
+ module HTML #:nodoc: all
15
+
16
+ class Tag
17
+
18
+ # tag_name:: a String, the name of the tag
19
+ # can_omit:: a Boolean, true if end tag is optional
20
+ def initialize(tag_name, can_omit)
21
+ @name = tag_name.downcase
22
+ @can_omit_end = can_omit
23
+ end
24
+
25
+ # Return my tag name.
26
+ def name; @name; end
27
+
28
+ # Return true if my end tag can be omitted.
29
+ def can_omit_end_tag; @can_omit_end; end
30
+
31
+ # Return true if I am a block element.
32
+ def is_block_element; false; end
33
+
34
+ # Return true if I am an inline element.
35
+ def is_inline_element; false; end
36
+
37
+ # Return true if I am an empty element.
38
+ def is_empty_element; false; end
39
+
40
+ # Return true if I can contain <tt>tag</tt> if my parent is of type <tt>parent</tt>.
41
+ # tag:: tag name, a String
42
+ # parent:: parent tag name, a String.
43
+ def can_contain(tag, parent); false; end
44
+
45
+ # Return true if whitespace within me can be omitted (ignoring browser
46
+ # bugs)
47
+ def can_ignore_whitespace; true; end
48
+ end
49
+
50
+ # This represents an HTML block element.
51
+ class BlockTag < Tag
52
+ def is_block_element; true; end
53
+
54
+ # Blocks can contain anything, so return true.
55
+ def can_contain(tag, parent); true; end
56
+ end
57
+
58
+ # This represents an HTML inline element.
59
+ class InlineTag < Tag
60
+ def is_inline_element; true; end
61
+
62
+ # Inlines can only contain other inlines.
63
+ def can_contain(tag, parent)
64
+ Tag.named(tag).is_inline_element
65
+ end
66
+ end
67
+
68
+ # This represents an HTML element that can be regarded as either a block
69
+ # or an inline element..
70
+ class BlockOrInlineTag < InlineTag
71
+
72
+ def is_block_element; true; end
73
+
74
+ # If used as inline elements (e.g., within another inline element or a P),
75
+ # these elements should not contain any block-level elements.
76
+ def can_contain(tag, parent)
77
+ return ((parent.downcase == 'p' \
78
+ or Tag.named(parent).is_inline_element) \
79
+ and ! Tag.named(tag).is_block_element)
80
+ end
81
+ end
82
+
83
+ # This represents an HTML tag that never has an end tag.
84
+ class EmptyTag < Tag
85
+ def is_empty_element; true; end
86
+ def is_inline_element; true; end
87
+ def can_contain(tag, parent); false; end
88
+ end
89
+
90
+ # This block initializes the tag lookup table.
91
+ class Tag
92
+ @table = Hash.new
93
+
94
+ # Add the given tag to the tag lookup table.
95
+ #
96
+ # This can be called by user code to add otherwise unknown tags to the
97
+ # table.
98
+ #
99
+ # name:: the tag name, a String.
100
+ # is_block:: true if I am a block element.
101
+ # is_inline:: true if I am an inline element.
102
+ # is_empty:: true if I am an empty element.
103
+ # can_omit:: true if my end tag can be omitted.
104
+ def Tag.add_tag(name, is_block, is_inline, is_empty, can_omit)
105
+ @table[ name.upcase ] = @table[ name.downcase ] = \
106
+ if is_empty
107
+ EmptyTag.new(name, true)
108
+ elsif is_block
109
+ if is_inline
110
+ BlockOrInlineTag.new(name, can_omit)
111
+ else
112
+ BlockTag.new(name, can_omit)
113
+ end
114
+ else
115
+ InlineTag.new(name, can_omit)
116
+ end
117
+ end
118
+
119
+ # Return an Tag with the given name, or raise a
120
+ # NoSuchHTMLTagError.
121
+ def Tag.named(tagname)
122
+ @table[ tagname ] || raise(NoSuchHTMLTagError.exception(tagname))
123
+ end
124
+
125
+ # Block Inline Empty can_omit_end
126
+ [
127
+ [ 'A', false, true, false, false ], # Anchor
128
+ [ 'ABBR', false, true, false, false ], # Abbreviation
129
+ [ 'ACRONYM', false, true, false, false ], # Acronym
130
+ [ 'ADDRESS', true, false, false, false ], # Address
131
+ [ 'APPLET', true, true, false, false ], # Java applet
132
+ [ 'AREA', true, false, true, true ], # Image map region
133
+ [ 'B', false, true, false, false ], # Bold text
134
+ [ 'BASE', false, false, true, true ], # Document base URI
135
+ [ 'BASEFONT', false, true, true, true ], # Base font change
136
+ [ 'BDO', false, true, false, false ], # Bi_di override
137
+ [ 'BIG', false, true, false, false ], # Large text
138
+ [ 'BLOCKQUOTE', true, false, false, false ], # Block quotation
139
+ [ 'BODY', true, false, false, false ], # Document body
140
+ [ 'BR', false, true, true, true ], # Line break
141
+ [ 'BUTTON', true, true, false, false ], # Button
142
+ [ 'CAPTION', false, true, false, false ], # Table caption
143
+ [ 'CENTER', false, true, false, false ], # Centered block
144
+ [ 'CITE', false, true, false, false ], # Citation
145
+ [ 'CODE', false, true, false, false ], # Computer code
146
+ [ 'COL', false, false, true, true ], # Table column
147
+ [ 'COLGROUP', true, false, false, true ], # Table column group
148
+ [ 'DD', true, false, false, true ], # Definition description
149
+ [ 'DEL', true, true, false, false ], # Deleted text
150
+ [ 'DFN', false, true, false, false ], # Defined term
151
+ [ 'DIR', true, false, false, false ], # Directory list
152
+ [ 'DIV', true, false, false, false ], # Generic block-level container
153
+ [ 'DL', true, false, false, false ], # Definition list
154
+ [ 'DT', false, true, false, true ], # Definition term
155
+ [ 'EM', false, true, false, false ], # Emphasis
156
+ [ 'FIELDSET', true, false, false, false ], # Form control group
157
+ [ 'FONT', false, true, false, false ], # Font change
158
+ [ 'FORM', true, false, false, false ], # Interactive form
159
+ [ 'FRAME', false, false, true, true ], # Frame
160
+ [ 'FRAMESET', true, false, false, false ], # Frameset
161
+ [ 'H1', true, false, false, false ], # Level-one heading
162
+ [ 'H2', true, false, false, false ], # Level-two heading
163
+ [ 'H3', true, false, false, false ], # Level-three heading
164
+ [ 'H4', true, false, false, false ], # Level-four heading
165
+ [ 'H5', true, false, false, false ], # Level-five heading
166
+ [ 'H6', true, false, false, false ], # Level-six heading
167
+ [ 'HEAD', true, false, false, false ], # Document head
168
+ [ 'HR', false, true, true, true ], # Horizontal rule
169
+ [ 'HTML', true, false, false, false ], # HTML document
170
+ [ 'I', false, true, false, false ], # Italic text
171
+ [ 'IFRAME', true, true, false, false ], # Inline frame
172
+ [ 'IMG', false, true, true, true ], # Inline image
173
+ [ 'INPUT', false, true, true, true ], # Form input
174
+ [ 'INS', true, true, false, false ], # Inserted text
175
+ [ 'ISINDEX', false, true, true, true ], # Input prompt
176
+ [ 'KBD', false, true, false, false ], # Text to be input
177
+ [ 'LABEL', false, true, false, false ], # Form field label
178
+ [ 'LEGEND', false, true, false, false ], # Fieldset caption
179
+ [ 'LI', true, false, false, true ], # List item
180
+ [ 'LINK', true, false, false, false ], # Document relationship
181
+ [ 'MAP', true, true, false, false ], # Image map
182
+ [ 'MENU', true, false, false, false ], # Menu list
183
+ [ 'META', false, true, true, true ], # Metadata
184
+ [ 'NOFRAMES', true, false, false, false ], # Frames alternate content
185
+ [ 'NOSCRIPT', true, false, false, false ], # Alternate script content
186
+ [ 'OBJECT', true, true, false, false ], # Object
187
+ [ 'OL', true, false, false, false ], # Ordered list
188
+ [ 'OPTGROUP', true, false, false, false ], # Option group
189
+ [ 'OPTION', true, false, false, false ], # Menu option
190
+ [ 'P', true, false, false, true ], # Paragraph
191
+ [ 'PARAM', false, true, true, true ], # Object parameter
192
+ [ 'PRE', true, false, false, false ], # Preformatted text
193
+ [ 'Q', false, true, false, false ], # Short quotation
194
+ [ 'S', false, true, false, false ], # Strike-through text
195
+ [ 'SAMP', false, true, false, false ], # Sample output
196
+ [ 'SCRIPT', true, true, false, false ], # Client-side script
197
+ [ 'SELECT', true, false, false, false ], # Option selector
198
+ [ 'SMALL', false, true, false, false ], # Small text
199
+ [ 'SPAN', false, true, false, false ], # Generic inline container
200
+ [ 'STRIKE', false, true, false, false ], # Strike-through text
201
+ [ 'STRONG', false, true, false, false ], # Strong emphasis
202
+ [ 'STYLE', true, false, false, false ], # Embedded style sheet
203
+ [ 'SUB', false, true, false, false ], # Subscript
204
+ [ 'SUP', false, true, false, false ], # Superscript
205
+ [ 'TABLE', true, false, false, false ], # Table
206
+ [ 'TBODY', true, false, false, false ], # Table body
207
+ [ 'TD', true, false, false, true ], # Table data cell
208
+ [ 'TEXTAREA', false, true, false, false ], # Multi-line text input
209
+ [ 'TFOOT', true, false, false, true ], # Table foot
210
+ [ 'TH', true, false, false, true ], # Table header cell
211
+ [ 'THEAD', true, false, false, true ], # Table head
212
+ [ 'TITLE', true, false, false, false ], # Document title
213
+ [ 'TR', true, false, false, true ], # Table row
214
+ [ 'TT', false, true, false, false ], # Teletype text
215
+ [ 'U', false, true, false, false ], # Underlined text
216
+ [ 'UL', true, false, false, false ], # Unordered list
217
+ [ 'VAR', false, true, false, false ], # Variable
218
+ ].each { |a| add_tag(*a) }
219
+
220
+ # EXCEPTIONS TODO
221
+ # A, LABEL can't contain itself
222
+ # several things (fonts, etc) can't be in PRE
223
+ # SELECT can only have OPTGROUP or OPTION
224
+ # TEXTAREA, OPTION only contains plain text
225
+ # APPLET and OBJECT has PARAM+ followed by block and/or inline
226
+ # BUTTON can't contain:
227
+ # A, INPUT, SELECT, TEXTAREA, LABEL, BUTTON, or IFRAME
228
+ # nor FORM, ISINDEX, and FIELDSET
229
+ # IFRAME can only contain block elems if parent can
230
+ # MAP can contain block+ *xor* AREA+
231
+ # SCRIPT only contains a SCRIPT (that is, until /<\/[A-Za-z]/)
232
+ # BODY must be in HTML or NOFRAMES
233
+ # COL can only be in COLGROUP or TABLE
234
+ # COLGROUP has only COL*, and can only be in TABLE
235
+ # DIR, MENU can only contain LI+, none of which may contain block elems
236
+ # DL must contain (DT|DD)+
237
+ # DT and DD are only allowed in DL
238
+ # FIELDSET contains LEGEND, (block|inline)*
239
+ # FRAMESET contains (FRAMESET|FRAME), plus NOFRAMES and must be in HTML
240
+ # H# can only be contained in block elems, but only contain inlines.
241
+ # HEAD must only contain TITLE, BASE?, ISINDEX?, SCRIPT* STYLE* META* LINK*
242
+ # OBJECT* HEAD must be in HTML
243
+ # HTML is top-level and can only contain HEAD, BODY, or HEAD, FRAMESET
244
+ # LI can contain blocks except when inside DIR or MENU
245
+ # LI can only be inside OL, UL, DIR, MENU
246
+ # OL, UL can only contain LI+
247
+ # OPTGROUP contains OPTION+
248
+ # P can only contain inlines. However, it is a block-level elem.
249
+ # PRE can only contain inlines except IMG, OBJECT, APPLET, BIG, SMALL, SUB,
250
+ # SUP, FONT, BASEFONT
251
+
252
+ # tags with optional omitted endtags and their allowed contents:
253
+ # anchor matches at beginning and end
254
+ {
255
+ 'AREA' => '(?!AREA)[A-Z]+',
256
+ 'COLGROUP' => 'COL',
257
+ 'DD' => '(?!D[DT]$)[A-Z]+',
258
+ 'DT' => '(?!D[DT]$)[A-Z]+',
259
+ 'MAP' => 'AREA',
260
+ 'P' => '(?!P$)[A-Z]+',
261
+ 'TD' => '(?!T[HDR]$)[A-Z]+',
262
+ 'TFOOT' => 'TR',
263
+ 'TH' => '(?!T[HDR]$)[A-Z]+',
264
+ 'THEAD' => 'TR',
265
+ 'TR' => 'T[HD]',
266
+ }.each_pair { |tagname, pattern|
267
+ eval <<EOM
268
+ class << named(tagname) # :nodoc:
269
+ def can_contain(tag, parent)
270
+ (/\\A#{pattern}\\z/i =~ tag) == 0
271
+ end
272
+ end
273
+ EOM
274
+ }
275
+
276
+ class << named('TEXTAREA') # :nodoc:
277
+ def can_ignore_whitespace; false; end
278
+ end
279
+ class << named('PRE') # :nodoc:
280
+ def can_ignore_whitespace; false; end
281
+ end
282
+ class << named('OPTION') # :nodoc:
283
+ def can_ignore_whitespace; false; end
284
+ end
285
+ end
286
+ end
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/ruby
2
+ # This is an HTML parser that builds an element tree for further
3
+ # processing. Attributes and data are also stored.
4
+ #
5
+ # Typical usage is:
6
+ # parser = HTMLTree::Parser.new(false, false)
7
+ # parser.parse_file_named('whatever.html')
8
+ # # then you have the tree built..
9
+ # parser.tree.dump
10
+ #
11
+ # Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
12
+ # License:: Ruby's
13
+ # CVS ID:: $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
14
+
15
+ require 'web/htmltools/tags'
16
+ require 'web/htmltools/stparser'
17
+ require 'web/htmltools/element'
18
+
19
+ # This is a tree building HTML parser.
20
+ module HTMLTree #:nodoc: all
21
+ class Parser < HTML::StackingParser
22
+
23
+ # verbose:: if true, will warn to $stderr on unknown
24
+ # tags/entities/characters, as well as missing end tags and extra end
25
+ # tags.
26
+ # strip_white:: if true, remove all non-essential whitespace. Note
27
+ # that there are browser bugs that may cause this to change the
28
+ # appearance of HTML (even though it shouldn't by the standard).
29
+ def initialize(verbose=false, strip_white=true)
30
+ super
31
+ reset
32
+ end
33
+
34
+ # Reset this parser so that it can parse a new document.
35
+ def reset
36
+ super
37
+ @rootNode = @currentNode = Document.new
38
+ end
39
+
40
+ # Return the tree that was built. This will be an HTMLTree::Element that
41
+ # represents the whole document. The \<html> node is a child of this.
42
+ def tree
43
+ @rootNode
44
+ end
45
+
46
+ # Return the <html> node, if any.
47
+ def html
48
+ @rootNode.html_node()
49
+ end
50
+
51
+ # no user-serviceable parts inside...
52
+ # though you can subclass carefully.
53
+ private
54
+
55
+ def add_child_to_current(tag, attrs)
56
+ node = Element.new(@currentNode, tag)
57
+ attrs.each { |a| node.add_attribute(*a) }
58
+ node
59
+ end
60
+
61
+ # callbacks
62
+
63
+ # add a child to the current node and descend
64
+ def handle_start_tag(tag, attrs)
65
+ node = add_child_to_current(tag, attrs)
66
+ @rootNode = node unless @rootNode
67
+ @currentNode = node
68
+ end
69
+
70
+ # go up to parent
71
+ def handle_end_tag(tag)
72
+ @currentNode = @currentNode.parent
73
+ end
74
+
75
+ # add a child to the current node
76
+ def handle_empty_tag(tag, attrs)
77
+ add_child_to_current(tag, attrs)
78
+ end
79
+
80
+ # Add a child to the current node and descend
81
+ # Assume that the unknown tag has an end tag.
82
+ def handle_unknown_tag(tag, attrs)
83
+ super
84
+ handle_start_tag(tag, attrs)
85
+ end
86
+
87
+ # go up to parent
88
+ def handle_missing_end_tag(tag)
89
+ super
90
+ handle_end_tag(tag)
91
+ end
92
+
93
+ # ignore
94
+ def handle_extra_end_tag(tag)
95
+ super
96
+ end
97
+
98
+ def handle_cdata(data)
99
+ node = Data.new(@currentNode, data)
100
+ end
101
+
102
+ def handle_script(data)
103
+ node = Data.new(@currentNode, data)
104
+ end
105
+
106
+ def handle_unknown_character(name)
107
+ super
108
+ end
109
+
110
+ def handle_unknown_entity(name)
111
+ super
112
+ end
113
+
114
+ def handle_comment(data)
115
+ super # make sure and strip whitespace.
116
+ node = Comment.new(@currentNode, data)
117
+ end
118
+
119
+ def handle_special(data)
120
+ node = HTMLTree::Special.new(@currentNode, data)
121
+ $stderr.print('special ', node, ' discarded') unless @currentNode
122
+ end
123
+
124
+ end
125
+ end
126
+
127
+ if $0 == __FILE__
128
+ $stdout.sync = true
129
+
130
+ class TestStackingParser < HTMLTree::Parser #:nodoc: all
131
+ $DEBUG = false
132
+ p = TestStackingParser.new(true, false)
133
+ p.parse_file_named(ARGV[0] || 'ebay.html')
134
+ File.open('xx.html', 'w') { |of|
135
+ p.tree.write(of)
136
+ }
137
+ p.tree.dump
138
+ end
139
+ end
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/ruby
2
+ # This is an HTML parser that builds an element tree for further
3
+ # processing. Attributes and data are also stored.
4
+ # The storage is that of REXML, which is required.
5
+ #
6
+ # Typical usage is:
7
+ # parser = HTMLTree::XMLParser.new(false, false)
8
+ # parser.parse_file_named('whatever.html')
9
+ # # then you have the tree built..
10
+ # parser.tree # is a REXML::Document
11
+ #
12
+ # Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
13
+ # License:: Ruby's
14
+ # CVS ID:: $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
15
+
16
+ require 'web/htmltools/tags'
17
+ require 'web/htmltools/stparser'
18
+ require 'rexml/element'
19
+ require 'rexml/document'
20
+
21
+ # REXML::Child
22
+ # REXML::XMLDecl
23
+ # REXML::Instruction
24
+ # REXML::Text
25
+ # REXML::Comment
26
+ # REXML::Entity
27
+ # REXML::Parent
28
+ # REXML::Element (+REXML::Namespace)
29
+ # REXML::Document
30
+ # REXML::DocType
31
+ #
32
+ # This is a tree building HTML parser that makes XML.
33
+ module HTMLTree #:nodoc: all
34
+ class XMLParser < HTML::StackingParser
35
+
36
+ # verbose:: if true, will warn to $stderr on unknown
37
+ # tags/entities/characters, as well as missing end tags and extra end
38
+ # tags.
39
+ # strip_white:: if true, remove all non-essential whitespace. Note
40
+ # that there are browser bugs that may cause this to change the
41
+ # appearance of HTML (even though it shouldn't by the standard).
42
+ def initialize(verbose=false, strip_white=true)
43
+ super
44
+ reset
45
+ end
46
+
47
+ # Reset this parser so that it can parse a new document.
48
+ def reset
49
+ super
50
+ @rootNode = @currentNode = REXML::Document.new()
51
+ end
52
+
53
+ # Return the document that was built. This will be an
54
+ # REXML::Document that represents the whole document. The \<html>
55
+ # node is a child of this.
56
+ def document
57
+ @rootNode
58
+ end
59
+
60
+ def tree
61
+ document()
62
+ end
63
+
64
+ # Return the root of the document, if any.
65
+ def root
66
+ @rootNode.root()
67
+ end
68
+
69
+ # Return the <html> node, if any.
70
+ def html
71
+ @rootNode.root.elements['html']
72
+ end
73
+
74
+ # no user-serviceable parts inside...
75
+ # though you can subclass carefully.
76
+ private
77
+
78
+ def add_child_to_current(tag, attrs)
79
+ node = REXML::Element.new(tag, @currentNode)
80
+ attrs.each { |a| node.attributes[a[0]] = a[1] }
81
+ node
82
+ end
83
+
84
+ # callbacks
85
+
86
+ # add a child to the current node and descend
87
+ def handle_start_tag(tag, attrs)
88
+ node = add_child_to_current(tag, attrs)
89
+ @rootNode = node unless @rootNode
90
+ @currentNode = node
91
+ end
92
+
93
+ # go up to parent
94
+ def handle_end_tag(tag)
95
+ @currentNode = @currentNode.parent
96
+ end
97
+
98
+ # add a child to the current node
99
+ def handle_empty_tag(tag, attrs)
100
+ add_child_to_current(tag, attrs)
101
+ end
102
+
103
+ # Add a child to the current node and descend
104
+ # Assume that the unknown tag has an end tag.
105
+ def handle_unknown_tag(tag, attrs)
106
+ super
107
+ handle_start_tag(tag, attrs)
108
+ end
109
+
110
+ # go up to parent
111
+ def handle_missing_end_tag(tag)
112
+ super
113
+ handle_end_tag(tag)
114
+ end
115
+
116
+ # ignore
117
+ def handle_extra_end_tag(tag)
118
+ super
119
+ end
120
+
121
+ def handle_cdata(data)
122
+ REXML::Text.new(data, !@stripWhitespace, @currentNode)
123
+ end
124
+
125
+ def handle_script(data)
126
+ REXML::Comment.new(data, @currentNode)
127
+ end
128
+
129
+ def handle_unknown_character(name)
130
+ super # that is, do nothing
131
+ end
132
+
133
+ def handle_unknown_entity(name)
134
+ super # that is, do nothing
135
+ end
136
+
137
+ def handle_comment(data)
138
+ super # strip white
139
+ REXML::Comment.new(data, @currentNode)
140
+ end
141
+
142
+ def handle_special(data)
143
+ REXML::DocType.new(data, @currentNode) # TODO
144
+ end
145
+
146
+ end
147
+ end
148
+
149
+ if $0 == __FILE__
150
+ $stdout.sync = true
151
+
152
+ class TestStackingParser < HTMLTree::XMLParser #:nodoc: all
153
+ $DEBUG = false
154
+ p = TestStackingParser.new(true, false)
155
+ p.parse_file_named(ARGV[0] || 'ebay.html')
156
+ File.open('xx.html', 'w') { |of|
157
+ p.document.write(of)
158
+ }
159
+ end
160
+ end