ruby-web 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +474 -0
- data/INSTALL.txt +9 -0
- data/InstalledFiles +180 -0
- data/LICENSE.txt +74 -0
- data/Rakefile +529 -0
- data/TODO +65 -0
- data/doc/additional.xml +149 -0
- data/doc/core.xml +652 -0
- data/doc/credits/index.xml +52 -0
- data/doc/credits/php.contributors.xml +118 -0
- data/doc/credits/php.language-snippets.ent +622 -0
- data/doc/install/index.xml +136 -0
- data/doc/install/mac/index.xml +21 -0
- data/doc/install/ruby-web.install.rb.instructions.xml +7 -0
- data/doc/install/unix/index.xml +46 -0
- data/doc/install/win/apache1.xml +166 -0
- data/doc/install/win/apache2.xml +141 -0
- data/doc/install/win/iis.xml +162 -0
- data/doc/install/win/index.xml +24 -0
- data/doc/install/win/installer.xml +31 -0
- data/doc/install/win/manual.xml +43 -0
- data/doc/manual.xml +69 -0
- data/doc/old/apache_cgi.txt +23 -0
- data/doc/old/fastcgi.txt +23 -0
- data/doc/old/mod_ruby.txt +21 -0
- data/doc/old/snippets.rdoc +183 -0
- data/doc/old/webrick.txt +23 -0
- data/doc/old/windows_cgi.txt +9 -0
- data/doc/tutorial.xml +14 -0
- data/doc/xsl/manual-multi.xsl +10 -0
- data/doc/xsl/manual-pdf.xsl +6 -0
- data/doc/xsl/manual-single.xsl +6 -0
- data/doc/xsl/manual.css +22 -0
- data/install.rb +1022 -0
- data/lib/formatter.rb +314 -0
- data/lib/html-parser.rb +429 -0
- data/lib/htmlrepair.rb +113 -0
- data/lib/htmlsplit.rb +842 -0
- data/lib/sgml-parser.rb +332 -0
- data/lib/web.rb +68 -0
- data/lib/web/assertinclude.rb +129 -0
- data/lib/web/config.rb +50 -0
- data/lib/web/connection.rb +1070 -0
- data/lib/web/convenience.rb +154 -0
- data/lib/web/formreader.rb +318 -0
- data/lib/web/htmlparser/html-parser.rb +429 -0
- data/lib/web/htmlparser/sgml-parser.rb +332 -0
- data/lib/web/htmltools/element.rb +296 -0
- data/lib/web/htmltools/stparser.rb +276 -0
- data/lib/web/htmltools/tags.rb +286 -0
- data/lib/web/htmltools/tree.rb +139 -0
- data/lib/web/htmltools/xmltree.rb +160 -0
- data/lib/web/htmltools/xpath.rb +71 -0
- data/lib/web/info.rb +63 -0
- data/lib/web/load.rb +210 -0
- data/lib/web/mime.rb +87 -0
- data/lib/web/phprb.rb +340 -0
- data/lib/web/resources/test/cookie.rb +33 -0
- data/lib/web/resources/test/counter.rb +20 -0
- data/lib/web/resources/test/multipart.rb +14 -0
- data/lib/web/resources/test/redirect.rb +8 -0
- data/lib/web/resources/test/stock.rb +33 -0
- data/lib/web/sapi/apache.rb +129 -0
- data/lib/web/sapi/fastcgi.rb +22 -0
- data/lib/web/sapi/install/apache.rb +180 -0
- data/lib/web/sapi/install/iis.rb +93 -0
- data/lib/web/sapi/install/macosx.rb +90 -0
- data/lib/web/sapi/webrick.rb +86 -0
- data/lib/web/session.rb +83 -0
- data/lib/web/shim/cgi.rb +129 -0
- data/lib/web/shim/rails.rb +175 -0
- data/lib/web/stringio.rb +78 -0
- data/lib/web/strscanparser.rb +24 -0
- data/lib/web/tagparser.rb +96 -0
- data/lib/web/testing.rb +666 -0
- data/lib/web/traceoutput.rb +75 -0
- data/lib/web/unit.rb +56 -0
- data/lib/web/upload.rb +59 -0
- data/lib/web/validate.rb +52 -0
- data/lib/web/wiki.rb +557 -0
- data/lib/web/wiki/linker.rb +72 -0
- data/lib/web/wiki/page.rb +201 -0
- data/lib/webunit.rb +27 -0
- data/lib/webunit/assert.rb +152 -0
- data/lib/webunit/converter.rb +154 -0
- data/lib/webunit/cookie.rb +118 -0
- data/lib/webunit/domwalker.rb +185 -0
- data/lib/webunit/exception.rb +14 -0
- data/lib/webunit/form.rb +116 -0
- data/lib/webunit/frame.rb +37 -0
- data/lib/webunit/htmlelem.rb +122 -0
- data/lib/webunit/image.rb +26 -0
- data/lib/webunit/jscript.rb +31 -0
- data/lib/webunit/link.rb +33 -0
- data/lib/webunit/params.rb +321 -0
- data/lib/webunit/parser.rb +229 -0
- data/lib/webunit/response.rb +464 -0
- data/lib/webunit/runtest.rb +41 -0
- data/lib/webunit/table.rb +148 -0
- data/lib/webunit/testcase.rb +45 -0
- data/lib/webunit/ui/cui/testrunner.rb +50 -0
- data/lib/webunit/utils.rb +68 -0
- data/lib/webunit/webunit.rb +28 -0
- data/test/dev/action.rb +83 -0
- data/test/dev/forms.rb +104 -0
- data/test/dev/forms2.rb +104 -0
- data/test/dev/parser.rb +17 -0
- data/test/dev/scripts/dump.rb +24 -0
- data/test/dev/scripts/makedist.rb +62 -0
- data/test/dev/scripts/uri.rb +41 -0
- data/test/dev/scripts/uri/common.rb +432 -0
- data/test/dev/scripts/uri/ftp.rb +149 -0
- data/test/dev/scripts/uri/generic.rb +1106 -0
- data/test/dev/scripts/uri/http.rb +76 -0
- data/test/dev/scripts/uri/https.rb +26 -0
- data/test/dev/scripts/uri/ldap.rb +238 -0
- data/test/dev/scripts/uri/mailto.rb +260 -0
- data/test/dev/scripts/urireg.rb +174 -0
- data/test/dev/simpledispatcher.rb +156 -0
- data/test/dev/test.action.rb +146 -0
- data/test/dev/test.formreader.rb +463 -0
- data/test/dev/test.simpledispatcher.rb +186 -0
- data/test/dev/webunit/conv/digit-0.rb +21 -0
- data/test/dev/webunit/conv/digit-1.rb +17 -0
- data/test/dev/webunit/conv/digit.rb +23 -0
- data/test/dev/webunit/conv/test_digit-0.rb +16 -0
- data/test/dev/webunit/conv/test_digit-1.rb +19 -0
- data/test/dev/webunit/conv/test_digit.rb +26 -0
- data/test/dev/webunit/conv/test_digit_view-0.rb +76 -0
- data/test/dev/webunit/conv/test_digit_view-1.rb +102 -0
- data/test/dev/webunit/conv/test_digit_view.rb +134 -0
- data/test/installation/htdocs/cgi_test.rb +296 -0
- data/test/installation/htdocs/test_install.rb +4 -0
- data/test/installation/runwebtest.rb +5 -0
- data/test/installation/test_cookie.rb +128 -0
- data/test/installation/test_form.rb +47 -0
- data/test/installation/test_multipart.rb +51 -0
- data/test/installation/test_request.rb +24 -0
- data/test/installation/test_response.rb +35 -0
- data/test/unit/htdocs/cookie.rb +32 -0
- data/test/unit/htdocs/multipart.rb +28 -0
- data/test/unit/htdocs/redirect.rb +12 -0
- data/test/unit/htdocs/simple.rb +13 -0
- data/test/unit/htdocs/stock.rb +33 -0
- data/test/unit/test_assert.rb +162 -0
- data/test/unit/test_cookie.rb +114 -0
- data/test/unit/test_domwalker.rb +77 -0
- data/test/unit/test_form.rb +42 -0
- data/test/unit/test_frame.rb +40 -0
- data/test/unit/test_htmlelem.rb +74 -0
- data/test/unit/test_image.rb +45 -0
- data/test/unit/test_jscript.rb +57 -0
- data/test/unit/test_link.rb +85 -0
- data/test/unit/test_multipart.rb +51 -0
- data/test/unit/test_params.rb +210 -0
- data/test/unit/test_parser.rb +53 -0
- data/test/unit/test_response.rb +150 -0
- data/test/unit/test_table.rb +70 -0
- data/test/unit/test_utils.rb +106 -0
- data/test/unit/test_webunit.rb +28 -0
- data/test/web/mod_ruby_stub.rb +39 -0
- data/test/web/test.assertinclude.rb +109 -0
- data/test/web/test.buffer.rb +182 -0
- data/test/web/test.code.loader.rb +78 -0
- data/test/web/test.config.rb +31 -0
- data/test/web/test.error.handling.rb +91 -0
- data/test/web/test.formreader-2.0.rb +352 -0
- data/test/web/test.load.rb +125 -0
- data/test/web/test.mime-type.rb +23 -0
- data/test/web/test.narf.cgi.rb +106 -0
- data/test/web/test.phprb.rb +239 -0
- data/test/web/test.request.rb +368 -0
- data/test/web/test.response.rb +637 -0
- data/test/web/test.ruby-web.rb +10 -0
- data/test/web/test.session.rb +50 -0
- data/test/web/test.shim.cgi.rb +96 -0
- data/test/web/test.tagparser.rb +65 -0
- data/test/web/test.template2.rb +297 -0
- data/test/web/test.testing2.rb +318 -0
- data/test/web/test.upload.rb +45 -0
- data/test/web/test.validate.rb +46 -0
- data/test/web/test.web.test.rb +495 -0
- data/test/wiki/test.history.rb +297 -0
- data/test/wiki/test.illustration_page.rb +287 -0
- data/test/wiki/test.linker.rb +197 -0
- data/test/wiki/test.tarpit.rb +56 -0
- data/test/wiki/test.wiki.rb +300 -0
- data/test/wikitestroot/admin.rb +7 -0
- data/test/wikitestroot/wiki.rb +6 -0
- metadata +234 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
# Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
|
|
2
|
+
# License:: Same as Ruby's
|
|
3
|
+
# CVS ID: $Id: element.rb,v 1.8 2002/06/04 01:55:59 ned Exp $
|
|
4
|
+
|
|
5
|
+
# This module is a mix-in that provides parent/child behavior to real
|
|
6
|
+
# Element classes. Because it defines <tt>each()</tt> and includes Enumerable,
|
|
7
|
+
# you can iterate through a tree using the usual Enumerable methods.
|
|
8
|
+
|
|
9
|
+
require 'web/htmltools/tags'
|
|
10
|
+
|
|
11
|
+
module HTMLTree #:nodoc: all
|
|
12
|
+
module TreeElement
|
|
13
|
+
include Enumerable
|
|
14
|
+
|
|
15
|
+
protected
|
|
16
|
+
|
|
17
|
+
def initialize_tree_element(parent_or_nil = nil, contents_or_nil = nil)
|
|
18
|
+
@_content, @_parent = contents_or_nil, parent_or_nil
|
|
19
|
+
if parent_or_nil
|
|
20
|
+
parent_or_nil.add_child(self)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
attr_accessor :_parent
|
|
25
|
+
|
|
26
|
+
public
|
|
27
|
+
|
|
28
|
+
# Add one or more children to this node.
|
|
29
|
+
def add_child(*children_to_add)
|
|
30
|
+
if can_have_children?
|
|
31
|
+
children_to_add.each do |child|
|
|
32
|
+
@_content << child
|
|
33
|
+
child._parent = self
|
|
34
|
+
end
|
|
35
|
+
else
|
|
36
|
+
raise(ArgumentError.exception('node cannot have children'))
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
alias_method(:add_children, :add_child)
|
|
41
|
+
|
|
42
|
+
# Remove one or more children from this node.
|
|
43
|
+
def remove_child(*children_to_remove)
|
|
44
|
+
if can_have_children?
|
|
45
|
+
children_to_remove.each do |child|
|
|
46
|
+
child._parent = nil if @_content.delete(child)
|
|
47
|
+
end
|
|
48
|
+
else
|
|
49
|
+
raise(ArgumentError.exception('node cannot have children'))
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
alias_method(:remove_children, :remove_child)
|
|
54
|
+
|
|
55
|
+
# Change my parent. Disconnects from prior parent, if any.
|
|
56
|
+
def parent=(parent_or_nil)
|
|
57
|
+
@_parent.remove_child(self) if @_parent
|
|
58
|
+
parent_or_nil.add_child(self) if parent_or_nil
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Return true if my content is a collection of Elements
|
|
62
|
+
# rather than actual data.
|
|
63
|
+
def can_have_children?
|
|
64
|
+
@_content.kind_of?(Array)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Return a collection of my children. Returns an empty Array if I am a
|
|
68
|
+
# data element, just to keep other methods simple.
|
|
69
|
+
def children
|
|
70
|
+
can_have_children? ? @_content : []
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Return my content; either my children or my data.
|
|
74
|
+
def content
|
|
75
|
+
@_content
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Return my parent element.
|
|
79
|
+
def parent
|
|
80
|
+
@_parent
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Return the ultimate parent.
|
|
84
|
+
def root
|
|
85
|
+
@_parent ? self : @_parent.root
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Return true if I have any children.
|
|
89
|
+
def has_children?
|
|
90
|
+
children.size > 0
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Breadth-first iterator, required by Enumerable.
|
|
94
|
+
def each(&block)
|
|
95
|
+
block.call(self)
|
|
96
|
+
children.each { |ch| ch.each(&block) }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Print out to $stdout (or given IO or String)
|
|
100
|
+
# a formatted dump of my structure.
|
|
101
|
+
def dump(indent=0, io=$stdout)
|
|
102
|
+
io << " " * indent
|
|
103
|
+
io << self.to_s
|
|
104
|
+
io << "\n"
|
|
105
|
+
children.each { |ea| ea.dump(indent+1, io) }
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# This is a Element that represents the whole document (and makes a
|
|
111
|
+
# scope for the DTD declaration)
|
|
112
|
+
class Document
|
|
113
|
+
include TreeElement
|
|
114
|
+
|
|
115
|
+
def initialize
|
|
116
|
+
initialize_tree_element(nil, [])
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def to_s
|
|
120
|
+
''
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def each(&block)
|
|
124
|
+
children.each { |ch| ch.each(&block) }
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def write(io)
|
|
128
|
+
children.each { |t| t.write(io) }
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def tag
|
|
132
|
+
''
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Return my child <html> node, if any.
|
|
136
|
+
def html_node
|
|
137
|
+
children.detect { |ea| ea.tag == 'html' }
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# This is a TreeElement that represents tagged items in an HTML
|
|
142
|
+
# document.
|
|
143
|
+
class Element
|
|
144
|
+
include TreeElement
|
|
145
|
+
|
|
146
|
+
protected
|
|
147
|
+
|
|
148
|
+
# parent_or_nil:: TreeElement or nil
|
|
149
|
+
# tag_name:: String
|
|
150
|
+
def initialize(parent_or_nil = nil, tag_name = nil)
|
|
151
|
+
initialize_tree_element(parent_or_nil, [])
|
|
152
|
+
@_tag = tag_name
|
|
153
|
+
@_attributes = {}
|
|
154
|
+
@_attribute_order = []
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
public
|
|
158
|
+
|
|
159
|
+
def can_have_children?; true; end
|
|
160
|
+
|
|
161
|
+
# Return true if I'm data instead of a tag
|
|
162
|
+
def data?; false; end
|
|
163
|
+
|
|
164
|
+
def to_s
|
|
165
|
+
a = [ "<", tag ]
|
|
166
|
+
@_attribute_order.each { |k|
|
|
167
|
+
v = @_attributes[k]
|
|
168
|
+
a << " #{k.to_s}=\"#{v.to_s}\""
|
|
169
|
+
}
|
|
170
|
+
a << ">"
|
|
171
|
+
a.join('')
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Append an attribute. <tt>values</tt> are first flattened into an Array,
|
|
175
|
+
# then converted into strings.
|
|
176
|
+
#
|
|
177
|
+
# If there is a single attribute value, it will appear as a String,
|
|
178
|
+
# otherwise it will be an Array of Strings.
|
|
179
|
+
#
|
|
180
|
+
# Example:
|
|
181
|
+
# element.add_attribute("width", "123")
|
|
182
|
+
# element.add_attribute("value", [ "a", "b" ])
|
|
183
|
+
def add_attribute(name, *values)
|
|
184
|
+
values = values.flatten.collect { |ea| ea.to_s.strip }
|
|
185
|
+
name = name.downcase
|
|
186
|
+
if @_attributes.include?(name)
|
|
187
|
+
@_attributes[name] = @_attributes[name].to_a + values
|
|
188
|
+
else
|
|
189
|
+
@_attributes[name] = values.size > 1 ? values : values[0]
|
|
190
|
+
end
|
|
191
|
+
@_attribute_order << name
|
|
192
|
+
self
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Return my tag (should be a String)
|
|
196
|
+
def tag; @_tag; end
|
|
197
|
+
|
|
198
|
+
# Return an HTML::Tag for further information, or nil if this is an
|
|
199
|
+
# unknown tag.
|
|
200
|
+
def tag_info
|
|
201
|
+
begin
|
|
202
|
+
HTML::Tag.named(@_tag)
|
|
203
|
+
rescue NoSuchHTMLTagError
|
|
204
|
+
nil
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Return my attributes Hash.
|
|
209
|
+
def attributes; @_attributes; end
|
|
210
|
+
|
|
211
|
+
# Return the order of my attributes
|
|
212
|
+
def attribute_order; @_attribute_order; end
|
|
213
|
+
|
|
214
|
+
# Return the value of a single attribute (a String or Array).
|
|
215
|
+
def attribute(name); @_attributes[name]; end
|
|
216
|
+
|
|
217
|
+
# Return the value of a single attribute (a String or Array).
|
|
218
|
+
def [](name); attribute(name); end
|
|
219
|
+
|
|
220
|
+
# Replace an attribute.
|
|
221
|
+
def []=(name, *values)
|
|
222
|
+
@_attributes[name] = values.size > 1 ? values : values[0]
|
|
223
|
+
@_attribute_order.delete(name)
|
|
224
|
+
self
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Print me (and my descendents) on the given IO stream.
|
|
228
|
+
def write(io)
|
|
229
|
+
io << self
|
|
230
|
+
children.each { |t| t.write(io) }
|
|
231
|
+
unless tag_info.is_empty_element
|
|
232
|
+
io.puts( "</#{tag()}>" )
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# This is a TreeElement that represents leaf data nodes (CDATA, scripts,
|
|
239
|
+
# comments, processing directives). It forwards unknown messages to the
|
|
240
|
+
# content element, so it otherwise behaves like a String.
|
|
241
|
+
class Data
|
|
242
|
+
include TreeElement
|
|
243
|
+
|
|
244
|
+
protected
|
|
245
|
+
|
|
246
|
+
# parent_or_nil:: parent, TreeElement or nil
|
|
247
|
+
# str:: contents, String
|
|
248
|
+
def initialize(parent_or_nil = nil, str = '')
|
|
249
|
+
initialize_tree_element(parent_or_nil, str)
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
public
|
|
253
|
+
|
|
254
|
+
# Return true because I am a data Element.
|
|
255
|
+
def data?; true; end
|
|
256
|
+
|
|
257
|
+
# Return false because I have no children.
|
|
258
|
+
def can_have_children?; false; end
|
|
259
|
+
|
|
260
|
+
# Return an empty collection because I have no children.
|
|
261
|
+
def children; []; end
|
|
262
|
+
|
|
263
|
+
# Return my (empty) tag String.
|
|
264
|
+
def tag; ''; end
|
|
265
|
+
|
|
266
|
+
# Return my (empty) attributes Hash.
|
|
267
|
+
def attributes; {}; end
|
|
268
|
+
|
|
269
|
+
def to_s
|
|
270
|
+
@_content
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
# Print me on the given IO stream.
|
|
274
|
+
def write(io)
|
|
275
|
+
io << self
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Forward all other methods to my content, so I can otherwise behave
|
|
279
|
+
# like a String.
|
|
280
|
+
def method_missing(sym, *args)
|
|
281
|
+
@_content.method(sym).call(*args)
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
class Comment < Data
|
|
286
|
+
def to_s
|
|
287
|
+
'<!--' + @_content + '-->'
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
class Special < Data
|
|
292
|
+
def to_s
|
|
293
|
+
'<' + @_content + '>'
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
end
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# Copyright:: Copyright(C) 2002 Ned Konz
|
|
2
|
+
# License:: Ruby's License
|
|
3
|
+
# CVS ID:: $Id: stparser.rb,v 1.6 2002/06/04 01:55:59 ned Exp $
|
|
4
|
+
|
|
5
|
+
require 'web/htmlparser/sgml-parser'
|
|
6
|
+
require 'web/htmltools/tags'
|
|
7
|
+
|
|
8
|
+
# This is an SGMLParser subclass that knows about HTML 4.0 rules
|
|
9
|
+
# and can spot empty tags and deal with tags that may have omitted endtags.
|
|
10
|
+
module HTML #:nodoc: all
|
|
11
|
+
class StackingParser < SGMLParser
|
|
12
|
+
# accessors
|
|
13
|
+
|
|
14
|
+
def stack; @tagStack; end
|
|
15
|
+
|
|
16
|
+
def last_tag; @tagStack[-1] || 'html'; end
|
|
17
|
+
|
|
18
|
+
def parent_tag; @tagStack[-2] || 'html'; end
|
|
19
|
+
|
|
20
|
+
def strip_whitespace=(flag); @stripWhitespace = flag; end
|
|
21
|
+
|
|
22
|
+
# input methods
|
|
23
|
+
|
|
24
|
+
# Open and parse the given file.
|
|
25
|
+
def parse_file_named(name)
|
|
26
|
+
File.open(name) { |f|
|
|
27
|
+
while bytes = f.read(65536)
|
|
28
|
+
feed(bytes)
|
|
29
|
+
end
|
|
30
|
+
}
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Feed some more data to the parser.
|
|
34
|
+
def feed(string)
|
|
35
|
+
super
|
|
36
|
+
while @saved.size > 0
|
|
37
|
+
saved = @saved
|
|
38
|
+
@saved = ''
|
|
39
|
+
super(saved)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# available only to subclasses
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
if $DEBUG
|
|
47
|
+
def dprint(*stuff)
|
|
48
|
+
print((" " * @tagStack.size), stuff) if @verbose
|
|
49
|
+
end
|
|
50
|
+
else
|
|
51
|
+
def dprint(*stuff); end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def warn(msg)
|
|
55
|
+
$stderr.print(msg) if @verbose
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def initialize(verbose=false, strip_white=false)
|
|
59
|
+
super(verbose)
|
|
60
|
+
@tagStack = []
|
|
61
|
+
@saved = ''
|
|
62
|
+
@stripWhitespace = strip_white
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# handle_data will call this.
|
|
66
|
+
def skip_script(data)
|
|
67
|
+
# is the end of the script in this buffer?
|
|
68
|
+
if m = data.index(%r{</[A-Za-z]})
|
|
69
|
+
@nomoretags = false
|
|
70
|
+
@saved = data[m..-1]
|
|
71
|
+
handle_script(data[0,m]) # call user handler
|
|
72
|
+
else
|
|
73
|
+
handle_script(data)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Unfortunately, sgml-parser calls this and there's important work to do in
|
|
78
|
+
# it. So the user handler has to be named something different.
|
|
79
|
+
def handle_data(data)
|
|
80
|
+
# need to handle scripts
|
|
81
|
+
if last_tag() == 'script' && @nomoretags
|
|
82
|
+
skip_script(data)
|
|
83
|
+
else
|
|
84
|
+
if @stripWhitespace
|
|
85
|
+
begin
|
|
86
|
+
data.strip! if HTML::Tag.named(last_tag()).can_ignore_whitespace
|
|
87
|
+
rescue NoSuchHTMLTagError
|
|
88
|
+
data.strip!
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
handle_cdata(data) if data.size > 0 # call user handler
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def finish_starttag(tag, attrs)
|
|
96
|
+
dprint "*START* #{tag} #{attrs.inspect}\n"
|
|
97
|
+
# dprint "-START- #{tag}\n"
|
|
98
|
+
begin
|
|
99
|
+
unless HTML::Tag.named(last_tag()).can_contain(tag, parent_tag())
|
|
100
|
+
dprint "-INSERT-\n"
|
|
101
|
+
finish_endtag(last_tag())
|
|
102
|
+
end
|
|
103
|
+
rescue NoSuchHTMLTagError
|
|
104
|
+
# hmm.. last_tag was unknown.
|
|
105
|
+
# Assume it doesn't have an optional endtag.
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
push(tag)
|
|
109
|
+
|
|
110
|
+
begin
|
|
111
|
+
if HTML::Tag.named(tag).is_empty_element
|
|
112
|
+
dprint "-EMPTY-\n"
|
|
113
|
+
handle_empty_tag(tag, attrs) # call user handler
|
|
114
|
+
drop_to_tag(tag)
|
|
115
|
+
else
|
|
116
|
+
handle_start_tag(tag, attrs) # call user handler
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
if tag.downcase == 'script'
|
|
120
|
+
@nomoretags = true
|
|
121
|
+
end
|
|
122
|
+
rescue NoSuchHTMLTagError
|
|
123
|
+
# hmm... the start tag is unknown.
|
|
124
|
+
# And we pushed it.
|
|
125
|
+
# If it's empty, we'll get rid of it at the next end tag.
|
|
126
|
+
handle_unknown_tag(tag, attrs)
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# return true if tag is not extra
|
|
131
|
+
def drop_to_tag(tag)
|
|
132
|
+
dropped = @tagStack.size - (@tagStack.rindex(tag.downcase) || @tagStack.size)
|
|
133
|
+
if dropped == 0 # got an end tag but we haven't seen start tag?
|
|
134
|
+
handle_extra_end_tag(tag) # call user handler
|
|
135
|
+
return false
|
|
136
|
+
end
|
|
137
|
+
dropped.times do
|
|
138
|
+
begin
|
|
139
|
+
# detect missing end tag
|
|
140
|
+
if last_tag != tag and ! HTML::Tag.named(last_tag).can_omit_end_tag
|
|
141
|
+
handle_missing_end_tag(last_tag) # call user handler
|
|
142
|
+
end
|
|
143
|
+
rescue NoSuchHTMLTagError
|
|
144
|
+
# oops, don't recognize last_tag.
|
|
145
|
+
end
|
|
146
|
+
pop
|
|
147
|
+
end
|
|
148
|
+
return true
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def finish_endtag(tag)
|
|
152
|
+
dprint "*END* #{tag}\n"
|
|
153
|
+
if drop_to_tag(tag)
|
|
154
|
+
dprint "-END- #{tag} #{@tagStack.inspect}\n"
|
|
155
|
+
handle_end_tag(tag) # call user handler
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def push(tag)
|
|
160
|
+
@tagStack.push(tag.downcase)
|
|
161
|
+
dprint "*PUSH* #{tag} => #{@tagStack.inspect}\n"
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def pop
|
|
165
|
+
tag = @tagStack.pop
|
|
166
|
+
dprint "*POP* #{tag} => #{@tagStack.inspect}\n"
|
|
167
|
+
tag
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def unknown_charref(name)
|
|
171
|
+
handle_unknown_character(name)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def unknown_entityref(name)
|
|
175
|
+
handle_unknown_entity(name)
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# callbacks: can be overridden in subclasses
|
|
179
|
+
|
|
180
|
+
def handle_start_tag(tag, attrs)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def handle_end_tag(tag)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# by default, an empty tag is handled as a start tag
|
|
187
|
+
# with an inserted end tag.
|
|
188
|
+
def handle_empty_tag(tag, attrs)
|
|
189
|
+
handle_start_tag(tag, attrs)
|
|
190
|
+
handle_end_tag(tag)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def handle_unknown_tag(tag, attrs)
|
|
194
|
+
warn("warning: unknown tag #{tag}\n")
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def handle_missing_end_tag(tag)
|
|
198
|
+
warn("warning: missing end tag </#{tag}>\n")
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
def handle_extra_end_tag(tag)
|
|
202
|
+
warn("warning: extra end tag </#{tag}>\n")
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def handle_cdata(data)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def handle_script(data)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def handle_unknown_character(name)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def handle_unknown_entity(name)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# call super if you want the data stripped
|
|
218
|
+
def handle_comment(data)
|
|
219
|
+
data.strip! if @stripWhitespace
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def handle_special(data)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# test script
|
|
229
|
+
if $0 == __FILE__
|
|
230
|
+
$stdout.sync = true
|
|
231
|
+
|
|
232
|
+
class TestStackingParser < HTML::StackingParser #:nodoc: all
|
|
233
|
+
def dump_stack
|
|
234
|
+
stack.each { |ea| print ea, '/' }
|
|
235
|
+
end
|
|
236
|
+
def handle_start_tag(tag, attrs)
|
|
237
|
+
print("START: #{tag} #{attrs.inspect}\n")
|
|
238
|
+
end
|
|
239
|
+
def handle_end_tag(tag)
|
|
240
|
+
# print("END: #{tag}\n")
|
|
241
|
+
end
|
|
242
|
+
def handle_empty_tag(tag, attrs)
|
|
243
|
+
# print("EMPTY: #{tag} #{attrs.inspect}\n")
|
|
244
|
+
end
|
|
245
|
+
def handle_cdata(data)
|
|
246
|
+
# print("DATA: #{data.size} chars\n")
|
|
247
|
+
if last_tag() != 'style'
|
|
248
|
+
str = data.strip
|
|
249
|
+
if str.size > 0
|
|
250
|
+
dump_stack
|
|
251
|
+
print(str.inspect, "\n")
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
def handle_script(data)
|
|
256
|
+
# print("SCRIPT: #{data.size} chars\n")
|
|
257
|
+
end
|
|
258
|
+
def handle_unknown_character(name)
|
|
259
|
+
print("UNKC: #{name}\n")
|
|
260
|
+
end
|
|
261
|
+
def handle_unknown_entity(name)
|
|
262
|
+
print("UNKE: #{name}\n")
|
|
263
|
+
end
|
|
264
|
+
def handle_comment(data)
|
|
265
|
+
super
|
|
266
|
+
print("COMMENT: #{data}\n")
|
|
267
|
+
end
|
|
268
|
+
def handle_special(data)
|
|
269
|
+
print("SPECIAL: #{data}\n")
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
$DEBUG = false
|
|
274
|
+
p = TestStackingParser.new(true)
|
|
275
|
+
p.parse_file_named(ARGV[0] || 'ebay.html')
|
|
276
|
+
end
|