assaf-scrapi 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ require "time"
2
+
3
+
4
+ module Scraper
5
+
6
+ module Microformats
7
+
8
+ class HCard < Scraper::Base
9
+
10
+ process ".fn", :fn=>:text
11
+ process ".given-name", :given_name=>:text
12
+ process ".family-name", :family_name=>:text
13
+ process "img.photo", :photo=>"@src"
14
+ process "a.url", :url=>"@href"
15
+
16
+ result :fn, :given_name, :family_name, :photo, :url
17
+
18
+ def collect()
19
+ unless fn
20
+ if self.fn = given_name
21
+ self.given_name << " #{family_name}" if family_name
22
+ else
23
+ self.fn = family_name
24
+ end
25
+ end
26
+ end
27
+
28
+ end
29
+
30
+
31
+ class HAtom < Scraper::Base
32
+
33
+ class Entry < Scraper::Base
34
+
35
+ array :content, :tags
36
+
37
+ process ".entry-title", :title=>:text
38
+ process ".entry-content", :content=>:element
39
+ process ".entry-summary", :summary=>:element
40
+ process "a[rel~=bookmark]", :permalink=>["@href"]
41
+ process ".author.vcard, .author .vcard", :author=>HCard
42
+ process ".published", :published=>["abbr@title", :text]
43
+ process ".updated", :updated=>["abbr@title", :text]
44
+ process "a[rel~=tag]", :tags=>:text
45
+
46
+ def collect()
47
+ self.published = Time.parse(published)
48
+ self.updated = updated ? Time.parse(updated) : published
49
+ end
50
+
51
+ result :title, :content, :summary, :permalink, :author, :published, :updated, :tags
52
+
53
+ end
54
+
55
+ class Feed < Scraper::Base
56
+
57
+ array :entries
58
+
59
+ process ".hentry", :entries=>Entry
60
+
61
+ def result()
62
+ entries
63
+ end
64
+
65
+ end
66
+
67
+ array :feeds, :entries
68
+
69
+ # Skip feeds, so we don't process them twice.
70
+ process ".hfeed", :skip=>true, :feeds=>Feed
71
+ # And so we can collect unwrapped entries into a separate feed.
72
+ process ".hentry", :skip=>true, :entries=>Entry
73
+ # And collect the first remaining hcard as the default author.
74
+ process ".vcard", :hcard=>HCard
75
+
76
+ def collect()
77
+ @feeds ||= []
78
+ @feeds << entries if entries
79
+ for feed in feeds
80
+ for entry in feed
81
+ entry.author = hcard unless entry.author
82
+ end
83
+ end
84
+ end
85
+
86
+ result :feeds
87
+
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+
@@ -0,0 +1,240 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "uri"
9
+ require "net/http"
10
+ require "net/https"
11
+ begin
12
+ require "rubygems"
13
+ require "tidy"
14
+ rescue LoadError
15
+ end
16
+
17
+
18
+ module Scraper
19
+
20
+ module Reader
21
+
22
+ class HTTPError < StandardError
23
+
24
+ attr_reader :cause
25
+
26
+ def initialize(cause = nil)
27
+ @cause = cause
28
+ end
29
+
30
+
31
+ def to_s
32
+ @cause ? "#{super}: #{@cause}" : super
33
+ end
34
+
35
+ end
36
+
37
+ class HTTPTimeoutError < HTTPError ; end
38
+ class HTTPUnspecifiedError < HTTPError ; end
39
+ class HTTPNotFoundError < HTTPError ; end
40
+ class HTTPNoAccessError < HTTPError ; end
41
+ class HTTPInvalidURLError < HTTPError ; end
42
+ class HTTPRedirectLimitError < HTTPError ; end
43
+
44
+
45
+ class HTMLParseError < StandardError
46
+
47
+ attr_reader :cause
48
+
49
+ def initialize(cause = nil)
50
+ @cause = cause
51
+ end
52
+
53
+ def to_s
54
+ @cause ? "#{super}: #{@cause}" : super
55
+ end
56
+
57
+ end
58
+
59
+
60
+ unless const_defined? :REDIRECT_LIMIT
61
+ REDIRECT_LIMIT = 3
62
+ DEFAULT_TIMEOUT = 30
63
+ PARSERS = [:tidy, :html_parser]
64
+ end
65
+
66
+ unless const_defined? :TIDY_OPTIONS
67
+ TIDY_OPTIONS = {
68
+ :output_xhtml=>true,
69
+ :show_errors=>0,
70
+ :show_warnings=>false,
71
+ :wrap=>0,
72
+ :wrap_sections=>false,
73
+ :force_output=>true,
74
+ :quiet=>true,
75
+ :tidy_mark=>false
76
+ }
77
+ end
78
+
79
+
80
+ Page = Struct.new(:url, :content, :encoding, :last_modified, :etag)
81
+ Parsed = Struct.new(:document, :encoding)
82
+
83
+
84
+ module_function
85
+
86
+ # :call-seq:
87
+ # read_page(url, options?) => response
88
+ #
89
+ # Reads a Web page and return its URL, content and cache control headers.
90
+ #
91
+ # The request reads a Web page at the specified URL (must be a URI object).
92
+ # It accepts the following options:
93
+ # * :last_modified -- Last modified header (from a previous request).
94
+ # * :etag -- ETag header (from a previous request).
95
+ # * :redirect_limit -- Number of redirects allowed (default is 3).
96
+ # * :user_agent -- The User-Agent header to send.
97
+ # * :timeout -- HTTP open connection/read timeouts (in second).
98
+ #
99
+ # It returns a hash with the following information:
100
+ # * :url -- The URL of the requested page (may change by permanent redirect)
101
+ # * :content -- The content of the response (may be nil if cached)
102
+ # * :content_type -- The HTML page Content-Type header
103
+ # * :last_modified -- Last modified cache control header (may be nil)
104
+ # * :etag -- ETag cache control header (may be nil)
105
+ # * :encoding -- Document encoding for the page
106
+ # If the page has not been modified from the last request, the content is nil.
107
+ #
108
+ # Raises HTTPError if an error prevents it from reading the page.
109
+ def read_page(url, options = nil)
110
+ options ||= {}
111
+ redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT
112
+ raise HTTPRedirectLimitError if redirect_limit == 0
113
+ if url.is_a?(URI)
114
+ uri = url
115
+ else
116
+ begin
117
+ uri = URI.parse(url)
118
+ rescue Exception=>error
119
+ raise HTTPInvalidURLError.new(error)
120
+ end
121
+ end
122
+ raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/
123
+ begin
124
+ http = Net::HTTP.new(uri.host, uri.port)
125
+ http.use_ssl = (uri.scheme == "https")
126
+ http.close_on_empty_response = true
127
+ http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
128
+ path = uri.path.dup # required so we don't modify path
129
+ path << "?#{uri.query}" if uri.query
130
+ # TODO: Specify which content types are accepted.
131
+ # TODO: GZip support.
132
+ headers = {}
133
+ headers["User-Agent"] = options[:user_agent] if options[:user_agent]
134
+ headers["Last-Modified"] = options[:last_modified] if options[:last_modified]
135
+ headers["ETag"] = options[:etag] if options[:etag]
136
+ response = http.request_get(path, headers)
137
+ # TODO: Ignore content types that do not map to HTML.
138
+ rescue TimeoutError=>error
139
+ raise HTTPTimeoutError.new(error)
140
+ rescue Exception=>error
141
+ raise HTTPUnspecifiedError.new(error)
142
+ end
143
+ case response
144
+ when Net::HTTPSuccess
145
+ encoding = if content_type = response["Content-Type"]
146
+ if match = content_type.match(/charset=([^\s]+)/i)
147
+ match[1]
148
+ end
149
+ end
150
+ return Page[(options[:source_url] || uri), response.body, encoding,
151
+ response["Last-Modified"], response["ETag"]]
152
+ when Net::HTTPNotModified
153
+ return Page[(options[:source_url] || uri), nil, nil,
154
+ options[:last_modified], options[:etag]]
155
+ when Net::HTTPMovedPermanently
156
+ return read_page((uri.merge(response["location"]) rescue nil), # New URL takes effect
157
+ :last_modified=>options[:last_modified],
158
+ :etag=>options[:etag],
159
+ :redirect_limit=>redirect_limit-1)
160
+ when Net::HTTPRedirection
161
+ return read_page((uri.merge(response["location"]) rescue nil),
162
+ :last_modified=>options[:last_modified],
163
+ :etag=>options[:etag],
164
+ :redirect_limit=>redirect_limit-1,
165
+ :source_url=>(options[:source_url] || uri)) # Old URL still in effect
166
+ when Net::HTTPNotFound
167
+ raise HTTPNotFoundError
168
+ when Net::HTTPUnauthorized, Net::HTTPForbidden
169
+ raise HTTPNoAccessError
170
+ when Net::HTTPRequestTimeOut
171
+ raise HTTPTimeoutError
172
+ else
173
+ raise HTTPUnspecifiedError
174
+ end
175
+ end
176
+
177
+
178
+ # :call-seq:
179
+ # parse_page(html, encoding?, options?, parser) => html
180
+ #
181
+ # Parses an HTML page and returns the encoding and HTML element.
182
+ # Raises HTMLParseError exceptions if it cannot parse the HTML.
183
+ #
184
+ # Options are passed to the parser. For example, when using Tidy
185
+ # you can pass Tidy cleanup options in the hash.
186
+ #
187
+ # The last option specifies which parser to use (see PARSERS).
188
+ # By default Tidy is used.
189
+ def parse_page(content, encoding = nil, options = nil, parser = :tidy)
190
+ begin
191
+ # Get the document encoding from the meta header.
192
+ if meta = content.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
193
+ if meta = meta[0].match(/charset=([\w-]*)/i)
194
+ encoding = meta[1]
195
+ end
196
+ end
197
+ encoding ||= "utf8"
198
+ case (parser || :tidy)
199
+ when :tidy
200
+ # Make sure the Tidy path is set and always apply the default
201
+ # options (these only control things like errors, output type).
202
+ find_tidy
203
+ options = (options || {}).update(TIDY_OPTIONS)
204
+ options[:input_encoding] = encoding.gsub("-", "").downcase
205
+ document = Tidy.open(options) do |tidy|
206
+ html = tidy.clean(content)
207
+ HTML::Document.new(html).find(:tag=>"html")
208
+ end
209
+ when :html_parser
210
+ document = HTML::HTMLParser.parse(content).root
211
+ else
212
+ raise HTMLParseError, "No parser #{parser || "unspecified"}"
213
+ end
214
+ return Parsed[document, encoding]
215
+ rescue Exception=>error
216
+ raise HTMLParseError.new(error)
217
+ end
218
+ end
219
+
220
+
221
+ protected
222
+
223
+ module_function
224
+
225
+ def find_tidy()
226
+ return if Tidy.path
227
+ begin
228
+ Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
229
+ rescue LoadError
230
+ begin
231
+ Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
232
+ rescue LoadError
233
+ Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
234
+ end
235
+ end
236
+ end
237
+
238
+ end
239
+
240
+ end
@@ -0,0 +1,8 @@
1
+ # Conditional loads, since we may have these libraries elsewhere,
2
+ # e.g. when using Rails with assert_select plugin.
3
+ require File.join(File.dirname(__FILE__), "html", "document") unless defined?(HTML::Document)
4
+ require File.join(File.dirname(__FILE__), "html", "node_ext") unless defined?(HTML::Node.detach)
5
+ require File.join(File.dirname(__FILE__), "html", "selector") unless defined?(HTML::Selector)
6
+ require File.join(File.dirname(__FILE__), "html", "htmlparser") unless defined?(HTML::HTMLParser)
7
+
8
+ require File.join(File.dirname(__FILE__), "scraper", "base") unless defined?(Scraper::Base)
Binary file
Binary file
@@ -0,0 +1,54 @@
1
+ require "net/http"
2
+
3
+ class Net::HTTP
4
+
5
+ @@on_get = nil
6
+
7
+ # Reset get method to default behavior.
8
+ def self.reset_on_get
9
+ @@on_get = nil
10
+ end
11
+
12
+
13
+ # :call-seq:
14
+ # on_get { |address, path, headers| ... => [response, body] }
15
+ #
16
+ # Specify alternative behavior for next execution of get method.
17
+ # This change applies to all instances of Net::HTTP, so do not use
18
+ # this method when running tests in parallel.
19
+ #
20
+ # The method takes a single block that accepts three arguments:
21
+ # the address (host), path and headers (hash). It must return an
22
+ # array with two values: the Net::HTTPResponse object and the
23
+ # content of the response body.
24
+ def self.on_get(&block)
25
+ @@on_get = block
26
+ end
27
+
28
+
29
+ unless method_defined?(:mocked_request_get)
30
+ alias :mocked_request_get :request_get
31
+
32
+ def request_get(path, headers)
33
+ # If we have prescribed behavior for the next search, execute it,
34
+ # otherwise, go with the default.
35
+ if @@on_get
36
+ response, body = @@on_get.call(@address, path, headers)
37
+ # Stuff the body into the response. No other way, since read_body
38
+ # attempts to read from a socket and we're too lazy to stub a socket.
39
+ response.instance_variable_set(:@mock_body, body.to_s)
40
+ class << response
41
+ def read_body()
42
+ @mock_body
43
+ end
44
+ end
45
+ response
46
+ else
47
+ mocked_request_get(path, headers)
48
+ end
49
+ end
50
+
51
+ end
52
+
53
+ end
54
+
@@ -0,0 +1,24 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require "test/unit"
10
+ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
11
+
12
+
13
+ class NodeExtTest < Test::Unit::TestCase
14
+
15
+ def setup
16
+ end
17
+
18
+ def teardown
19
+ end
20
+
21
+ def test_add_tests
22
+ end
23
+
24
+ end
@@ -0,0 +1,318 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require "test/unit"
10
+ require "time" # rfc2822
11
+ require "webrick"
12
+ require "webrick/https"
13
+ require "logger"
14
+ require "stringio"
15
+ require File.join(File.dirname(__FILE__), "mock_net_http")
16
+ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
17
+
18
+
19
+ class ReaderTest < Test::Unit::TestCase
20
+
21
+ include Scraper
22
+
23
+
24
+ WEBRICK_OPTIONS = {
25
+ :BindAddredd=>"127.0.0.1",
26
+ :Port=>2000,
27
+ :Logger=>Logger.new(StringIO.new) # /dev/null
28
+ }
29
+
30
+ WEBRICK_TEST_URL = "http://127.0.0.1:2000/test.html"
31
+
32
+
33
+ def setup
34
+ Net::HTTP.reset_on_get
35
+ end
36
+
37
+ def teardown
38
+ Net::HTTP.reset_on_get
39
+ end
40
+
41
+
42
+ #
43
+ # Tests read_page.
44
+ #
45
+
46
+ def test_should_pass_path_and_user_agent
47
+ # Test path, query string and user agent.
48
+ Net::HTTP.on_get do |address, path, headers|
49
+ assert_equal "localhost", address
50
+ assert_equal "/path?query", path
51
+ assert_equal "MyUserAgent", headers["User-Agent"]
52
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
53
+ end
54
+ response = Reader.read_page("http://localhost/path?query", :user_agent=>"MyUserAgent")
55
+ assert_equal "http://localhost/path?query", response.url.to_s
56
+ assert_equal "nothing", response.content
57
+ assert_equal nil, response.last_modified
58
+ assert_equal nil, response.etag
59
+ end
60
+
61
+
62
+ def test_should_handle_http_and_timeout_errors
63
+ # Test timeout error and HTTP status that we can't process.
64
+ Net::HTTP.on_get { |address, path, headers| raise TimeoutError }
65
+ assert_raise(Reader::HTTPTimeoutError) do
66
+ response = Reader.read_page("http://localhost/path?query")
67
+ end
68
+ Net::HTTP.on_get { |address, path, headers| [Net::HTTPRequestTimeOut.new(Net::HTTP.version_1_2, 408, "Timeout"),""] }
69
+ assert_raise(Reader::HTTPTimeoutError) do
70
+ response = Reader.read_page("http://localhost/path?query")
71
+ end
72
+ end
73
+
74
+
75
+ def test_should_fail_on_too_many_redirects
76
+ # Test too many redirections.
77
+ Net::HTTP.on_get do |address, path, headers|
78
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
79
+ response["location"] = "http://localhost"
80
+ [response, ""]
81
+ end
82
+ assert_raise(Reader::HTTPRedirectLimitError) do
83
+ response = Reader.read_page("http://localhost/path?query")
84
+ end
85
+ Net::HTTP.on_get do |address, path, headers|
86
+ response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
87
+ response["location"] = "http://localhost"
88
+ [response, ""]
89
+ end
90
+ assert_raise(Reader::HTTPRedirectLimitError) do
91
+ response = Reader.read_page("http://localhost/path?query")
92
+ end
93
+ end
94
+
95
+
96
+ def test_should_validate_redirect_url
97
+ # Test validation of redirection URI.
98
+ Net::HTTP.on_get do |address, path, headers|
99
+ response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
100
+ response["location"] = "ftp://notsupported"
101
+ [response, ""]
102
+ end
103
+ assert_raise(Reader::HTTPInvalidURLError) do
104
+ response = Reader.read_page("http://localhost/path?query")
105
+ end
106
+ end
107
+
108
+
109
+ def test_should_support_redirection
110
+ # Test working redirection. Redirect only once and test response URL.
111
+ # Should be new URL for permanent redirect, same URL for all other redirects.
112
+ Net::HTTP.on_get do |address, path, headers|
113
+ if path.empty?
114
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
115
+ else
116
+ response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
117
+ response["Location"] = "http://localhost"
118
+ [response, ""]
119
+ end
120
+ end
121
+ assert_nothing_raised() do
122
+ response = Reader.read_page("http://localhost/path?query")
123
+ assert_equal "http://localhost/path?query", response.url.to_s
124
+ end
125
+ end
126
+
127
+
128
+ def test_should_support_permanent_redirection
129
+ # Test working redirection. Redirect only once and test response URL.
130
+ # Should be new URL for permanent redirect, same URL for all other redirects.
131
+ Net::HTTP.on_get do |address, path, headers|
132
+ if path == "/"
133
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
134
+ else
135
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
136
+ response["location"] = "http://localhost/"
137
+ [response, ""]
138
+ end
139
+ end
140
+ assert_nothing_raised() do
141
+ response = Reader.read_page("http://localhost/path?query")
142
+ assert_equal "http://localhost/", response.url.to_s
143
+ end
144
+ end
145
+
146
+
147
+ def test_should_support_partial_location_redirection
148
+ # Test working redirection. Redirect only once and test response URL.
149
+ # Should be new URL for permanent redirect, same URL for all other redirects.
150
+ Net::HTTP.on_get do |address, path, headers|
151
+ if path == "/somewhere"
152
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
153
+ else
154
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
155
+ response["location"] = "somewhere"
156
+ [response, ""]
157
+ end
158
+ end
159
+ assert_nothing_raised() do
160
+ response = Reader.read_page("http://localhost/path?query")
161
+ assert_equal "http://localhost/somewhere", response.url.to_s
162
+ end
163
+ end
164
+
165
+
166
+ def test_should_use_cache_control
167
+ # Test Last Modified and ETag headers. First, that they are correctly
168
+ # returned from headers to response object. Next, that passing right
169
+ # headers in options returns nil body and same values (no change),
170
+ # passing wrong/no headers, returnspage.
171
+ time = Time.new.rfc2822
172
+ Net::HTTP.on_get do |address, path, headers|
173
+ response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
174
+ response["Last-Modified"] = time
175
+ response["ETag"] = "etag"
176
+ [response, "nothing"]
177
+ end
178
+ response = Reader.read_page("http://localhost/path?query")
179
+ assert_equal time, response.last_modified
180
+ assert_equal "etag", response.etag
181
+ Net::HTTP.on_get do |address, path, headers|
182
+ if headers["Last-Modified"] == time and headers["ETag"] == "etag"
183
+ [Net::HTTPNotModified.new(Net::HTTP.version_1_2, 304, "Same"), ""]
184
+ else
185
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
186
+ end
187
+ end
188
+ response = Reader.read_page("http://localhost/path?query")
189
+ assert_equal "nothing", response.content
190
+ response = Reader.read_page("http://localhost/path?query", :last_modified=>time, :etag=>"etag")
191
+ assert_equal nil, response.content
192
+ assert_equal time, response.last_modified
193
+ assert_equal "etag", response.etag
194
+ end
195
+
196
+
197
+ def test_should_find_encoding
198
+ # Test working redirection. Redirect only once and test response URL.
199
+ # Should be new URL for permanent redirect, same URL for all other redirects.
200
+ Net::HTTP.on_get do |address, path, headers|
201
+ response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
202
+ response["content-type"] = "text/html; charset=bogus"
203
+ [response, ""]
204
+ end
205
+ response = Reader.read_page("http://localhost/path?query")
206
+ assert_equal "bogus", response.encoding
207
+ end
208
+
209
+
210
+ #
211
+ # Tests parse_page.
212
+ #
213
+
214
+ def test_should_parse_html_page
215
+ html = Reader.parse_page("<html><head></head><body><p>something</p></body></html>").document
216
+ assert_equal 1, html.find_all(:tag=>"head").size
217
+ assert_equal 1, html.find_all(:tag=>"body").size
218
+ assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
219
+ assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
220
+ end
221
+
222
+
223
+ def test_should_use_tidy_if_specified
224
+ # This will only work with Tidy which adds the head/body parts,
225
+ # HTMLParser doesn't fix the HTML.
226
+ html = Reader.parse_page("<p>something</p>", nil, {}).document
227
+ assert_equal 1, html.find_all(:tag=>"head").size
228
+ assert_equal 1, html.find_all(:tag=>"body").size
229
+ assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
230
+ assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
231
+ end
232
+
233
+
234
+ #
235
+ # Other tests.
236
+ #
237
+
238
+ def test_should_handle_encoding_correctly
239
+ # Test content encoding returned from HTTP server.
240
+ with_webrick do |server, params|
241
+ server.mount_proc "/test.html" do |req,resp|
242
+ resp["Content-Type"] = "text/html; charset=my-encoding"
243
+ resp.body = "Content comes here"
244
+ end
245
+ page = Reader.read_page(WEBRICK_TEST_URL)
246
+ page = Reader.parse_page(page.content, page.encoding)
247
+ assert_equal "my-encoding", page.encoding
248
+ end
249
+ # Test content encoding in HTML http-equiv header
250
+ # that overrides content encoding returned in HTTP.
251
+ with_webrick do |server, params|
252
+ server.mount_proc "/test.html" do |req,resp|
253
+ resp["Content-Type"] = "text/html; charset=my-encoding"
254
+ resp.body = %Q{
255
+ <html>
256
+ <head>
257
+ <meta http-equiv="content-type" value="text/html; charset=other-encoding">
258
+ </head>
259
+ <body></body>
260
+ </html>
261
+ }
262
+ end
263
+ page = Reader.read_page(WEBRICK_TEST_URL)
264
+ page = Reader.parse_page(page.content, page.encoding)
265
+ assert_equal "other-encoding", page.encoding
266
+ end
267
+ end
268
+
269
+ def test_should_support_https
270
+ begin
271
+ options = WEBRICK_OPTIONS.dup.update(
272
+ :SSLEnable=>true,
273
+ :SSLVerifyClient => ::OpenSSL::SSL::VERIFY_NONE,
274
+ :SSLCertName => [ ["C","JP"], ["O","WEBrick.Org"], ["CN", "WWW"] ]
275
+ )
276
+ server = WEBrick::HTTPServer.new(options)
277
+ trap("INT") { server.shutdown }
278
+ Thread.new { server.start }
279
+ server.mount_proc "/test.html" do |req,resp|
280
+ resp.body = %Q{
281
+ <html>
282
+ <head>
283
+ <title>test https</title>
284
+ </head>
285
+ <body></body>
286
+ </html>
287
+ }
288
+ end
289
+ # Make sure page not HTTP accessible.
290
+ assert_raises(Reader::HTTPUnspecifiedError) do
291
+ Reader.read_page(WEBRICK_TEST_URL)
292
+ end
293
+ page = Reader.read_page(WEBRICK_TEST_URL.gsub("http", "https"))
294
+ page = Reader.parse_page(page.content, page.encoding)
295
+ assert_equal "<title>test https</title>",
296
+ page.document.find(:tag=>"title").to_s
297
+ server.shutdown
298
+ ensure
299
+ server.shutdown if server
300
+ end
301
+ end
302
+
303
+
304
+ private
305
+
306
+ def with_webrick(params = nil)
307
+ begin
308
+ server = WEBrick::HTTPServer.new(WEBRICK_OPTIONS)
309
+ trap("INT") { server.shutdown }
310
+ Thread.new { server.start }
311
+ yield server, params
312
+ server.shutdown
313
+ ensure
314
+ server.shutdown if server
315
+ end
316
+ end
317
+
318
+ end