assaf-scrapi 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,93 @@
1
+ require "time"
2
+
3
+
4
+ module Scraper
5
+
6
+ module Microformats
7
+
8
+ class HCard < Scraper::Base
9
+
10
+ process ".fn", :fn=>:text
11
+ process ".given-name", :given_name=>:text
12
+ process ".family-name", :family_name=>:text
13
+ process "img.photo", :photo=>"@src"
14
+ process "a.url", :url=>"@href"
15
+
16
+ result :fn, :given_name, :family_name, :photo, :url
17
+
18
+ def collect()
19
+ unless fn
20
+ if self.fn = given_name
21
+ self.given_name << " #{family_name}" if family_name
22
+ else
23
+ self.fn = family_name
24
+ end
25
+ end
26
+ end
27
+
28
+ end
29
+
30
+
31
+ class HAtom < Scraper::Base
32
+
33
+ class Entry < Scraper::Base
34
+
35
+ array :content, :tags
36
+
37
+ process ".entry-title", :title=>:text
38
+ process ".entry-content", :content=>:element
39
+ process ".entry-summary", :summary=>:element
40
+ process "a[rel~=bookmark]", :permalink=>["@href"]
41
+ process ".author.vcard, .author .vcard", :author=>HCard
42
+ process ".published", :published=>["abbr@title", :text]
43
+ process ".updated", :updated=>["abbr@title", :text]
44
+ process "a[rel~=tag]", :tags=>:text
45
+
46
+ def collect()
47
+ self.published = Time.parse(published)
48
+ self.updated = updated ? Time.parse(updated) : published
49
+ end
50
+
51
+ result :title, :content, :summary, :permalink, :author, :published, :updated, :tags
52
+
53
+ end
54
+
55
+ class Feed < Scraper::Base
56
+
57
+ array :entries
58
+
59
+ process ".hentry", :entries=>Entry
60
+
61
+ def result()
62
+ entries
63
+ end
64
+
65
+ end
66
+
67
+ array :feeds, :entries
68
+
69
+ # Skip feeds, so we don't process them twice.
70
+ process ".hfeed", :skip=>true, :feeds=>Feed
71
+ # And so we can collect unwrapped entries into a separate feed.
72
+ process ".hentry", :skip=>true, :entries=>Entry
73
+ # And collect the first remaining hcard as the default author.
74
+ process ".vcard", :hcard=>HCard
75
+
76
+ def collect()
77
+ @feeds ||= []
78
+ @feeds << entries if entries
79
+ for feed in feeds
80
+ for entry in feed
81
+ entry.author = hcard unless entry.author
82
+ end
83
+ end
84
+ end
85
+
86
+ result :feeds
87
+
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+
@@ -0,0 +1,240 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "uri"
9
+ require "net/http"
10
+ require "net/https"
11
+ begin
12
+ require "rubygems"
13
+ require "tidy"
14
+ rescue LoadError
15
+ end
16
+
17
+
18
+ module Scraper
19
+
20
+ module Reader
21
+
22
+ class HTTPError < StandardError
23
+
24
+ attr_reader :cause
25
+
26
+ def initialize(cause = nil)
27
+ @cause = cause
28
+ end
29
+
30
+
31
+ def to_s
32
+ @cause ? "#{super}: #{@cause}" : super
33
+ end
34
+
35
+ end
36
+
37
+ class HTTPTimeoutError < HTTPError ; end
38
+ class HTTPUnspecifiedError < HTTPError ; end
39
+ class HTTPNotFoundError < HTTPError ; end
40
+ class HTTPNoAccessError < HTTPError ; end
41
+ class HTTPInvalidURLError < HTTPError ; end
42
+ class HTTPRedirectLimitError < HTTPError ; end
43
+
44
+
45
+ class HTMLParseError < StandardError
46
+
47
+ attr_reader :cause
48
+
49
+ def initialize(cause = nil)
50
+ @cause = cause
51
+ end
52
+
53
+ def to_s
54
+ @cause ? "#{super}: #{@cause}" : super
55
+ end
56
+
57
+ end
58
+
59
+
60
+ unless const_defined? :REDIRECT_LIMIT
61
+ REDIRECT_LIMIT = 3
62
+ DEFAULT_TIMEOUT = 30
63
+ PARSERS = [:tidy, :html_parser]
64
+ end
65
+
66
+ unless const_defined? :TIDY_OPTIONS
67
+ TIDY_OPTIONS = {
68
+ :output_xhtml=>true,
69
+ :show_errors=>0,
70
+ :show_warnings=>false,
71
+ :wrap=>0,
72
+ :wrap_sections=>false,
73
+ :force_output=>true,
74
+ :quiet=>true,
75
+ :tidy_mark=>false
76
+ }
77
+ end
78
+
79
+
80
+ Page = Struct.new(:url, :content, :encoding, :last_modified, :etag)
81
+ Parsed = Struct.new(:document, :encoding)
82
+
83
+
84
+ module_function
85
+
86
+ # :call-seq:
87
+ # read_page(url, options?) => response
88
+ #
89
+ # Reads a Web page and return its URL, content and cache control headers.
90
+ #
91
+ # The request reads a Web page at the specified URL (must be a URI object).
92
+ # It accepts the following options:
93
+ # * :last_modified -- Last modified header (from a previous request).
94
+ # * :etag -- ETag header (from a previous request).
95
+ # * :redirect_limit -- Number of redirects allowed (default is 3).
96
+ # * :user_agent -- The User-Agent header to send.
97
+ # * :timeout -- HTTP open connection/read timeouts (in second).
98
+ #
99
+ # It returns a hash with the following information:
100
+ # * :url -- The URL of the requested page (may change by permanent redirect)
101
+ # * :content -- The content of the response (may be nil if cached)
102
+ # * :content_type -- The HTML page Content-Type header
103
+ # * :last_modified -- Last modified cache control header (may be nil)
104
+ # * :etag -- ETag cache control header (may be nil)
105
+ # * :encoding -- Document encoding for the page
106
+ # If the page has not been modified from the last request, the content is nil.
107
+ #
108
+ # Raises HTTPError if an error prevents it from reading the page.
109
+ def read_page(url, options = nil)
110
+ options ||= {}
111
+ redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT
112
+ raise HTTPRedirectLimitError if redirect_limit == 0
113
+ if url.is_a?(URI)
114
+ uri = url
115
+ else
116
+ begin
117
+ uri = URI.parse(url)
118
+ rescue Exception=>error
119
+ raise HTTPInvalidURLError.new(error)
120
+ end
121
+ end
122
+ raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/
123
+ begin
124
+ http = Net::HTTP.new(uri.host, uri.port)
125
+ http.use_ssl = (uri.scheme == "https")
126
+ http.close_on_empty_response = true
127
+ http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
128
+ path = uri.path.dup # required so we don't modify path
129
+ path << "?#{uri.query}" if uri.query
130
+ # TODO: Specify which content types are accepted.
131
+ # TODO: GZip support.
132
+ headers = {}
133
+ headers["User-Agent"] = options[:user_agent] if options[:user_agent]
134
+ headers["Last-Modified"] = options[:last_modified] if options[:last_modified]
135
+ headers["ETag"] = options[:etag] if options[:etag]
136
+ response = http.request_get(path, headers)
137
+ # TODO: Ignore content types that do not map to HTML.
138
+ rescue TimeoutError=>error
139
+ raise HTTPTimeoutError.new(error)
140
+ rescue Exception=>error
141
+ raise HTTPUnspecifiedError.new(error)
142
+ end
143
+ case response
144
+ when Net::HTTPSuccess
145
+ encoding = if content_type = response["Content-Type"]
146
+ if match = content_type.match(/charset=([^\s]+)/i)
147
+ match[1]
148
+ end
149
+ end
150
+ return Page[(options[:source_url] || uri), response.body, encoding,
151
+ response["Last-Modified"], response["ETag"]]
152
+ when Net::HTTPNotModified
153
+ return Page[(options[:source_url] || uri), nil, nil,
154
+ options[:last_modified], options[:etag]]
155
+ when Net::HTTPMovedPermanently
156
+ return read_page((uri.merge(response["location"]) rescue nil), # New URL takes effect
157
+ :last_modified=>options[:last_modified],
158
+ :etag=>options[:etag],
159
+ :redirect_limit=>redirect_limit-1)
160
+ when Net::HTTPRedirection
161
+ return read_page((uri.merge(response["location"]) rescue nil),
162
+ :last_modified=>options[:last_modified],
163
+ :etag=>options[:etag],
164
+ :redirect_limit=>redirect_limit-1,
165
+ :source_url=>(options[:source_url] || uri)) # Old URL still in effect
166
+ when Net::HTTPNotFound
167
+ raise HTTPNotFoundError
168
+ when Net::HTTPUnauthorized, Net::HTTPForbidden
169
+ raise HTTPNoAccessError
170
+ when Net::HTTPRequestTimeOut
171
+ raise HTTPTimeoutError
172
+ else
173
+ raise HTTPUnspecifiedError
174
+ end
175
+ end
176
+
177
+
178
+ # :call-seq:
179
+ # parse_page(html, encoding?, options?, parser) => html
180
+ #
181
+ # Parses an HTML page and returns the encoding and HTML element.
182
+ # Raises HTMLParseError exceptions if it cannot parse the HTML.
183
+ #
184
+ # Options are passed to the parser. For example, when using Tidy
185
+ # you can pass Tidy cleanup options in the hash.
186
+ #
187
+ # The last option specifies which parser to use (see PARSERS).
188
+ # By default Tidy is used.
189
+ def parse_page(content, encoding = nil, options = nil, parser = :tidy)
190
+ begin
191
+ # Get the document encoding from the meta header.
192
+ if meta = content.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
193
+ if meta = meta[0].match(/charset=([\w-]*)/i)
194
+ encoding = meta[1]
195
+ end
196
+ end
197
+ encoding ||= "utf8"
198
+ case (parser || :tidy)
199
+ when :tidy
200
+ # Make sure the Tidy path is set and always apply the default
201
+ # options (these only control things like errors, output type).
202
+ find_tidy
203
+ options = (options || {}).update(TIDY_OPTIONS)
204
+ options[:input_encoding] = encoding.gsub("-", "").downcase
205
+ document = Tidy.open(options) do |tidy|
206
+ html = tidy.clean(content)
207
+ HTML::Document.new(html).find(:tag=>"html")
208
+ end
209
+ when :html_parser
210
+ document = HTML::HTMLParser.parse(content).root
211
+ else
212
+ raise HTMLParseError, "No parser #{parser || "unspecified"}"
213
+ end
214
+ return Parsed[document, encoding]
215
+ rescue Exception=>error
216
+ raise HTMLParseError.new(error)
217
+ end
218
+ end
219
+
220
+
221
+ protected
222
+
223
+ module_function
224
+
225
+ def find_tidy()
226
+ return if Tidy.path
227
+ begin
228
+ Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
229
+ rescue LoadError
230
+ begin
231
+ Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
232
+ rescue LoadError
233
+ Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
234
+ end
235
+ end
236
+ end
237
+
238
+ end
239
+
240
+ end
@@ -0,0 +1,8 @@
1
+ # Conditional loads, since we may have these libraries elsewhere,
2
+ # e.g. when using Rails with assert_select plugin.
3
+ require File.join(File.dirname(__FILE__), "html", "document") unless defined?(HTML::Document)
4
+ require File.join(File.dirname(__FILE__), "html", "node_ext") unless defined?(HTML::Node.detach)
5
+ require File.join(File.dirname(__FILE__), "html", "selector") unless defined?(HTML::Selector)
6
+ require File.join(File.dirname(__FILE__), "html", "htmlparser") unless defined?(HTML::HTMLParser)
7
+
8
+ require File.join(File.dirname(__FILE__), "scraper", "base") unless defined?(Scraper::Base)
Binary file
Binary file
@@ -0,0 +1,54 @@
1
+ require "net/http"
2
+
3
+ class Net::HTTP
4
+
5
+ @@on_get = nil
6
+
7
+ # Reset get method to default behavior.
8
+ def self.reset_on_get
9
+ @@on_get = nil
10
+ end
11
+
12
+
13
+ # :call-seq:
14
+ # on_get { |address, path, headers| ... => [response, body] }
15
+ #
16
+ # Specify alternative behavior for next execution of get method.
17
+ # This change applies to all instances of Net::HTTP, so do not use
18
+ # this method when running tests in parallel.
19
+ #
20
+ # The method takes a single block that accepts three arguments:
21
+ # the address (host), path and headers (hash). It must return an
22
+ # array with two values: the Net::HTTPResponse object and the
23
+ # content of the response body.
24
+ def self.on_get(&block)
25
+ @@on_get = block
26
+ end
27
+
28
+
29
+ unless method_defined?(:mocked_request_get)
30
+ alias :mocked_request_get :request_get
31
+
32
+ def request_get(path, headers)
33
+ # If we have prescribed behavior for the next search, execute it,
34
+ # otherwise, go with the default.
35
+ if @@on_get
36
+ response, body = @@on_get.call(@address, path, headers)
37
+ # Stuff the body into the response. No other way, since read_body
38
+ # attempts to read from a socket and we're too lazy to stub a socket.
39
+ response.instance_variable_set(:@mock_body, body.to_s)
40
+ class << response
41
+ def read_body()
42
+ @mock_body
43
+ end
44
+ end
45
+ response
46
+ else
47
+ mocked_request_get(path, headers)
48
+ end
49
+ end
50
+
51
+ end
52
+
53
+ end
54
+
@@ -0,0 +1,24 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require "test/unit"
10
+ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
11
+
12
+
13
+ class NodeExtTest < Test::Unit::TestCase
14
+
15
+ def setup
16
+ end
17
+
18
+ def teardown
19
+ end
20
+
21
+ def test_add_tests
22
+ end
23
+
24
+ end
@@ -0,0 +1,318 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require "test/unit"
10
+ require "time" # rfc2822
11
+ require "webrick"
12
+ require "webrick/https"
13
+ require "logger"
14
+ require "stringio"
15
+ require File.join(File.dirname(__FILE__), "mock_net_http")
16
+ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
17
+
18
+
19
+ class ReaderTest < Test::Unit::TestCase
20
+
21
+ include Scraper
22
+
23
+
24
+ WEBRICK_OPTIONS = {
25
+ :BindAddredd=>"127.0.0.1",
26
+ :Port=>2000,
27
+ :Logger=>Logger.new(StringIO.new) # /dev/null
28
+ }
29
+
30
+ WEBRICK_TEST_URL = "http://127.0.0.1:2000/test.html"
31
+
32
+
33
+ def setup
34
+ Net::HTTP.reset_on_get
35
+ end
36
+
37
+ def teardown
38
+ Net::HTTP.reset_on_get
39
+ end
40
+
41
+
42
+ #
43
+ # Tests read_page.
44
+ #
45
+
46
+ def test_should_pass_path_and_user_agent
47
+ # Test path, query string and user agent.
48
+ Net::HTTP.on_get do |address, path, headers|
49
+ assert_equal "localhost", address
50
+ assert_equal "/path?query", path
51
+ assert_equal "MyUserAgent", headers["User-Agent"]
52
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
53
+ end
54
+ response = Reader.read_page("http://localhost/path?query", :user_agent=>"MyUserAgent")
55
+ assert_equal "http://localhost/path?query", response.url.to_s
56
+ assert_equal "nothing", response.content
57
+ assert_equal nil, response.last_modified
58
+ assert_equal nil, response.etag
59
+ end
60
+
61
+
62
+ def test_should_handle_http_and_timeout_errors
63
+ # Test timeout error and HTTP status that we can't process.
64
+ Net::HTTP.on_get { |address, path, headers| raise TimeoutError }
65
+ assert_raise(Reader::HTTPTimeoutError) do
66
+ response = Reader.read_page("http://localhost/path?query")
67
+ end
68
+ Net::HTTP.on_get { |address, path, headers| [Net::HTTPRequestTimeOut.new(Net::HTTP.version_1_2, 408, "Timeout"),""] }
69
+ assert_raise(Reader::HTTPTimeoutError) do
70
+ response = Reader.read_page("http://localhost/path?query")
71
+ end
72
+ end
73
+
74
+
75
+ def test_should_fail_on_too_many_redirects
76
+ # Test too many redirections.
77
+ Net::HTTP.on_get do |address, path, headers|
78
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
79
+ response["location"] = "http://localhost"
80
+ [response, ""]
81
+ end
82
+ assert_raise(Reader::HTTPRedirectLimitError) do
83
+ response = Reader.read_page("http://localhost/path?query")
84
+ end
85
+ Net::HTTP.on_get do |address, path, headers|
86
+ response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
87
+ response["location"] = "http://localhost"
88
+ [response, ""]
89
+ end
90
+ assert_raise(Reader::HTTPRedirectLimitError) do
91
+ response = Reader.read_page("http://localhost/path?query")
92
+ end
93
+ end
94
+
95
+
96
+ def test_should_validate_redirect_url
97
+ # Test validation of redirection URI.
98
+ Net::HTTP.on_get do |address, path, headers|
99
+ response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
100
+ response["location"] = "ftp://notsupported"
101
+ [response, ""]
102
+ end
103
+ assert_raise(Reader::HTTPInvalidURLError) do
104
+ response = Reader.read_page("http://localhost/path?query")
105
+ end
106
+ end
107
+
108
+
109
+ def test_should_support_redirection
110
+ # Test working redirection. Redirect only once and test response URL.
111
+ # Should be new URL for permanent redirect, same URL for all other redirects.
112
+ Net::HTTP.on_get do |address, path, headers|
113
+ if path.empty?
114
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
115
+ else
116
+ response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
117
+ response["Location"] = "http://localhost"
118
+ [response, ""]
119
+ end
120
+ end
121
+ assert_nothing_raised() do
122
+ response = Reader.read_page("http://localhost/path?query")
123
+ assert_equal "http://localhost/path?query", response.url.to_s
124
+ end
125
+ end
126
+
127
+
128
+ def test_should_support_permanent_redirection
129
+ # Test working redirection. Redirect only once and test response URL.
130
+ # Should be new URL for permanent redirect, same URL for all other redirects.
131
+ Net::HTTP.on_get do |address, path, headers|
132
+ if path == "/"
133
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
134
+ else
135
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
136
+ response["location"] = "http://localhost/"
137
+ [response, ""]
138
+ end
139
+ end
140
+ assert_nothing_raised() do
141
+ response = Reader.read_page("http://localhost/path?query")
142
+ assert_equal "http://localhost/", response.url.to_s
143
+ end
144
+ end
145
+
146
+
147
+ def test_should_support_partial_location_redirection
148
+ # Test working redirection. Redirect only once and test response URL.
149
+ # Should be new URL for permanent redirect, same URL for all other redirects.
150
+ Net::HTTP.on_get do |address, path, headers|
151
+ if path == "/somewhere"
152
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
153
+ else
154
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
155
+ response["location"] = "somewhere"
156
+ [response, ""]
157
+ end
158
+ end
159
+ assert_nothing_raised() do
160
+ response = Reader.read_page("http://localhost/path?query")
161
+ assert_equal "http://localhost/somewhere", response.url.to_s
162
+ end
163
+ end
164
+
165
+
166
+ def test_should_use_cache_control
167
+ # Test Last Modified and ETag headers. First, that they are correctly
168
+ # returned from headers to response object. Next, that passing right
169
+ # headers in options returns nil body and same values (no change),
170
+ # passing wrong/no headers, returnspage.
171
+ time = Time.new.rfc2822
172
+ Net::HTTP.on_get do |address, path, headers|
173
+ response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
174
+ response["Last-Modified"] = time
175
+ response["ETag"] = "etag"
176
+ [response, "nothing"]
177
+ end
178
+ response = Reader.read_page("http://localhost/path?query")
179
+ assert_equal time, response.last_modified
180
+ assert_equal "etag", response.etag
181
+ Net::HTTP.on_get do |address, path, headers|
182
+ if headers["Last-Modified"] == time and headers["ETag"] == "etag"
183
+ [Net::HTTPNotModified.new(Net::HTTP.version_1_2, 304, "Same"), ""]
184
+ else
185
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
186
+ end
187
+ end
188
+ response = Reader.read_page("http://localhost/path?query")
189
+ assert_equal "nothing", response.content
190
+ response = Reader.read_page("http://localhost/path?query", :last_modified=>time, :etag=>"etag")
191
+ assert_equal nil, response.content
192
+ assert_equal time, response.last_modified
193
+ assert_equal "etag", response.etag
194
+ end
195
+
196
+
197
+ def test_should_find_encoding
198
+ # Test working redirection. Redirect only once and test response URL.
199
+ # Should be new URL for permanent redirect, same URL for all other redirects.
200
+ Net::HTTP.on_get do |address, path, headers|
201
+ response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
202
+ response["content-type"] = "text/html; charset=bogus"
203
+ [response, ""]
204
+ end
205
+ response = Reader.read_page("http://localhost/path?query")
206
+ assert_equal "bogus", response.encoding
207
+ end
208
+
209
+
210
+ #
211
+ # Tests parse_page.
212
+ #
213
+
214
+ def test_should_parse_html_page
215
+ html = Reader.parse_page("<html><head></head><body><p>something</p></body></html>").document
216
+ assert_equal 1, html.find_all(:tag=>"head").size
217
+ assert_equal 1, html.find_all(:tag=>"body").size
218
+ assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
219
+ assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
220
+ end
221
+
222
+
223
+ def test_should_use_tidy_if_specified
224
+ # This will only work with Tidy which adds the head/body parts,
225
+ # HTMLParser doesn't fix the HTML.
226
+ html = Reader.parse_page("<p>something</p>", nil, {}).document
227
+ assert_equal 1, html.find_all(:tag=>"head").size
228
+ assert_equal 1, html.find_all(:tag=>"body").size
229
+ assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
230
+ assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
231
+ end
232
+
233
+
234
+ #
235
+ # Other tests.
236
+ #
237
+
238
+ def test_should_handle_encoding_correctly
239
+ # Test content encoding returned from HTTP server.
240
+ with_webrick do |server, params|
241
+ server.mount_proc "/test.html" do |req,resp|
242
+ resp["Content-Type"] = "text/html; charset=my-encoding"
243
+ resp.body = "Content comes here"
244
+ end
245
+ page = Reader.read_page(WEBRICK_TEST_URL)
246
+ page = Reader.parse_page(page.content, page.encoding)
247
+ assert_equal "my-encoding", page.encoding
248
+ end
249
+ # Test content encoding in HTML http-equiv header
250
+ # that overrides content encoding returned in HTTP.
251
+ with_webrick do |server, params|
252
+ server.mount_proc "/test.html" do |req,resp|
253
+ resp["Content-Type"] = "text/html; charset=my-encoding"
254
+ resp.body = %Q{
255
+ <html>
256
+ <head>
257
+ <meta http-equiv="content-type" value="text/html; charset=other-encoding">
258
+ </head>
259
+ <body></body>
260
+ </html>
261
+ }
262
+ end
263
+ page = Reader.read_page(WEBRICK_TEST_URL)
264
+ page = Reader.parse_page(page.content, page.encoding)
265
+ assert_equal "other-encoding", page.encoding
266
+ end
267
+ end
268
+
269
+ def test_should_support_https
270
+ begin
271
+ options = WEBRICK_OPTIONS.dup.update(
272
+ :SSLEnable=>true,
273
+ :SSLVerifyClient => ::OpenSSL::SSL::VERIFY_NONE,
274
+ :SSLCertName => [ ["C","JP"], ["O","WEBrick.Org"], ["CN", "WWW"] ]
275
+ )
276
+ server = WEBrick::HTTPServer.new(options)
277
+ trap("INT") { server.shutdown }
278
+ Thread.new { server.start }
279
+ server.mount_proc "/test.html" do |req,resp|
280
+ resp.body = %Q{
281
+ <html>
282
+ <head>
283
+ <title>test https</title>
284
+ </head>
285
+ <body></body>
286
+ </html>
287
+ }
288
+ end
289
+ # Make sure page not HTTP accessible.
290
+ assert_raises(Reader::HTTPUnspecifiedError) do
291
+ Reader.read_page(WEBRICK_TEST_URL)
292
+ end
293
+ page = Reader.read_page(WEBRICK_TEST_URL.gsub("http", "https"))
294
+ page = Reader.parse_page(page.content, page.encoding)
295
+ assert_equal "<title>test https</title>",
296
+ page.document.find(:tag=>"title").to_s
297
+ server.shutdown
298
+ ensure
299
+ server.shutdown if server
300
+ end
301
+ end
302
+
303
+
304
+ private
305
+
306
+ def with_webrick(params = nil)
307
+ begin
308
+ server = WEBrick::HTTPServer.new(WEBRICK_OPTIONS)
309
+ trap("INT") { server.shutdown }
310
+ Thread.new { server.start }
311
+ yield server, params
312
+ server.shutdown
313
+ ensure
314
+ server.shutdown if server
315
+ end
316
+ end
317
+
318
+ end