assaf-scrapi 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +36 -0
- data/MIT-LICENSE +20 -0
- data/README.rdoc +88 -0
- data/Rakefile +33 -0
- data/lib/html/document.rb +64 -0
- data/lib/html/htmlparser.rb +407 -0
- data/lib/html/node.rb +534 -0
- data/lib/html/node_ext.rb +86 -0
- data/lib/html/selector.rb +825 -0
- data/lib/html/tokenizer.rb +105 -0
- data/lib/html/version.rb +11 -0
- data/lib/scraper/base.rb +990 -0
- data/lib/scraper/microformats.rb +93 -0
- data/lib/scraper/reader.rb +240 -0
- data/lib/scrapi.rb +8 -0
- data/lib/tidy/libtidy.dll +0 -0
- data/lib/tidy/libtidy.so +0 -0
- data/test/mock_net_http.rb +54 -0
- data/test/node_ext_test.rb +24 -0
- data/test/reader_test.rb +318 -0
- data/test/scraper_test.rb +804 -0
- data/test/selector_test.rb +637 -0
- metadata +89 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
require "time"
|
2
|
+
|
3
|
+
|
4
|
+
module Scraper
|
5
|
+
|
6
|
+
module Microformats
|
7
|
+
|
8
|
+
class HCard < Scraper::Base
|
9
|
+
|
10
|
+
process ".fn", :fn=>:text
|
11
|
+
process ".given-name", :given_name=>:text
|
12
|
+
process ".family-name", :family_name=>:text
|
13
|
+
process "img.photo", :photo=>"@src"
|
14
|
+
process "a.url", :url=>"@href"
|
15
|
+
|
16
|
+
result :fn, :given_name, :family_name, :photo, :url
|
17
|
+
|
18
|
+
def collect()
|
19
|
+
unless fn
|
20
|
+
if self.fn = given_name
|
21
|
+
self.given_name << " #{family_name}" if family_name
|
22
|
+
else
|
23
|
+
self.fn = family_name
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
class HAtom < Scraper::Base
|
32
|
+
|
33
|
+
class Entry < Scraper::Base
|
34
|
+
|
35
|
+
array :content, :tags
|
36
|
+
|
37
|
+
process ".entry-title", :title=>:text
|
38
|
+
process ".entry-content", :content=>:element
|
39
|
+
process ".entry-summary", :summary=>:element
|
40
|
+
process "a[rel~=bookmark]", :permalink=>["@href"]
|
41
|
+
process ".author.vcard, .author .vcard", :author=>HCard
|
42
|
+
process ".published", :published=>["abbr@title", :text]
|
43
|
+
process ".updated", :updated=>["abbr@title", :text]
|
44
|
+
process "a[rel~=tag]", :tags=>:text
|
45
|
+
|
46
|
+
def collect()
|
47
|
+
self.published = Time.parse(published)
|
48
|
+
self.updated = updated ? Time.parse(updated) : published
|
49
|
+
end
|
50
|
+
|
51
|
+
result :title, :content, :summary, :permalink, :author, :published, :updated, :tags
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
class Feed < Scraper::Base
|
56
|
+
|
57
|
+
array :entries
|
58
|
+
|
59
|
+
process ".hentry", :entries=>Entry
|
60
|
+
|
61
|
+
def result()
|
62
|
+
entries
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
array :feeds, :entries
|
68
|
+
|
69
|
+
# Skip feeds, so we don't process them twice.
|
70
|
+
process ".hfeed", :skip=>true, :feeds=>Feed
|
71
|
+
# And so we can collect unwrapped entries into a separate feed.
|
72
|
+
process ".hentry", :skip=>true, :entries=>Entry
|
73
|
+
# And collect the first remaining hcard as the default author.
|
74
|
+
process ".vcard", :hcard=>HCard
|
75
|
+
|
76
|
+
def collect()
|
77
|
+
@feeds ||= []
|
78
|
+
@feeds << entries if entries
|
79
|
+
for feed in feeds
|
80
|
+
for entry in feed
|
81
|
+
entry.author = hcard unless entry.author
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
result :feeds
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
@@ -0,0 +1,240 @@
|
|
1
|
+
# ScrAPI toolkit for Ruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
|
4
|
+
# Developed for http://co.mments.com
|
5
|
+
# Code and documention: http://labnotes.org
|
6
|
+
|
7
|
+
|
8
|
+
require "uri"
|
9
|
+
require "net/http"
|
10
|
+
require "net/https"
|
11
|
+
begin
|
12
|
+
require "rubygems"
|
13
|
+
require "tidy"
|
14
|
+
rescue LoadError
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
module Scraper
|
19
|
+
|
20
|
+
module Reader
|
21
|
+
|
22
|
+
class HTTPError < StandardError
|
23
|
+
|
24
|
+
attr_reader :cause
|
25
|
+
|
26
|
+
def initialize(cause = nil)
|
27
|
+
@cause = cause
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def to_s
|
32
|
+
@cause ? "#{super}: #{@cause}" : super
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
class HTTPTimeoutError < HTTPError ; end
|
38
|
+
class HTTPUnspecifiedError < HTTPError ; end
|
39
|
+
class HTTPNotFoundError < HTTPError ; end
|
40
|
+
class HTTPNoAccessError < HTTPError ; end
|
41
|
+
class HTTPInvalidURLError < HTTPError ; end
|
42
|
+
class HTTPRedirectLimitError < HTTPError ; end
|
43
|
+
|
44
|
+
|
45
|
+
class HTMLParseError < StandardError
|
46
|
+
|
47
|
+
attr_reader :cause
|
48
|
+
|
49
|
+
def initialize(cause = nil)
|
50
|
+
@cause = cause
|
51
|
+
end
|
52
|
+
|
53
|
+
def to_s
|
54
|
+
@cause ? "#{super}: #{@cause}" : super
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
unless const_defined? :REDIRECT_LIMIT
|
61
|
+
REDIRECT_LIMIT = 3
|
62
|
+
DEFAULT_TIMEOUT = 30
|
63
|
+
PARSERS = [:tidy, :html_parser]
|
64
|
+
end
|
65
|
+
|
66
|
+
unless const_defined? :TIDY_OPTIONS
|
67
|
+
TIDY_OPTIONS = {
|
68
|
+
:output_xhtml=>true,
|
69
|
+
:show_errors=>0,
|
70
|
+
:show_warnings=>false,
|
71
|
+
:wrap=>0,
|
72
|
+
:wrap_sections=>false,
|
73
|
+
:force_output=>true,
|
74
|
+
:quiet=>true,
|
75
|
+
:tidy_mark=>false
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
Page = Struct.new(:url, :content, :encoding, :last_modified, :etag)
|
81
|
+
Parsed = Struct.new(:document, :encoding)
|
82
|
+
|
83
|
+
|
84
|
+
module_function
|
85
|
+
|
86
|
+
# :call-seq:
|
87
|
+
# read_page(url, options?) => response
|
88
|
+
#
|
89
|
+
# Reads a Web page and return its URL, content and cache control headers.
|
90
|
+
#
|
91
|
+
# The request reads a Web page at the specified URL (must be a URI object).
|
92
|
+
# It accepts the following options:
|
93
|
+
# * :last_modified -- Last modified header (from a previous request).
|
94
|
+
# * :etag -- ETag header (from a previous request).
|
95
|
+
# * :redirect_limit -- Number of redirects allowed (default is 3).
|
96
|
+
# * :user_agent -- The User-Agent header to send.
|
97
|
+
# * :timeout -- HTTP open connection/read timeouts (in second).
|
98
|
+
#
|
99
|
+
# It returns a hash with the following information:
|
100
|
+
# * :url -- The URL of the requested page (may change by permanent redirect)
|
101
|
+
# * :content -- The content of the response (may be nil if cached)
|
102
|
+
# * :content_type -- The HTML page Content-Type header
|
103
|
+
# * :last_modified -- Last modified cache control header (may be nil)
|
104
|
+
# * :etag -- ETag cache control header (may be nil)
|
105
|
+
# * :encoding -- Document encoding for the page
|
106
|
+
# If the page has not been modified from the last request, the content is nil.
|
107
|
+
#
|
108
|
+
# Raises HTTPError if an error prevents it from reading the page.
|
109
|
+
def read_page(url, options = nil)
|
110
|
+
options ||= {}
|
111
|
+
redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT
|
112
|
+
raise HTTPRedirectLimitError if redirect_limit == 0
|
113
|
+
if url.is_a?(URI)
|
114
|
+
uri = url
|
115
|
+
else
|
116
|
+
begin
|
117
|
+
uri = URI.parse(url)
|
118
|
+
rescue Exception=>error
|
119
|
+
raise HTTPInvalidURLError.new(error)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/
|
123
|
+
begin
|
124
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
125
|
+
http.use_ssl = (uri.scheme == "https")
|
126
|
+
http.close_on_empty_response = true
|
127
|
+
http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
|
128
|
+
path = uri.path.dup # required so we don't modify path
|
129
|
+
path << "?#{uri.query}" if uri.query
|
130
|
+
# TODO: Specify which content types are accepted.
|
131
|
+
# TODO: GZip support.
|
132
|
+
headers = {}
|
133
|
+
headers["User-Agent"] = options[:user_agent] if options[:user_agent]
|
134
|
+
headers["Last-Modified"] = options[:last_modified] if options[:last_modified]
|
135
|
+
headers["ETag"] = options[:etag] if options[:etag]
|
136
|
+
response = http.request_get(path, headers)
|
137
|
+
# TODO: Ignore content types that do not map to HTML.
|
138
|
+
rescue TimeoutError=>error
|
139
|
+
raise HTTPTimeoutError.new(error)
|
140
|
+
rescue Exception=>error
|
141
|
+
raise HTTPUnspecifiedError.new(error)
|
142
|
+
end
|
143
|
+
case response
|
144
|
+
when Net::HTTPSuccess
|
145
|
+
encoding = if content_type = response["Content-Type"]
|
146
|
+
if match = content_type.match(/charset=([^\s]+)/i)
|
147
|
+
match[1]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
return Page[(options[:source_url] || uri), response.body, encoding,
|
151
|
+
response["Last-Modified"], response["ETag"]]
|
152
|
+
when Net::HTTPNotModified
|
153
|
+
return Page[(options[:source_url] || uri), nil, nil,
|
154
|
+
options[:last_modified], options[:etag]]
|
155
|
+
when Net::HTTPMovedPermanently
|
156
|
+
return read_page((uri.merge(response["location"]) rescue nil), # New URL takes effect
|
157
|
+
:last_modified=>options[:last_modified],
|
158
|
+
:etag=>options[:etag],
|
159
|
+
:redirect_limit=>redirect_limit-1)
|
160
|
+
when Net::HTTPRedirection
|
161
|
+
return read_page((uri.merge(response["location"]) rescue nil),
|
162
|
+
:last_modified=>options[:last_modified],
|
163
|
+
:etag=>options[:etag],
|
164
|
+
:redirect_limit=>redirect_limit-1,
|
165
|
+
:source_url=>(options[:source_url] || uri)) # Old URL still in effect
|
166
|
+
when Net::HTTPNotFound
|
167
|
+
raise HTTPNotFoundError
|
168
|
+
when Net::HTTPUnauthorized, Net::HTTPForbidden
|
169
|
+
raise HTTPNoAccessError
|
170
|
+
when Net::HTTPRequestTimeOut
|
171
|
+
raise HTTPTimeoutError
|
172
|
+
else
|
173
|
+
raise HTTPUnspecifiedError
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
|
178
|
+
# :call-seq:
|
179
|
+
# parse_page(html, encoding?, options?, parser) => html
|
180
|
+
#
|
181
|
+
# Parses an HTML page and returns the encoding and HTML element.
|
182
|
+
# Raises HTMLParseError exceptions if it cannot parse the HTML.
|
183
|
+
#
|
184
|
+
# Options are passed to the parser. For example, when using Tidy
|
185
|
+
# you can pass Tidy cleanup options in the hash.
|
186
|
+
#
|
187
|
+
# The last option specifies which parser to use (see PARSERS).
|
188
|
+
# By default Tidy is used.
|
189
|
+
def parse_page(content, encoding = nil, options = nil, parser = :tidy)
|
190
|
+
begin
|
191
|
+
# Get the document encoding from the meta header.
|
192
|
+
if meta = content.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
|
193
|
+
if meta = meta[0].match(/charset=([\w-]*)/i)
|
194
|
+
encoding = meta[1]
|
195
|
+
end
|
196
|
+
end
|
197
|
+
encoding ||= "utf8"
|
198
|
+
case (parser || :tidy)
|
199
|
+
when :tidy
|
200
|
+
# Make sure the Tidy path is set and always apply the default
|
201
|
+
# options (these only control things like errors, output type).
|
202
|
+
find_tidy
|
203
|
+
options = (options || {}).update(TIDY_OPTIONS)
|
204
|
+
options[:input_encoding] = encoding.gsub("-", "").downcase
|
205
|
+
document = Tidy.open(options) do |tidy|
|
206
|
+
html = tidy.clean(content)
|
207
|
+
HTML::Document.new(html).find(:tag=>"html")
|
208
|
+
end
|
209
|
+
when :html_parser
|
210
|
+
document = HTML::HTMLParser.parse(content).root
|
211
|
+
else
|
212
|
+
raise HTMLParseError, "No parser #{parser || "unspecified"}"
|
213
|
+
end
|
214
|
+
return Parsed[document, encoding]
|
215
|
+
rescue Exception=>error
|
216
|
+
raise HTMLParseError.new(error)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
|
221
|
+
protected
|
222
|
+
|
223
|
+
module_function
|
224
|
+
|
225
|
+
def find_tidy()
|
226
|
+
return if Tidy.path
|
227
|
+
begin
|
228
|
+
Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
|
229
|
+
rescue LoadError
|
230
|
+
begin
|
231
|
+
Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
|
232
|
+
rescue LoadError
|
233
|
+
Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
239
|
+
|
240
|
+
end
|
data/lib/scrapi.rb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
# Conditional loads, since we may have these libraries elsewhere,
|
2
|
+
# e.g. when using Rails with assert_select plugin.
|
3
|
+
require File.join(File.dirname(__FILE__), "html", "document") unless defined?(HTML::Document)
|
4
|
+
require File.join(File.dirname(__FILE__), "html", "node_ext") unless defined?(HTML::Node.detach)
|
5
|
+
require File.join(File.dirname(__FILE__), "html", "selector") unless defined?(HTML::Selector)
|
6
|
+
require File.join(File.dirname(__FILE__), "html", "htmlparser") unless defined?(HTML::HTMLParser)
|
7
|
+
|
8
|
+
require File.join(File.dirname(__FILE__), "scraper", "base") unless defined?(Scraper::Base)
|
Binary file
|
data/lib/tidy/libtidy.so
ADDED
Binary file
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require "net/http"
|
2
|
+
|
3
|
+
class Net::HTTP
|
4
|
+
|
5
|
+
@@on_get = nil
|
6
|
+
|
7
|
+
# Reset get method to default behavior.
|
8
|
+
def self.reset_on_get
|
9
|
+
@@on_get = nil
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
# :call-seq:
|
14
|
+
# on_get { |address, path, headers| ... => [response, body] }
|
15
|
+
#
|
16
|
+
# Specify alternative behavior for next execution of get method.
|
17
|
+
# This change applies to all instances of Net::HTTP, so do not use
|
18
|
+
# this method when running tests in parallel.
|
19
|
+
#
|
20
|
+
# The method takes a single block that accepts three arguments:
|
21
|
+
# the address (host), path and headers (hash). It must return an
|
22
|
+
# array with two values: the Net::HTTPResponse object and the
|
23
|
+
# content of the response body.
|
24
|
+
def self.on_get(&block)
|
25
|
+
@@on_get = block
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
unless method_defined?(:mocked_request_get)
|
30
|
+
alias :mocked_request_get :request_get
|
31
|
+
|
32
|
+
def request_get(path, headers)
|
33
|
+
# If we have prescribed behavior for the next search, execute it,
|
34
|
+
# otherwise, go with the default.
|
35
|
+
if @@on_get
|
36
|
+
response, body = @@on_get.call(@address, path, headers)
|
37
|
+
# Stuff the body into the response. No other way, since read_body
|
38
|
+
# attempts to read from a socket and we're too lazy to stub a socket.
|
39
|
+
response.instance_variable_set(:@mock_body, body.to_s)
|
40
|
+
class << response
|
41
|
+
def read_body()
|
42
|
+
@mock_body
|
43
|
+
end
|
44
|
+
end
|
45
|
+
response
|
46
|
+
else
|
47
|
+
mocked_request_get(path, headers)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# ScrAPI toolkit for Ruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
|
4
|
+
# Developed for http://co.mments.com
|
5
|
+
# Code and documention: http://labnotes.org
|
6
|
+
|
7
|
+
|
8
|
+
require "rubygems"
|
9
|
+
require "test/unit"
|
10
|
+
require File.join(File.dirname(__FILE__), "../lib", "scrapi")
|
11
|
+
|
12
|
+
|
13
|
+
class NodeExtTest < Test::Unit::TestCase
|
14
|
+
|
15
|
+
def setup
|
16
|
+
end
|
17
|
+
|
18
|
+
def teardown
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_add_tests
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
data/test/reader_test.rb
ADDED
@@ -0,0 +1,318 @@
|
|
1
|
+
# ScrAPI toolkit for Ruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
|
4
|
+
# Developed for http://co.mments.com
|
5
|
+
# Code and documention: http://labnotes.org
|
6
|
+
|
7
|
+
|
8
|
+
require "rubygems"
|
9
|
+
require "test/unit"
|
10
|
+
require "time" # rfc2822
|
11
|
+
require "webrick"
|
12
|
+
require "webrick/https"
|
13
|
+
require "logger"
|
14
|
+
require "stringio"
|
15
|
+
require File.join(File.dirname(__FILE__), "mock_net_http")
|
16
|
+
require File.join(File.dirname(__FILE__), "../lib", "scrapi")
|
17
|
+
|
18
|
+
|
19
|
+
class ReaderTest < Test::Unit::TestCase
|
20
|
+
|
21
|
+
include Scraper
|
22
|
+
|
23
|
+
|
24
|
+
WEBRICK_OPTIONS = {
|
25
|
+
:BindAddredd=>"127.0.0.1",
|
26
|
+
:Port=>2000,
|
27
|
+
:Logger=>Logger.new(StringIO.new) # /dev/null
|
28
|
+
}
|
29
|
+
|
30
|
+
WEBRICK_TEST_URL = "http://127.0.0.1:2000/test.html"
|
31
|
+
|
32
|
+
|
33
|
+
def setup
|
34
|
+
Net::HTTP.reset_on_get
|
35
|
+
end
|
36
|
+
|
37
|
+
def teardown
|
38
|
+
Net::HTTP.reset_on_get
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
#
|
43
|
+
# Tests read_page.
|
44
|
+
#
|
45
|
+
|
46
|
+
def test_should_pass_path_and_user_agent
|
47
|
+
# Test path, query string and user agent.
|
48
|
+
Net::HTTP.on_get do |address, path, headers|
|
49
|
+
assert_equal "localhost", address
|
50
|
+
assert_equal "/path?query", path
|
51
|
+
assert_equal "MyUserAgent", headers["User-Agent"]
|
52
|
+
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
|
53
|
+
end
|
54
|
+
response = Reader.read_page("http://localhost/path?query", :user_agent=>"MyUserAgent")
|
55
|
+
assert_equal "http://localhost/path?query", response.url.to_s
|
56
|
+
assert_equal "nothing", response.content
|
57
|
+
assert_equal nil, response.last_modified
|
58
|
+
assert_equal nil, response.etag
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
def test_should_handle_http_and_timeout_errors
|
63
|
+
# Test timeout error and HTTP status that we can't process.
|
64
|
+
Net::HTTP.on_get { |address, path, headers| raise TimeoutError }
|
65
|
+
assert_raise(Reader::HTTPTimeoutError) do
|
66
|
+
response = Reader.read_page("http://localhost/path?query")
|
67
|
+
end
|
68
|
+
Net::HTTP.on_get { |address, path, headers| [Net::HTTPRequestTimeOut.new(Net::HTTP.version_1_2, 408, "Timeout"),""] }
|
69
|
+
assert_raise(Reader::HTTPTimeoutError) do
|
70
|
+
response = Reader.read_page("http://localhost/path?query")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
def test_should_fail_on_too_many_redirects
|
76
|
+
# Test too many redirections.
|
77
|
+
Net::HTTP.on_get do |address, path, headers|
|
78
|
+
response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
|
79
|
+
response["location"] = "http://localhost"
|
80
|
+
[response, ""]
|
81
|
+
end
|
82
|
+
assert_raise(Reader::HTTPRedirectLimitError) do
|
83
|
+
response = Reader.read_page("http://localhost/path?query")
|
84
|
+
end
|
85
|
+
Net::HTTP.on_get do |address, path, headers|
|
86
|
+
response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
|
87
|
+
response["location"] = "http://localhost"
|
88
|
+
[response, ""]
|
89
|
+
end
|
90
|
+
assert_raise(Reader::HTTPRedirectLimitError) do
|
91
|
+
response = Reader.read_page("http://localhost/path?query")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
def test_should_validate_redirect_url
|
97
|
+
# Test validation of redirection URI.
|
98
|
+
Net::HTTP.on_get do |address, path, headers|
|
99
|
+
response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
|
100
|
+
response["location"] = "ftp://notsupported"
|
101
|
+
[response, ""]
|
102
|
+
end
|
103
|
+
assert_raise(Reader::HTTPInvalidURLError) do
|
104
|
+
response = Reader.read_page("http://localhost/path?query")
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
def test_should_support_redirection
|
110
|
+
# Test working redirection. Redirect only once and test response URL.
|
111
|
+
# Should be new URL for permanent redirect, same URL for all other redirects.
|
112
|
+
Net::HTTP.on_get do |address, path, headers|
|
113
|
+
if path.empty?
|
114
|
+
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
|
115
|
+
else
|
116
|
+
response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
|
117
|
+
response["Location"] = "http://localhost"
|
118
|
+
[response, ""]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
assert_nothing_raised() do
|
122
|
+
response = Reader.read_page("http://localhost/path?query")
|
123
|
+
assert_equal "http://localhost/path?query", response.url.to_s
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
def test_should_support_permanent_redirection
|
129
|
+
# Test working redirection. Redirect only once and test response URL.
|
130
|
+
# Should be new URL for permanent redirect, same URL for all other redirects.
|
131
|
+
Net::HTTP.on_get do |address, path, headers|
|
132
|
+
if path == "/"
|
133
|
+
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
|
134
|
+
else
|
135
|
+
response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
|
136
|
+
response["location"] = "http://localhost/"
|
137
|
+
[response, ""]
|
138
|
+
end
|
139
|
+
end
|
140
|
+
assert_nothing_raised() do
|
141
|
+
response = Reader.read_page("http://localhost/path?query")
|
142
|
+
assert_equal "http://localhost/", response.url.to_s
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
def test_should_support_partial_location_redirection
|
148
|
+
# Test working redirection. Redirect only once and test response URL.
|
149
|
+
# Should be new URL for permanent redirect, same URL for all other redirects.
|
150
|
+
Net::HTTP.on_get do |address, path, headers|
|
151
|
+
if path == "/somewhere"
|
152
|
+
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
|
153
|
+
else
|
154
|
+
response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
|
155
|
+
response["location"] = "somewhere"
|
156
|
+
[response, ""]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
assert_nothing_raised() do
|
160
|
+
response = Reader.read_page("http://localhost/path?query")
|
161
|
+
assert_equal "http://localhost/somewhere", response.url.to_s
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
|
166
|
+
def test_should_use_cache_control
|
167
|
+
# Test Last Modified and ETag headers. First, that they are correctly
|
168
|
+
# returned from headers to response object. Next, that passing right
|
169
|
+
# headers in options returns nil body and same values (no change),
|
170
|
+
# passing wrong/no headers, returnspage.
|
171
|
+
time = Time.new.rfc2822
|
172
|
+
Net::HTTP.on_get do |address, path, headers|
|
173
|
+
response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
|
174
|
+
response["Last-Modified"] = time
|
175
|
+
response["ETag"] = "etag"
|
176
|
+
[response, "nothing"]
|
177
|
+
end
|
178
|
+
response = Reader.read_page("http://localhost/path?query")
|
179
|
+
assert_equal time, response.last_modified
|
180
|
+
assert_equal "etag", response.etag
|
181
|
+
Net::HTTP.on_get do |address, path, headers|
|
182
|
+
if headers["Last-Modified"] == time and headers["ETag"] == "etag"
|
183
|
+
[Net::HTTPNotModified.new(Net::HTTP.version_1_2, 304, "Same"), ""]
|
184
|
+
else
|
185
|
+
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
|
186
|
+
end
|
187
|
+
end
|
188
|
+
response = Reader.read_page("http://localhost/path?query")
|
189
|
+
assert_equal "nothing", response.content
|
190
|
+
response = Reader.read_page("http://localhost/path?query", :last_modified=>time, :etag=>"etag")
|
191
|
+
assert_equal nil, response.content
|
192
|
+
assert_equal time, response.last_modified
|
193
|
+
assert_equal "etag", response.etag
|
194
|
+
end
|
195
|
+
|
196
|
+
|
197
|
+
def test_should_find_encoding
|
198
|
+
# Test working redirection. Redirect only once and test response URL.
|
199
|
+
# Should be new URL for permanent redirect, same URL for all other redirects.
|
200
|
+
Net::HTTP.on_get do |address, path, headers|
|
201
|
+
response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
|
202
|
+
response["content-type"] = "text/html; charset=bogus"
|
203
|
+
[response, ""]
|
204
|
+
end
|
205
|
+
response = Reader.read_page("http://localhost/path?query")
|
206
|
+
assert_equal "bogus", response.encoding
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
#
|
211
|
+
# Tests parse_page.
|
212
|
+
#
|
213
|
+
|
214
|
+
def test_should_parse_html_page
|
215
|
+
html = Reader.parse_page("<html><head></head><body><p>something</p></body></html>").document
|
216
|
+
assert_equal 1, html.find_all(:tag=>"head").size
|
217
|
+
assert_equal 1, html.find_all(:tag=>"body").size
|
218
|
+
assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
|
219
|
+
assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
|
220
|
+
end
|
221
|
+
|
222
|
+
|
223
|
+
def test_should_use_tidy_if_specified
|
224
|
+
# This will only work with Tidy which adds the head/body parts,
|
225
|
+
# HTMLParser doesn't fix the HTML.
|
226
|
+
html = Reader.parse_page("<p>something</p>", nil, {}).document
|
227
|
+
assert_equal 1, html.find_all(:tag=>"head").size
|
228
|
+
assert_equal 1, html.find_all(:tag=>"body").size
|
229
|
+
assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
|
230
|
+
assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
|
231
|
+
end
|
232
|
+
|
233
|
+
|
234
|
+
#
|
235
|
+
# Other tests.
|
236
|
+
#
|
237
|
+
|
238
|
+
def test_should_handle_encoding_correctly
|
239
|
+
# Test content encoding returned from HTTP server.
|
240
|
+
with_webrick do |server, params|
|
241
|
+
server.mount_proc "/test.html" do |req,resp|
|
242
|
+
resp["Content-Type"] = "text/html; charset=my-encoding"
|
243
|
+
resp.body = "Content comes here"
|
244
|
+
end
|
245
|
+
page = Reader.read_page(WEBRICK_TEST_URL)
|
246
|
+
page = Reader.parse_page(page.content, page.encoding)
|
247
|
+
assert_equal "my-encoding", page.encoding
|
248
|
+
end
|
249
|
+
# Test content encoding in HTML http-equiv header
|
250
|
+
# that overrides content encoding returned in HTTP.
|
251
|
+
with_webrick do |server, params|
|
252
|
+
server.mount_proc "/test.html" do |req,resp|
|
253
|
+
resp["Content-Type"] = "text/html; charset=my-encoding"
|
254
|
+
resp.body = %Q{
|
255
|
+
<html>
|
256
|
+
<head>
|
257
|
+
<meta http-equiv="content-type" value="text/html; charset=other-encoding">
|
258
|
+
</head>
|
259
|
+
<body></body>
|
260
|
+
</html>
|
261
|
+
}
|
262
|
+
end
|
263
|
+
page = Reader.read_page(WEBRICK_TEST_URL)
|
264
|
+
page = Reader.parse_page(page.content, page.encoding)
|
265
|
+
assert_equal "other-encoding", page.encoding
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
def test_should_support_https
|
270
|
+
begin
|
271
|
+
options = WEBRICK_OPTIONS.dup.update(
|
272
|
+
:SSLEnable=>true,
|
273
|
+
:SSLVerifyClient => ::OpenSSL::SSL::VERIFY_NONE,
|
274
|
+
:SSLCertName => [ ["C","JP"], ["O","WEBrick.Org"], ["CN", "WWW"] ]
|
275
|
+
)
|
276
|
+
server = WEBrick::HTTPServer.new(options)
|
277
|
+
trap("INT") { server.shutdown }
|
278
|
+
Thread.new { server.start }
|
279
|
+
server.mount_proc "/test.html" do |req,resp|
|
280
|
+
resp.body = %Q{
|
281
|
+
<html>
|
282
|
+
<head>
|
283
|
+
<title>test https</title>
|
284
|
+
</head>
|
285
|
+
<body></body>
|
286
|
+
</html>
|
287
|
+
}
|
288
|
+
end
|
289
|
+
# Make sure page not HTTP accessible.
|
290
|
+
assert_raises(Reader::HTTPUnspecifiedError) do
|
291
|
+
Reader.read_page(WEBRICK_TEST_URL)
|
292
|
+
end
|
293
|
+
page = Reader.read_page(WEBRICK_TEST_URL.gsub("http", "https"))
|
294
|
+
page = Reader.parse_page(page.content, page.encoding)
|
295
|
+
assert_equal "<title>test https</title>",
|
296
|
+
page.document.find(:tag=>"title").to_s
|
297
|
+
server.shutdown
|
298
|
+
ensure
|
299
|
+
server.shutdown if server
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
|
304
|
+
private
|
305
|
+
|
306
|
+
def with_webrick(params = nil)
|
307
|
+
begin
|
308
|
+
server = WEBrick::HTTPServer.new(WEBRICK_OPTIONS)
|
309
|
+
trap("INT") { server.shutdown }
|
310
|
+
Thread.new { server.start }
|
311
|
+
yield server, params
|
312
|
+
server.shutdown
|
313
|
+
ensure
|
314
|
+
server.shutdown if server
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
end
|