scrapi 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,239 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "uri"
9
+ require "net/http"
10
+ require "net/https"
11
+ begin
12
+ require "rubygems"
13
+ require "tidy"
14
+ rescue LoadError
15
+ end
16
+
17
+
18
+ module Scraper
19
+
20
+ module Reader
21
+
22
+ class HTTPError < StandardError
23
+
24
+ attr_reader :cause
25
+
26
+ def initialize(cause = nil)
27
+ @cause = cause
28
+ end
29
+
30
+
31
+ def to_s
32
+ @cause ? "#{super}: #{@cause}" : super
33
+ end
34
+
35
+ end
36
+
37
+ class HTTPTimeoutError < HTTPError ; end
38
+ class HTTPUnspecifiedError < HTTPError ; end
39
+ class HTTPNotFoundError < HTTPError ; end
40
+ class HTTPNoAccessError < HTTPError ; end
41
+ class HTTPInvalidURLError < HTTPError ; end
42
+ class HTTPRedirectLimitError < HTTPError ; end
43
+
44
+
45
+ class HTMLParseError < StandardError
46
+
47
+ attr_reader :cause
48
+
49
+ def initialize(cause = nil)
50
+ @cause = cause
51
+ end
52
+
53
+ def to_s
54
+ @cause ? "#{super}: #{@cause}" : super
55
+ end
56
+
57
+ end
58
+
59
+
60
+ unless const_defined? :REDIRECT_LIMIT
61
+ REDIRECT_LIMIT = 3
62
+ DEFAULT_TIMEOUT = 30
63
+ PARSERS = [:tidy, :html_parser]
64
+ end
65
+
66
+ unless const_defined? :TIDY_OPTIONS
67
+ TIDY_OPTIONS = {
68
+ :output_xhtml=>true,
69
+ :show_errors=>0,
70
+ :show_warnings=>false,
71
+ :wrap=>0,
72
+ :wrap_sections=>false,
73
+ :force_output=>true,
74
+ :quiet=>true,
75
+ :tidy_mark=>false
76
+ }
77
+ end
78
+
79
+
80
+ Page = Struct.new(:url, :content, :encoding, :last_modified, :etag)
81
+ Parsed = Struct.new(:document, :encoding)
82
+
83
+
84
+ module_function
85
+
86
+ # :call-seq:
87
+ # read_page(url, options?) => response
88
+ #
89
+ # Reads a Web page and return its URL, content and cache control headers.
90
+ #
91
+ # The request reads a Web page at the specified URL (must be a URI object).
92
+ # It accepts the following options:
93
+ # * :last_modified -- Last modified header (from a previous request).
94
+ # * :etag -- ETag header (from a previous request).
95
+ # * :redirect_limit -- Number of redirects allowed (default is 3).
96
+ # * :user_agent -- The User-Agent header to send.
97
+ # * :timeout -- HTTP open connection/read timeouts (in second).
98
+ #
99
+ # It returns a hash with the following information:
100
+ # * :url -- The URL of the requested page (may change by permanent redirect)
101
+ # * :content -- The content of the response (may be nil if cached)
102
+ # * :content_type -- The HTML page Content-Type header
103
+ # * :last_modified -- Last modified cache control header (may be nil)
104
+ # * :etag -- ETag cache control header (may be nil)
105
+ # * :encoding -- Document encoding for the page
106
+ # If the page has not been modified from the last request, the content is nil.
107
+ #
108
+ # Raises HTTPError if an error prevents it from reading the page.
109
+ def read_page(url, options = nil)
110
+ options ||= {}
111
+ redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT
112
+ raise HTTPRedirectLimitError if redirect_limit == 0
113
+ if url.is_a?(URI)
114
+ uri = url
115
+ else
116
+ begin
117
+ uri = URI.parse(url)
118
+ rescue Exception=>error
119
+ raise HTTPInvalidURLError.new(error)
120
+ end
121
+ end
122
+ raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/
123
+ begin
124
+ http = Net::HTTP.new(uri.host, uri.port)
125
+ http.use_ssl = (uri.scheme == "https")
126
+ http.close_on_empty_response = true
127
+ http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
128
+ path = uri.path.dup # required so we don't modify path
129
+ path << "?#{uri.query}" if uri.query
130
+ # TODO: Specify which content types are accepted.
131
+ # TODO: GZip support.
132
+ headers = {}
133
+ headers["User-Agent"] = options[:user_agent] if options[:user_agent]
134
+ headers["Last-Modified"] = options[:last_modified] if options[:last_modified]
135
+ headers["ETag"] = options[:etag] if options[:etag]
136
+ response = http.request_get(path, headers)
137
+ # TODO: Ignore content types that do not map to HTML.
138
+ rescue TimeoutError=>error
139
+ raise HTTPTimeoutError.new(error)
140
+ rescue Exception=>error
141
+ raise HTTPUnspecifiedError.new(error)
142
+ end
143
+ case response
144
+ when Net::HTTPSuccess
145
+ encoding = if content_type = response["Content-Type"]
146
+ if match = content_type.match(/charset=([^\s]+)/i)
147
+ match[1]
148
+ end
149
+ end
150
+ return Page[(options[:source_url] || uri), response.body, encoding,
151
+ response["Last-Modified"], response["ETag"]]
152
+ when Net::HTTPNotModified
153
+ return Page[(options[:source_url] || uri), nil, nil,
154
+ options[:last_modified], options[:etag]]
155
+ when Net::HTTPMovedPermanently
156
+ return read_page(response["location"], # New URL takes effect
157
+ :last_modified=>options[:last_modified],
158
+ :etag=>options[:etag],
159
+ :redirect_limit=>redirect_limit-1)
160
+ when Net::HTTPRedirection
161
+ return read_page(response["location"],
162
+ :last_modified=>options[:last_modified],
163
+ :etag=>options[:etag],
164
+ :redirect_limit=>redirect_limit-1,
165
+ :source_url=>(options[:source_url] || uri)) # Old URL still in effect
166
+ when Net::HTTPNotFound
167
+ raise HTTPNotFoundError
168
+ when Net::HTTPUnauthorized, Net::HTTPForbidden
169
+ raise HTTPNoAccessError
170
+ when Net::HTTPRequestTimeOut
171
+ raise HTTPTimeoutError
172
+ else
173
+ raise HTTPUnspecifiedError
174
+ end
175
+ end
176
+
177
+
178
+ # :call-seq:
179
+ # parse_page(html, encoding?, options?, parser) => html
180
+ #
181
+ # Parses an HTML page and returns the encoding and HTML element.
182
+ # Raises HTMLParseError exceptions if it cannot parse the HTML.
183
+ #
184
+ # Options are passed to the parser. For example, when using Tidy
185
+ # you can pass Tidy cleanup options in the hash.
186
+ #
187
+ # The last option specifies which parser to use (see PARSERS).
188
+ # By default Tidy is used.
189
+ def parse_page(content, encoding = nil, options = nil, parser = :tidy)
190
+ begin
191
+ # Get the document encoding from the meta header.
192
+ if meta = content.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
193
+ if meta = meta[0].match(/charset=([\w-]*)/i)
194
+ encoding = meta[1]
195
+ end
196
+ end
197
+ encoding ||= "utf8"
198
+ case (parser || :tidy)
199
+ when :tidy
200
+ # Make sure the Tidy path is set and always apply the default
201
+ # options (these only control things like errors, output type).
202
+ find_tidy
203
+ options = (options || {}).update(TIDY_OPTIONS)
204
+ options[:input_encoding] = encoding.gsub("-", "").downcase
205
+ document = Tidy.open(options) do |tidy|
206
+ html = tidy.clean(content)
207
+ HTML::Document.new(html).find(:tag=>"html")
208
+ end
209
+ when :html_parser
210
+ document = HTML::HTMLParser.parse(content).root
211
+ else
212
+ raise HTMLParseError, "No parser #{parser || "unspecified"}"
213
+ end
214
+ return Parsed[document, encoding]
215
+ rescue Exception=>error
216
+ raise HTMLParseError.new(error)
217
+ end
218
+ end
219
+
220
+
221
+ protected
222
+ module_function
223
+
224
+ def find_tidy()
225
+ return if Tidy.path
226
+ begin
227
+ Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
228
+ rescue LoadError
229
+ begin
230
+ Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
231
+ rescue LoadError
232
+ Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
233
+ end
234
+ end
235
+ end
236
+
237
+ end
238
+
239
+ end
@@ -0,0 +1,8 @@
1
+ # Conditional loads, since we may have these libraries elsewhere,
2
+ # e.g. when using Rails with assert_select plugin.
3
+ require File.join(File.dirname(__FILE__), "html", "document") unless defined?(HTML::Document)
4
+ require File.join(File.dirname(__FILE__), "html", "node_ext") unless defined?(HTML::Node.detach)
5
+ require File.join(File.dirname(__FILE__), "html", "selector") unless defined?(HTML::Selector)
6
+ require File.join(File.dirname(__FILE__), "html", "htmlparser") unless defined?(HTML::HTMLParser)
7
+
8
+ require File.join(File.dirname(__FILE__), "scraper", "base") unless defined?(Scraper::Base)
Binary file
Binary file
@@ -0,0 +1,54 @@
1
+ require "net/http"
2
+
3
+ class Net::HTTP
4
+
5
+ @@on_get = nil
6
+
7
+ # Reset get method to default behavior.
8
+ def self.reset_on_get
9
+ @@on_get = nil
10
+ end
11
+
12
+
13
+ # :call-seq:
14
+ # on_get { |address, path, headers| ... => [response, body] }
15
+ #
16
+ # Specify alternative behavior for next execution of get method.
17
+ # This change applies to all instances of Net::HTTP, so do not use
18
+ # this method when running tests in parallel.
19
+ #
20
+ # The method takes a single block that accepts three arguments:
21
+ # the address (host), path and headers (hash). It must return an
22
+ # array with two values: the Net::HTTPResponse object and the
23
+ # content of the response body.
24
+ def self.on_get(&block)
25
+ @@on_get = block
26
+ end
27
+
28
+
29
+ unless method_defined?(:mocked_request_get)
30
+ alias :mocked_request_get :request_get
31
+
32
+ def request_get(path, headers)
33
+ # If we have prescribed behavior for the next search, execute it,
34
+ # otherwise, go with the default.
35
+ if @@on_get
36
+ response, body = @@on_get.call(@address, path, headers)
37
+ # Stuff the body into the response. No other way, since read_body
38
+ # attempts to read from a socket and we're too lazy to stub a socket.
39
+ response.instance_variable_set(:@mock_body, body.to_s)
40
+ class << response
41
+ def read_body()
42
+ @mock_body
43
+ end
44
+ end
45
+ response
46
+ else
47
+ mocked_request_get(path, headers)
48
+ end
49
+ end
50
+
51
+ end
52
+
53
+ end
54
+
@@ -0,0 +1,24 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require "test/unit"
10
+ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
11
+
12
+
13
+ class NodeExtTest < Test::Unit::TestCase
14
+
15
+ def setup
16
+ end
17
+
18
+ def teardown
19
+ end
20
+
21
+ def test_add_tests
22
+ end
23
+
24
+ end
@@ -0,0 +1,299 @@
1
+ # ScrAPI toolkit for Ruby
2
+ #
3
+ # Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
4
+ # Developed for http://co.mments.com
5
+ # Code and documention: http://labnotes.org
6
+
7
+
8
+ require "rubygems"
9
+ require "test/unit"
10
+ require "time" # rfc2822
11
+ require "webrick"
12
+ require "webrick/https"
13
+ require "logger"
14
+ require "stringio"
15
+ require File.join(File.dirname(__FILE__), "mock_net_http")
16
+ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
17
+
18
+
19
+ class ReaderTest < Test::Unit::TestCase
20
+
21
+ include Scraper
22
+
23
+
24
+ WEBRICK_OPTIONS = {
25
+ :BindAddredd=>"127.0.0.1",
26
+ :Port=>2000,
27
+ :Logger=>Logger.new(StringIO.new) # /dev/null
28
+ }
29
+
30
+ WEBRICK_TEST_URL = "http://127.0.0.1:2000/test.html"
31
+
32
+
33
+ def setup
34
+ Net::HTTP.reset_on_get
35
+ end
36
+
37
+ def teardown
38
+ Net::HTTP.reset_on_get
39
+ end
40
+
41
+
42
+ #
43
+ # Tests read_page.
44
+ #
45
+
46
+ def test_should_pass_path_and_user_agent
47
+ # Test path, query string and user agent.
48
+ Net::HTTP.on_get do |address, path, headers|
49
+ assert_equal "localhost", address
50
+ assert_equal "/path?query", path
51
+ assert_equal "MyUserAgent", headers["User-Agent"]
52
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
53
+ end
54
+ response = Reader.read_page("http://localhost/path?query", :user_agent=>"MyUserAgent")
55
+ assert_equal "http://localhost/path?query", response.url.to_s
56
+ assert_equal "nothing", response.content
57
+ assert_equal nil, response.last_modified
58
+ assert_equal nil, response.etag
59
+ end
60
+
61
+
62
+ def test_should_handle_http_and_timeout_errors
63
+ # Test timeout error and HTTP status that we can't process.
64
+ Net::HTTP.on_get { |address, path, headers| raise TimeoutError }
65
+ assert_raise(Reader::HTTPTimeoutError) do
66
+ response = Reader.read_page("http://localhost/path?query")
67
+ end
68
+ Net::HTTP.on_get { |address, path, headers| [Net::HTTPRequestTimeOut.new(Net::HTTP.version_1_2, 408, "Timeout"),""] }
69
+ assert_raise(Reader::HTTPTimeoutError) do
70
+ response = Reader.read_page("http://localhost/path?query")
71
+ end
72
+ end
73
+
74
+
75
+ def test_should_fail_on_too_many_redirects
76
+ # Test too many redirections.
77
+ Net::HTTP.on_get do |address, path, headers|
78
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
79
+ response["location"] = "http://localhost"
80
+ [response, ""]
81
+ end
82
+ assert_raise(Reader::HTTPRedirectLimitError) do
83
+ response = Reader.read_page("http://localhost/path?query")
84
+ end
85
+ Net::HTTP.on_get do |address, path, headers|
86
+ response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
87
+ response["location"] = "http://localhost"
88
+ [response, ""]
89
+ end
90
+ assert_raise(Reader::HTTPRedirectLimitError) do
91
+ response = Reader.read_page("http://localhost/path?query")
92
+ end
93
+ end
94
+
95
+
96
+ def test_should_validate_redirect_url
97
+ # Test validation of redirection URI.
98
+ Net::HTTP.on_get do |address, path, headers|
99
+ response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
100
+ response["location"] = "ftp://notsupported"
101
+ [response, ""]
102
+ end
103
+ assert_raise(Reader::HTTPInvalidURLError) do
104
+ response = Reader.read_page("http://localhost/path?query")
105
+ end
106
+ end
107
+
108
+
109
+ def test_should_support_redirection
110
+ # Test working redirection. Redirect only once and test response URL.
111
+ # Should be new URL for permanent redirect, same URL for all other redirects.
112
+ Net::HTTP.on_get do |address, path, headers|
113
+ if path.empty?
114
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
115
+ else
116
+ response = Net::HTTPRedirection.new(Net::HTTP.version_1_2, 300, "Moved")
117
+ response["Location"] = "http://localhost"
118
+ [response, ""]
119
+ end
120
+ end
121
+ assert_nothing_raised() do
122
+ response = Reader.read_page("http://localhost/path?query")
123
+ assert_equal "http://localhost/path?query", response.url.to_s
124
+ end
125
+ end
126
+
127
+
128
+ def test_should_support_permanent_redirection
129
+ # Test working redirection. Redirect only once and test response URL.
130
+ # Should be new URL for permanent redirect, same URL for all other redirects.
131
+ Net::HTTP.on_get do |address, path, headers|
132
+ if path == "/"
133
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
134
+ else
135
+ response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
136
+ response["location"] = "http://localhost/"
137
+ [response, ""]
138
+ end
139
+ end
140
+ assert_nothing_raised() do
141
+ response = Reader.read_page("http://localhost/path?query")
142
+ assert_equal "http://localhost/", response.url.to_s
143
+ end
144
+ end
145
+
146
+
147
+ def test_should_use_cache_control
148
+ # Test Last Modified and ETag headers. First, that they are correctly
149
+ # returned from headers to response object. Next, that passing right
150
+ # headers in options returns nil body and same values (no change),
151
+ # passing wrong/no headers, returnspage.
152
+ time = Time.new.rfc2822
153
+ Net::HTTP.on_get do |address, path, headers|
154
+ response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
155
+ response["Last-Modified"] = time
156
+ response["ETag"] = "etag"
157
+ [response, "nothing"]
158
+ end
159
+ response = Reader.read_page("http://localhost/path?query")
160
+ assert_equal time, response.last_modified
161
+ assert_equal "etag", response.etag
162
+ Net::HTTP.on_get do |address, path, headers|
163
+ if headers["Last-Modified"] == time and headers["ETag"] == "etag"
164
+ [Net::HTTPNotModified.new(Net::HTTP.version_1_2, 304, "Same"), ""]
165
+ else
166
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), "nothing"]
167
+ end
168
+ end
169
+ response = Reader.read_page("http://localhost/path?query")
170
+ assert_equal "nothing", response.content
171
+ response = Reader.read_page("http://localhost/path?query", :last_modified=>time, :etag=>"etag")
172
+ assert_equal nil, response.content
173
+ assert_equal time, response.last_modified
174
+ assert_equal "etag", response.etag
175
+ end
176
+
177
+
178
+ def test_should_find_encoding
179
+ # Test working redirection. Redirect only once and test response URL.
180
+ # Should be new URL for permanent redirect, same URL for all other redirects.
181
+ Net::HTTP.on_get do |address, path, headers|
182
+ response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
183
+ response["content-type"] = "text/html; charset=bogus"
184
+ [response, ""]
185
+ end
186
+ response = Reader.read_page("http://localhost/path?query")
187
+ assert_equal "bogus", response.encoding
188
+ end
189
+
190
+
191
+ #
192
+ # Tests parse_page.
193
+ #
194
+
195
+ def test_should_parse_html_page
196
+ html = Reader.parse_page("<html><head></head><body><p>something</p></body></html>").document
197
+ assert_equal 1, html.find_all(:tag=>"head").size
198
+ assert_equal 1, html.find_all(:tag=>"body").size
199
+ assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
200
+ assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
201
+ end
202
+
203
+
204
+ def test_should_use_tidy_if_specified
205
+ # This will only work with Tidy which adds the head/body parts,
206
+ # HTMLParser doesn't fix the HTML.
207
+ html = Reader.parse_page("<p>something</p>", nil, {}).document
208
+ assert_equal 1, html.find_all(:tag=>"head").size
209
+ assert_equal 1, html.find_all(:tag=>"body").size
210
+ assert_equal 1, html.find(:tag=>"body").find_all(:tag=>"p").size
211
+ assert_equal "something", html.find(:tag=>"body").find(:tag=>"p").children.join
212
+ end
213
+
214
+
215
+ #
216
+ # Other tests.
217
+ #
218
+
219
+ def test_should_handle_encoding_correctly
220
+ # Test content encoding returned from HTTP server.
221
+ with_webrick do |server, params|
222
+ server.mount_proc "/test.html" do |req,resp|
223
+ resp["Content-Type"] = "text/html; charset=my-encoding"
224
+ resp.body = "Content comes here"
225
+ end
226
+ page = Reader.read_page(WEBRICK_TEST_URL)
227
+ page = Reader.parse_page(page.content, page.encoding)
228
+ assert_equal "my-encoding", page.encoding
229
+ end
230
+ # Test content encoding in HTML http-equiv header
231
+ # that overrides content encoding returned in HTTP.
232
+ with_webrick do |server, params|
233
+ server.mount_proc "/test.html" do |req,resp|
234
+ resp["Content-Type"] = "text/html; charset=my-encoding"
235
+ resp.body = %Q{
236
+ <html>
237
+ <head>
238
+ <meta http-equiv="content-type" value="text/html; charset=other-encoding">
239
+ </head>
240
+ <body></body>
241
+ </html>
242
+ }
243
+ end
244
+ page = Reader.read_page(WEBRICK_TEST_URL)
245
+ page = Reader.parse_page(page.content, page.encoding)
246
+ assert_equal "other-encoding", page.encoding
247
+ end
248
+ end
249
+
250
+ def test_should_support_https
251
+ begin
252
+ options = WEBRICK_OPTIONS.dup.update(
253
+ :SSLEnable=>true,
254
+ :SSLVerifyClient => ::OpenSSL::SSL::VERIFY_NONE,
255
+ :SSLCertName => [ ["C","JP"], ["O","WEBrick.Org"], ["CN", "WWW"] ]
256
+ )
257
+ server = WEBrick::HTTPServer.new(options)
258
+ trap("INT") { server.shutdown }
259
+ Thread.new { server.start }
260
+ server.mount_proc "/test.html" do |req,resp|
261
+ resp.body = %Q{
262
+ <html>
263
+ <head>
264
+ <title>test https</title>
265
+ </head>
266
+ <body></body>
267
+ </html>
268
+ }
269
+ end
270
+ # Make sure page not HTTP accessible.
271
+ assert_raises(Reader::HTTPUnspecifiedError) do
272
+ Reader.read_page(WEBRICK_TEST_URL)
273
+ end
274
+ page = Reader.read_page(WEBRICK_TEST_URL.gsub("http", "https"))
275
+ page = Reader.parse_page(page.content, page.encoding)
276
+ assert_equal "<title>test https</title>",
277
+ page.document.find(:tag=>"title").to_s
278
+ server.shutdown
279
+ ensure
280
+ server.shutdown if server
281
+ end
282
+ end
283
+
284
+
285
+ private
286
+
287
+ def with_webrick(params = nil)
288
+ begin
289
+ server = WEBrick::HTTPServer.new(WEBRICK_OPTIONS)
290
+ trap("INT") { server.shutdown }
291
+ Thread.new { server.start }
292
+ yield server, params
293
+ server.shutdown
294
+ ensure
295
+ server.shutdown if server
296
+ end
297
+ end
298
+
299
+ end