sunbro 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/sunbro.rb +4 -1
- data/lib/sunbro/dynamic_http.rb +22 -4
- data/lib/sunbro/http.rb +42 -32
- data/lib/sunbro/initialize.rb +4 -0
- data/lib/sunbro/page.rb +2 -1
- data/lib/sunbro/settings.rb +42 -11
- data/lib/sunbro/version.rb +1 -1
- data/spec/page_spec.rb +16 -5
- data/spec/settings_spec.rb +29 -0
- data/spec/spec_helper.rb +2 -3
- data/sunbro.gemspec +2 -2
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1de54ea6b7adab57393a8a220e6ce2715a1875d
|
4
|
+
data.tar.gz: 6d1b4fa82d615a15320b9a2281dba34c44f6b4f3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d658cb22b7b57abd492ffac8f2b0b4173c6f806ed93a4d28fa699cbce6178bd97fa87af64778e88eb841bd1f59c1efe0c0e3912377046709104ddc85a48a6ba1
|
7
|
+
data.tar.gz: e96e191c25f0145f1e540237fc2c59dc8dc390871c23df01466186d15d539bb1d76ef8becc113bce6726fa627799886718f1da0bb36480925fae92ae36ea312e
|
data/lib/sunbro.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'capybara/poltergeist'
|
3
|
-
require '
|
3
|
+
require 'rest-client'
|
4
4
|
require 'webrick/cookie'
|
5
|
+
require 'active_support/all'
|
6
|
+
require 'retryable'
|
5
7
|
|
6
8
|
%w(
|
7
9
|
sunbro/version
|
@@ -9,6 +11,7 @@ require 'webrick/cookie'
|
|
9
11
|
sunbro/dynamic_http
|
10
12
|
sunbro/http
|
11
13
|
sunbro/page
|
14
|
+
sunbro/initialize
|
12
15
|
).each do |f|
|
13
16
|
require f
|
14
17
|
end
|
data/lib/sunbro/dynamic_http.rb
CHANGED
@@ -1,22 +1,26 @@
|
|
1
1
|
module Sunbro
|
2
2
|
class DynamicHTTP
|
3
|
+
|
3
4
|
attr_reader :session
|
4
5
|
|
5
6
|
def initialize(opts = {})
|
6
7
|
@opts = opts
|
7
|
-
new_session
|
8
|
+
Retryable.retryable { new_session }
|
8
9
|
end
|
9
10
|
|
10
11
|
def close
|
11
12
|
@session.driver.quit
|
13
|
+
rescue IOError
|
14
|
+
nil
|
12
15
|
end
|
13
16
|
|
14
17
|
def new_session
|
15
18
|
Capybara.register_driver :poltergeist do |app|
|
16
19
|
Capybara::Poltergeist::Driver.new(
|
17
20
|
app,
|
21
|
+
timeout: 10,
|
18
22
|
js_errors: false,
|
19
|
-
phantomjs_options:
|
23
|
+
phantomjs_options: phantomjs_options,
|
20
24
|
)
|
21
25
|
end
|
22
26
|
Capybara.default_driver = :poltergeist
|
@@ -29,13 +33,27 @@ module Sunbro
|
|
29
33
|
@session
|
30
34
|
end
|
31
35
|
|
36
|
+
def phantomjs_options
|
37
|
+
@phantomjs_options ||= begin
|
38
|
+
opts = [ '--load-images=no', '--ignore-ssl-errors=yes' ]
|
39
|
+
if Sunbro::Settings.proxy_host
|
40
|
+
if Sunbro::Settings.proxy_port
|
41
|
+
opts << "--proxy=#{Sunbro::Settings.proxy_host}:#{Sunbro::Settings.proxy_port}"
|
42
|
+
else
|
43
|
+
opts << "--proxy=#{Sunbro::Settings.proxy_host}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
opts
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
32
50
|
def user_agent
|
33
51
|
@opts[:agent] || Settings.phantomjs_user_agent
|
34
52
|
end
|
35
53
|
|
36
54
|
def restart_session
|
37
55
|
close
|
38
|
-
new_session
|
56
|
+
Retryable.retryable { new_session }
|
39
57
|
end
|
40
58
|
|
41
59
|
#
|
@@ -46,7 +64,7 @@ module Sunbro
|
|
46
64
|
begin
|
47
65
|
tries ||= 5
|
48
66
|
get_page(url, opts)
|
49
|
-
rescue Capybara::Poltergeist::DeadClient, Errno::EPIPE, NoMethodError, Capybara::Poltergeist::BrowserError => e
|
67
|
+
rescue IOError, Capybara::Poltergeist::DeadClient, Errno::EPIPE, NoMethodError, Capybara::Poltergeist::BrowserError => e
|
50
68
|
restart_session
|
51
69
|
retry unless (tries -= 1).zero?
|
52
70
|
close
|
data/lib/sunbro/http.rb
CHANGED
@@ -3,17 +3,23 @@ module Sunbro
|
|
3
3
|
# Maximum number of redirects to follow on each get_response
|
4
4
|
REDIRECT_LIMIT = 5
|
5
5
|
|
6
|
+
class RestResponse < Struct.new(:body, :headers, :code, :location)
|
7
|
+
def clean!
|
8
|
+
body.present?
|
9
|
+
rescue ArgumentError
|
10
|
+
body.encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
|
11
|
+
body.encode!('UTF-8', 'UTF-16')
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
6
15
|
def initialize(opts = {})
|
7
16
|
@connections = {}
|
8
17
|
@opts = opts
|
9
18
|
end
|
10
19
|
|
11
20
|
def close
|
12
|
-
|
13
|
-
|
14
|
-
connection.finish
|
15
|
-
end
|
16
|
-
end
|
21
|
+
# Deprecated with move to RestClient
|
22
|
+
true
|
17
23
|
end
|
18
24
|
|
19
25
|
#
|
@@ -44,15 +50,15 @@ module Sunbro
|
|
44
50
|
begin
|
45
51
|
url = convert_to_uri(url) unless url.is_a?(URI)
|
46
52
|
pages = []
|
47
|
-
get(url
|
48
|
-
pages << Page.new(location, :body
|
49
|
-
:code
|
50
|
-
:headers
|
51
|
-
:referer
|
52
|
-
:depth
|
53
|
-
:redirect_to
|
53
|
+
get(url) do |response, code, location, redirect_to, response_time|
|
54
|
+
pages << Page.new(location, :body => response.body.dup,
|
55
|
+
:code => code,
|
56
|
+
:headers => response.headers.stringify_keys,
|
57
|
+
:referer => referer,
|
58
|
+
:depth => depth,
|
59
|
+
:redirect_to => redirect_to,
|
54
60
|
:response_time => response_time,
|
55
|
-
:force_format
|
61
|
+
:force_format => force_format)
|
56
62
|
end
|
57
63
|
|
58
64
|
return pages
|
@@ -124,7 +130,7 @@ module Sunbro
|
|
124
130
|
# Yields the response object, response code, and URI location
|
125
131
|
# for each response.
|
126
132
|
#
|
127
|
-
def get(url
|
133
|
+
def get(url)
|
128
134
|
limit = redirect_limit
|
129
135
|
loc = url
|
130
136
|
begin
|
@@ -132,9 +138,9 @@ module Sunbro
|
|
132
138
|
# request url
|
133
139
|
loc = url.merge(loc) if loc.relative?
|
134
140
|
|
135
|
-
response, response_time = get_response(loc
|
141
|
+
response, response_time = get_response(loc)
|
136
142
|
code = Integer(response.code)
|
137
|
-
redirect_to =
|
143
|
+
redirect_to = 300.upto(307).include?(response['code']) ? URI(response['location']).normalize : nil
|
138
144
|
yield response, code, loc, redirect_to, response_time
|
139
145
|
limit -= 1
|
140
146
|
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
@@ -147,19 +153,27 @@ module Sunbro
|
|
147
153
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
148
154
|
|
149
155
|
opts = {}
|
150
|
-
opts[
|
151
|
-
|
156
|
+
opts[:headers] = {
|
157
|
+
user_agent: user_agent
|
158
|
+
} if user_agent
|
152
159
|
|
153
160
|
retries = 0
|
154
161
|
begin
|
155
162
|
start = Time.now()
|
156
|
-
|
157
|
-
|
158
|
-
#
|
159
|
-
|
160
|
-
|
163
|
+
response = RestResponse.new
|
164
|
+
|
165
|
+
# This causes RestClient to skip following the redirect automatically
|
166
|
+
connection(url)[full_path].get(opts) do |res, request, result|
|
167
|
+
response.body = res.body
|
168
|
+
response.headers = res.headers
|
169
|
+
response.code = res.code
|
170
|
+
response.location = res.headers[:location]
|
171
|
+
end
|
172
|
+
|
161
173
|
finish = Time.now()
|
162
174
|
response_time = ((finish - start) * 1000).round
|
175
|
+
response.clean!
|
176
|
+
|
163
177
|
return response, response_time
|
164
178
|
rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
|
165
179
|
puts e.inspect if verbose?
|
@@ -180,16 +194,12 @@ module Sunbro
|
|
180
194
|
end
|
181
195
|
|
182
196
|
def refresh_connection(url)
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
http.use_ssl = true
|
189
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
190
|
-
end
|
197
|
+
@connections[url.host][url.port] = RestClient::Resource.new(
|
198
|
+
"#{url.scheme}://#{url.host}",
|
199
|
+
timeout: read_timeout || 5,
|
200
|
+
verify_ssl: OpenSSL::SSL::VERIFY_NONE
|
201
|
+
)
|
191
202
|
|
192
|
-
@connections[url.host][url.port] = http.start
|
193
203
|
end
|
194
204
|
|
195
205
|
def verbose?
|
data/lib/sunbro/page.rb
CHANGED
@@ -98,7 +98,8 @@ module Sunbro
|
|
98
98
|
# The content-type returned by the HTTP request for this page
|
99
99
|
#
|
100
100
|
def content_type
|
101
|
-
headers['content-type'].first
|
101
|
+
return headers['content-type'].first if headers['content-type'].first.present?
|
102
|
+
headers['content_type']
|
102
103
|
end
|
103
104
|
|
104
105
|
#
|
data/lib/sunbro/settings.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'hashie'
|
2
|
-
|
3
1
|
module Sunbro
|
4
2
|
module Settings
|
5
3
|
|
@@ -8,29 +6,62 @@ module Sunbro
|
|
8
6
|
phantomjs_user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X)",
|
9
7
|
page_format: :auto
|
10
8
|
}
|
9
|
+
|
10
|
+
class SettingsData < Struct.new(:user_agent, :proxy_url, :proxy_host, :proxy_port, :phantomjs_user_agent, :page_format);end
|
11
11
|
|
12
12
|
def self.configure
|
13
|
-
|
14
|
-
yield
|
13
|
+
@configuration ||= SettingsData.new
|
14
|
+
yield @configuration
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.proxy_url
|
18
|
+
return unless configured?
|
19
|
+
if @configuration.proxy_url
|
20
|
+
@configuration.proxy_url
|
21
|
+
elsif @configuration.proxy_host
|
22
|
+
if @configuration.proxy_port
|
23
|
+
"http://#{@configuration.proxy_host}:#{@configuration.proxy_port}/"
|
24
|
+
else
|
25
|
+
"http://#{@configuration.proxy_host}/"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.proxy_host
|
31
|
+
return unless configured?
|
32
|
+
if @configuration.proxy_url
|
33
|
+
@configuration.proxy_host = URI.parse(proxy_url).host
|
34
|
+
else
|
35
|
+
@configuration.proxy_host
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.proxy_port
|
40
|
+
return unless configured?
|
41
|
+
if @configuration.proxy_url
|
42
|
+
@configuration.proxy_port = URI.parse(proxy_url).port
|
43
|
+
else
|
44
|
+
@configuration.proxy_port
|
45
|
+
end
|
15
46
|
end
|
16
47
|
|
17
48
|
def self.user_agent
|
18
|
-
return DEFAULTS[:user_agent] unless configured?
|
19
|
-
|
49
|
+
return DEFAULTS[:user_agent] unless configured? && @configuration.user_agent
|
50
|
+
@configuration.user_agent
|
20
51
|
end
|
21
52
|
|
22
53
|
def self.phantomjs_user_agent
|
23
|
-
return DEFAULTS[:phantomjs_user_agent] unless configured?
|
24
|
-
|
54
|
+
return DEFAULTS[:phantomjs_user_agent] unless configured? && @configuration.phantomjs_user_agent
|
55
|
+
@configuration.phantomjs_user_agent
|
25
56
|
end
|
26
57
|
|
27
58
|
def self.page_format
|
28
|
-
return DEFAULTS[:page_format] unless configured?
|
29
|
-
|
59
|
+
return DEFAULTS[:page_format] unless configured? && @configuration.page_format
|
60
|
+
@configuration.page_format
|
30
61
|
end
|
31
62
|
|
32
63
|
def self.configured?
|
33
|
-
|
64
|
+
!!@configuration
|
34
65
|
end
|
35
66
|
end
|
36
67
|
end
|
data/lib/sunbro/version.rb
CHANGED
data/spec/page_spec.rb
CHANGED
@@ -1,23 +1,33 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
-
require '
|
2
|
+
require 'open-uri'
|
3
3
|
|
4
4
|
describe Sunbro::Page do
|
5
5
|
|
6
6
|
before :each do
|
7
7
|
@http = Sunbro::HTTP.new(verbose: true)
|
8
|
+
@body = "<html><head><title>Title</title></head><body><p>Body text</p></body></html>"
|
8
9
|
|
9
10
|
Mocktra("www.retailer.com") do
|
10
11
|
get '/1.html' do
|
11
12
|
"<html><head><title>Title</title></head><body><p>Body text</p></body></html>"
|
12
13
|
end
|
14
|
+
|
15
|
+
get '/invalid.html' do
|
16
|
+
"<html><head><title>Title</title></head><body><p>Body text</p></body></html>\255".force_encoding('UTF-8')
|
17
|
+
end
|
13
18
|
end
|
14
19
|
end
|
15
20
|
|
16
21
|
describe "#initialize" do
|
17
|
-
it "it scrubs invalid UTF-8 from @body
|
18
|
-
|
19
|
-
|
20
|
-
|
22
|
+
it "it scrubs invalid UTF-8 from @body" do
|
23
|
+
url = "http://www.retailer.com/invalid.html"
|
24
|
+
|
25
|
+
page = @http.fetch_page(url)
|
26
|
+
expect(page.body.present?).to eq(true)
|
27
|
+
expect(page.url.to_s).to eq(url)
|
28
|
+
expect(page.redirect_to).to be_nil
|
29
|
+
expect(page.redirect_from).to be_nil
|
30
|
+
|
21
31
|
end
|
22
32
|
end
|
23
33
|
|
@@ -26,6 +36,7 @@ describe Sunbro::Page do
|
|
26
36
|
url = "http://www.retailer.com/1.html"
|
27
37
|
|
28
38
|
page = @http.fetch_page(url)
|
39
|
+
expect(page.body).to eq(@body)
|
29
40
|
expect(page.url.to_s).to eq(url)
|
30
41
|
expect(page.redirect_to).to be_nil
|
31
42
|
expect(page.redirect_from).to be_nil
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
describe Sunbro::Settings do
|
5
|
+
|
6
|
+
before :each do
|
7
|
+
@proxy_url = 'http://proxy1.oddwonout.com:8888/'
|
8
|
+
Sunbro::Settings.configure {|c| c.proxy_url = @proxy_url}
|
9
|
+
end
|
10
|
+
|
11
|
+
describe '::proxy_url' do
|
12
|
+
it 'returns the proxy url' do
|
13
|
+
expect(Sunbro::Settings.proxy_url).to eq(@proxy_url)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '::proxy_host' do
|
18
|
+
it 'returns the proxy host' do
|
19
|
+
expect(Sunbro::Settings.proxy_host).to eq('proxy1.oddwonout.com')
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe '::proxy_port' do
|
24
|
+
it 'returns the proxy port' do
|
25
|
+
expect(Sunbro::Settings.proxy_port).to eq(8888)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/sunbro.gemspec
CHANGED
@@ -21,9 +21,9 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_dependency "nokogiri"
|
22
22
|
spec.add_dependency "capybara"
|
23
23
|
spec.add_dependency "poltergeist"
|
24
|
-
spec.add_dependency "
|
24
|
+
spec.add_dependency "rest-client"
|
25
25
|
spec.add_dependency "activesupport"
|
26
|
-
spec.add_dependency "
|
26
|
+
spec.add_dependency "retryable"
|
27
27
|
|
28
28
|
spec.add_development_dependency "bundler", "~> 1.5"
|
29
29
|
spec.add_development_dependency "rake"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sunbro
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jon Stokes
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -58,7 +58,7 @@ dependencies:
|
|
58
58
|
- - '>='
|
59
59
|
- !ruby/object:Gem::Version
|
60
60
|
version: '0'
|
61
|
-
name:
|
61
|
+
name: rest-client
|
62
62
|
prerelease: false
|
63
63
|
type: :runtime
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -86,7 +86,7 @@ dependencies:
|
|
86
86
|
- - '>='
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '0'
|
89
|
-
name:
|
89
|
+
name: retryable
|
90
90
|
prerelease: false
|
91
91
|
type: :runtime
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -166,10 +166,12 @@ files:
|
|
166
166
|
- lib/sunbro/connection.rb
|
167
167
|
- lib/sunbro/dynamic_http.rb
|
168
168
|
- lib/sunbro/http.rb
|
169
|
+
- lib/sunbro/initialize.rb
|
169
170
|
- lib/sunbro/page.rb
|
170
171
|
- lib/sunbro/settings.rb
|
171
172
|
- lib/sunbro/version.rb
|
172
173
|
- spec/page_spec.rb
|
174
|
+
- spec/settings_spec.rb
|
173
175
|
- spec/spec_helper.rb
|
174
176
|
- sunbro.gemspec
|
175
177
|
- tasks/rspec.rake
|
@@ -193,10 +195,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
193
195
|
version: '0'
|
194
196
|
requirements: []
|
195
197
|
rubyforge_project:
|
196
|
-
rubygems_version: 2.
|
198
|
+
rubygems_version: 2.1.9
|
197
199
|
signing_key:
|
198
200
|
specification_version: 4
|
199
201
|
summary: Some code that I use to crawl the web at scale. Shared in the spirit of jolly cooperation.
|
200
202
|
test_files:
|
201
203
|
- spec/page_spec.rb
|
204
|
+
- spec/settings_spec.rb
|
202
205
|
- spec/spec_helper.rb
|