sunbro 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sunbro.rb +4 -1
- data/lib/sunbro/dynamic_http.rb +22 -4
- data/lib/sunbro/http.rb +42 -32
- data/lib/sunbro/initialize.rb +4 -0
- data/lib/sunbro/page.rb +2 -1
- data/lib/sunbro/settings.rb +42 -11
- data/lib/sunbro/version.rb +1 -1
- data/spec/page_spec.rb +16 -5
- data/spec/settings_spec.rb +29 -0
- data/spec/spec_helper.rb +2 -3
- data/sunbro.gemspec +2 -2
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1de54ea6b7adab57393a8a220e6ce2715a1875d
|
4
|
+
data.tar.gz: 6d1b4fa82d615a15320b9a2281dba34c44f6b4f3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d658cb22b7b57abd492ffac8f2b0b4173c6f806ed93a4d28fa699cbce6178bd97fa87af64778e88eb841bd1f59c1efe0c0e3912377046709104ddc85a48a6ba1
|
7
|
+
data.tar.gz: e96e191c25f0145f1e540237fc2c59dc8dc390871c23df01466186d15d539bb1d76ef8becc113bce6726fa627799886718f1da0bb36480925fae92ae36ea312e
|
data/lib/sunbro.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'capybara/poltergeist'
|
3
|
-
require '
|
3
|
+
require 'rest-client'
|
4
4
|
require 'webrick/cookie'
|
5
|
+
require 'active_support/all'
|
6
|
+
require 'retryable'
|
5
7
|
|
6
8
|
%w(
|
7
9
|
sunbro/version
|
@@ -9,6 +11,7 @@ require 'webrick/cookie'
|
|
9
11
|
sunbro/dynamic_http
|
10
12
|
sunbro/http
|
11
13
|
sunbro/page
|
14
|
+
sunbro/initialize
|
12
15
|
).each do |f|
|
13
16
|
require f
|
14
17
|
end
|
data/lib/sunbro/dynamic_http.rb
CHANGED
@@ -1,22 +1,26 @@
|
|
1
1
|
module Sunbro
|
2
2
|
class DynamicHTTP
|
3
|
+
|
3
4
|
attr_reader :session
|
4
5
|
|
5
6
|
def initialize(opts = {})
|
6
7
|
@opts = opts
|
7
|
-
new_session
|
8
|
+
Retryable.retryable { new_session }
|
8
9
|
end
|
9
10
|
|
10
11
|
def close
|
11
12
|
@session.driver.quit
|
13
|
+
rescue IOError
|
14
|
+
nil
|
12
15
|
end
|
13
16
|
|
14
17
|
def new_session
|
15
18
|
Capybara.register_driver :poltergeist do |app|
|
16
19
|
Capybara::Poltergeist::Driver.new(
|
17
20
|
app,
|
21
|
+
timeout: 10,
|
18
22
|
js_errors: false,
|
19
|
-
phantomjs_options:
|
23
|
+
phantomjs_options: phantomjs_options,
|
20
24
|
)
|
21
25
|
end
|
22
26
|
Capybara.default_driver = :poltergeist
|
@@ -29,13 +33,27 @@ module Sunbro
|
|
29
33
|
@session
|
30
34
|
end
|
31
35
|
|
36
|
+
def phantomjs_options
|
37
|
+
@phantomjs_options ||= begin
|
38
|
+
opts = [ '--load-images=no', '--ignore-ssl-errors=yes' ]
|
39
|
+
if Sunbro::Settings.proxy_host
|
40
|
+
if Sunbro::Settings.proxy_port
|
41
|
+
opts << "--proxy=#{Sunbro::Settings.proxy_host}:#{Sunbro::Settings.proxy_port}"
|
42
|
+
else
|
43
|
+
opts << "--proxy=#{Sunbro::Settings.proxy_host}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
opts
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
32
50
|
def user_agent
|
33
51
|
@opts[:agent] || Settings.phantomjs_user_agent
|
34
52
|
end
|
35
53
|
|
36
54
|
def restart_session
|
37
55
|
close
|
38
|
-
new_session
|
56
|
+
Retryable.retryable { new_session }
|
39
57
|
end
|
40
58
|
|
41
59
|
#
|
@@ -46,7 +64,7 @@ module Sunbro
|
|
46
64
|
begin
|
47
65
|
tries ||= 5
|
48
66
|
get_page(url, opts)
|
49
|
-
rescue Capybara::Poltergeist::DeadClient, Errno::EPIPE, NoMethodError, Capybara::Poltergeist::BrowserError => e
|
67
|
+
rescue IOError, Capybara::Poltergeist::DeadClient, Errno::EPIPE, NoMethodError, Capybara::Poltergeist::BrowserError => e
|
50
68
|
restart_session
|
51
69
|
retry unless (tries -= 1).zero?
|
52
70
|
close
|
data/lib/sunbro/http.rb
CHANGED
@@ -3,17 +3,23 @@ module Sunbro
|
|
3
3
|
# Maximum number of redirects to follow on each get_response
|
4
4
|
REDIRECT_LIMIT = 5
|
5
5
|
|
6
|
+
class RestResponse < Struct.new(:body, :headers, :code, :location)
|
7
|
+
def clean!
|
8
|
+
body.present?
|
9
|
+
rescue ArgumentError
|
10
|
+
body.encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
|
11
|
+
body.encode!('UTF-8', 'UTF-16')
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
6
15
|
def initialize(opts = {})
|
7
16
|
@connections = {}
|
8
17
|
@opts = opts
|
9
18
|
end
|
10
19
|
|
11
20
|
def close
|
12
|
-
|
13
|
-
|
14
|
-
connection.finish
|
15
|
-
end
|
16
|
-
end
|
21
|
+
# Deprecated with move to RestClient
|
22
|
+
true
|
17
23
|
end
|
18
24
|
|
19
25
|
#
|
@@ -44,15 +50,15 @@ module Sunbro
|
|
44
50
|
begin
|
45
51
|
url = convert_to_uri(url) unless url.is_a?(URI)
|
46
52
|
pages = []
|
47
|
-
get(url
|
48
|
-
pages << Page.new(location, :body
|
49
|
-
:code
|
50
|
-
:headers
|
51
|
-
:referer
|
52
|
-
:depth
|
53
|
-
:redirect_to
|
53
|
+
get(url) do |response, code, location, redirect_to, response_time|
|
54
|
+
pages << Page.new(location, :body => response.body.dup,
|
55
|
+
:code => code,
|
56
|
+
:headers => response.headers.stringify_keys,
|
57
|
+
:referer => referer,
|
58
|
+
:depth => depth,
|
59
|
+
:redirect_to => redirect_to,
|
54
60
|
:response_time => response_time,
|
55
|
-
:force_format
|
61
|
+
:force_format => force_format)
|
56
62
|
end
|
57
63
|
|
58
64
|
return pages
|
@@ -124,7 +130,7 @@ module Sunbro
|
|
124
130
|
# Yields the response object, response code, and URI location
|
125
131
|
# for each response.
|
126
132
|
#
|
127
|
-
def get(url
|
133
|
+
def get(url)
|
128
134
|
limit = redirect_limit
|
129
135
|
loc = url
|
130
136
|
begin
|
@@ -132,9 +138,9 @@ module Sunbro
|
|
132
138
|
# request url
|
133
139
|
loc = url.merge(loc) if loc.relative?
|
134
140
|
|
135
|
-
response, response_time = get_response(loc
|
141
|
+
response, response_time = get_response(loc)
|
136
142
|
code = Integer(response.code)
|
137
|
-
redirect_to =
|
143
|
+
redirect_to = 300.upto(307).include?(response['code']) ? URI(response['location']).normalize : nil
|
138
144
|
yield response, code, loc, redirect_to, response_time
|
139
145
|
limit -= 1
|
140
146
|
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
@@ -147,19 +153,27 @@ module Sunbro
|
|
147
153
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
148
154
|
|
149
155
|
opts = {}
|
150
|
-
opts[
|
151
|
-
|
156
|
+
opts[:headers] = {
|
157
|
+
user_agent: user_agent
|
158
|
+
} if user_agent
|
152
159
|
|
153
160
|
retries = 0
|
154
161
|
begin
|
155
162
|
start = Time.now()
|
156
|
-
|
157
|
-
|
158
|
-
#
|
159
|
-
|
160
|
-
|
163
|
+
response = RestResponse.new
|
164
|
+
|
165
|
+
# This causes RestClient to skip following the redirect automatically
|
166
|
+
connection(url)[full_path].get(opts) do |res, request, result|
|
167
|
+
response.body = res.body
|
168
|
+
response.headers = res.headers
|
169
|
+
response.code = res.code
|
170
|
+
response.location = res.headers[:location]
|
171
|
+
end
|
172
|
+
|
161
173
|
finish = Time.now()
|
162
174
|
response_time = ((finish - start) * 1000).round
|
175
|
+
response.clean!
|
176
|
+
|
163
177
|
return response, response_time
|
164
178
|
rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
|
165
179
|
puts e.inspect if verbose?
|
@@ -180,16 +194,12 @@ module Sunbro
|
|
180
194
|
end
|
181
195
|
|
182
196
|
def refresh_connection(url)
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
http.use_ssl = true
|
189
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
190
|
-
end
|
197
|
+
@connections[url.host][url.port] = RestClient::Resource.new(
|
198
|
+
"#{url.scheme}://#{url.host}",
|
199
|
+
timeout: read_timeout || 5,
|
200
|
+
verify_ssl: OpenSSL::SSL::VERIFY_NONE
|
201
|
+
)
|
191
202
|
|
192
|
-
@connections[url.host][url.port] = http.start
|
193
203
|
end
|
194
204
|
|
195
205
|
def verbose?
|
data/lib/sunbro/page.rb
CHANGED
@@ -98,7 +98,8 @@ module Sunbro
|
|
98
98
|
# The content-type returned by the HTTP request for this page
|
99
99
|
#
|
100
100
|
def content_type
|
101
|
-
headers['content-type'].first
|
101
|
+
return headers['content-type'].first if headers['content-type'].first.present?
|
102
|
+
headers['content_type']
|
102
103
|
end
|
103
104
|
|
104
105
|
#
|
data/lib/sunbro/settings.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
require 'hashie'
|
2
|
-
|
3
1
|
module Sunbro
|
4
2
|
module Settings
|
5
3
|
|
@@ -8,29 +6,62 @@ module Sunbro
|
|
8
6
|
phantomjs_user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X)",
|
9
7
|
page_format: :auto
|
10
8
|
}
|
9
|
+
|
10
|
+
class SettingsData < Struct.new(:user_agent, :proxy_url, :proxy_host, :proxy_port, :phantomjs_user_agent, :page_format);end
|
11
11
|
|
12
12
|
def self.configure
|
13
|
-
|
14
|
-
yield
|
13
|
+
@configuration ||= SettingsData.new
|
14
|
+
yield @configuration
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.proxy_url
|
18
|
+
return unless configured?
|
19
|
+
if @configuration.proxy_url
|
20
|
+
@configuration.proxy_url
|
21
|
+
elsif @configuration.proxy_host
|
22
|
+
if @configuration.proxy_port
|
23
|
+
"http://#{@configuration.proxy_host}:#{@configuration.proxy_port}/"
|
24
|
+
else
|
25
|
+
"http://#{@configuration.proxy_host}/"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.proxy_host
|
31
|
+
return unless configured?
|
32
|
+
if @configuration.proxy_url
|
33
|
+
@configuration.proxy_host = URI.parse(proxy_url).host
|
34
|
+
else
|
35
|
+
@configuration.proxy_host
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.proxy_port
|
40
|
+
return unless configured?
|
41
|
+
if @configuration.proxy_url
|
42
|
+
@configuration.proxy_port = URI.parse(proxy_url).port
|
43
|
+
else
|
44
|
+
@configuration.proxy_port
|
45
|
+
end
|
15
46
|
end
|
16
47
|
|
17
48
|
def self.user_agent
|
18
|
-
return DEFAULTS[:user_agent] unless configured?
|
19
|
-
|
49
|
+
return DEFAULTS[:user_agent] unless configured? && @configuration.user_agent
|
50
|
+
@configuration.user_agent
|
20
51
|
end
|
21
52
|
|
22
53
|
def self.phantomjs_user_agent
|
23
|
-
return DEFAULTS[:phantomjs_user_agent] unless configured?
|
24
|
-
|
54
|
+
return DEFAULTS[:phantomjs_user_agent] unless configured? && @configuration.phantomjs_user_agent
|
55
|
+
@configuration.phantomjs_user_agent
|
25
56
|
end
|
26
57
|
|
27
58
|
def self.page_format
|
28
|
-
return DEFAULTS[:page_format] unless configured?
|
29
|
-
|
59
|
+
return DEFAULTS[:page_format] unless configured? && @configuration.page_format
|
60
|
+
@configuration.page_format
|
30
61
|
end
|
31
62
|
|
32
63
|
def self.configured?
|
33
|
-
|
64
|
+
!!@configuration
|
34
65
|
end
|
35
66
|
end
|
36
67
|
end
|
data/lib/sunbro/version.rb
CHANGED
data/spec/page_spec.rb
CHANGED
@@ -1,23 +1,33 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
-
require '
|
2
|
+
require 'open-uri'
|
3
3
|
|
4
4
|
describe Sunbro::Page do
|
5
5
|
|
6
6
|
before :each do
|
7
7
|
@http = Sunbro::HTTP.new(verbose: true)
|
8
|
+
@body = "<html><head><title>Title</title></head><body><p>Body text</p></body></html>"
|
8
9
|
|
9
10
|
Mocktra("www.retailer.com") do
|
10
11
|
get '/1.html' do
|
11
12
|
"<html><head><title>Title</title></head><body><p>Body text</p></body></html>"
|
12
13
|
end
|
14
|
+
|
15
|
+
get '/invalid.html' do
|
16
|
+
"<html><head><title>Title</title></head><body><p>Body text</p></body></html>\255".force_encoding('UTF-8')
|
17
|
+
end
|
13
18
|
end
|
14
19
|
end
|
15
20
|
|
16
21
|
describe "#initialize" do
|
17
|
-
it "it scrubs invalid UTF-8 from @body
|
18
|
-
|
19
|
-
|
20
|
-
|
22
|
+
it "it scrubs invalid UTF-8 from @body" do
|
23
|
+
url = "http://www.retailer.com/invalid.html"
|
24
|
+
|
25
|
+
page = @http.fetch_page(url)
|
26
|
+
expect(page.body.present?).to eq(true)
|
27
|
+
expect(page.url.to_s).to eq(url)
|
28
|
+
expect(page.redirect_to).to be_nil
|
29
|
+
expect(page.redirect_from).to be_nil
|
30
|
+
|
21
31
|
end
|
22
32
|
end
|
23
33
|
|
@@ -26,6 +36,7 @@ describe Sunbro::Page do
|
|
26
36
|
url = "http://www.retailer.com/1.html"
|
27
37
|
|
28
38
|
page = @http.fetch_page(url)
|
39
|
+
expect(page.body).to eq(@body)
|
29
40
|
expect(page.url.to_s).to eq(url)
|
30
41
|
expect(page.redirect_to).to be_nil
|
31
42
|
expect(page.redirect_from).to be_nil
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
describe Sunbro::Settings do
|
5
|
+
|
6
|
+
before :each do
|
7
|
+
@proxy_url = 'http://proxy1.oddwonout.com:8888/'
|
8
|
+
Sunbro::Settings.configure {|c| c.proxy_url = @proxy_url}
|
9
|
+
end
|
10
|
+
|
11
|
+
describe '::proxy_url' do
|
12
|
+
it 'returns the proxy url' do
|
13
|
+
expect(Sunbro::Settings.proxy_url).to eq(@proxy_url)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '::proxy_host' do
|
18
|
+
it 'returns the proxy host' do
|
19
|
+
expect(Sunbro::Settings.proxy_host).to eq('proxy1.oddwonout.com')
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe '::proxy_port' do
|
24
|
+
it 'returns the proxy port' do
|
25
|
+
expect(Sunbro::Settings.proxy_port).to eq(8888)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/sunbro.gemspec
CHANGED
@@ -21,9 +21,9 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_dependency "nokogiri"
|
22
22
|
spec.add_dependency "capybara"
|
23
23
|
spec.add_dependency "poltergeist"
|
24
|
-
spec.add_dependency "
|
24
|
+
spec.add_dependency "rest-client"
|
25
25
|
spec.add_dependency "activesupport"
|
26
|
-
spec.add_dependency "
|
26
|
+
spec.add_dependency "retryable"
|
27
27
|
|
28
28
|
spec.add_development_dependency "bundler", "~> 1.5"
|
29
29
|
spec.add_development_dependency "rake"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sunbro
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jon Stokes
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -58,7 +58,7 @@ dependencies:
|
|
58
58
|
- - '>='
|
59
59
|
- !ruby/object:Gem::Version
|
60
60
|
version: '0'
|
61
|
-
name:
|
61
|
+
name: rest-client
|
62
62
|
prerelease: false
|
63
63
|
type: :runtime
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -86,7 +86,7 @@ dependencies:
|
|
86
86
|
- - '>='
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '0'
|
89
|
-
name:
|
89
|
+
name: retryable
|
90
90
|
prerelease: false
|
91
91
|
type: :runtime
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -166,10 +166,12 @@ files:
|
|
166
166
|
- lib/sunbro/connection.rb
|
167
167
|
- lib/sunbro/dynamic_http.rb
|
168
168
|
- lib/sunbro/http.rb
|
169
|
+
- lib/sunbro/initialize.rb
|
169
170
|
- lib/sunbro/page.rb
|
170
171
|
- lib/sunbro/settings.rb
|
171
172
|
- lib/sunbro/version.rb
|
172
173
|
- spec/page_spec.rb
|
174
|
+
- spec/settings_spec.rb
|
173
175
|
- spec/spec_helper.rb
|
174
176
|
- sunbro.gemspec
|
175
177
|
- tasks/rspec.rake
|
@@ -193,10 +195,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
193
195
|
version: '0'
|
194
196
|
requirements: []
|
195
197
|
rubyforge_project:
|
196
|
-
rubygems_version: 2.
|
198
|
+
rubygems_version: 2.1.9
|
197
199
|
signing_key:
|
198
200
|
specification_version: 4
|
199
201
|
summary: Some code that I use to crawl the web at scale. Shared in the spirit of jolly cooperation.
|
200
202
|
test_files:
|
201
203
|
- spec/page_spec.rb
|
204
|
+
- spec/settings_spec.rb
|
202
205
|
- spec/spec_helper.rb
|