sunbro 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 90a10b3e643f709e46b6a0cefe53e09755a333df
4
- data.tar.gz: 124e852db53309e2284fb3daf372b61b30af669e
3
+ metadata.gz: d1de54ea6b7adab57393a8a220e6ce2715a1875d
4
+ data.tar.gz: 6d1b4fa82d615a15320b9a2281dba34c44f6b4f3
5
5
  SHA512:
6
- metadata.gz: 5294a0e2819a7ca39c3e59f73952c7b5ae73569a6337bc24128ef77e45d905de18ec2796648e53cfe53d26012a38523b815fe0c37e81a33d33302da591e0dfc5
7
- data.tar.gz: f4596d13a476704c094b1661e5c1a7297f0aa487984c7363c1eeb558853f7d30833dd238539546902ae178e6b0ebaa7c9f127400438100c34a639807bb263277
6
+ metadata.gz: d658cb22b7b57abd492ffac8f2b0b4173c6f806ed93a4d28fa699cbce6178bd97fa87af64778e88eb841bd1f59c1efe0c0e3912377046709104ddc85a48a6ba1
7
+ data.tar.gz: e96e191c25f0145f1e540237fc2c59dc8dc390871c23df01466186d15d539bb1d76ef8becc113bce6726fa627799886718f1da0bb36480925fae92ae36ea312e
@@ -1,7 +1,9 @@
1
1
  require 'nokogiri'
2
2
  require 'capybara/poltergeist'
3
- require 'net/http/persistent'
3
+ require 'rest-client'
4
4
  require 'webrick/cookie'
5
+ require 'active_support/all'
6
+ require 'retryable'
5
7
 
6
8
  %w(
7
9
  sunbro/version
@@ -9,6 +11,7 @@ require 'webrick/cookie'
9
11
  sunbro/dynamic_http
10
12
  sunbro/http
11
13
  sunbro/page
14
+ sunbro/initialize
12
15
  ).each do |f|
13
16
  require f
14
17
  end
@@ -1,22 +1,26 @@
1
1
  module Sunbro
2
2
  class DynamicHTTP
3
+
3
4
  attr_reader :session
4
5
 
5
6
  def initialize(opts = {})
6
7
  @opts = opts
7
- new_session
8
+ Retryable.retryable { new_session }
8
9
  end
9
10
 
10
11
  def close
11
12
  @session.driver.quit
13
+ rescue IOError
14
+ nil
12
15
  end
13
16
 
14
17
  def new_session
15
18
  Capybara.register_driver :poltergeist do |app|
16
19
  Capybara::Poltergeist::Driver.new(
17
20
  app,
21
+ timeout: 10,
18
22
  js_errors: false,
19
- phantomjs_options: ['--load-images=no', '--ignore-ssl-errors=yes']
23
+ phantomjs_options: phantomjs_options,
20
24
  )
21
25
  end
22
26
  Capybara.default_driver = :poltergeist
@@ -29,13 +33,27 @@ module Sunbro
29
33
  @session
30
34
  end
31
35
 
36
+ def phantomjs_options
37
+ @phantomjs_options ||= begin
38
+ opts = [ '--load-images=no', '--ignore-ssl-errors=yes' ]
39
+ if Sunbro::Settings.proxy_host
40
+ if Sunbro::Settings.proxy_port
41
+ opts << "--proxy=#{Sunbro::Settings.proxy_host}:#{Sunbro::Settings.proxy_port}"
42
+ else
43
+ opts << "--proxy=#{Sunbro::Settings.proxy_host}"
44
+ end
45
+ end
46
+ opts
47
+ end
48
+ end
49
+
32
50
  def user_agent
33
51
  @opts[:agent] || Settings.phantomjs_user_agent
34
52
  end
35
53
 
36
54
  def restart_session
37
55
  close
38
- new_session
56
+ Retryable.retryable { new_session }
39
57
  end
40
58
 
41
59
  #
@@ -46,7 +64,7 @@ module Sunbro
46
64
  begin
47
65
  tries ||= 5
48
66
  get_page(url, opts)
49
- rescue Capybara::Poltergeist::DeadClient, Errno::EPIPE, NoMethodError, Capybara::Poltergeist::BrowserError => e
67
+ rescue IOError, Capybara::Poltergeist::DeadClient, Errno::EPIPE, NoMethodError, Capybara::Poltergeist::BrowserError => e
50
68
  restart_session
51
69
  retry unless (tries -= 1).zero?
52
70
  close
@@ -3,17 +3,23 @@ module Sunbro
3
3
  # Maximum number of redirects to follow on each get_response
4
4
  REDIRECT_LIMIT = 5
5
5
 
6
+ class RestResponse < Struct.new(:body, :headers, :code, :location)
7
+ def clean!
8
+ body.present?
9
+ rescue ArgumentError
10
+ body.encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
11
+ body.encode!('UTF-8', 'UTF-16')
12
+ end
13
+ end
14
+
6
15
  def initialize(opts = {})
7
16
  @connections = {}
8
17
  @opts = opts
9
18
  end
10
19
 
11
20
  def close
12
- @connections.each do |host, ports|
13
- ports.each do |port, connection|
14
- connection.finish
15
- end
16
- end
21
+ # Deprecated with move to RestClient
22
+ true
17
23
  end
18
24
 
19
25
  #
@@ -44,15 +50,15 @@ module Sunbro
44
50
  begin
45
51
  url = convert_to_uri(url) unless url.is_a?(URI)
46
52
  pages = []
47
- get(url, referer) do |response, code, location, redirect_to, response_time|
48
- pages << Page.new(location, :body => response.body.dup,
49
- :code => code,
50
- :headers => response.to_hash,
51
- :referer => referer,
52
- :depth => depth,
53
- :redirect_to => redirect_to,
53
+ get(url) do |response, code, location, redirect_to, response_time|
54
+ pages << Page.new(location, :body => response.body.dup,
55
+ :code => code,
56
+ :headers => response.headers.stringify_keys,
57
+ :referer => referer,
58
+ :depth => depth,
59
+ :redirect_to => redirect_to,
54
60
  :response_time => response_time,
55
- :force_format => force_format)
61
+ :force_format => force_format)
56
62
  end
57
63
 
58
64
  return pages
@@ -124,7 +130,7 @@ module Sunbro
124
130
  # Yields the response object, response code, and URI location
125
131
  # for each response.
126
132
  #
127
- def get(url, referer = nil)
133
+ def get(url)
128
134
  limit = redirect_limit
129
135
  loc = url
130
136
  begin
@@ -132,9 +138,9 @@ module Sunbro
132
138
  # request url
133
139
  loc = url.merge(loc) if loc.relative?
134
140
 
135
- response, response_time = get_response(loc, referer)
141
+ response, response_time = get_response(loc)
136
142
  code = Integer(response.code)
137
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
143
+ redirect_to = 300.upto(307).include?(response['code']) ? URI(response['location']).normalize : nil
138
144
  yield response, code, loc, redirect_to, response_time
139
145
  limit -= 1
140
146
  end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
@@ -147,19 +153,27 @@ module Sunbro
147
153
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
148
154
 
149
155
  opts = {}
150
- opts['User-Agent'] = user_agent if user_agent
151
- opts['Referer'] = referer.to_s if referer
156
+ opts[:headers] = {
157
+ user_agent: user_agent
158
+ } if user_agent
152
159
 
153
160
  retries = 0
154
161
  begin
155
162
  start = Time.now()
156
- # format request
157
- req = Net::HTTP::Get.new(full_path, opts)
158
- # HTTP Basic authentication
159
- req.basic_auth url.user, url.password if url.user
160
- response = connection(url).request(req)
163
+ response = RestResponse.new
164
+
165
+ # This causes RestClient to skip following the redirect automatically
166
+ connection(url)[full_path].get(opts) do |res, request, result|
167
+ response.body = res.body
168
+ response.headers = res.headers
169
+ response.code = res.code
170
+ response.location = res.headers[:location]
171
+ end
172
+
161
173
  finish = Time.now()
162
174
  response_time = ((finish - start) * 1000).round
175
+ response.clean!
176
+
163
177
  return response, response_time
164
178
  rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
165
179
  puts e.inspect if verbose?
@@ -180,16 +194,12 @@ module Sunbro
180
194
  end
181
195
 
182
196
  def refresh_connection(url)
183
- http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
184
-
185
- http.read_timeout = read_timeout if !!read_timeout
186
-
187
- if url.scheme == 'https'
188
- http.use_ssl = true
189
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
190
- end
197
+ @connections[url.host][url.port] = RestClient::Resource.new(
198
+ "#{url.scheme}://#{url.host}",
199
+ timeout: read_timeout || 5,
200
+ verify_ssl: OpenSSL::SSL::VERIFY_NONE
201
+ )
191
202
 
192
- @connections[url.host][url.port] = http.start
193
203
  end
194
204
 
195
205
  def verbose?
@@ -0,0 +1,4 @@
1
+ if Sunbro::Settings.proxy_url
2
+ puts "## Setting RestClient proxy to #{Sunbro::Settings.proxy_url}"
3
+ RestClient.proxy = Sunbro::Settings.proxy_url
4
+ end
@@ -98,7 +98,8 @@ module Sunbro
98
98
  # The content-type returned by the HTTP request for this page
99
99
  #
100
100
  def content_type
101
- headers['content-type'].first
101
+ return headers['content-type'].first if headers['content-type'].first.present?
102
+ headers['content_type']
102
103
  end
103
104
 
104
105
  #
@@ -1,5 +1,3 @@
1
- require 'hashie'
2
-
3
1
  module Sunbro
4
2
  module Settings
5
3
 
@@ -8,29 +6,62 @@ module Sunbro
8
6
  phantomjs_user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X)",
9
7
  page_format: :auto
10
8
  }
9
+
10
+ class SettingsData < Struct.new(:user_agent, :proxy_url, :proxy_host, :proxy_port, :phantomjs_user_agent, :page_format);end
11
11
 
12
12
  def self.configure
13
- $sunbro_configuration ||= Hashie::Mash.new
14
- yield $sunbro_configuration
13
+ @configuration ||= SettingsData.new
14
+ yield @configuration
15
+ end
16
+
17
+ def self.proxy_url
18
+ return unless configured?
19
+ if @configuration.proxy_url
20
+ @configuration.proxy_url
21
+ elsif @configuration.proxy_host
22
+ if @configuration.proxy_port
23
+ "http://#{@configuration.proxy_host}:#{@configuration.proxy_port}/"
24
+ else
25
+ "http://#{@configuration.proxy_host}/"
26
+ end
27
+ end
28
+ end
29
+
30
+ def self.proxy_host
31
+ return unless configured?
32
+ if @configuration.proxy_url
33
+ @configuration.proxy_host = URI.parse(proxy_url).host
34
+ else
35
+ @configuration.proxy_host
36
+ end
37
+ end
38
+
39
+ def self.proxy_port
40
+ return unless configured?
41
+ if @configuration.proxy_url
42
+ @configuration.proxy_port = URI.parse(proxy_url).port
43
+ else
44
+ @configuration.proxy_port
45
+ end
15
46
  end
16
47
 
17
48
  def self.user_agent
18
- return DEFAULTS[:user_agent] unless configured?
19
- $sunbro_configuration.user_agent
49
+ return DEFAULTS[:user_agent] unless configured? && @configuration.user_agent
50
+ @configuration.user_agent
20
51
  end
21
52
 
22
53
  def self.phantomjs_user_agent
23
- return DEFAULTS[:phantomjs_user_agent] unless configured?
24
- $sunbro_configuration.phantomjs_user_agent
54
+ return DEFAULTS[:phantomjs_user_agent] unless configured? && @configuration.phantomjs_user_agent
55
+ @configuration.phantomjs_user_agent
25
56
  end
26
57
 
27
58
  def self.page_format
28
- return DEFAULTS[:page_format] unless configured?
29
- $sunbro_configuration.page_format
59
+ return DEFAULTS[:page_format] unless configured? && @configuration.page_format
60
+ @configuration.page_format
30
61
  end
31
62
 
32
63
  def self.configured?
33
- !!$sunbro_configuration
64
+ !!@configuration
34
65
  end
35
66
  end
36
67
  end
@@ -1,3 +1,3 @@
1
1
  module Sunbro
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.2"
3
3
  end
@@ -1,23 +1,33 @@
1
1
  require 'spec_helper'
2
- require 'mocktra'
2
+ require 'open-uri'
3
3
 
4
4
  describe Sunbro::Page do
5
5
 
6
6
  before :each do
7
7
  @http = Sunbro::HTTP.new(verbose: true)
8
+ @body = "<html><head><title>Title</title></head><body><p>Body text</p></body></html>"
8
9
 
9
10
  Mocktra("www.retailer.com") do
10
11
  get '/1.html' do
11
12
  "<html><head><title>Title</title></head><body><p>Body text</p></body></html>"
12
13
  end
14
+
15
+ get '/invalid.html' do
16
+ "<html><head><title>Title</title></head><body><p>Body text</p></body></html>\255".force_encoding('UTF-8')
17
+ end
13
18
  end
14
19
  end
15
20
 
16
21
  describe "#initialize" do
17
- it "it scrubs invalid UTF-8 from @body by converting to UTF-16, then back again" do
18
- # See http://stackoverflow.com/a/8873922/1169868
19
- pending "Example"
20
- fail
22
+ it "it scrubs invalid UTF-8 from @body" do
23
+ url = "http://www.retailer.com/invalid.html"
24
+
25
+ page = @http.fetch_page(url)
26
+ expect(page.body.present?).to eq(true)
27
+ expect(page.url.to_s).to eq(url)
28
+ expect(page.redirect_to).to be_nil
29
+ expect(page.redirect_from).to be_nil
30
+
21
31
  end
22
32
  end
23
33
 
@@ -26,6 +36,7 @@ describe Sunbro::Page do
26
36
  url = "http://www.retailer.com/1.html"
27
37
 
28
38
  page = @http.fetch_page(url)
39
+ expect(page.body).to eq(@body)
29
40
  expect(page.url.to_s).to eq(url)
30
41
  expect(page.redirect_to).to be_nil
31
42
  expect(page.redirect_from).to be_nil
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+ require 'open-uri'
3
+
4
+ describe Sunbro::Settings do
5
+
6
+ before :each do
7
+ @proxy_url = 'http://proxy1.oddwonout.com:8888/'
8
+ Sunbro::Settings.configure {|c| c.proxy_url = @proxy_url}
9
+ end
10
+
11
+ describe '::proxy_url' do
12
+ it 'returns the proxy url' do
13
+ expect(Sunbro::Settings.proxy_url).to eq(@proxy_url)
14
+ end
15
+ end
16
+
17
+ describe '::proxy_host' do
18
+ it 'returns the proxy host' do
19
+ expect(Sunbro::Settings.proxy_host).to eq('proxy1.oddwonout.com')
20
+ end
21
+ end
22
+
23
+ describe '::proxy_port' do
24
+ it 'returns the proxy port' do
25
+ expect(Sunbro::Settings.proxy_port).to eq(8888)
26
+ end
27
+ end
28
+
29
+ end
@@ -1,6 +1,5 @@
1
- require 'rubygems'
2
- require 'bundler/setup'
3
- require 'active_support'
1
+ require 'rspec'
2
+ require 'mocktra'
4
3
  require 'sunbro'
5
4
 
6
5
  RSpec.configure do |config|
@@ -21,9 +21,9 @@ Gem::Specification.new do |spec|
21
21
  spec.add_dependency "nokogiri"
22
22
  spec.add_dependency "capybara"
23
23
  spec.add_dependency "poltergeist"
24
- spec.add_dependency "net-http-persistent"
24
+ spec.add_dependency "rest-client"
25
25
  spec.add_dependency "activesupport"
26
- spec.add_dependency "hashie"
26
+ spec.add_dependency "retryable"
27
27
 
28
28
  spec.add_development_dependency "bundler", "~> 1.5"
29
29
  spec.add_development_dependency "rake"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sunbro
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jon Stokes
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-09 00:00:00.000000000 Z
11
+ date: 2015-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -58,7 +58,7 @@ dependencies:
58
58
  - - '>='
59
59
  - !ruby/object:Gem::Version
60
60
  version: '0'
61
- name: net-http-persistent
61
+ name: rest-client
62
62
  prerelease: false
63
63
  type: :runtime
64
64
  version_requirements: !ruby/object:Gem::Requirement
@@ -86,7 +86,7 @@ dependencies:
86
86
  - - '>='
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
- name: hashie
89
+ name: retryable
90
90
  prerelease: false
91
91
  type: :runtime
92
92
  version_requirements: !ruby/object:Gem::Requirement
@@ -166,10 +166,12 @@ files:
166
166
  - lib/sunbro/connection.rb
167
167
  - lib/sunbro/dynamic_http.rb
168
168
  - lib/sunbro/http.rb
169
+ - lib/sunbro/initialize.rb
169
170
  - lib/sunbro/page.rb
170
171
  - lib/sunbro/settings.rb
171
172
  - lib/sunbro/version.rb
172
173
  - spec/page_spec.rb
174
+ - spec/settings_spec.rb
173
175
  - spec/spec_helper.rb
174
176
  - sunbro.gemspec
175
177
  - tasks/rspec.rake
@@ -193,10 +195,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
193
195
  version: '0'
194
196
  requirements: []
195
197
  rubyforge_project:
196
- rubygems_version: 2.4.5
198
+ rubygems_version: 2.1.9
197
199
  signing_key:
198
200
  specification_version: 4
199
201
  summary: Some code that I use to crawl the web at scale. Shared in the spirit of jolly cooperation.
200
202
  test_files:
201
203
  - spec/page_spec.rb
204
+ - spec/settings_spec.rb
202
205
  - spec/spec_helper.rb