sunbro 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 90a10b3e643f709e46b6a0cefe53e09755a333df
4
- data.tar.gz: 124e852db53309e2284fb3daf372b61b30af669e
3
+ metadata.gz: d1de54ea6b7adab57393a8a220e6ce2715a1875d
4
+ data.tar.gz: 6d1b4fa82d615a15320b9a2281dba34c44f6b4f3
5
5
  SHA512:
6
- metadata.gz: 5294a0e2819a7ca39c3e59f73952c7b5ae73569a6337bc24128ef77e45d905de18ec2796648e53cfe53d26012a38523b815fe0c37e81a33d33302da591e0dfc5
7
- data.tar.gz: f4596d13a476704c094b1661e5c1a7297f0aa487984c7363c1eeb558853f7d30833dd238539546902ae178e6b0ebaa7c9f127400438100c34a639807bb263277
6
+ metadata.gz: d658cb22b7b57abd492ffac8f2b0b4173c6f806ed93a4d28fa699cbce6178bd97fa87af64778e88eb841bd1f59c1efe0c0e3912377046709104ddc85a48a6ba1
7
+ data.tar.gz: e96e191c25f0145f1e540237fc2c59dc8dc390871c23df01466186d15d539bb1d76ef8becc113bce6726fa627799886718f1da0bb36480925fae92ae36ea312e
@@ -1,7 +1,9 @@
1
1
  require 'nokogiri'
2
2
  require 'capybara/poltergeist'
3
- require 'net/http/persistent'
3
+ require 'rest-client'
4
4
  require 'webrick/cookie'
5
+ require 'active_support/all'
6
+ require 'retryable'
5
7
 
6
8
  %w(
7
9
  sunbro/version
@@ -9,6 +11,7 @@ require 'webrick/cookie'
9
11
  sunbro/dynamic_http
10
12
  sunbro/http
11
13
  sunbro/page
14
+ sunbro/initialize
12
15
  ).each do |f|
13
16
  require f
14
17
  end
@@ -1,22 +1,26 @@
1
1
  module Sunbro
2
2
  class DynamicHTTP
3
+
3
4
  attr_reader :session
4
5
 
5
6
  def initialize(opts = {})
6
7
  @opts = opts
7
- new_session
8
+ Retryable.retryable { new_session }
8
9
  end
9
10
 
10
11
  def close
11
12
  @session.driver.quit
13
+ rescue IOError
14
+ nil
12
15
  end
13
16
 
14
17
  def new_session
15
18
  Capybara.register_driver :poltergeist do |app|
16
19
  Capybara::Poltergeist::Driver.new(
17
20
  app,
21
+ timeout: 10,
18
22
  js_errors: false,
19
- phantomjs_options: ['--load-images=no', '--ignore-ssl-errors=yes']
23
+ phantomjs_options: phantomjs_options,
20
24
  )
21
25
  end
22
26
  Capybara.default_driver = :poltergeist
@@ -29,13 +33,27 @@ module Sunbro
29
33
  @session
30
34
  end
31
35
 
36
+ def phantomjs_options
37
+ @phantomjs_options ||= begin
38
+ opts = [ '--load-images=no', '--ignore-ssl-errors=yes' ]
39
+ if Sunbro::Settings.proxy_host
40
+ if Sunbro::Settings.proxy_port
41
+ opts << "--proxy=#{Sunbro::Settings.proxy_host}:#{Sunbro::Settings.proxy_port}"
42
+ else
43
+ opts << "--proxy=#{Sunbro::Settings.proxy_host}"
44
+ end
45
+ end
46
+ opts
47
+ end
48
+ end
49
+
32
50
  def user_agent
33
51
  @opts[:agent] || Settings.phantomjs_user_agent
34
52
  end
35
53
 
36
54
  def restart_session
37
55
  close
38
- new_session
56
+ Retryable.retryable { new_session }
39
57
  end
40
58
 
41
59
  #
@@ -46,7 +64,7 @@ module Sunbro
46
64
  begin
47
65
  tries ||= 5
48
66
  get_page(url, opts)
49
- rescue Capybara::Poltergeist::DeadClient, Errno::EPIPE, NoMethodError, Capybara::Poltergeist::BrowserError => e
67
+ rescue IOError, Capybara::Poltergeist::DeadClient, Errno::EPIPE, NoMethodError, Capybara::Poltergeist::BrowserError => e
50
68
  restart_session
51
69
  retry unless (tries -= 1).zero?
52
70
  close
@@ -3,17 +3,23 @@ module Sunbro
3
3
  # Maximum number of redirects to follow on each get_response
4
4
  REDIRECT_LIMIT = 5
5
5
 
6
+ class RestResponse < Struct.new(:body, :headers, :code, :location)
7
+ def clean!
8
+ body.present?
9
+ rescue ArgumentError
10
+ body.encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
11
+ body.encode!('UTF-8', 'UTF-16')
12
+ end
13
+ end
14
+
6
15
  def initialize(opts = {})
7
16
  @connections = {}
8
17
  @opts = opts
9
18
  end
10
19
 
11
20
  def close
12
- @connections.each do |host, ports|
13
- ports.each do |port, connection|
14
- connection.finish
15
- end
16
- end
21
+ # Deprecated with move to RestClient
22
+ true
17
23
  end
18
24
 
19
25
  #
@@ -44,15 +50,15 @@ module Sunbro
44
50
  begin
45
51
  url = convert_to_uri(url) unless url.is_a?(URI)
46
52
  pages = []
47
- get(url, referer) do |response, code, location, redirect_to, response_time|
48
- pages << Page.new(location, :body => response.body.dup,
49
- :code => code,
50
- :headers => response.to_hash,
51
- :referer => referer,
52
- :depth => depth,
53
- :redirect_to => redirect_to,
53
+ get(url) do |response, code, location, redirect_to, response_time|
54
+ pages << Page.new(location, :body => response.body.dup,
55
+ :code => code,
56
+ :headers => response.headers.stringify_keys,
57
+ :referer => referer,
58
+ :depth => depth,
59
+ :redirect_to => redirect_to,
54
60
  :response_time => response_time,
55
- :force_format => force_format)
61
+ :force_format => force_format)
56
62
  end
57
63
 
58
64
  return pages
@@ -124,7 +130,7 @@ module Sunbro
124
130
  # Yields the response object, response code, and URI location
125
131
  # for each response.
126
132
  #
127
- def get(url, referer = nil)
133
+ def get(url)
128
134
  limit = redirect_limit
129
135
  loc = url
130
136
  begin
@@ -132,9 +138,9 @@ module Sunbro
132
138
  # request url
133
139
  loc = url.merge(loc) if loc.relative?
134
140
 
135
- response, response_time = get_response(loc, referer)
141
+ response, response_time = get_response(loc)
136
142
  code = Integer(response.code)
137
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
143
+ redirect_to = 300.upto(307).include?(response['code']) ? URI(response['location']).normalize : nil
138
144
  yield response, code, loc, redirect_to, response_time
139
145
  limit -= 1
140
146
  end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
@@ -147,19 +153,27 @@ module Sunbro
147
153
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
148
154
 
149
155
  opts = {}
150
- opts['User-Agent'] = user_agent if user_agent
151
- opts['Referer'] = referer.to_s if referer
156
+ opts[:headers] = {
157
+ user_agent: user_agent
158
+ } if user_agent
152
159
 
153
160
  retries = 0
154
161
  begin
155
162
  start = Time.now()
156
- # format request
157
- req = Net::HTTP::Get.new(full_path, opts)
158
- # HTTP Basic authentication
159
- req.basic_auth url.user, url.password if url.user
160
- response = connection(url).request(req)
163
+ response = RestResponse.new
164
+
165
+ # This causes RestClient to skip following the redirect automatically
166
+ connection(url)[full_path].get(opts) do |res, request, result|
167
+ response.body = res.body
168
+ response.headers = res.headers
169
+ response.code = res.code
170
+ response.location = res.headers[:location]
171
+ end
172
+
161
173
  finish = Time.now()
162
174
  response_time = ((finish - start) * 1000).round
175
+ response.clean!
176
+
163
177
  return response, response_time
164
178
  rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
165
179
  puts e.inspect if verbose?
@@ -180,16 +194,12 @@ module Sunbro
180
194
  end
181
195
 
182
196
  def refresh_connection(url)
183
- http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
184
-
185
- http.read_timeout = read_timeout if !!read_timeout
186
-
187
- if url.scheme == 'https'
188
- http.use_ssl = true
189
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
190
- end
197
+ @connections[url.host][url.port] = RestClient::Resource.new(
198
+ "#{url.scheme}://#{url.host}",
199
+ timeout: read_timeout || 5,
200
+ verify_ssl: OpenSSL::SSL::VERIFY_NONE
201
+ )
191
202
 
192
- @connections[url.host][url.port] = http.start
193
203
  end
194
204
 
195
205
  def verbose?
@@ -0,0 +1,4 @@
1
+ if Sunbro::Settings.proxy_url
2
+ puts "## Setting RestClient proxy to #{Sunbro::Settings.proxy_url}"
3
+ RestClient.proxy = Sunbro::Settings.proxy_url
4
+ end
@@ -98,7 +98,8 @@ module Sunbro
98
98
  # The content-type returned by the HTTP request for this page
99
99
  #
100
100
  def content_type
101
- headers['content-type'].first
101
+ return headers['content-type'].first if headers['content-type'].first.present?
102
+ headers['content_type']
102
103
  end
103
104
 
104
105
  #
@@ -1,5 +1,3 @@
1
- require 'hashie'
2
-
3
1
  module Sunbro
4
2
  module Settings
5
3
 
@@ -8,29 +6,62 @@ module Sunbro
8
6
  phantomjs_user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X)",
9
7
  page_format: :auto
10
8
  }
9
+
10
+ class SettingsData < Struct.new(:user_agent, :proxy_url, :proxy_host, :proxy_port, :phantomjs_user_agent, :page_format);end
11
11
 
12
12
  def self.configure
13
- $sunbro_configuration ||= Hashie::Mash.new
14
- yield $sunbro_configuration
13
+ @configuration ||= SettingsData.new
14
+ yield @configuration
15
+ end
16
+
17
+ def self.proxy_url
18
+ return unless configured?
19
+ if @configuration.proxy_url
20
+ @configuration.proxy_url
21
+ elsif @configuration.proxy_host
22
+ if @configuration.proxy_port
23
+ "http://#{@configuration.proxy_host}:#{@configuration.proxy_port}/"
24
+ else
25
+ "http://#{@configuration.proxy_host}/"
26
+ end
27
+ end
28
+ end
29
+
30
+ def self.proxy_host
31
+ return unless configured?
32
+ if @configuration.proxy_url
33
+ @configuration.proxy_host = URI.parse(proxy_url).host
34
+ else
35
+ @configuration.proxy_host
36
+ end
37
+ end
38
+
39
+ def self.proxy_port
40
+ return unless configured?
41
+ if @configuration.proxy_url
42
+ @configuration.proxy_port = URI.parse(proxy_url).port
43
+ else
44
+ @configuration.proxy_port
45
+ end
15
46
  end
16
47
 
17
48
  def self.user_agent
18
- return DEFAULTS[:user_agent] unless configured?
19
- $sunbro_configuration.user_agent
49
+ return DEFAULTS[:user_agent] unless configured? && @configuration.user_agent
50
+ @configuration.user_agent
20
51
  end
21
52
 
22
53
  def self.phantomjs_user_agent
23
- return DEFAULTS[:phantomjs_user_agent] unless configured?
24
- $sunbro_configuration.phantomjs_user_agent
54
+ return DEFAULTS[:phantomjs_user_agent] unless configured? && @configuration.phantomjs_user_agent
55
+ @configuration.phantomjs_user_agent
25
56
  end
26
57
 
27
58
  def self.page_format
28
- return DEFAULTS[:page_format] unless configured?
29
- $sunbro_configuration.page_format
59
+ return DEFAULTS[:page_format] unless configured? && @configuration.page_format
60
+ @configuration.page_format
30
61
  end
31
62
 
32
63
  def self.configured?
33
- !!$sunbro_configuration
64
+ !!@configuration
34
65
  end
35
66
  end
36
67
  end
@@ -1,3 +1,3 @@
1
1
  module Sunbro
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.2"
3
3
  end
@@ -1,23 +1,33 @@
1
1
  require 'spec_helper'
2
- require 'mocktra'
2
+ require 'open-uri'
3
3
 
4
4
  describe Sunbro::Page do
5
5
 
6
6
  before :each do
7
7
  @http = Sunbro::HTTP.new(verbose: true)
8
+ @body = "<html><head><title>Title</title></head><body><p>Body text</p></body></html>"
8
9
 
9
10
  Mocktra("www.retailer.com") do
10
11
  get '/1.html' do
11
12
  "<html><head><title>Title</title></head><body><p>Body text</p></body></html>"
12
13
  end
14
+
15
+ get '/invalid.html' do
16
+ "<html><head><title>Title</title></head><body><p>Body text</p></body></html>\255".force_encoding('UTF-8')
17
+ end
13
18
  end
14
19
  end
15
20
 
16
21
  describe "#initialize" do
17
- it "it scrubs invalid UTF-8 from @body by converting to UTF-16, then back again" do
18
- # See http://stackoverflow.com/a/8873922/1169868
19
- pending "Example"
20
- fail
22
+ it "it scrubs invalid UTF-8 from @body" do
23
+ url = "http://www.retailer.com/invalid.html"
24
+
25
+ page = @http.fetch_page(url)
26
+ expect(page.body.present?).to eq(true)
27
+ expect(page.url.to_s).to eq(url)
28
+ expect(page.redirect_to).to be_nil
29
+ expect(page.redirect_from).to be_nil
30
+
21
31
  end
22
32
  end
23
33
 
@@ -26,6 +36,7 @@ describe Sunbro::Page do
26
36
  url = "http://www.retailer.com/1.html"
27
37
 
28
38
  page = @http.fetch_page(url)
39
+ expect(page.body).to eq(@body)
29
40
  expect(page.url.to_s).to eq(url)
30
41
  expect(page.redirect_to).to be_nil
31
42
  expect(page.redirect_from).to be_nil
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+ require 'open-uri'
3
+
4
+ describe Sunbro::Settings do
5
+
6
+ before :each do
7
+ @proxy_url = 'http://proxy1.oddwonout.com:8888/'
8
+ Sunbro::Settings.configure {|c| c.proxy_url = @proxy_url}
9
+ end
10
+
11
+ describe '::proxy_url' do
12
+ it 'returns the proxy url' do
13
+ expect(Sunbro::Settings.proxy_url).to eq(@proxy_url)
14
+ end
15
+ end
16
+
17
+ describe '::proxy_host' do
18
+ it 'returns the proxy host' do
19
+ expect(Sunbro::Settings.proxy_host).to eq('proxy1.oddwonout.com')
20
+ end
21
+ end
22
+
23
+ describe '::proxy_port' do
24
+ it 'returns the proxy port' do
25
+ expect(Sunbro::Settings.proxy_port).to eq(8888)
26
+ end
27
+ end
28
+
29
+ end
@@ -1,6 +1,5 @@
1
- require 'rubygems'
2
- require 'bundler/setup'
3
- require 'active_support'
1
+ require 'rspec'
2
+ require 'mocktra'
4
3
  require 'sunbro'
5
4
 
6
5
  RSpec.configure do |config|
@@ -21,9 +21,9 @@ Gem::Specification.new do |spec|
21
21
  spec.add_dependency "nokogiri"
22
22
  spec.add_dependency "capybara"
23
23
  spec.add_dependency "poltergeist"
24
- spec.add_dependency "net-http-persistent"
24
+ spec.add_dependency "rest-client"
25
25
  spec.add_dependency "activesupport"
26
- spec.add_dependency "hashie"
26
+ spec.add_dependency "retryable"
27
27
 
28
28
  spec.add_development_dependency "bundler", "~> 1.5"
29
29
  spec.add_development_dependency "rake"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sunbro
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jon Stokes
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-09 00:00:00.000000000 Z
11
+ date: 2015-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -58,7 +58,7 @@ dependencies:
58
58
  - - '>='
59
59
  - !ruby/object:Gem::Version
60
60
  version: '0'
61
- name: net-http-persistent
61
+ name: rest-client
62
62
  prerelease: false
63
63
  type: :runtime
64
64
  version_requirements: !ruby/object:Gem::Requirement
@@ -86,7 +86,7 @@ dependencies:
86
86
  - - '>='
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
- name: hashie
89
+ name: retryable
90
90
  prerelease: false
91
91
  type: :runtime
92
92
  version_requirements: !ruby/object:Gem::Requirement
@@ -166,10 +166,12 @@ files:
166
166
  - lib/sunbro/connection.rb
167
167
  - lib/sunbro/dynamic_http.rb
168
168
  - lib/sunbro/http.rb
169
+ - lib/sunbro/initialize.rb
169
170
  - lib/sunbro/page.rb
170
171
  - lib/sunbro/settings.rb
171
172
  - lib/sunbro/version.rb
172
173
  - spec/page_spec.rb
174
+ - spec/settings_spec.rb
173
175
  - spec/spec_helper.rb
174
176
  - sunbro.gemspec
175
177
  - tasks/rspec.rake
@@ -193,10 +195,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
193
195
  version: '0'
194
196
  requirements: []
195
197
  rubyforge_project:
196
- rubygems_version: 2.4.5
198
+ rubygems_version: 2.1.9
197
199
  signing_key:
198
200
  specification_version: 4
199
201
  summary: Some code that I use to crawl the web at scale. Shared in the spirit of jolly cooperation.
200
202
  test_files:
201
203
  - spec/page_spec.rb
204
+ - spec/settings_spec.rb
202
205
  - spec/spec_helper.rb