grell 1.6 → 1.6.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2be8992c96b83e9b1a98474ada3b49ea7e5adb69
4
- data.tar.gz: 3eed1bea205812e8e9ab7dc8678da57efea1fea1
3
+ metadata.gz: 0ef86064ca2938505dec12137ac353c08087695c
4
+ data.tar.gz: 22483420d8592db3d8e633c56a1c83b3c812a9ba
5
5
  SHA512:
6
- metadata.gz: baa6e37b2ce80491b05688618b6ad0576149236c2367b2f6c52a84dfeae25edb6d340abfdcae4e3b6f7363072db0dc0c8c052cd83410e1f28e1725305db99993
7
- data.tar.gz: 7c246e8b2a02494d5e44dc6fc4b0029ab254e63764b46791e9135ed9ec1657627d4b6f7e5cd921a951c062cfe815ac1fd7b4e7d87ffb11f786e0989d44c3083a
6
+ metadata.gz: 64c2b2d0b7e1478faed2a44d78e38c0cda20fc2742f558b610cb1a009120928ce3fa7215a5919f4e28554f8c9cc032957a775858dae5ce5870abaa1d847017fb
7
+ data.tar.gz: caf16e853905923720299543fde15318b62187fd21f4600136f135a6386eeef2669674e8eff1074ab5c66d6ed3432adc9e68561768f334b0c89052b246a4de20
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ # 1.6.1
2
+ * Use non-static name to support registering Poltergeist crawler multiple times
3
+ * More exception handling, store redirected URLs in addition to original URL
4
+
1
5
  # 1.6
2
6
  * Support custom URL comparison when adding new pages during crawling
3
7
  * Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
data/grell.gemspec CHANGED
@@ -31,4 +31,5 @@ Gem::Specification.new do |spec|
31
31
  spec.add_development_dependency "webmock", '~> 1.18'
32
32
  spec.add_development_dependency 'rspec', '~> 3.0'
33
33
  spec.add_development_dependency 'puffing-billy', '~> 0.5'
34
+ spec.add_development_dependency 'timecop', '~> 0.8'
34
35
  end
@@ -13,7 +13,13 @@ module Grell
13
13
 
14
14
  def setup_capybara
15
15
  @poltergeist_driver = nil
16
- Capybara.register_driver :poltergeist_crawler do |app|
16
+
17
+ # Capybara will not re-run the block if the driver name already exists, so the driver name
18
+ # will have a time integer appended to ensure uniqueness.
19
+ driver_name = "poltergeist_crawler_#{Time.now.to_i}".to_sym
20
+ Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
21
+
22
+ Capybara.register_driver driver_name do |app|
17
23
  @poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
18
24
  js_errors: false,
19
25
  inspector: false,
@@ -24,13 +30,13 @@ module Grell
24
30
 
25
31
  Capybara.default_max_wait_time = 3
26
32
  Capybara.run_server = false
27
- Capybara.default_driver = :poltergeist_crawler
33
+ Capybara.default_driver = driver_name
28
34
  page.driver.headers = {
29
35
  "DNT" => 1,
30
36
  "User-Agent" => USER_AGENT
31
37
  }
32
38
 
33
- fail "Poltergeist Driver could not be properly initialized" unless @poltergeist_driver
39
+ raise 'Poltergeist Driver could not be properly initialized' unless @poltergeist_driver
34
40
 
35
41
  @poltergeist_driver
36
42
  end
data/lib/grell/crawler.rb CHANGED
@@ -8,13 +8,13 @@ module Grell
8
8
  # Creates a crawler
9
9
  # options allows :logger to point to an object with the same interface than Logger in the standard library
10
10
  def initialize(options = {})
11
- @driver = CapybaraDriver.setup(options)
12
-
13
11
  if options[:logger]
14
12
  Grell.logger = options[:logger]
15
13
  else
16
14
  Grell.logger = Logger.new(STDOUT)
17
15
  end
16
+
17
+ @driver = CapybaraDriver.setup(options)
18
18
  end
19
19
 
20
20
  # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
@@ -51,12 +51,14 @@ module Grell
51
51
  Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
52
52
  site.navigate
53
53
  filter!(site.links)
54
+ add_redirect_url(site)
54
55
 
55
- if block #The user of this block can send us a :retry to retry accessing the page
56
- while block.call(site) == :retry
56
+ if block # The user of this block can send us a :retry to retry accessing the page
57
+ while crawl_block(block, site) == :retry
57
58
  Grell.logger.info "Retrying our visit to #{site.url}"
58
59
  site.navigate
59
60
  filter!(site.links)
61
+ add_redirect_url(site)
60
62
  end
61
63
  end
62
64
 
@@ -67,6 +69,15 @@ module Grell
67
69
 
68
70
  private
69
71
 
72
+ # Treat any exceptions from the block as an unavailable page
73
+ def crawl_block(block, site)
74
+ block.call(site)
75
+ rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
76
+ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
77
+ Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
78
+ site.unavailable_page(404, e)
79
+ end
80
+
70
81
  def filter!(links)
71
82
  links.select! { |link| link =~ @whitelist_regexp } if @whitelist_regexp
72
83
  links.delete_if { |link| link =~ @blacklist_regexp } if @blacklist_regexp
@@ -80,6 +91,13 @@ module Grell
80
91
  end
81
92
  end
82
93
 
94
+ # Store the resulting redirected URL along with the original URL
95
+ def add_redirect_url(site)
96
+ if site.url != site.current_url
97
+ @collection.create_page(site.current_url, site.id)
98
+ end
99
+ end
100
+
83
101
  end
84
102
 
85
103
  end
data/lib/grell/page.rb CHANGED
@@ -34,15 +34,9 @@ module Grell
34
34
  @result_page = VisitedPage.new(@rawpage)
35
35
  @timestamp = Time.now
36
36
  @times_visited += 1
37
- rescue Capybara::Poltergeist::JavascriptError => e
38
- unavailable_page(404, e)
39
- rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug.
40
- unavailable_page(404, e)
41
- rescue URI::InvalidURIError => e #No cool URL means we report error
42
- unavailable_page(404, e)
43
- rescue Capybara::Poltergeist::TimeoutError => e #Poltergeist has its own timeout which is similar to Chromes.
44
- unavailable_page(404, e)
45
- rescue Capybara::Poltergeist::StatusFailError => e
37
+ rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
38
+ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
39
+ Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
46
40
  unavailable_page(404, e)
47
41
  end
48
42
 
@@ -73,13 +67,14 @@ module Grell
73
67
  @url
74
68
  end
75
69
 
76
- private
77
70
  def unavailable_page(status, exception)
78
71
  Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
79
72
  @result_page = ErroredPage.new(status, exception)
80
73
  @timestamp = Time.now
81
74
  end
82
75
 
76
+ private
77
+
83
78
  # Private class.
84
79
  # This is a result page when it has not been visited yet. Essentially empty of information
85
80
  #
data/lib/grell/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Grell
2
- VERSION = "1.6"
2
+ VERSION = "1.6.1"
3
3
  end
@@ -0,0 +1,32 @@
1
+
2
+ RSpec.describe Grell::CapybaraDriver do
3
+ let(:ts) { Time.now }
4
+
5
+ describe 'setup_capybara' do
6
+ it 'properly registers the poltergeist driver' do
7
+ Timecop.freeze(ts)
8
+ driver = Grell::CapybaraDriver.new.setup_capybara
9
+ expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
10
+ end
11
+
12
+ it 'raises an exception if the driver cannot be initialized' do
13
+ Timecop.freeze(ts + 60)
14
+
15
+ # Attempt to register twice with the same driver name
16
+ Grell::CapybaraDriver.new.setup_capybara
17
+ expect { Grell::CapybaraDriver.new.setup_capybara }.
18
+ to raise_error "Poltergeist Driver could not be properly initialized"
19
+ end
20
+
21
+ it 'can register the poltergeist driver multiple times in a row' do
22
+ Timecop.freeze(ts + 120)
23
+ driver = Grell::CapybaraDriver.new.setup_capybara
24
+ expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
25
+ end
26
+
27
+ after do
28
+ Timecop.return
29
+ end
30
+ end
31
+
32
+ end
@@ -36,13 +36,19 @@ RSpec.describe Grell::Crawler do
36
36
 
37
37
  it 'yields the result if a block is given' do
38
38
  result = []
39
- block = Proc.new {|n| result.push(n) }
39
+ block = Proc.new { |n| result.push(n) }
40
40
  crawler.crawl(page, block)
41
41
  expect(result.size).to eq(1)
42
42
  expect(result.first.url).to eq(url)
43
43
  expect(result.first.visited?).to eq(true)
44
44
  end
45
45
 
46
+ it 'rescues any specified exceptions raised during the block execution' do
47
+ block = Proc.new { |n| raise Capybara::Poltergeist::BrowserError, 'Exception' }
48
+ expect{ crawler.crawl(page, block) }.to_not raise_error
49
+ expect(page.status).to eq(404)
50
+ end
51
+
46
52
  it 'logs interesting information' do
47
53
  crawler
48
54
  expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
@@ -61,6 +67,13 @@ RSpec.describe Grell::Crawler do
61
67
  crawler.crawl(page, block)
62
68
  expect(counter).to eq(times_retrying)
63
69
  end
70
+
71
+ it 'handles redirects by adding the current_url to the page collection' do
72
+ redirect_url = 'http://www.example.com/test/landing_page'
73
+ allow(page).to receive(:current_url).and_return(redirect_url)
74
+ expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
75
+ crawler.crawl(page, nil)
76
+ end
64
77
  end
65
78
 
66
79
  context '#start_crawling' do
@@ -80,7 +93,7 @@ RSpec.describe Grell::Crawler do
80
93
 
81
94
  it 'calls the block we used to start_crawling' do
82
95
  result = []
83
- block = Proc.new {|n| result.push(n) }
96
+ block = Proc.new { |n| result.push(n) }
84
97
  crawler.start_crawling(url, &block)
85
98
  expect(result.size).to eq(2)
86
99
  expect(result[0].url).to eq(url)
@@ -106,7 +106,8 @@ RSpec.describe Grell::Page do
106
106
  end
107
107
 
108
108
  [ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
109
- Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError ].each do |error_type|
109
+ Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError,
110
+ Capybara::Poltergeist::DeadClient, Errno::ECONNRESET ].each do |error_type|
110
111
 
111
112
  context "#{error_type}" do
112
113
  let(:headers) do
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'grell'
2
2
  require 'byebug'
3
+ require 'timecop'
3
4
  require 'webmock/rspec'
4
5
  require 'billy/rspec'
5
6
  require 'rack'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grell
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.6'
4
+ version: 1.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jordi Polo Carres
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-02 00:00:00.000000000 Z
11
+ date: 2016-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -136,6 +136,20 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0.5'
139
+ - !ruby/object:Gem::Dependency
140
+ name: timecop
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '0.8'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '0.8'
139
153
  description: Ruby web crawler using PhantomJS
140
154
  email:
141
155
  - jcarres@mdsol.com
@@ -159,6 +173,7 @@ files:
159
173
  - lib/grell/rawpage.rb
160
174
  - lib/grell/reader.rb
161
175
  - lib/grell/version.rb
176
+ - spec/lib/capybara_driver_spec.rb
162
177
  - spec/lib/crawler_spec.rb
163
178
  - spec/lib/page_collection_spec.rb
164
179
  - spec/lib/page_spec.rb
@@ -184,11 +199,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
184
199
  version: '0'
185
200
  requirements: []
186
201
  rubyforge_project:
187
- rubygems_version: 2.4.8
202
+ rubygems_version: 2.5.1
188
203
  signing_key:
189
204
  specification_version: 4
190
205
  summary: Ruby web crawler
191
206
  test_files:
207
+ - spec/lib/capybara_driver_spec.rb
192
208
  - spec/lib/crawler_spec.rb
193
209
  - spec/lib/page_collection_spec.rb
194
210
  - spec/lib/page_spec.rb