grell 1.6 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2be8992c96b83e9b1a98474ada3b49ea7e5adb69
4
- data.tar.gz: 3eed1bea205812e8e9ab7dc8678da57efea1fea1
3
+ metadata.gz: 0ef86064ca2938505dec12137ac353c08087695c
4
+ data.tar.gz: 22483420d8592db3d8e633c56a1c83b3c812a9ba
5
5
  SHA512:
6
- metadata.gz: baa6e37b2ce80491b05688618b6ad0576149236c2367b2f6c52a84dfeae25edb6d340abfdcae4e3b6f7363072db0dc0c8c052cd83410e1f28e1725305db99993
7
- data.tar.gz: 7c246e8b2a02494d5e44dc6fc4b0029ab254e63764b46791e9135ed9ec1657627d4b6f7e5cd921a951c062cfe815ac1fd7b4e7d87ffb11f786e0989d44c3083a
6
+ metadata.gz: 64c2b2d0b7e1478faed2a44d78e38c0cda20fc2742f558b610cb1a009120928ce3fa7215a5919f4e28554f8c9cc032957a775858dae5ce5870abaa1d847017fb
7
+ data.tar.gz: caf16e853905923720299543fde15318b62187fd21f4600136f135a6386eeef2669674e8eff1074ab5c66d6ed3432adc9e68561768f334b0c89052b246a4de20
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ # 1.6.1
2
+ * Use non-static name to support registering Poltergeist crawler multiple times
3
+ * More exception handling, store redirected URLs in addition to original URL
4
+
1
5
  # 1.6
2
6
  * Support custom URL comparison when adding new pages during crawling
3
7
  * Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
data/grell.gemspec CHANGED
@@ -31,4 +31,5 @@ Gem::Specification.new do |spec|
31
31
  spec.add_development_dependency "webmock", '~> 1.18'
32
32
  spec.add_development_dependency 'rspec', '~> 3.0'
33
33
  spec.add_development_dependency 'puffing-billy', '~> 0.5'
34
+ spec.add_development_dependency 'timecop', '~> 0.8'
34
35
  end
@@ -13,7 +13,13 @@ module Grell
13
13
 
14
14
  def setup_capybara
15
15
  @poltergeist_driver = nil
16
- Capybara.register_driver :poltergeist_crawler do |app|
16
+
17
+ # Capybara will not re-run the block if the driver name already exists, so the driver name
18
+ # will have a time integer appended to ensure uniqueness.
19
+ driver_name = "poltergeist_crawler_#{Time.now.to_i}".to_sym
20
+ Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
21
+
22
+ Capybara.register_driver driver_name do |app|
17
23
  @poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
18
24
  js_errors: false,
19
25
  inspector: false,
@@ -24,13 +30,13 @@ module Grell
24
30
 
25
31
  Capybara.default_max_wait_time = 3
26
32
  Capybara.run_server = false
27
- Capybara.default_driver = :poltergeist_crawler
33
+ Capybara.default_driver = driver_name
28
34
  page.driver.headers = {
29
35
  "DNT" => 1,
30
36
  "User-Agent" => USER_AGENT
31
37
  }
32
38
 
33
- fail "Poltergeist Driver could not be properly initialized" unless @poltergeist_driver
39
+ raise 'Poltergeist Driver could not be properly initialized' unless @poltergeist_driver
34
40
 
35
41
  @poltergeist_driver
36
42
  end
data/lib/grell/crawler.rb CHANGED
@@ -8,13 +8,13 @@ module Grell
8
8
  # Creates a crawler
9
9
  # options allows :logger to point to an object with the same interface than Logger in the standard library
10
10
  def initialize(options = {})
11
- @driver = CapybaraDriver.setup(options)
12
-
13
11
  if options[:logger]
14
12
  Grell.logger = options[:logger]
15
13
  else
16
14
  Grell.logger = Logger.new(STDOUT)
17
15
  end
16
+
17
+ @driver = CapybaraDriver.setup(options)
18
18
  end
19
19
 
20
20
  # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
@@ -51,12 +51,14 @@ module Grell
51
51
  Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
52
52
  site.navigate
53
53
  filter!(site.links)
54
+ add_redirect_url(site)
54
55
 
55
- if block #The user of this block can send us a :retry to retry accessing the page
56
- while block.call(site) == :retry
56
+ if block # The user of this block can send us a :retry to retry accessing the page
57
+ while crawl_block(block, site) == :retry
57
58
  Grell.logger.info "Retrying our visit to #{site.url}"
58
59
  site.navigate
59
60
  filter!(site.links)
61
+ add_redirect_url(site)
60
62
  end
61
63
  end
62
64
 
@@ -67,6 +69,15 @@ module Grell
67
69
 
68
70
  private
69
71
 
72
+ # Treat any exceptions from the block as an unavailable page
73
+ def crawl_block(block, site)
74
+ block.call(site)
75
+ rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
76
+ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
77
+ Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
78
+ site.unavailable_page(404, e)
79
+ end
80
+
70
81
  def filter!(links)
71
82
  links.select! { |link| link =~ @whitelist_regexp } if @whitelist_regexp
72
83
  links.delete_if { |link| link =~ @blacklist_regexp } if @blacklist_regexp
@@ -80,6 +91,13 @@ module Grell
80
91
  end
81
92
  end
82
93
 
94
+ # Store the resulting redirected URL along with the original URL
95
+ def add_redirect_url(site)
96
+ if site.url != site.current_url
97
+ @collection.create_page(site.current_url, site.id)
98
+ end
99
+ end
100
+
83
101
  end
84
102
 
85
103
  end
data/lib/grell/page.rb CHANGED
@@ -34,15 +34,9 @@ module Grell
34
34
  @result_page = VisitedPage.new(@rawpage)
35
35
  @timestamp = Time.now
36
36
  @times_visited += 1
37
- rescue Capybara::Poltergeist::JavascriptError => e
38
- unavailable_page(404, e)
39
- rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug.
40
- unavailable_page(404, e)
41
- rescue URI::InvalidURIError => e #No cool URL means we report error
42
- unavailable_page(404, e)
43
- rescue Capybara::Poltergeist::TimeoutError => e #Poltergeist has its own timeout which is similar to Chromes.
44
- unavailable_page(404, e)
45
- rescue Capybara::Poltergeist::StatusFailError => e
37
+ rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
38
+ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
39
+ Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
46
40
  unavailable_page(404, e)
47
41
  end
48
42
 
@@ -73,13 +67,14 @@ module Grell
73
67
  @url
74
68
  end
75
69
 
76
- private
77
70
  def unavailable_page(status, exception)
78
71
  Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
79
72
  @result_page = ErroredPage.new(status, exception)
80
73
  @timestamp = Time.now
81
74
  end
82
75
 
76
+ private
77
+
83
78
  # Private class.
84
79
  # This is a result page when it has not been visited yet. Essentially empty of information
85
80
  #
data/lib/grell/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Grell
2
- VERSION = "1.6"
2
+ VERSION = "1.6.1"
3
3
  end
@@ -0,0 +1,32 @@
1
+
2
+ RSpec.describe Grell::CapybaraDriver do
3
+ let(:ts) { Time.now }
4
+
5
+ describe 'setup_capybara' do
6
+ it 'properly registers the poltergeist driver' do
7
+ Timecop.freeze(ts)
8
+ driver = Grell::CapybaraDriver.new.setup_capybara
9
+ expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
10
+ end
11
+
12
+ it 'raises an exception if the driver cannot be initialized' do
13
+ Timecop.freeze(ts + 60)
14
+
15
+ # Attempt to register twice with the same driver name
16
+ Grell::CapybaraDriver.new.setup_capybara
17
+ expect { Grell::CapybaraDriver.new.setup_capybara }.
18
+ to raise_error "Poltergeist Driver could not be properly initialized"
19
+ end
20
+
21
+ it 'can register the poltergeist driver multiple times in a row' do
22
+ Timecop.freeze(ts + 120)
23
+ driver = Grell::CapybaraDriver.new.setup_capybara
24
+ expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
25
+ end
26
+
27
+ after do
28
+ Timecop.return
29
+ end
30
+ end
31
+
32
+ end
@@ -36,13 +36,19 @@ RSpec.describe Grell::Crawler do
36
36
 
37
37
  it 'yields the result if a block is given' do
38
38
  result = []
39
- block = Proc.new {|n| result.push(n) }
39
+ block = Proc.new { |n| result.push(n) }
40
40
  crawler.crawl(page, block)
41
41
  expect(result.size).to eq(1)
42
42
  expect(result.first.url).to eq(url)
43
43
  expect(result.first.visited?).to eq(true)
44
44
  end
45
45
 
46
+ it 'rescues any specified exceptions raised during the block execution' do
47
+ block = Proc.new { |n| raise Capybara::Poltergeist::BrowserError, 'Exception' }
48
+ expect{ crawler.crawl(page, block) }.to_not raise_error
49
+ expect(page.status).to eq(404)
50
+ end
51
+
46
52
  it 'logs interesting information' do
47
53
  crawler
48
54
  expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
@@ -61,6 +67,13 @@ RSpec.describe Grell::Crawler do
61
67
  crawler.crawl(page, block)
62
68
  expect(counter).to eq(times_retrying)
63
69
  end
70
+
71
+ it 'handles redirects by adding the current_url to the page collection' do
72
+ redirect_url = 'http://www.example.com/test/landing_page'
73
+ allow(page).to receive(:current_url).and_return(redirect_url)
74
+ expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
75
+ crawler.crawl(page, nil)
76
+ end
64
77
  end
65
78
 
66
79
  context '#start_crawling' do
@@ -80,7 +93,7 @@ RSpec.describe Grell::Crawler do
80
93
 
81
94
  it 'calls the block we used to start_crawling' do
82
95
  result = []
83
- block = Proc.new {|n| result.push(n) }
96
+ block = Proc.new { |n| result.push(n) }
84
97
  crawler.start_crawling(url, &block)
85
98
  expect(result.size).to eq(2)
86
99
  expect(result[0].url).to eq(url)
@@ -106,7 +106,8 @@ RSpec.describe Grell::Page do
106
106
  end
107
107
 
108
108
  [ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
109
- Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError ].each do |error_type|
109
+ Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError,
110
+ Capybara::Poltergeist::DeadClient, Errno::ECONNRESET ].each do |error_type|
110
111
 
111
112
  context "#{error_type}" do
112
113
  let(:headers) do
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'grell'
2
2
  require 'byebug'
3
+ require 'timecop'
3
4
  require 'webmock/rspec'
4
5
  require 'billy/rspec'
5
6
  require 'rack'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grell
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.6'
4
+ version: 1.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jordi Polo Carres
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-02 00:00:00.000000000 Z
11
+ date: 2016-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -136,6 +136,20 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0.5'
139
+ - !ruby/object:Gem::Dependency
140
+ name: timecop
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '0.8'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '0.8'
139
153
  description: Ruby web crawler using PhantomJS
140
154
  email:
141
155
  - jcarres@mdsol.com
@@ -159,6 +173,7 @@ files:
159
173
  - lib/grell/rawpage.rb
160
174
  - lib/grell/reader.rb
161
175
  - lib/grell/version.rb
176
+ - spec/lib/capybara_driver_spec.rb
162
177
  - spec/lib/crawler_spec.rb
163
178
  - spec/lib/page_collection_spec.rb
164
179
  - spec/lib/page_spec.rb
@@ -184,11 +199,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
184
199
  version: '0'
185
200
  requirements: []
186
201
  rubyforge_project:
187
- rubygems_version: 2.4.8
202
+ rubygems_version: 2.5.1
188
203
  signing_key:
189
204
  specification_version: 4
190
205
  summary: Ruby web crawler
191
206
  test_files:
207
+ - spec/lib/capybara_driver_spec.rb
192
208
  - spec/lib/crawler_spec.rb
193
209
  - spec/lib/page_collection_spec.rb
194
210
  - spec/lib/page_spec.rb