grell 1.6 → 1.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/grell.gemspec +1 -0
- data/lib/grell/capybara_driver.rb +9 -3
- data/lib/grell/crawler.rb +22 -4
- data/lib/grell/page.rb +5 -10
- data/lib/grell/version.rb +1 -1
- data/spec/lib/capybara_driver_spec.rb +32 -0
- data/spec/lib/crawler_spec.rb +15 -2
- data/spec/lib/page_spec.rb +2 -1
- data/spec/spec_helper.rb +1 -0
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ef86064ca2938505dec12137ac353c08087695c
|
4
|
+
data.tar.gz: 22483420d8592db3d8e633c56a1c83b3c812a9ba
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 64c2b2d0b7e1478faed2a44d78e38c0cda20fc2742f558b610cb1a009120928ce3fa7215a5919f4e28554f8c9cc032957a775858dae5ce5870abaa1d847017fb
|
7
|
+
data.tar.gz: caf16e853905923720299543fde15318b62187fd21f4600136f135a6386eeef2669674e8eff1074ab5c66d6ed3432adc9e68561768f334b0c89052b246a4de20
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# 1.6.1
|
2
|
+
* Use non-static name to support registering Poltergeist crawler multiple times
|
3
|
+
* More exception handling, store redirected URLs in addition to original URL
|
4
|
+
|
1
5
|
# 1.6
|
2
6
|
* Support custom URL comparison when adding new pages during crawling
|
3
7
|
* Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
|
data/grell.gemspec
CHANGED
@@ -13,7 +13,13 @@ module Grell
|
|
13
13
|
|
14
14
|
def setup_capybara
|
15
15
|
@poltergeist_driver = nil
|
16
|
-
|
16
|
+
|
17
|
+
# Capybara will not re-run the block if the driver name already exists, so the driver name
|
18
|
+
# will have a time integer appended to ensure uniqueness.
|
19
|
+
driver_name = "poltergeist_crawler_#{Time.now.to_i}".to_sym
|
20
|
+
Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
|
21
|
+
|
22
|
+
Capybara.register_driver driver_name do |app|
|
17
23
|
@poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
|
18
24
|
js_errors: false,
|
19
25
|
inspector: false,
|
@@ -24,13 +30,13 @@ module Grell
|
|
24
30
|
|
25
31
|
Capybara.default_max_wait_time = 3
|
26
32
|
Capybara.run_server = false
|
27
|
-
Capybara.default_driver =
|
33
|
+
Capybara.default_driver = driver_name
|
28
34
|
page.driver.headers = {
|
29
35
|
"DNT" => 1,
|
30
36
|
"User-Agent" => USER_AGENT
|
31
37
|
}
|
32
38
|
|
33
|
-
|
39
|
+
raise 'Poltergeist Driver could not be properly initialized' unless @poltergeist_driver
|
34
40
|
|
35
41
|
@poltergeist_driver
|
36
42
|
end
|
data/lib/grell/crawler.rb
CHANGED
@@ -8,13 +8,13 @@ module Grell
|
|
8
8
|
# Creates a crawler
|
9
9
|
# options allows :logger to point to an object with the same interface than Logger in the standard library
|
10
10
|
def initialize(options = {})
|
11
|
-
@driver = CapybaraDriver.setup(options)
|
12
|
-
|
13
11
|
if options[:logger]
|
14
12
|
Grell.logger = options[:logger]
|
15
13
|
else
|
16
14
|
Grell.logger = Logger.new(STDOUT)
|
17
15
|
end
|
16
|
+
|
17
|
+
@driver = CapybaraDriver.setup(options)
|
18
18
|
end
|
19
19
|
|
20
20
|
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
@@ -51,12 +51,14 @@ module Grell
|
|
51
51
|
Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
|
52
52
|
site.navigate
|
53
53
|
filter!(site.links)
|
54
|
+
add_redirect_url(site)
|
54
55
|
|
55
|
-
if block #The user of this block can send us a :retry to retry accessing the page
|
56
|
-
while block
|
56
|
+
if block # The user of this block can send us a :retry to retry accessing the page
|
57
|
+
while crawl_block(block, site) == :retry
|
57
58
|
Grell.logger.info "Retrying our visit to #{site.url}"
|
58
59
|
site.navigate
|
59
60
|
filter!(site.links)
|
61
|
+
add_redirect_url(site)
|
60
62
|
end
|
61
63
|
end
|
62
64
|
|
@@ -67,6 +69,15 @@ module Grell
|
|
67
69
|
|
68
70
|
private
|
69
71
|
|
72
|
+
# Treat any exceptions from the block as an unavailable page
|
73
|
+
def crawl_block(block, site)
|
74
|
+
block.call(site)
|
75
|
+
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
|
76
|
+
Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
|
77
|
+
Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
|
78
|
+
site.unavailable_page(404, e)
|
79
|
+
end
|
80
|
+
|
70
81
|
def filter!(links)
|
71
82
|
links.select! { |link| link =~ @whitelist_regexp } if @whitelist_regexp
|
72
83
|
links.delete_if { |link| link =~ @blacklist_regexp } if @blacklist_regexp
|
@@ -80,6 +91,13 @@ module Grell
|
|
80
91
|
end
|
81
92
|
end
|
82
93
|
|
94
|
+
# Store the resulting redirected URL along with the original URL
|
95
|
+
def add_redirect_url(site)
|
96
|
+
if site.url != site.current_url
|
97
|
+
@collection.create_page(site.current_url, site.id)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
83
101
|
end
|
84
102
|
|
85
103
|
end
|
data/lib/grell/page.rb
CHANGED
@@ -34,15 +34,9 @@ module Grell
|
|
34
34
|
@result_page = VisitedPage.new(@rawpage)
|
35
35
|
@timestamp = Time.now
|
36
36
|
@times_visited += 1
|
37
|
-
rescue Capybara::Poltergeist::
|
38
|
-
|
39
|
-
|
40
|
-
unavailable_page(404, e)
|
41
|
-
rescue URI::InvalidURIError => e #No cool URL means we report error
|
42
|
-
unavailable_page(404, e)
|
43
|
-
rescue Capybara::Poltergeist::TimeoutError => e #Poltergeist has its own timeout which is similar to Chromes.
|
44
|
-
unavailable_page(404, e)
|
45
|
-
rescue Capybara::Poltergeist::StatusFailError => e
|
37
|
+
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
|
38
|
+
Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
|
39
|
+
Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
|
46
40
|
unavailable_page(404, e)
|
47
41
|
end
|
48
42
|
|
@@ -73,13 +67,14 @@ module Grell
|
|
73
67
|
@url
|
74
68
|
end
|
75
69
|
|
76
|
-
private
|
77
70
|
def unavailable_page(status, exception)
|
78
71
|
Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
|
79
72
|
@result_page = ErroredPage.new(status, exception)
|
80
73
|
@timestamp = Time.now
|
81
74
|
end
|
82
75
|
|
76
|
+
private
|
77
|
+
|
83
78
|
# Private class.
|
84
79
|
# This is a result page when it has not been visited yet. Essentially empty of information
|
85
80
|
#
|
data/lib/grell/version.rb
CHANGED
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
RSpec.describe Grell::CapybaraDriver do
|
3
|
+
let(:ts) { Time.now }
|
4
|
+
|
5
|
+
describe 'setup_capybara' do
|
6
|
+
it 'properly registers the poltergeist driver' do
|
7
|
+
Timecop.freeze(ts)
|
8
|
+
driver = Grell::CapybaraDriver.new.setup_capybara
|
9
|
+
expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'raises an exception if the driver cannot be initialized' do
|
13
|
+
Timecop.freeze(ts + 60)
|
14
|
+
|
15
|
+
# Attempt to register twice with the same driver name
|
16
|
+
Grell::CapybaraDriver.new.setup_capybara
|
17
|
+
expect { Grell::CapybaraDriver.new.setup_capybara }.
|
18
|
+
to raise_error "Poltergeist Driver could not be properly initialized"
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'can register the poltergeist driver multiple times in a row' do
|
22
|
+
Timecop.freeze(ts + 120)
|
23
|
+
driver = Grell::CapybaraDriver.new.setup_capybara
|
24
|
+
expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
|
25
|
+
end
|
26
|
+
|
27
|
+
after do
|
28
|
+
Timecop.return
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
data/spec/lib/crawler_spec.rb
CHANGED
@@ -36,13 +36,19 @@ RSpec.describe Grell::Crawler do
|
|
36
36
|
|
37
37
|
it 'yields the result if a block is given' do
|
38
38
|
result = []
|
39
|
-
block = Proc.new {|n| result.push(n) }
|
39
|
+
block = Proc.new { |n| result.push(n) }
|
40
40
|
crawler.crawl(page, block)
|
41
41
|
expect(result.size).to eq(1)
|
42
42
|
expect(result.first.url).to eq(url)
|
43
43
|
expect(result.first.visited?).to eq(true)
|
44
44
|
end
|
45
45
|
|
46
|
+
it 'rescues any specified exceptions raised during the block execution' do
|
47
|
+
block = Proc.new { |n| raise Capybara::Poltergeist::BrowserError, 'Exception' }
|
48
|
+
expect{ crawler.crawl(page, block) }.to_not raise_error
|
49
|
+
expect(page.status).to eq(404)
|
50
|
+
end
|
51
|
+
|
46
52
|
it 'logs interesting information' do
|
47
53
|
crawler
|
48
54
|
expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
|
@@ -61,6 +67,13 @@ RSpec.describe Grell::Crawler do
|
|
61
67
|
crawler.crawl(page, block)
|
62
68
|
expect(counter).to eq(times_retrying)
|
63
69
|
end
|
70
|
+
|
71
|
+
it 'handles redirects by adding the current_url to the page collection' do
|
72
|
+
redirect_url = 'http://www.example.com/test/landing_page'
|
73
|
+
allow(page).to receive(:current_url).and_return(redirect_url)
|
74
|
+
expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
|
75
|
+
crawler.crawl(page, nil)
|
76
|
+
end
|
64
77
|
end
|
65
78
|
|
66
79
|
context '#start_crawling' do
|
@@ -80,7 +93,7 @@ RSpec.describe Grell::Crawler do
|
|
80
93
|
|
81
94
|
it 'calls the block we used to start_crawling' do
|
82
95
|
result = []
|
83
|
-
block = Proc.new {|n| result.push(n) }
|
96
|
+
block = Proc.new { |n| result.push(n) }
|
84
97
|
crawler.start_crawling(url, &block)
|
85
98
|
expect(result.size).to eq(2)
|
86
99
|
expect(result[0].url).to eq(url)
|
data/spec/lib/page_spec.rb
CHANGED
@@ -106,7 +106,8 @@ RSpec.describe Grell::Page do
|
|
106
106
|
end
|
107
107
|
|
108
108
|
[ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
|
109
|
-
Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError
|
109
|
+
Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError,
|
110
|
+
Capybara::Poltergeist::DeadClient, Errno::ECONNRESET ].each do |error_type|
|
110
111
|
|
111
112
|
context "#{error_type}" do
|
112
113
|
let(:headers) do
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -136,6 +136,20 @@ dependencies:
|
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '0.5'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: timecop
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0.8'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0.8'
|
139
153
|
description: Ruby web crawler using PhantomJS
|
140
154
|
email:
|
141
155
|
- jcarres@mdsol.com
|
@@ -159,6 +173,7 @@ files:
|
|
159
173
|
- lib/grell/rawpage.rb
|
160
174
|
- lib/grell/reader.rb
|
161
175
|
- lib/grell/version.rb
|
176
|
+
- spec/lib/capybara_driver_spec.rb
|
162
177
|
- spec/lib/crawler_spec.rb
|
163
178
|
- spec/lib/page_collection_spec.rb
|
164
179
|
- spec/lib/page_spec.rb
|
@@ -184,11 +199,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
184
199
|
version: '0'
|
185
200
|
requirements: []
|
186
201
|
rubyforge_project:
|
187
|
-
rubygems_version: 2.
|
202
|
+
rubygems_version: 2.5.1
|
188
203
|
signing_key:
|
189
204
|
specification_version: 4
|
190
205
|
summary: Ruby web crawler
|
191
206
|
test_files:
|
207
|
+
- spec/lib/capybara_driver_spec.rb
|
192
208
|
- spec/lib/crawler_spec.rb
|
193
209
|
- spec/lib/page_collection_spec.rb
|
194
210
|
- spec/lib/page_spec.rb
|