grell 1.6 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/grell.gemspec +1 -0
- data/lib/grell/capybara_driver.rb +9 -3
- data/lib/grell/crawler.rb +22 -4
- data/lib/grell/page.rb +5 -10
- data/lib/grell/version.rb +1 -1
- data/spec/lib/capybara_driver_spec.rb +32 -0
- data/spec/lib/crawler_spec.rb +15 -2
- data/spec/lib/page_spec.rb +2 -1
- data/spec/spec_helper.rb +1 -0
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ef86064ca2938505dec12137ac353c08087695c
|
4
|
+
data.tar.gz: 22483420d8592db3d8e633c56a1c83b3c812a9ba
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 64c2b2d0b7e1478faed2a44d78e38c0cda20fc2742f558b610cb1a009120928ce3fa7215a5919f4e28554f8c9cc032957a775858dae5ce5870abaa1d847017fb
|
7
|
+
data.tar.gz: caf16e853905923720299543fde15318b62187fd21f4600136f135a6386eeef2669674e8eff1074ab5c66d6ed3432adc9e68561768f334b0c89052b246a4de20
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
# 1.6.1
|
2
|
+
* Use non-static name to support registering Poltergeist crawler multiple times
|
3
|
+
* More exception handling, store redirected URLs in addition to original URL
|
4
|
+
|
1
5
|
# 1.6
|
2
6
|
* Support custom URL comparison when adding new pages during crawling
|
3
7
|
* Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
|
data/grell.gemspec
CHANGED
@@ -13,7 +13,13 @@ module Grell
|
|
13
13
|
|
14
14
|
def setup_capybara
|
15
15
|
@poltergeist_driver = nil
|
16
|
-
|
16
|
+
|
17
|
+
# Capybara will not re-run the block if the driver name already exists, so the driver name
|
18
|
+
# will have a time integer appended to ensure uniqueness.
|
19
|
+
driver_name = "poltergeist_crawler_#{Time.now.to_i}".to_sym
|
20
|
+
Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
|
21
|
+
|
22
|
+
Capybara.register_driver driver_name do |app|
|
17
23
|
@poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
|
18
24
|
js_errors: false,
|
19
25
|
inspector: false,
|
@@ -24,13 +30,13 @@ module Grell
|
|
24
30
|
|
25
31
|
Capybara.default_max_wait_time = 3
|
26
32
|
Capybara.run_server = false
|
27
|
-
Capybara.default_driver =
|
33
|
+
Capybara.default_driver = driver_name
|
28
34
|
page.driver.headers = {
|
29
35
|
"DNT" => 1,
|
30
36
|
"User-Agent" => USER_AGENT
|
31
37
|
}
|
32
38
|
|
33
|
-
|
39
|
+
raise 'Poltergeist Driver could not be properly initialized' unless @poltergeist_driver
|
34
40
|
|
35
41
|
@poltergeist_driver
|
36
42
|
end
|
data/lib/grell/crawler.rb
CHANGED
@@ -8,13 +8,13 @@ module Grell
|
|
8
8
|
# Creates a crawler
|
9
9
|
# options allows :logger to point to an object with the same interface than Logger in the standard library
|
10
10
|
def initialize(options = {})
|
11
|
-
@driver = CapybaraDriver.setup(options)
|
12
|
-
|
13
11
|
if options[:logger]
|
14
12
|
Grell.logger = options[:logger]
|
15
13
|
else
|
16
14
|
Grell.logger = Logger.new(STDOUT)
|
17
15
|
end
|
16
|
+
|
17
|
+
@driver = CapybaraDriver.setup(options)
|
18
18
|
end
|
19
19
|
|
20
20
|
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
@@ -51,12 +51,14 @@ module Grell
|
|
51
51
|
Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
|
52
52
|
site.navigate
|
53
53
|
filter!(site.links)
|
54
|
+
add_redirect_url(site)
|
54
55
|
|
55
|
-
if block #The user of this block can send us a :retry to retry accessing the page
|
56
|
-
while block
|
56
|
+
if block # The user of this block can send us a :retry to retry accessing the page
|
57
|
+
while crawl_block(block, site) == :retry
|
57
58
|
Grell.logger.info "Retrying our visit to #{site.url}"
|
58
59
|
site.navigate
|
59
60
|
filter!(site.links)
|
61
|
+
add_redirect_url(site)
|
60
62
|
end
|
61
63
|
end
|
62
64
|
|
@@ -67,6 +69,15 @@ module Grell
|
|
67
69
|
|
68
70
|
private
|
69
71
|
|
72
|
+
# Treat any exceptions from the block as an unavailable page
|
73
|
+
def crawl_block(block, site)
|
74
|
+
block.call(site)
|
75
|
+
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
|
76
|
+
Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
|
77
|
+
Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
|
78
|
+
site.unavailable_page(404, e)
|
79
|
+
end
|
80
|
+
|
70
81
|
def filter!(links)
|
71
82
|
links.select! { |link| link =~ @whitelist_regexp } if @whitelist_regexp
|
72
83
|
links.delete_if { |link| link =~ @blacklist_regexp } if @blacklist_regexp
|
@@ -80,6 +91,13 @@ module Grell
|
|
80
91
|
end
|
81
92
|
end
|
82
93
|
|
94
|
+
# Store the resulting redirected URL along with the original URL
|
95
|
+
def add_redirect_url(site)
|
96
|
+
if site.url != site.current_url
|
97
|
+
@collection.create_page(site.current_url, site.id)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
83
101
|
end
|
84
102
|
|
85
103
|
end
|
data/lib/grell/page.rb
CHANGED
@@ -34,15 +34,9 @@ module Grell
|
|
34
34
|
@result_page = VisitedPage.new(@rawpage)
|
35
35
|
@timestamp = Time.now
|
36
36
|
@times_visited += 1
|
37
|
-
rescue Capybara::Poltergeist::
|
38
|
-
|
39
|
-
|
40
|
-
unavailable_page(404, e)
|
41
|
-
rescue URI::InvalidURIError => e #No cool URL means we report error
|
42
|
-
unavailable_page(404, e)
|
43
|
-
rescue Capybara::Poltergeist::TimeoutError => e #Poltergeist has its own timeout which is similar to Chromes.
|
44
|
-
unavailable_page(404, e)
|
45
|
-
rescue Capybara::Poltergeist::StatusFailError => e
|
37
|
+
rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
|
38
|
+
Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
|
39
|
+
Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
|
46
40
|
unavailable_page(404, e)
|
47
41
|
end
|
48
42
|
|
@@ -73,13 +67,14 @@ module Grell
|
|
73
67
|
@url
|
74
68
|
end
|
75
69
|
|
76
|
-
private
|
77
70
|
def unavailable_page(status, exception)
|
78
71
|
Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
|
79
72
|
@result_page = ErroredPage.new(status, exception)
|
80
73
|
@timestamp = Time.now
|
81
74
|
end
|
82
75
|
|
76
|
+
private
|
77
|
+
|
83
78
|
# Private class.
|
84
79
|
# This is a result page when it has not been visited yet. Essentially empty of information
|
85
80
|
#
|
data/lib/grell/version.rb
CHANGED
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
RSpec.describe Grell::CapybaraDriver do
|
3
|
+
let(:ts) { Time.now }
|
4
|
+
|
5
|
+
describe 'setup_capybara' do
|
6
|
+
it 'properly registers the poltergeist driver' do
|
7
|
+
Timecop.freeze(ts)
|
8
|
+
driver = Grell::CapybaraDriver.new.setup_capybara
|
9
|
+
expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'raises an exception if the driver cannot be initialized' do
|
13
|
+
Timecop.freeze(ts + 60)
|
14
|
+
|
15
|
+
# Attempt to register twice with the same driver name
|
16
|
+
Grell::CapybaraDriver.new.setup_capybara
|
17
|
+
expect { Grell::CapybaraDriver.new.setup_capybara }.
|
18
|
+
to raise_error "Poltergeist Driver could not be properly initialized"
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'can register the poltergeist driver multiple times in a row' do
|
22
|
+
Timecop.freeze(ts + 120)
|
23
|
+
driver = Grell::CapybaraDriver.new.setup_capybara
|
24
|
+
expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
|
25
|
+
end
|
26
|
+
|
27
|
+
after do
|
28
|
+
Timecop.return
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
data/spec/lib/crawler_spec.rb
CHANGED
@@ -36,13 +36,19 @@ RSpec.describe Grell::Crawler do
|
|
36
36
|
|
37
37
|
it 'yields the result if a block is given' do
|
38
38
|
result = []
|
39
|
-
block = Proc.new {|n| result.push(n) }
|
39
|
+
block = Proc.new { |n| result.push(n) }
|
40
40
|
crawler.crawl(page, block)
|
41
41
|
expect(result.size).to eq(1)
|
42
42
|
expect(result.first.url).to eq(url)
|
43
43
|
expect(result.first.visited?).to eq(true)
|
44
44
|
end
|
45
45
|
|
46
|
+
it 'rescues any specified exceptions raised during the block execution' do
|
47
|
+
block = Proc.new { |n| raise Capybara::Poltergeist::BrowserError, 'Exception' }
|
48
|
+
expect{ crawler.crawl(page, block) }.to_not raise_error
|
49
|
+
expect(page.status).to eq(404)
|
50
|
+
end
|
51
|
+
|
46
52
|
it 'logs interesting information' do
|
47
53
|
crawler
|
48
54
|
expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
|
@@ -61,6 +67,13 @@ RSpec.describe Grell::Crawler do
|
|
61
67
|
crawler.crawl(page, block)
|
62
68
|
expect(counter).to eq(times_retrying)
|
63
69
|
end
|
70
|
+
|
71
|
+
it 'handles redirects by adding the current_url to the page collection' do
|
72
|
+
redirect_url = 'http://www.example.com/test/landing_page'
|
73
|
+
allow(page).to receive(:current_url).and_return(redirect_url)
|
74
|
+
expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
|
75
|
+
crawler.crawl(page, nil)
|
76
|
+
end
|
64
77
|
end
|
65
78
|
|
66
79
|
context '#start_crawling' do
|
@@ -80,7 +93,7 @@ RSpec.describe Grell::Crawler do
|
|
80
93
|
|
81
94
|
it 'calls the block we used to start_crawling' do
|
82
95
|
result = []
|
83
|
-
block = Proc.new {|n| result.push(n) }
|
96
|
+
block = Proc.new { |n| result.push(n) }
|
84
97
|
crawler.start_crawling(url, &block)
|
85
98
|
expect(result.size).to eq(2)
|
86
99
|
expect(result[0].url).to eq(url)
|
data/spec/lib/page_spec.rb
CHANGED
@@ -106,7 +106,8 @@ RSpec.describe Grell::Page do
|
|
106
106
|
end
|
107
107
|
|
108
108
|
[ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
|
109
|
-
Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError
|
109
|
+
Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError,
|
110
|
+
Capybara::Poltergeist::DeadClient, Errno::ECONNRESET ].each do |error_type|
|
110
111
|
|
111
112
|
context "#{error_type}" do
|
112
113
|
let(:headers) do
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -136,6 +136,20 @@ dependencies:
|
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '0.5'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: timecop
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0.8'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0.8'
|
139
153
|
description: Ruby web crawler using PhantomJS
|
140
154
|
email:
|
141
155
|
- jcarres@mdsol.com
|
@@ -159,6 +173,7 @@ files:
|
|
159
173
|
- lib/grell/rawpage.rb
|
160
174
|
- lib/grell/reader.rb
|
161
175
|
- lib/grell/version.rb
|
176
|
+
- spec/lib/capybara_driver_spec.rb
|
162
177
|
- spec/lib/crawler_spec.rb
|
163
178
|
- spec/lib/page_collection_spec.rb
|
164
179
|
- spec/lib/page_spec.rb
|
@@ -184,11 +199,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
184
199
|
version: '0'
|
185
200
|
requirements: []
|
186
201
|
rubyforge_project:
|
187
|
-
rubygems_version: 2.
|
202
|
+
rubygems_version: 2.5.1
|
188
203
|
signing_key:
|
189
204
|
specification_version: 4
|
190
205
|
summary: Ruby web crawler
|
191
206
|
test_files:
|
207
|
+
- spec/lib/capybara_driver_spec.rb
|
192
208
|
- spec/lib/crawler_spec.rb
|
193
209
|
- spec/lib/page_collection_spec.rb
|
194
210
|
- spec/lib/page_spec.rb
|