grell 1.3.2 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +23 -0
- data/lib/grell/capybara_driver.rb +3 -1
- data/lib/grell/crawler.rb +20 -4
- data/lib/grell/page.rb +7 -0
- data/lib/grell/version.rb +1 -1
- data/spec/lib/crawler_spec.rb +14 -1
- data/spec/lib/page_spec.rb +29 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66dc6a02c6a3ff7c79e01fd7e84c21644ef5eb3a
|
4
|
+
data.tar.gz: 8ee3c5702299dfe64c2bd2732d2795014080e0bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c191dd4ec5a994d2963d5c7db5f0ebb6f36d0532a05ed02e4153c6d04fb9e6022f3a63065da6e07208e0389c2f8f0e7c008ab763c2a76bd986c13407dcd2740
|
7
|
+
data.tar.gz: 3b05002063f76bbc0916684c352554eddf75ce88c6c294c067fc91e4973a1cb10a7f2465ea874d1aa8c162ee3a6612d9bb5289b7489edab9f0dcfc5ae79dc463
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -51,6 +51,28 @@ end
|
|
51
51
|
Grell keeps a list of pages previously crawled and do not visit the same page twice.
|
52
52
|
This list is indexed by the complete url, including query parameters.
|
53
53
|
|
54
|
+
### Re-retrieving a page
|
55
|
+
If you want Grell to revisit a page and return the data to you again,
|
56
|
+
return the symbol :retry in your block in the start_crawling method.
|
57
|
+
For instance
|
58
|
+
```ruby
|
59
|
+
require 'grell'
|
60
|
+
crawler = Grell::Crawler.new
|
61
|
+
crawler.start_crawling('http://www.google.com') do |current_page|
|
62
|
+
if current_page.status == 500 && current_page.retries == 0
|
63
|
+
crawler.restart
|
64
|
+
:retry
|
65
|
+
end
|
66
|
+
end
|
67
|
+
```
|
68
|
+
|
69
|
+
### Restarting PhantomJS
|
70
|
+
If you are doing a long crawling it is possible that phantomJS starts failing.
|
71
|
+
To avoid that, you can restart it by calling "restart" on crawler.
|
72
|
+
That will kill phantom and will restart it. Grell will keep the status of
|
73
|
+
pages already visited and pages discovered and to be visited. And will keep crawling
|
74
|
+
with the new phantomJS process instead of the old one.
|
75
|
+
|
54
76
|
### Selecting links to follow
|
55
77
|
|
56
78
|
Grell by default will follow all the links it finds going to the site
|
@@ -58,6 +80,7 @@ your are crawling. It will never follow links linking outside your site.
|
|
58
80
|
If you want to further limit the amount of links crawled, you can use
|
59
81
|
whitelisting, blacklisting or manual filtering.
|
60
82
|
|
83
|
+
|
61
84
|
#### Whitelisting
|
62
85
|
|
63
86
|
```ruby
|
@@ -12,8 +12,9 @@ module Grell
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def setup_capybara
|
15
|
+
@poltergeist_driver = nil
|
15
16
|
Capybara.register_driver :poltergeist_crawler do |app|
|
16
|
-
Capybara::Poltergeist::Driver.new(app, {
|
17
|
+
@poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
|
17
18
|
js_errors: false,
|
18
19
|
inspector: false,
|
19
20
|
phantomjs_logger: open('/dev/null'),
|
@@ -28,6 +29,7 @@ module Grell
|
|
28
29
|
"DNT" => 1,
|
29
30
|
"User-Agent" => USER_AGENT
|
30
31
|
}
|
32
|
+
@poltergeist_driver
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
data/lib/grell/crawler.rb
CHANGED
@@ -5,8 +5,10 @@ module Grell
|
|
5
5
|
class Crawler
|
6
6
|
attr_reader :collection
|
7
7
|
|
8
|
+
# Creates a crawler
|
9
|
+
# options allows :logger to point to an object with the same interface than Logger in the standard library
|
8
10
|
def initialize(options = {})
|
9
|
-
CapybaraDriver.setup(options)
|
11
|
+
@driver = CapybaraDriver.setup(options)
|
10
12
|
|
11
13
|
if options[:logger]
|
12
14
|
Grell.logger = options[:logger]
|
@@ -17,15 +19,24 @@ module Grell
|
|
17
19
|
@collection = PageCollection.new
|
18
20
|
end
|
19
21
|
|
22
|
+
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
23
|
+
def restart
|
24
|
+
Grell.logger.info "GRELL is restarting"
|
25
|
+
@driver.restart
|
26
|
+
Grell.logger.info "GRELL has restarted"
|
27
|
+
end
|
28
|
+
|
29
|
+
# Setups a whitelist filter, allows a regexp, string or array of either to be matched.
|
20
30
|
def whitelist(list)
|
21
31
|
@whitelist_regexp = Regexp.union(list)
|
22
32
|
end
|
23
33
|
|
34
|
+
# Setups a blacklist filter, allows a regexp, string or array of either to be matched.
|
24
35
|
def blacklist(list)
|
25
36
|
@blacklist_regexp = Regexp.union(list)
|
26
37
|
end
|
27
38
|
|
28
|
-
|
39
|
+
# Main method, it starts crawling on the given URL and calls a block for each of the pages found.
|
29
40
|
def start_crawling(url, &block)
|
30
41
|
Grell.logger.info "GRELL Started crawling"
|
31
42
|
@collection = PageCollection.new
|
@@ -39,10 +50,15 @@ module Grell
|
|
39
50
|
def crawl(site, block)
|
40
51
|
Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
|
41
52
|
site.navigate
|
42
|
-
|
43
53
|
filter!(site.links)
|
44
54
|
|
45
|
-
block
|
55
|
+
if block #The user of this block can send us a :retry to retry accessing the page
|
56
|
+
while(block.call(site) == :retry)
|
57
|
+
Grell.logger.info "Retrying our visit to #{site.url}"
|
58
|
+
site.navigate
|
59
|
+
filter!(site.links)
|
60
|
+
end
|
61
|
+
end
|
46
62
|
|
47
63
|
site.links.each do |url|
|
48
64
|
@collection.create_page(url, site.id)
|
data/lib/grell/page.rb
CHANGED
@@ -11,6 +11,7 @@ module Grell
|
|
11
11
|
WAIT_INTERVAL = 0.5
|
12
12
|
|
13
13
|
attr_reader :url, :timestamp, :id, :parent_id, :rawpage
|
14
|
+
|
14
15
|
#Most of the interesting information accessed through this class is accessed by the methods below
|
15
16
|
def_delegators :@result_page, :headers, :body, :status, :links, :has_selector?, :host, :visited?
|
16
17
|
|
@@ -20,6 +21,7 @@ module Grell
|
|
20
21
|
@id = id
|
21
22
|
@parent_id = parent_id
|
22
23
|
@timestamp = nil
|
24
|
+
@times_visited = 0
|
23
25
|
@result_page = UnvisitedPage.new
|
24
26
|
end
|
25
27
|
|
@@ -31,6 +33,7 @@ module Grell
|
|
31
33
|
end
|
32
34
|
@result_page = VisitedPage.new(@rawpage)
|
33
35
|
@timestamp = Time.now
|
36
|
+
@times_visited += 1
|
34
37
|
rescue Capybara::Poltergeist::JavascriptError => e
|
35
38
|
unavailable_page(404, e)
|
36
39
|
rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug.
|
@@ -45,6 +48,10 @@ module Grell
|
|
45
48
|
unavailable_page(404, e)
|
46
49
|
end
|
47
50
|
|
51
|
+
def retries
|
52
|
+
[@times_visited -1, 0].max
|
53
|
+
end
|
54
|
+
|
48
55
|
private
|
49
56
|
def unavailable_page(status, exception)
|
50
57
|
Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
|
data/lib/grell/version.rb
CHANGED
data/spec/lib/crawler_spec.rb
CHANGED
@@ -23,7 +23,7 @@ RSpec.describe Grell::Crawler do
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
|
26
|
+
describe '#crawl' do
|
27
27
|
it 'yields the result if a block is given' do
|
28
28
|
result = []
|
29
29
|
block = Proc.new {|n| result.push(n) }
|
@@ -38,6 +38,19 @@ RSpec.describe Grell::Crawler do
|
|
38
38
|
expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
|
39
39
|
crawler.crawl(page, nil)
|
40
40
|
end
|
41
|
+
|
42
|
+
it 'retries when the block returns :retry' do
|
43
|
+
counter = 0
|
44
|
+
times_retrying = 2
|
45
|
+
block = Proc.new do |n|
|
46
|
+
if counter < times_retrying
|
47
|
+
counter += 1
|
48
|
+
:retry
|
49
|
+
end
|
50
|
+
end
|
51
|
+
crawler.crawl(page, block)
|
52
|
+
expect(counter).to eq(times_retrying)
|
53
|
+
end
|
41
54
|
end
|
42
55
|
|
43
56
|
context '#start_crawling' do
|
data/spec/lib/page_spec.rb
CHANGED
@@ -56,6 +56,35 @@ RSpec.describe Grell::Page do
|
|
56
56
|
|
57
57
|
end
|
58
58
|
|
59
|
+
describe '#retries' do
|
60
|
+
context 'page has not been navigated' do
|
61
|
+
it '#retries return 0' do
|
62
|
+
expect(page.retries).to eq(0)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
context 'page has been navigated once' do
|
67
|
+
before do
|
68
|
+
proxy.stub(url).and_return(body: '', code: 200, headers: {})
|
69
|
+
page.navigate
|
70
|
+
end
|
71
|
+
it '#retries return 0' do
|
72
|
+
expect(page.retries).to eq(0)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context 'page has been navigated twice' do
|
77
|
+
before do
|
78
|
+
proxy.stub(url).and_return(body: '', code: 200, headers: {})
|
79
|
+
page.navigate
|
80
|
+
page.navigate
|
81
|
+
end
|
82
|
+
it '#retries return 1' do
|
83
|
+
expect(page.retries).to eq(1)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
59
88
|
shared_examples_for 'an errored grell page' do
|
60
89
|
it 'returns empty status 404 page after navigating' do
|
61
90
|
expect(page.status).to eq(404)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-05-
|
11
|
+
date: 2015-05-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|