grell 1.3.2 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +23 -0
- data/lib/grell/capybara_driver.rb +3 -1
- data/lib/grell/crawler.rb +20 -4
- data/lib/grell/page.rb +7 -0
- data/lib/grell/version.rb +1 -1
- data/spec/lib/crawler_spec.rb +14 -1
- data/spec/lib/page_spec.rb +29 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66dc6a02c6a3ff7c79e01fd7e84c21644ef5eb3a
|
4
|
+
data.tar.gz: 8ee3c5702299dfe64c2bd2732d2795014080e0bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c191dd4ec5a994d2963d5c7db5f0ebb6f36d0532a05ed02e4153c6d04fb9e6022f3a63065da6e07208e0389c2f8f0e7c008ab763c2a76bd986c13407dcd2740
|
7
|
+
data.tar.gz: 3b05002063f76bbc0916684c352554eddf75ce88c6c294c067fc91e4973a1cb10a7f2465ea874d1aa8c162ee3a6612d9bb5289b7489edab9f0dcfc5ae79dc463
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -51,6 +51,28 @@ end
|
|
51
51
|
Grell keeps a list of pages previously crawled and do not visit the same page twice.
|
52
52
|
This list is indexed by the complete url, including query parameters.
|
53
53
|
|
54
|
+
### Re-retrieving a page
|
55
|
+
If you want Grell to revisit a page and return the data to you again,
|
56
|
+
return the symbol :retry in your block in the start_crawling method.
|
57
|
+
For instance
|
58
|
+
```ruby
|
59
|
+
require 'grell'
|
60
|
+
crawler = Grell::Crawler.new
|
61
|
+
crawler.start_crawling('http://www.google.com') do |current_page|
|
62
|
+
if current_page.status == 500 && current_page.retries == 0
|
63
|
+
crawler.restart
|
64
|
+
:retry
|
65
|
+
end
|
66
|
+
end
|
67
|
+
```
|
68
|
+
|
69
|
+
### Restarting PhantomJS
|
70
|
+
If you are doing a long crawling it is possible that phantomJS starts failing.
|
71
|
+
To avoid that, you can restart it by calling "restart" on crawler.
|
72
|
+
That will kill phantom and will restart it. Grell will keep the status of
|
73
|
+
pages already visited and pages discovered and to be visited. And will keep crawling
|
74
|
+
with the new phantomJS process instead of the old one.
|
75
|
+
|
54
76
|
### Selecting links to follow
|
55
77
|
|
56
78
|
Grell by default will follow all the links it finds going to the site
|
@@ -58,6 +80,7 @@ your are crawling. It will never follow links linking outside your site.
|
|
58
80
|
If you want to further limit the amount of links crawled, you can use
|
59
81
|
whitelisting, blacklisting or manual filtering.
|
60
82
|
|
83
|
+
|
61
84
|
#### Whitelisting
|
62
85
|
|
63
86
|
```ruby
|
@@ -12,8 +12,9 @@ module Grell
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def setup_capybara
|
15
|
+
@poltergeist_driver = nil
|
15
16
|
Capybara.register_driver :poltergeist_crawler do |app|
|
16
|
-
Capybara::Poltergeist::Driver.new(app, {
|
17
|
+
@poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
|
17
18
|
js_errors: false,
|
18
19
|
inspector: false,
|
19
20
|
phantomjs_logger: open('/dev/null'),
|
@@ -28,6 +29,7 @@ module Grell
|
|
28
29
|
"DNT" => 1,
|
29
30
|
"User-Agent" => USER_AGENT
|
30
31
|
}
|
32
|
+
@poltergeist_driver
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
data/lib/grell/crawler.rb
CHANGED
@@ -5,8 +5,10 @@ module Grell
|
|
5
5
|
class Crawler
|
6
6
|
attr_reader :collection
|
7
7
|
|
8
|
+
# Creates a crawler
|
9
|
+
# options allows :logger to point to an object with the same interface than Logger in the standard library
|
8
10
|
def initialize(options = {})
|
9
|
-
CapybaraDriver.setup(options)
|
11
|
+
@driver = CapybaraDriver.setup(options)
|
10
12
|
|
11
13
|
if options[:logger]
|
12
14
|
Grell.logger = options[:logger]
|
@@ -17,15 +19,24 @@ module Grell
|
|
17
19
|
@collection = PageCollection.new
|
18
20
|
end
|
19
21
|
|
22
|
+
# Restarts the PhantomJS process without modifying the state of visited and discovered pages.
|
23
|
+
def restart
|
24
|
+
Grell.logger.info "GRELL is restarting"
|
25
|
+
@driver.restart
|
26
|
+
Grell.logger.info "GRELL has restarted"
|
27
|
+
end
|
28
|
+
|
29
|
+
# Setups a whitelist filter, allows a regexp, string or array of either to be matched.
|
20
30
|
def whitelist(list)
|
21
31
|
@whitelist_regexp = Regexp.union(list)
|
22
32
|
end
|
23
33
|
|
34
|
+
# Setups a blacklist filter, allows a regexp, string or array of either to be matched.
|
24
35
|
def blacklist(list)
|
25
36
|
@blacklist_regexp = Regexp.union(list)
|
26
37
|
end
|
27
38
|
|
28
|
-
|
39
|
+
# Main method, it starts crawling on the given URL and calls a block for each of the pages found.
|
29
40
|
def start_crawling(url, &block)
|
30
41
|
Grell.logger.info "GRELL Started crawling"
|
31
42
|
@collection = PageCollection.new
|
@@ -39,10 +50,15 @@ module Grell
|
|
39
50
|
def crawl(site, block)
|
40
51
|
Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
|
41
52
|
site.navigate
|
42
|
-
|
43
53
|
filter!(site.links)
|
44
54
|
|
45
|
-
block
|
55
|
+
if block #The user of this block can send us a :retry to retry accessing the page
|
56
|
+
while(block.call(site) == :retry)
|
57
|
+
Grell.logger.info "Retrying our visit to #{site.url}"
|
58
|
+
site.navigate
|
59
|
+
filter!(site.links)
|
60
|
+
end
|
61
|
+
end
|
46
62
|
|
47
63
|
site.links.each do |url|
|
48
64
|
@collection.create_page(url, site.id)
|
data/lib/grell/page.rb
CHANGED
@@ -11,6 +11,7 @@ module Grell
|
|
11
11
|
WAIT_INTERVAL = 0.5
|
12
12
|
|
13
13
|
attr_reader :url, :timestamp, :id, :parent_id, :rawpage
|
14
|
+
|
14
15
|
#Most of the interesting information accessed through this class is accessed by the methods below
|
15
16
|
def_delegators :@result_page, :headers, :body, :status, :links, :has_selector?, :host, :visited?
|
16
17
|
|
@@ -20,6 +21,7 @@ module Grell
|
|
20
21
|
@id = id
|
21
22
|
@parent_id = parent_id
|
22
23
|
@timestamp = nil
|
24
|
+
@times_visited = 0
|
23
25
|
@result_page = UnvisitedPage.new
|
24
26
|
end
|
25
27
|
|
@@ -31,6 +33,7 @@ module Grell
|
|
31
33
|
end
|
32
34
|
@result_page = VisitedPage.new(@rawpage)
|
33
35
|
@timestamp = Time.now
|
36
|
+
@times_visited += 1
|
34
37
|
rescue Capybara::Poltergeist::JavascriptError => e
|
35
38
|
unavailable_page(404, e)
|
36
39
|
rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug.
|
@@ -45,6 +48,10 @@ module Grell
|
|
45
48
|
unavailable_page(404, e)
|
46
49
|
end
|
47
50
|
|
51
|
+
def retries
|
52
|
+
[@times_visited -1, 0].max
|
53
|
+
end
|
54
|
+
|
48
55
|
private
|
49
56
|
def unavailable_page(status, exception)
|
50
57
|
Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
|
data/lib/grell/version.rb
CHANGED
data/spec/lib/crawler_spec.rb
CHANGED
@@ -23,7 +23,7 @@ RSpec.describe Grell::Crawler do
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
|
26
|
+
describe '#crawl' do
|
27
27
|
it 'yields the result if a block is given' do
|
28
28
|
result = []
|
29
29
|
block = Proc.new {|n| result.push(n) }
|
@@ -38,6 +38,19 @@ RSpec.describe Grell::Crawler do
|
|
38
38
|
expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
|
39
39
|
crawler.crawl(page, nil)
|
40
40
|
end
|
41
|
+
|
42
|
+
it 'retries when the block returns :retry' do
|
43
|
+
counter = 0
|
44
|
+
times_retrying = 2
|
45
|
+
block = Proc.new do |n|
|
46
|
+
if counter < times_retrying
|
47
|
+
counter += 1
|
48
|
+
:retry
|
49
|
+
end
|
50
|
+
end
|
51
|
+
crawler.crawl(page, block)
|
52
|
+
expect(counter).to eq(times_retrying)
|
53
|
+
end
|
41
54
|
end
|
42
55
|
|
43
56
|
context '#start_crawling' do
|
data/spec/lib/page_spec.rb
CHANGED
@@ -56,6 +56,35 @@ RSpec.describe Grell::Page do
|
|
56
56
|
|
57
57
|
end
|
58
58
|
|
59
|
+
describe '#retries' do
|
60
|
+
context 'page has not been navigated' do
|
61
|
+
it '#retries return 0' do
|
62
|
+
expect(page.retries).to eq(0)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
context 'page has been navigated once' do
|
67
|
+
before do
|
68
|
+
proxy.stub(url).and_return(body: '', code: 200, headers: {})
|
69
|
+
page.navigate
|
70
|
+
end
|
71
|
+
it '#retries return 0' do
|
72
|
+
expect(page.retries).to eq(0)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
context 'page has been navigated twice' do
|
77
|
+
before do
|
78
|
+
proxy.stub(url).and_return(body: '', code: 200, headers: {})
|
79
|
+
page.navigate
|
80
|
+
page.navigate
|
81
|
+
end
|
82
|
+
it '#retries return 1' do
|
83
|
+
expect(page.retries).to eq(1)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
59
88
|
shared_examples_for 'an errored grell page' do
|
60
89
|
it 'returns empty status 404 page after navigating' do
|
61
90
|
expect(page.status).to eq(404)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jordi Polo Carres
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-05-
|
11
|
+
date: 2015-05-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|