grell 1.3.2 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7c505e81c2b0ec952d35d3ddef9829db6763939c
4
- data.tar.gz: ad46c404ff8acd62ae3be3f217f90a95ca9b6bcc
3
+ metadata.gz: 66dc6a02c6a3ff7c79e01fd7e84c21644ef5eb3a
4
+ data.tar.gz: 8ee3c5702299dfe64c2bd2732d2795014080e0bf
5
5
  SHA512:
6
- metadata.gz: 5b03ead4c1152e9d404635ca3c72ceaa41393ec46f3c80f023b7afb75cce7f193b7b2aca8ac98d599584dc913e0212842a579b5e6222c6daa973fd6fa132c0b9
7
- data.tar.gz: 8b92746abb6f3d032b857754541152badb4ef2e92ff47f55a9ee7a00c6fb0b22a0609be411b23926b046f9556b41550299655fc10d25da3b0ff06363291c85b4
6
+ metadata.gz: 6c191dd4ec5a994d2963d5c7db5f0ebb6f36d0532a05ed02e4153c6d04fb9e6022f3a63065da6e07208e0389c2f8f0e7c008ab763c2a76bd986c13407dcd2740
7
+ data.tar.gz: 3b05002063f76bbc0916684c352554eddf75ce88c6c294c067fc91e4973a1cb10a7f2465ea874d1aa8c162ee3a6612d9bb5289b7489edab9f0dcfc5ae79dc463
@@ -1,3 +1,7 @@
1
+ * Version 1.4.0
2
+ Added crawler.restart to restart browser process
3
+ The block of code can make grell retry any given page.
4
+
1
5
  * Version 1.3.2
2
6
  Rescue Timeout error and return an empty page when that happens
3
7
 
data/README.md CHANGED
@@ -51,6 +51,28 @@ end
51
51
  Grell keeps a list of pages previously crawled and do not visit the same page twice.
52
52
  This list is indexed by the complete url, including query parameters.
53
53
 
54
+ ### Re-retrieving a page
55
+ If you want Grell to revisit a page and return the data to you again,
56
+ return the symbol :retry in your block in the start_crawling method.
57
+ For instance
58
+ ```ruby
59
+ require 'grell'
60
+ crawler = Grell::Crawler.new
61
+ crawler.start_crawling('http://www.google.com') do |current_page|
62
+ if current_page.status == 500 && current_page.retries == 0
63
+ crawler.restart
64
+ :retry
65
+ end
66
+ end
67
+ ```
68
+
69
+ ### Restarting PhantomJS
70
+ If you are doing a long crawling it is possible that phantomJS starts failing.
71
+ To avoid that, you can restart it by calling "restart" on crawler.
72
+ That will kill phantom and will restart it. Grell will keep the status of
73
+ pages already visited and pages discovered and to be visited. And will keep crawling
74
+ with the new phantomJS process instead of the old one.
75
+
54
76
  ### Selecting links to follow
55
77
 
56
78
  Grell by default will follow all the links it finds going to the site
@@ -58,6 +80,7 @@ your are crawling. It will never follow links linking outside your site.
58
80
  If you want to further limit the amount of links crawled, you can use
59
81
  whitelisting, blacklisting or manual filtering.
60
82
 
83
+
61
84
  #### Whitelisting
62
85
 
63
86
  ```ruby
@@ -12,8 +12,9 @@ module Grell
12
12
  end
13
13
 
14
14
  def setup_capybara
15
+ @poltergeist_driver = nil
15
16
  Capybara.register_driver :poltergeist_crawler do |app|
16
- Capybara::Poltergeist::Driver.new(app, {
17
+ @poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
17
18
  js_errors: false,
18
19
  inspector: false,
19
20
  phantomjs_logger: open('/dev/null'),
@@ -28,6 +29,7 @@ module Grell
28
29
  "DNT" => 1,
29
30
  "User-Agent" => USER_AGENT
30
31
  }
32
+ @poltergeist_driver
31
33
  end
32
34
  end
33
35
 
@@ -5,8 +5,10 @@ module Grell
5
5
  class Crawler
6
6
  attr_reader :collection
7
7
 
8
+ # Creates a crawler
9
+ # options allows :logger to point to an object with the same interface than Logger in the standard library
8
10
  def initialize(options = {})
9
- CapybaraDriver.setup(options)
11
+ @driver = CapybaraDriver.setup(options)
10
12
 
11
13
  if options[:logger]
12
14
  Grell.logger = options[:logger]
@@ -17,15 +19,24 @@ module Grell
17
19
  @collection = PageCollection.new
18
20
  end
19
21
 
22
+ # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
23
+ def restart
24
+ Grell.logger.info "GRELL is restarting"
25
+ @driver.restart
26
+ Grell.logger.info "GRELL has restarted"
27
+ end
28
+
29
+ # Setups a whitelist filter, allows a regexp, string or array of either to be matched.
20
30
  def whitelist(list)
21
31
  @whitelist_regexp = Regexp.union(list)
22
32
  end
23
33
 
34
+ # Setups a blacklist filter, allows a regexp, string or array of either to be matched.
24
35
  def blacklist(list)
25
36
  @blacklist_regexp = Regexp.union(list)
26
37
  end
27
38
 
28
-
39
+ # Main method, it starts crawling on the given URL and calls a block for each of the pages found.
29
40
  def start_crawling(url, &block)
30
41
  Grell.logger.info "GRELL Started crawling"
31
42
  @collection = PageCollection.new
@@ -39,10 +50,15 @@ module Grell
39
50
  def crawl(site, block)
40
51
  Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
41
52
  site.navigate
42
-
43
53
  filter!(site.links)
44
54
 
45
- block.call(site) if block
55
+ if block #The user of this block can send us a :retry to retry accessing the page
56
+ while(block.call(site) == :retry)
57
+ Grell.logger.info "Retrying our visit to #{site.url}"
58
+ site.navigate
59
+ filter!(site.links)
60
+ end
61
+ end
46
62
 
47
63
  site.links.each do |url|
48
64
  @collection.create_page(url, site.id)
@@ -11,6 +11,7 @@ module Grell
11
11
  WAIT_INTERVAL = 0.5
12
12
 
13
13
  attr_reader :url, :timestamp, :id, :parent_id, :rawpage
14
+
14
15
  #Most of the interesting information accessed through this class is accessed by the methods below
15
16
  def_delegators :@result_page, :headers, :body, :status, :links, :has_selector?, :host, :visited?
16
17
 
@@ -20,6 +21,7 @@ module Grell
20
21
  @id = id
21
22
  @parent_id = parent_id
22
23
  @timestamp = nil
24
+ @times_visited = 0
23
25
  @result_page = UnvisitedPage.new
24
26
  end
25
27
 
@@ -31,6 +33,7 @@ module Grell
31
33
  end
32
34
  @result_page = VisitedPage.new(@rawpage)
33
35
  @timestamp = Time.now
36
+ @times_visited += 1
34
37
  rescue Capybara::Poltergeist::JavascriptError => e
35
38
  unavailable_page(404, e)
36
39
  rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug.
@@ -45,6 +48,10 @@ module Grell
45
48
  unavailable_page(404, e)
46
49
  end
47
50
 
51
+ def retries
52
+ [@times_visited -1, 0].max
53
+ end
54
+
48
55
  private
49
56
  def unavailable_page(status, exception)
50
57
  Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
@@ -1,3 +1,3 @@
1
1
  module Grell
2
- VERSION = "1.3.2"
2
+ VERSION = "1.4.0"
3
3
  end
@@ -23,7 +23,7 @@ RSpec.describe Grell::Crawler do
23
23
  end
24
24
  end
25
25
 
26
- context '#crawl' do
26
+ describe '#crawl' do
27
27
  it 'yields the result if a block is given' do
28
28
  result = []
29
29
  block = Proc.new {|n| result.push(n) }
@@ -38,6 +38,19 @@ RSpec.describe Grell::Crawler do
38
38
  expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
39
39
  crawler.crawl(page, nil)
40
40
  end
41
+
42
+ it 'retries when the block returns :retry' do
43
+ counter = 0
44
+ times_retrying = 2
45
+ block = Proc.new do |n|
46
+ if counter < times_retrying
47
+ counter += 1
48
+ :retry
49
+ end
50
+ end
51
+ crawler.crawl(page, block)
52
+ expect(counter).to eq(times_retrying)
53
+ end
41
54
  end
42
55
 
43
56
  context '#start_crawling' do
@@ -56,6 +56,35 @@ RSpec.describe Grell::Page do
56
56
 
57
57
  end
58
58
 
59
+ describe '#retries' do
60
+ context 'page has not been navigated' do
61
+ it '#retries return 0' do
62
+ expect(page.retries).to eq(0)
63
+ end
64
+ end
65
+
66
+ context 'page has been navigated once' do
67
+ before do
68
+ proxy.stub(url).and_return(body: '', code: 200, headers: {})
69
+ page.navigate
70
+ end
71
+ it '#retries return 0' do
72
+ expect(page.retries).to eq(0)
73
+ end
74
+ end
75
+
76
+ context 'page has been navigated twice' do
77
+ before do
78
+ proxy.stub(url).and_return(body: '', code: 200, headers: {})
79
+ page.navigate
80
+ page.navigate
81
+ end
82
+ it '#retries return 1' do
83
+ expect(page.retries).to eq(1)
84
+ end
85
+ end
86
+ end
87
+
59
88
  shared_examples_for 'an errored grell page' do
60
89
  it 'returns empty status 404 page after navigating' do
61
90
  expect(page.status).to eq(404)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grell
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.2
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jordi Polo Carres
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-13 00:00:00.000000000 Z
11
+ date: 2015-05-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara