RubyGems - grell - Versions diffs - 1.3.2 → 1.4.0 - Mend

grell 1.3.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7c505e81c2b0ec952d35d3ddef9829db6763939c
-  data.tar.gz: ad46c404ff8acd62ae3be3f217f90a95ca9b6bcc
+  metadata.gz: 66dc6a02c6a3ff7c79e01fd7e84c21644ef5eb3a
+  data.tar.gz: 8ee3c5702299dfe64c2bd2732d2795014080e0bf
 SHA512:
-  metadata.gz: 5b03ead4c1152e9d404635ca3c72ceaa41393ec46f3c80f023b7afb75cce7f193b7b2aca8ac98d599584dc913e0212842a579b5e6222c6daa973fd6fa132c0b9
-  data.tar.gz: 8b92746abb6f3d032b857754541152badb4ef2e92ff47f55a9ee7a00c6fb0b22a0609be411b23926b046f9556b41550299655fc10d25da3b0ff06363291c85b4
+  metadata.gz: 6c191dd4ec5a994d2963d5c7db5f0ebb6f36d0532a05ed02e4153c6d04fb9e6022f3a63065da6e07208e0389c2f8f0e7c008ab763c2a76bd986c13407dcd2740
+  data.tar.gz: 3b05002063f76bbc0916684c352554eddf75ce88c6c294c067fc91e4973a1cb10a7f2465ea874d1aa8c162ee3a6612d9bb5289b7489edab9f0dcfc5ae79dc463

data/CHANGELOG.md CHANGED

@@ -1,3 +1,7 @@
+* Version 1.4.0
+  Added crawler.restart to restart browser process
+  The block of code can make grell retry any given page.
 * Version 1.3.2
   Rescue Timeout error and return an empty page when that happens

data/README.md CHANGED

@@ -51,6 +51,28 @@ end
 Grell keeps a list of pages previously crawled and do not visit the same page twice.
 This list is indexed by the complete url, including query parameters.
+### Re-retrieving a page
+If you want Grell to revisit a page and return the data to you again,
+return the symbol :retry in your block in the start_crawling method.
+For instance
+```ruby
+require 'grell'
+crawler = Grell::Crawler.new
+crawler.start_crawling('http://www.google.com') do |current_page|
+  if current_page.status == 500 && current_page.retries == 0
+    crawler.restart
+    :retry
+  end
+end
+```
+### Restarting PhantomJS
+If you are doing a long crawling it is possible that phantomJS starts failing.
+To avoid that, you can restart it by calling "restart" on crawler.
+That will kill phantom and will restart it. Grell will keep the status of
+pages already visited and pages discovered and to be visited. And will keep crawling
+with the new phantomJS process instead of the old one.
 ### Selecting links to follow
 Grell by default will follow all the links it finds going to the site
@@ -58,6 +80,7 @@ your are crawling. It will never follow links linking outside your site.
 If you want to further limit the amount of links crawled, you can use
 whitelisting, blacklisting or manual filtering.
 #### Whitelisting
 ```ruby

data/lib/grell/capybara_driver.rb CHANGED

@@ -12,8 +12,9 @@ module Grell
     end
     def setup_capybara
+      @poltergeist_driver = nil
       Capybara.register_driver :poltergeist_crawler do |app|
-        Capybara::Poltergeist::Driver.new(app, {
+        @poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
           js_errors: false,
           inspector: false,
           phantomjs_logger: open('/dev/null'),
@@ -28,6 +29,7 @@ module Grell
         "DNT" => 1,
         "User-Agent" => USER_AGENT
       }
+      @poltergeist_driver
     end
   end

data/lib/grell/crawler.rb CHANGED

@@ -5,8 +5,10 @@ module Grell
   class Crawler
     attr_reader :collection
+    # Creates a crawler
+    # options allows :logger to point to an object with the same interface than Logger in the standard library
     def initialize(options = {})
-      CapybaraDriver.setup(options)
+      @driver = CapybaraDriver.setup(options)
       if options[:logger]
         Grell.logger = options[:logger]
@@ -17,15 +19,24 @@ module Grell
       @collection = PageCollection.new
     end
+    # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
+    def restart
+      Grell.logger.info "GRELL is restarting"
+      @driver.restart
+      Grell.logger.info "GRELL has restarted"
+    end
+    # Setups a whitelist filter, allows a regexp, string or array of either to be matched.
     def whitelist(list)
       @whitelist_regexp = Regexp.union(list)
     end
+    # Setups a blacklist filter, allows a regexp, string or array of either to be matched.
     def blacklist(list)
       @blacklist_regexp = Regexp.union(list)
     end
+    # Main method, it starts crawling on the given URL and calls a block for each of the pages found.
     def start_crawling(url, &block)
       Grell.logger.info "GRELL Started crawling"
       @collection = PageCollection.new
@@ -39,10 +50,15 @@ module Grell
     def crawl(site, block)
       Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
       site.navigate
       filter!(site.links)
-      block.call(site) if block
+      if block #The user of this block can send us a :retry to retry accessing the page
+        while(block.call(site) == :retry)
+          Grell.logger.info "Retrying our visit to #{site.url}"
+          site.navigate
+          filter!(site.links)
+        end
+      end
       site.links.each do |url|
         @collection.create_page(url, site.id)

data/lib/grell/page.rb CHANGED

@@ -11,6 +11,7 @@ module Grell
     WAIT_INTERVAL = 0.5
     attr_reader :url, :timestamp, :id, :parent_id, :rawpage
     #Most of the interesting information accessed through this class is accessed by the methods below
     def_delegators :@result_page, :headers, :body, :status, :links, :has_selector?, :host, :visited?
@@ -20,6 +21,7 @@ module Grell
       @id = id
       @parent_id = parent_id
       @timestamp = nil
+      @times_visited = 0
       @result_page = UnvisitedPage.new
     end
@@ -31,6 +33,7 @@ module Grell
       end
       @result_page = VisitedPage.new(@rawpage)
       @timestamp = Time.now
+      @times_visited += 1
     rescue Capybara::Poltergeist::JavascriptError => e
       unavailable_page(404, e)
     rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug.
@@ -45,6 +48,10 @@ module Grell
       unavailable_page(404, e)
     end
+    def retries
+      [@times_visited -1, 0].max
+    end
     private
     def unavailable_page(status, exception)
       Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"

data/lib/grell/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Grell
-  VERSION = "1.3.2"
+  VERSION = "1.4.0"
 end

data/spec/lib/crawler_spec.rb CHANGED

@@ -23,7 +23,7 @@ RSpec.describe Grell::Crawler do
     end
   end
-  context '#crawl' do
+  describe '#crawl' do
     it 'yields the result if a block is given' do
       result = []
       block = Proc.new {|n| result.push(n) }
@@ -38,6 +38,19 @@ RSpec.describe Grell::Crawler do
       expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
       crawler.crawl(page, nil)
     end
+    it 'retries when the block returns :retry' do
+      counter = 0
+      times_retrying = 2
+      block = Proc.new do |n|
+        if counter < times_retrying
+          counter += 1
+          :retry
+        end
+      end
+      crawler.crawl(page, block)
+      expect(counter).to eq(times_retrying)
+    end
   end
   context '#start_crawling' do

data/spec/lib/page_spec.rb CHANGED

@@ -56,6 +56,35 @@ RSpec.describe Grell::Page do
   end
+  describe '#retries' do
+    context 'page has not been navigated' do
+      it '#retries return 0' do
+        expect(page.retries).to eq(0)
+      end
+    end
+    context 'page has been navigated once' do
+      before do
+        proxy.stub(url).and_return(body: '', code: 200, headers: {})
+        page.navigate
+      end
+      it '#retries return 0' do
+        expect(page.retries).to eq(0)
+      end
+    end
+    context 'page has been navigated twice' do
+      before do
+        proxy.stub(url).and_return(body: '', code: 200, headers: {})
+        page.navigate
+        page.navigate
+      end
+      it '#retries return 1' do
+        expect(page.retries).to eq(1)
+      end
+    end
+  end
   shared_examples_for 'an errored grell page' do
     it 'returns empty status 404 page after navigating' do
       expect(page.status).to eq(404)

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: grell
 version: !ruby/object:Gem::Version
-  version: 1.3.2
+  version: 1.4.0
 platform: ruby
 authors:
 - Jordi Polo Carres
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-05-13 00:00:00.000000000 Z
+date: 2015-05-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: capybara