RubyGems - grell - Versions diffs - 1.6.10 → 2.1.2 - Mend

grell 1.6.10 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +5 -5
data/.travis.yml +17 -3
data/CHANGELOG.md +24 -0
data/Gemfile +4 -0
data/README.md +100 -63
data/grell.gemspec +6 -7
data/lib/grell.rb +1 -0
data/lib/grell/capybara_driver.rb +9 -19
data/lib/grell/crawler.rb +27 -53
data/lib/grell/crawler_manager.rb +84 -0
data/lib/grell/page.rb +2 -1
data/lib/grell/page_collection.rb +9 -1
data/lib/grell/rawpage.rb +11 -1
data/lib/grell/version.rb +1 -1
data/spec/lib/capybara_driver_spec.rb +3 -8
data/spec/lib/crawler_manager_spec.rb +174 -0
data/spec/lib/crawler_spec.rb +52 -93
data/spec/lib/page_spec.rb +11 -0
data/spec/spec_helper.rb +1 -0
metadata +15 -42

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 866f1b7117624455b79791bacba549710eb7dc2b
-  data.tar.gz: 4a3646c053bb7b4884fa8b82ebb24809d2d37f97
+SHA256:
+  metadata.gz: c17856255ff1e871cc5e12cc2a9f0f4870156923ab924ea11db16b053a6742fb
+  data.tar.gz: d619076b40cbb4b057015a8bbcb8a07f555c282aa0ec971aa36b4e867fbfbd86
 SHA512:
-  metadata.gz: dfb07e7c0a6a7fb2fe53a40a3da248a24ba683972c696ab7064481448cb4067403dc7350d34cbba21ff4f240fb09b3571adbb36b38bfecd28aeee1ea1551638e
-  data.tar.gz: 7f91a206fd4cb264d73b05468dce66d858637c6d675b92088b800a85b67599201f957f879d618a0f4fc7a538de79c3211f82ffc3d73b063fec760da09209578a
+  metadata.gz: 28860f331fc02f6976bcfd8717bf8c33ca89984ae5d2ce9eede6abb31b5f06b44e2135468c6d75374dd649378cc3d719474979c2f27e67a5a7e5301fc561113f
+  data.tar.gz: 77f68dbdb006803c517de4e0b72a11ac9eba265781703f1b03f98af52b147cab7ba02429371038d426fed1073e1a5f3dcdc0a6838cbf93c64c0c4307f605eea6

data/.travis.yml CHANGED Viewed

@@ -1,14 +1,28 @@
 language: ruby
 cache: bundler
-sudo: false
 rvm:
   - 2.2.4
   - 2.3.0
-script: bundle exec rspec
+  - 2.4.2
 before_install:
   - mkdir travis-phantomjs
-  - wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 -O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
+  - wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true
+    -O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
   - tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
   - export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
+install:
+  - bundle install --jobs=3 --retry=3
+script:
+  - bundle exec rspec
+deploy:
+  provider: rubygems
+  api_key:
+    secure: czStDI0W6MWL70sDwu53oNNCc8vKtT61pgvii+ZWIC9A41C2p7BzmbtosXsnLk2ApxmpWvFIgtQE0XIH7jkM5mY05cHinXDphtOTkNLFVjck3ZOMkx/cc+QRFW8K4FHkrzFsC+/Xx4t2/Psh35LpzhfJd0XzKKoCstXUVgJsfGcAK3DMpjXHSUbwLXGDZ4lzmsk52OLf0oL+in2447TJfVOvGXtYmfh1PjXRwDxKB0dan7w5mVgajS52b6wUhVPTaMe/JgCbMuV7BaQ1Goq8u7V4aaxU+liPAhzHWfMB6tF4TEW8yu2tvGLdOA0+1jmM8E9Q5saPWtwKiHvBxN8CzRpkiNDzyFAf8ljrWT5yKX3aRQCyPp3NNyhoumWap36b+O/zwZ3HxoAe22Yg0rjz8z8NxMR/ELPvjPYjCiF5zY7fO9PAzmIynMRUrxDnFj+/JGHdzx0ZMo3fEXgHHSaHPNxIzEffVVQk4XLVnFHDjBLY4mVp4sbHbja5qnui20RkdM/H9Yi/fQyl1ODhk+LUPoh45ZneDZq7GPrl+WKK06oEjXIXLU+1iEuqnSqybbmJMTUJlUV+7EJdtq2DgfDB4KXwLm2LLOR/IX63AzEav4NIxx3hIXifSKa9rp6D7nMTzdQwF0FFzIj/Y3qLrAe1WWt0gx3Vxq67pSwOJthk5Fc=
+  on:
+    tags: true
+    rvm: 2.4.2

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,27 @@
+# 2.1.2
+  * Change white/black lists to allow/deny lists
+# 2.1.1
+  * Update phantomjs_options to use 'TLSv1.2'
+# 2.1.0
+  * Delete `driver_options` configuration key as it was never used.
+  * `cleanup_all_processes` is a self method as intended to.
+# 2.0.0
+  * New configuration key `on_periodic_restart`.
+  * CrawlerManager.cleanup_all_processes method destroy all instances of phantomjs in this machine.
+  * Breaking changes
+    - Requires Ruby 2.1 or later.
+    - Crawler.start_crawling does not accept options anymore, all options are passed to Crawler.new.
+    - Crawler's methods `restart` and `quit` have been moved to CrawlerManager.
+    - Crawler gets whitelist and blacklist as configuration options instead of being set in specific methods.
+# 1.6.11
+  * Ensure all links are loaded by waiting for Ajax requests to complete
+  * Add '@evaluate_in_each_page' option to evaluate before extracting links (e.g. $('.dropdown').addClass('open');)
 # 1.6.10
   * Avoid following JS href links, add missing dependencies to fix Travis build

data/Gemfile CHANGED Viewed

@@ -1,3 +1,7 @@
 source 'https://rubygems.org'
+# Avoid ruby 2.1 to use Rack > 2.0 which is not compatible
+platform :ruby_21 do
+  gem 'rack', '~> 1.0'
+end
 gemspec

data/README.md CHANGED Viewed

@@ -21,16 +21,15 @@ Or install it yourself as:
     $ gem install grell
-Grell uses PhantomJS, you will need to download and install it in your
+Grell uses PhantomJS as a browser, you will need to download and install it in your
 system. Check for instructions in http://phantomjs.org/
 Grell has been tested with PhantomJS v2.1.x
 ## Usage
 ### Crawling an entire site
-The main entry point of the library is Grell#start_crawling.
+The main entry point of the library is Grell::Crawler#start_crawling.
 Grell will yield to your code with each page it finds:
 ```ruby
@@ -55,85 +54,104 @@ This list is indexed by the complete url, including query parameters.
 ### Re-retrieving a page
 If you want Grell to revisit a page and return the data to you again,
-return the symbol :retry in your block in the start_crawling method.
+return the symbol :retry in your block for the start_crawling method.
 For instance
 ```ruby
 require 'grell'
 crawler = Grell::Crawler.new
 crawler.start_crawling('http://www.google.com') do |current_page|
   if current_page.status == 500 && current_page.retries == 0
-    crawler.restart
+    crawler.manager.restart
     :retry
   end
 end
 ```
-### Restarting PhantomJS
-If you are doing a long crawling it is possible that phantomJS starts failing.
-To avoid that, you can restart it by calling "restart" on crawler.
-That will kill phantom and will restart it. Grell will keep the status of
-pages already visited and pages discovered and to be visited. And will keep crawling
-with the new phantomJS process instead of the old one.
+### Pages' id
-### Selecting links to follow
+Each page has an unique id, accessed by the property `id`. Also each page stores the id of the page from which we found this page, accessed by the property `parent_id`.
+The page object generated by accessing the first URL passed to the start_crawling(the root) has a `parent_id` equal to `nil` and an `id` equal to 0.
+Using this information it is possible to construct a directed graph.
-Grell by default will follow all the links it finds going to the site
-your are crawling. It will never follow links linking outside your site.
-If you want to further limit the amount of links crawled, you can use
-whitelisting, blacklisting or manual filtering.
-#### Custom URL Comparison
-By default, Grell will detect new URLs to visit by comparing the full URL
-with the URLs of the discovered and visited links. This functionality can
-be changed by passing a block of code to Grells `start_crawling` method.
-In the below example, the path of the URLs (instead of the full URL) will
-be compared.
+### Restart and quit
+Grell can be restarted. The current list of visited and yet-to-visit pages list are not modified when restarting
+but the browser is destroyed and recreated, all cookies and local storage are lost. After restarting, crawling is resumed with a
+new browser.
+To destroy the crawler, call the `quit` method. This will free the memory taken in Ruby and destroys the PhantomJS process.
 ```ruby
 require 'grell'
 crawler = Grell::Crawler.new
+crawler.manager.restart # restarts the browser
+crawler.manager.quit # quits and destroys the crawler
+```
-add_match_block = Proc.new do |collection_page, page|
-  collection_page.path == page.path
-end
+### Options
-crawler.start_crawling('http://www.google.com', add_match_block: add_match_block) do |current_page|
-...
-end
-```
+The `Grell:Crawler` class can be passed options to customize its behavior:
+- `logger`: Sets the logger object, for instance `Rails.logger`. Default: `Logger.new(STDOUT)`
+- `on_periodic_restart`: Sets periodic restarts of the crawler each certain number of visits. Default: 100 pages.
+- `allowlist`: Sets a allowlist filter for URLs to be visited. Default: all URLs are allowlisted.
+- `denylist`: Sets a denylist filter for URLs to be avoided. Default: no URL is denylisted.
+- `add_match_block`: Block evaluated to consider if a given page should be part of the pages to be visited. Default: add unique URLs.
+- `evaluate_in_each_page`: Javascript block to be evaluated on each page visited. Default: Nothing evaluated.
-#### Whitelisting
+Grell by default will follow all the links it finds in the site being crawled.
+It will never follow links linking outside your site.
+If you want to further limit the amount of links crawled, you can use
+allowlisting, denylisting or manual filtering.
+Below further details on these and other options.
-```ruby
-require 'grell'
-crawler = Grell::Crawler.new
-crawler.whitelist([/games\/.*/, '/fun'])
-crawler.start_crawling('http://www.google.com')
-```
+#### Automatically restarting PhantomJS
+If you are doing a long crawling it is possible that phantomJS gets into an inconsistent state or it starts leaking memory.
+The crawler can be restarted manually by calling `crawler.manager.restart` or automatically by using the
+`on_periodic_restart` configuration key as follows:
-Grell here will only follow links to games and '/fun' and ignore all
-other links. You can provide a regexp, strings (if any part of the
-string match is whitelisted) or an array with regexps and/or strings.
+ ```ruby
+ require 'grell'
-#### Blacklisting
+ crawler = Grell::Crawler.new(on_periodic_restart: { do: my_restart_procedure, each: 200 })
-```ruby
-require 'grell'
+ crawler.start_crawling('http://www.google.com') do |current_page|
+ ...
+ endd
+ ```
-crawler = Grell::Crawler.new
-crawler.blacklist(/games\/.*/)
-crawler.start_crawling('http://www.google.com')
-```
+ This code will setup the crawler to be restarted every 200 pages being crawled and to call `my_restart_procedure`
+ between restarts. A restart will destroy the cookies so for instance this custom block can be used to relogin.
+ #### Allowlisting
+ ```ruby
+ require 'grell'
+ crawler = Grell::Crawler.new(allowlist: [/games\/.*/, '/fun'])
+ crawler.start_crawling('http://www.google.com')
+ ```
+ Grell here will only follow links to games and '/fun' and ignore all
+ other links. You can provide a regexp, strings (if any part of the
+ string match is allowlisted) or an array with regexps and/or strings.
-Similar to whitelisting. But now Grell will follow every other link in
-this site which does not go to /games/...
+ #### Denylisting
-If you call both whitelist and blacklist then both will apply, a link
-has to fullfill both conditions to survive. If you do not call any, then
-all links on this site will be crawled. Think of these methods as
-filters.
+ ```ruby
+ require 'grell'
+ crawler = Grell::Crawler.new(denylist: /games\/.*/)
+ crawler.start_crawling('http://www.google.com')
+ ```
+ Similar to allowlisting. But now Grell will follow every other link in
+ this site which does not go to /games/...
+ If you call both allowlist and denylist then both will apply, a link
+ has to fullfill both conditions to survive. If you do not call any, then
+ all links on this site will be crawled. Think of these methods as
+ filters.
 #### Manual link filtering
@@ -144,12 +162,37 @@ links to visit. So you can modify in your block of code "page.links" to
 add and delete links to instruct Grell to add them to the list of links
 to visit next.
-### Pages' id
+#### Custom URL Comparison
+By default, Grell will detect new URLs to visit by comparing the full URL
+with the URLs of the discovered and visited links. This functionality can
+be changed by passing a block of code to Grells `start_crawling` method.
+In the below example, the path of the URLs (instead of the full URL) will
+be compared.
-Each page has an unique id, accessed by the property 'id'. Also each page stores the id of the page from which we found this page, accessed by the property 'parent_id'.
-The page object generated by accessing the first URL passed to the start_crawling(the root) has a 'parent_id' equal to 'nil' and an 'id' equal to 0.
-Using this information it is possible to construct a directed graph.
+```ruby
+require 'grell'
+add_match_block = Proc.new do |collection_page, page|
+  collection_page.path == page.path
+end
+crawler = Grell::Crawler.new(add_match_block: add_match_block)
+crawler.start_crawling('http://www.google.com') do |current_page|
+...
+end
+```
+#### Evaluate script
+You can evalute a JavaScript snippet in each page before extracting links by passing the snippet to the 'evaluate_in_each_page' option:
+```ruby
+require 'grell'
+crawler = Grell::Crawler.new(evaluate_in_each_page: "typeof jQuery !== 'undefined' && $('.dropdown').addClass('open');")
+```
 ### Errors
 When there is an error in the page or an internal error in the crawler (Javascript crashed the browser, etc). Grell will return with status 404 and the headers will have the following keys:
@@ -157,12 +200,6 @@ When there is an error in the page or an internal error in the crawler (Javascri
 - errorClass: The class of the error which broke this page.
 - errorMessage: A descriptive message with the information Grell could gather about the error.
-### Logging
-You can pass your logger to Grell. For example in a Rails app:
-```Ruby
-crawler = Grell::Crawler.new(logger: Rails.logger)
-```
 ## Tests
 Run the tests with

data/grell.gemspec CHANGED Viewed

@@ -19,19 +19,18 @@ Gem::Specification.new do |spec|
   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
   spec.require_paths = ["lib"]
-  spec.required_ruby_version = '>= 1.9.3'
+  spec.required_ruby_version = '>= 2.1.8'
-  spec.add_dependency 'capybara', '~> 2.7'
-  spec.add_dependency 'poltergeist', '~> 1.10'
+  spec.add_dependency 'capybara', '~> 2.10'
+  spec.add_dependency 'poltergeist', '~> 1.11'
-  spec.add_development_dependency 'bundler', '~> 1.6'
+  # spec.add_development_dependency 'bundler', '~> 1.6'
   spec.add_development_dependency 'byebug', '~> 4.0'
   spec.add_development_dependency 'kender', '~> 0.2'
   spec.add_development_dependency 'rake', '~> 10.0'
   spec.add_development_dependency 'webmock', '~> 1.18'
-  spec.add_development_dependency 'rspec', '~> 3.0'
-  spec.add_development_dependency 'puffing-billy', '~> 0.5'
+  spec.add_development_dependency 'rspec', '~> 3.5'
+  spec.add_development_dependency 'puffing-billy', '~> 0.9'
   spec.add_development_dependency 'timecop', '~> 0.8'
-  spec.add_development_dependency 'capybara-webkit', '~> 1.11.1'
   spec.add_development_dependency 'selenium-webdriver', '~> 2.53.4'
 end

data/lib/grell.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'capybara/dsl'
 require 'grell/grell_logger'
 require 'grell/capybara_driver'
+require 'grell/crawler_manager'
 require 'grell/crawler'
 require 'grell/rawpage'
 require 'grell/page'

data/lib/grell/capybara_driver.rb CHANGED Viewed

@@ -1,16 +1,10 @@
 module Grell
-  #The driver for Capybara. It uses Portelgeist to control PhantomJS
+  # This class setups the driver for capybara. Used internally by the CrawlerManager
+  # It uses Portelgeist to control PhantomJS
   class CapybaraDriver
-    include Capybara::DSL
-    USER_AGENT = "Mozilla/5.0 (Grell Crawler)"
-    def self.setup(options)
-      new.setup_capybara unless options[:external_driver]
-    end
+    USER_AGENT = "Mozilla/5.0 (Grell Crawler)".freeze
+    # Returns a poltergeist driver
     def setup_capybara
       @poltergeist_driver = nil
@@ -20,18 +14,17 @@ module Grell
       Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
       Capybara.register_driver driver_name do |app|
-        @poltergeist_driver = Capybara::Poltergeist::Driver.new(app, {
+        @poltergeist_driver = Capybara::Poltergeist::Driver.new(app,
           js_errors: false,
           inspector: false,
           phantomjs_logger: FakePoltergeistLogger,
-          phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1']
-         })
+          phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1.2'])
       end
       Capybara.default_max_wait_time = 3
       Capybara.run_server = false
       Capybara.default_driver = driver_name
-      page.driver.headers = {
+      Capybara.current_session.driver.headers = { # The driver gets initialized when modified here
         "DNT" => 1,
         "User-Agent" => USER_AGENT
       }
@@ -41,14 +34,11 @@ module Grell
       @poltergeist_driver
     end
-    def quit
-      @poltergeist_driver.quit
-    end
+    # Poltergeist driver needs a class with this signature. The javascript console.log is sent here.
+    # We just discard that information.
     module FakePoltergeistLogger
       def self.puts(*)
       end
     end
   end
 end

data/lib/grell/crawler.rb CHANGED Viewed

@@ -1,53 +1,32 @@
 module Grell
   # This is the class that starts and controls the crawling
   class Crawler
-    attr_reader :collection
+    attr_reader :collection, :manager
     # Creates a crawler
-    # options allows :logger to point to an object with the same interface than Logger in the standard library
-    def initialize(options = {})
-      if options[:logger]
-        Grell.logger = options[:logger]
-      else
-        Grell.logger = Logger.new(STDOUT)
-      end
-      @driver = CapybaraDriver.setup(options)
-    end
-    # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
-    def restart
-      Grell.logger.info "GRELL is restarting"
-      @driver.restart
-      Grell.logger.info "GRELL has restarted"
-    end
-    # Quits the poltergeist driver.
-    def quit
-      Grell.logger.info "GRELL is quitting the poltergeist driver"
-      @driver.quit
-    end
-    # Setups a whitelist filter, allows a regexp, string or array of either to be matched.
-    def whitelist(list)
-      @whitelist_regexp = Regexp.union(list)
-    end
-    # Setups a blacklist filter, allows a regexp, string or array of either to be matched.
-    def blacklist(list)
-      @blacklist_regexp = Regexp.union(list)
+    # evaluate_in_each_page: javascript block to evaluate in each page we crawl
+    # add_match_block: block to evaluate to consider if a page is part of the collection
+    # manager_options: options passed to the manager class
+    # allowlist: Sets an allowlist filter, allows a regexp, string or array of either to be matched.
+    # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
+    def initialize(evaluate_in_each_page: nil, add_match_block: nil, allowlist: /.*/, denylist: /a^/, **manager_options)
+      @collection = nil
+      @manager = CrawlerManager.new(manager_options)
+      @evaluate_in_each_page = evaluate_in_each_page
+      @add_match_block = add_match_block
+      @allowlist_regexp = Regexp.union(allowlist)
+      @denylist_regexp = Regexp.union(denylist)
     end
     # Main method, it starts crawling on the given URL and calls a block for each of the pages found.
-    def start_crawling(url, options = {}, &block)
+    def start_crawling(url, &block)
       Grell.logger.info "GRELL Started crawling"
-      @collection = PageCollection.new(options[:add_match_block] || default_add_match)
+      @collection = PageCollection.new(@add_match_block)
       @collection.create_page(url, nil)
       while !@collection.discovered_pages.empty?
         crawl(@collection.next_page, block)
+        @manager.check_periodic_restart(@collection)
       end
       Grell.logger.info "GRELL finished crawling"
@@ -55,16 +34,12 @@ module Grell
     def crawl(site, block)
       Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
-      site.navigate
-      filter!(site.links)
-      add_redirect_url(site)
+      crawl_site(site)
       if block # The user of this block can send us a :retry to retry accessing the page
         while crawl_block(block, site) == :retry
           Grell.logger.info "Retrying our visit to #{site.url}"
-          site.navigate
-          filter!(site.links)
-          add_redirect_url(site)
+          crawl_site(site)
         end
       end
@@ -75,6 +50,13 @@ module Grell
     private
+    def crawl_site(site)
+      site.navigate
+      site.rawpage.page.evaluate_script(@evaluate_in_each_page) if @evaluate_in_each_page
+      filter!(site.links)
+      add_redirect_url(site)
+    end
     # Treat any exceptions from the block as an unavailable page
     def crawl_block(block, site)
       block.call(site)
@@ -85,16 +67,8 @@ module Grell
     end
     def filter!(links)
-      links.select! { |link| link =~ @whitelist_regexp } if @whitelist_regexp
-      links.delete_if { |link| link =~ @blacklist_regexp } if @blacklist_regexp
-    end
-    # If options[:add_match_block] is not provided, url matching to determine if a
-    # new page should be added the page collection will default to this proc
-    def default_add_match
-      Proc.new do |collection_page, page|
-        collection_page.url.downcase == page.url.downcase
-      end
+      links.select! { |link| link =~ @allowlist_regexp } if @allowlist_regexp
+      links.delete_if { |link| link =~ @denylist_regexp } if @denylist_regexp
     end
     # Store the resulting redirected URL along with the original URL