RubyGems - kimurai - Versions diffs - 1.2.0 → 1.3.0 - Mend

kimurai 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +17 -0
data/README.md +58 -99
data/kimurai.gemspec +1 -1
data/lib/kimurai/automation/setup.yml +3 -3
data/lib/kimurai/base.rb +17 -19
data/lib/kimurai/base/storage.rb +14 -61
data/lib/kimurai/browser_builder/mechanize_builder.rb +4 -4
data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +6 -2
data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +8 -4
data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +6 -1
data/lib/kimurai/capybara_ext/mechanize/driver.rb +16 -0
data/lib/kimurai/capybara_ext/selenium/driver.rb +10 -0
data/lib/kimurai/capybara_ext/session.rb +26 -12
data/lib/kimurai/cli.rb +8 -3
data/lib/kimurai/runner.rb +7 -18
data/lib/kimurai/template/config/schedule.rb +1 -1
data/lib/kimurai/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2ebb62d0916ee55eb8ec05f34b5f5909b99aae8fa2236dd93169f6e5e1221805
-  data.tar.gz: da86497a6c4f61f2ff1ffe3e8eeee92acb203305e712b9bd4baaa5794fdcf5f5
+  metadata.gz: 9fb75ab8965adb212964eee4b319642c3b83bbb4e3b957b739b6eba28b64546d
+  data.tar.gz: b03efc58a57222fdbb51e5ab6aa389f352de1f94fbd02358ccb613d7a15a5774
 SHA512:
-  metadata.gz: 9d54d6074928a5bc0aa0a9d7e64308942e21ad5ae17788b2fa36adfffe88f67df0c533558ced412f2e22481a1664661c547419d09451d1260dd6ebd14ca4d915
-  data.tar.gz: d21f499a2a292dd672480d15da71742cfa82dec054b6a3c3a3c756e6cd2c98e58ac7d3b7fa3fe0ce6ae2e956d46417269242e742f8d83b644176ccef0822de75
+  metadata.gz: 737f467deef60eb2983d86286c040d1b899732ff3e52fdab50ce07e8893070c0cd9f3f3020fe57e32ab1d9140299a9112db698e8d72112934b4fd90ee10a7644
+  data.tar.gz: de89bc1ef85f7f29986b7033d883a49c804592a21f78b409bf325f2e0e578dc78bbf67cdca4ca719f34e43f305dd99f8f227cbd8b545e42f06ca7f49d24e1e35

data/CHANGELOG.md CHANGED

@@ -1,4 +1,21 @@
 # CHANGELOG
+## 1.3.0
+### Breaking changes 1.3.0
+* Remove persistence database feature (because it's slow and makes things complicated)
+### New
+* Add `--include` and `--exclude` options to CLI#runner
+* Add Base `#create_browser` method to easily create additional browser instances
+* Add Capybara::Session `#scroll_to_bottom`
+* Add skip_on_failure feature to `retry_request_errors` config option
+* Add info about `add_event` method to the README
+### Fixes and improvements
+* Improve Runner
+* Fix time helper in schedule.rb
+* Add proxy validation to browser builders
+* Allow to pass different arguments to the `Base.parse` method
 ## 1.2.0
 ### New
 * Add possibility to add array of values to the storage (`Base::Storage#add`)

data/README.md CHANGED

@@ -18,7 +18,7 @@
 <br>
-> Note: this readme is for `1.2.0` gem version. CHANGELOG [here](CHANGELOG.md).
+> Note: this readme is for `1.3.0` gem version. CHANGELOG [here](CHANGELOG.md).
 Kimurai is a modern web scraping framework written in Ruby which **works out of box with Headless Chromium/Firefox, PhantomJS**, or simple HTTP requests and **allows to scrape and interact with JavaScript rendered websites.**
@@ -241,10 +241,10 @@ I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320]  INFO -- infinite_scrol
     * [Skip duplicates](#skip-duplicates)
       * [Automatically skip all duplicated requests urls](#automatically-skip-all-duplicated-requests-urls)
       * [Storage object](#storage-object)
-      * [Persistence database for the storage](#persistence-database-for-the-storage)
     * [Handle request errors](#handle-request-errors)
       * [skip_request_errors](#skip_request_errors)
       * [retry_request_errors](#retry_request_errors)
+    * [Logging custom events](#logging-custom-events)
     * [open_spider and close_spider callbacks](#open_spider-and-close_spider-callbacks)
     * [KIMURAI_ENV](#kimurai_env)
     * [Parallel crawling using in_parallel](#parallel-crawling-using-in_parallel)
@@ -297,8 +297,8 @@ echo 'export PATH="$HOME/.rbenv/plugins/ruby-build/bin:$PATH"' >> ~/.bashrc
 exec $SHELL
 # Install latest Ruby
-rbenv install 2.5.1
-rbenv global 2.5.1
+rbenv install 2.5.3
+rbenv global 2.5.3
 gem install bundler
 ```
@@ -317,8 +317,8 @@ echo 'if which rbenv > /dev/null; then eval "$(rbenv init -)"; fi' >> ~/.bash_pr
 source ~/.bash_profile
 # Install latest Ruby
-rbenv install 2.5.1
-rbenv global 2.5.1
+rbenv install 2.5.3
+rbenv global 2.5.3
 gem install bundler
 ```
@@ -349,17 +349,17 @@ sudo apt install -q -y xvfb
 # Install chromium-browser and firefox
 sudo apt install -q -y chromium-browser firefox
-# Instal chromedriver (2.39 version)
+# Instal chromedriver (2.44 version)
 # All versions located here https://sites.google.com/a/chromium.org/chromedriver/downloads
-cd /tmp && wget https://chromedriver.storage.googleapis.com/2.39/chromedriver_linux64.zip
+cd /tmp && wget https://chromedriver.storage.googleapis.com/2.44/chromedriver_linux64.zip
 sudo unzip chromedriver_linux64.zip -d /usr/local/bin
 rm -f chromedriver_linux64.zip
-# Install geckodriver (0.21.0 version)
+# Install geckodriver (0.23.0 version)
 # All versions located here https://github.com/mozilla/geckodriver/releases/
-cd /tmp && wget https://github.com/mozilla/geckodriver/releases/download/v0.21.0/geckodriver-v0.21.0-linux64.tar.gz
-sudo tar -xvzf geckodriver-v0.21.0-linux64.tar.gz -C /usr/local/bin
-rm -f geckodriver-v0.21.0-linux64.tar.gz
+cd /tmp && wget https://github.com/mozilla/geckodriver/releases/download/v0.23.0/geckodriver-v0.23.0-linux64.tar.gz
+sudo tar -xvzf geckodriver-v0.23.0-linux64.tar.gz -C /usr/local/bin
+rm -f geckodriver-v0.23.0-linux64.tar.gz
 # Install PhantomJS (2.1.1)
 # All versions located here http://phantomjs.org/download.html
@@ -815,113 +815,60 @@ It is possible to automatically skip all already visited urls while calling `req
 * `#all` - display storage hash where keys are existing scopes.
 * `#include?(scope, value)` - return `true` if value in the scope exists, and `false` if not
 * `#add(scope, value)` - add value to the scope
-* `unique?(scope, value)` - method already described above, will return `false` if value in the scope exists, or return `true` + add value to the scope if value in the scope not exists.
-* `clear!` - reset the whole storage by deleting all values from all scopes.
+* `#unique?(scope, value)` - method already described above, will return `false` if value in the scope exists, or return `true` + add value to the scope if value in the scope not exists.
+* `#clear!` - reset the whole storage by deleting all values from all scopes.
-#### Persistence database for the storage
-It's pretty common that spider can fail (IP blocking, etc.) while crawling a huge website with +5k listings. In this case, it's not convenient to start everything over again.
-Kimurai can use persistence database for a `storage` using Ruby built-in [PStore](https://ruby-doc.org/stdlib-2.5.1/libdoc/pstore/rdoc/PStore.html) database. With this option, you can automatically skip already visited urls in the next run _if previous run was failed_, otherwise _(if run was successful)_ storage database will be removed before spider stops.
-Also, with persistence storage enabled, [save_to](#save_to-helper) method will keep adding items to an existing file (it will not be cleared before each run).
+### Handle request errors
+It is quite common that some pages of crawling website can return different response code than `200 ok`. In such cases, method `request_to` (or `browser.visit`) can raise an exception. Kimurai provides `skip_request_errors` and `retry_request_errors` [config](#spider-config) options to handle such errors:
-To use persistence storage, provide `continue: true` option to the `.crawl!` method: `SomeSpider.crawl!(continue: true)`.
+#### skip_request_errors
+You can automatically skip some of errors while requesting a page using `skip_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught, and request will be skipped. It is a good idea to skip errors like NotFound(404), etc.
-There are two approaches how to use persistence storage and skip already processed items pages. First, is to manually add required urls to the storage:
+Format for the option: array where elements are error classes or/and hashes. You can use _hash_ format for more flexibility:
-```ruby
-class ProductsSpider < Kimurai::Base
-  @start_urls = ["https://example-shop.com/"]
+```
+@config = {
+  skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }]
+}
+```
+In this case, provided `message:` will be compared with a full error message using `String#include?`. Also you can use regex instead: `{ error: RuntimeError, message: /404|403/ }`.
-  def parse(response, url:, data: {})
-    response.xpath("//categories/path").each do |category|
-      request_to :parse_category, url: category[:href]
-    end
-  end
+#### retry_request_errors
+You can automatically retry some of errors with a few attempts while requesting a page using `retry_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught and the request will be processed again within a delay.
-  def parse_category(response, url:, data: {})
-    response.xpath("//products/path").each do |product|
-      # check if product url already contains in the scope `:product_urls`, if so, skip the request:
-      next if storage.contains?(:product_urls, product[:href])
-      # Otherwise process it:
-      request_to :parse_product, url: product[:href]
-    end
-  end
+There are 3 attempts: first: delay _15 sec_, second: delay _30 sec_, third: delay _45 sec_. If after 3 attempts there is still an exception, then the exception will be raised. It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
-  def parse_product(response, url:, data: {})
-    # Add visited item to the storage:
-    storage.add(:product_urls, url)
+Format for the option: same like for `skip_request_errors` option.
-    # ...
-  end
-end
+If you would like to skip (not raise) error after all retries gone, you can specify `skip_on_failure: true` option:
-# Run the spider with persistence database option:
-ProductsSpider.crawl!(continue: true)
+```ruby
+@config = {
+  retry_request_errors: [{ error: RuntimeError, skip_on_failure: true }]
+}
 ```
-Second approach is to automatically skip already processed items urls using `@config` `skip_duplicate_requests:` option:
+### Logging custom events
-<details/>
-  <summary>Check the code</summary>
+It is possible to save custom messages to the [run_info](#open_spider-and-close_spider-callbacks) hash using `add_event('Some message')` method. This feature helps you to keep track on important things which happened during crawling without checking the whole spider log (in case if you're logging these messages using `logger`). Example:
 ```ruby
-class ProductsSpider < Kimurai::Base
-  @start_urls = ["https://example-shop.com/"]
-  @config = {
-    # Configure skip_duplicate_requests option:
-    skip_duplicate_requests: { scope: :product_urls, check_only: true }
-  }
-  def parse(response, url:, data: {})
-    response.xpath("//categories/path").each do |category|
-      request_to :parse_category, url: category[:href]
-    end
+def parse_product(response, url:, data: {})
+  unless response.at_xpath("//path/to/add_to_card_button")
+    add_event("Product is sold") and return
   end
-  def parse_category(response, url:, data: {})
-    response.xpath("//products/path").each do |product|
-      # Before visiting the url, `request_to` will check if it already contains
-      # in the storage scope `:product_urls`, if so, request will be skipped:
-      request_to :parse_product, url: product[:href]
-    end
-  end
-  def parse_product(response, url:, data: {})
-    # Add visited item url to the storage scope `:product_urls`:
-    storage.add(:product_urls, url)
-    # ...
-  end
+  # ...
 end
-# Run the spider with persistence database option:
-ProductsSpider.crawl!(continue: true)
 ```
-</details>
-### Handle request errors
-It is quite common that some pages of crawling website can return different response code than `200 ok`. In such cases, method `request_to` (or `browser.visit`) can raise an exception. Kimurai provides `skip_request_errors` and `retry_request_errors` [config](#spider-config) options to handle such errors:
-#### skip_request_errors
-You can automatically skip some of errors while requesting a page using `skip_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught, and request will be skipped. It is a good idea to skip errors like NotFound(404), etc.
-Format for the option: array where elements are error classes or/and hashes. You can use _hash_ format for more flexibility:
-```ruby
-@config = {
-  skip_request_errors: [{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }]
-}
 ```
-In this case, provided `message:` will be compared with a full error message using `String#include?`. Also you can use regex instead: `{ error: "RuntimeError", message: /404|403/ }`.
-#### retry_request_errors
-You can automatically retry some of errors with a few attempts while requesting a page using `retry_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught and the request will be processed again within a delay.
-There are 3 attempts: first: delay _15 sec_, second: delay _30 sec_, third: delay _45 sec_. If after 3 attempts there is still an exception, then the exception will be raised. It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
-Format for the option: same like for `skip_request_errors` option.
+...
+I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640]  INFO -- example_spider: Spider: new event (scope: custom): Product is sold
+...
+I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640]  INFO -- example_spider: Spider: stopped: {:events=>{:custom=>{"Product is sold"=>1}}}
+```
 ### `open_spider` and `close_spider` callbacks
@@ -1296,7 +1243,7 @@ set :chronic_options, hours24: true
 #   crawl "google_spider.com", output: "log/google_spider.com.log"
 # end
 def local_to_utc(time_string, zone:)
-  TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time))
+  TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
 end
 # Note: by default Whenever exports cron commands with :environment == "production".
@@ -1483,6 +1430,8 @@ You can automatically setup [required environment](#installation) for Kimurai on
 > To perform remote server setup, [Ansible](https://github.com/ansible/ansible) is required **on the desktop** machine (to install: Ubuntu: `$ sudo apt install ansible`, Mac OS X: `$ brew install ansible`)
+> It's recommended to use regular user to setup the server, not `root`. To create a new user, login to the server `$ ssh root@your_server_ip`, type `$ adduser username` to create a user, and `$ gpasswd -a username sudo` to add new user to a sudo group.
 Example:
 ```bash
@@ -2081,6 +2030,16 @@ $ bundle exec kimurai runner -j 3
 Each spider runs in a separate process. Spiders logs available at `log/` folder. Pass `-j` option to specify how many spiders should be processed at the same time (default is 1).
+You can provide additional arguments like `--include` or `--exclude` to specify which spiders to run:
+```bash
+# Run only custom_spider and example_spider:
+$ bundle exec kimurai runner --include custom_spider example_spider
+# Run all except github_spider:
+$ bundle exec kimurai runner --exclude github_spider
+```
 #### Runner callbacks
 You can perform custom actions before runner starts and after runner stops using `config.runner_at_start_callback` and `config.runner_at_stop_callback`. Check [config/application.rb](lib/kimurai/template/config/application.rb) to see example.

data/kimurai.gemspec CHANGED

@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
     `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
   end
   spec.bindir        = "exe"
-  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.executables   = "kimurai"
   spec.require_paths = ["lib"]
   spec.required_ruby_version = ">= 2.5.0"

data/lib/kimurai/automation/setup.yml CHANGED

@@ -1,16 +1,16 @@
 ---
 - hosts: all
   vars:
-    ruby: 2.5.1
+    ruby: 2.5.3
     rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
     rbenv_shims_path: "{{ rbenv_root_path }}/shims"
     ruby_versions_path: "{{ rbenv_root_path }}/versions"
     # check latest here http://phantomjs.org/download.html
     phantomjs: 2.1.1
     # check latest here https://github.com/mozilla/geckodriver/releases/
-    geckodriver: 0.21.0
+    geckodriver: 0.23.0
     # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
-    chromedriver: 2.39
+    chromedriver: 2.44
   tasks:
     - name: Update apt cache

data/lib/kimurai/base.rb CHANGED

@@ -97,15 +97,10 @@ module Kimurai
       end
     end
-    def self.crawl!(continue: false, exception_on_fail: true)
+    def self.crawl!(exception_on_fail: true)
       logger.error "Spider: already running: #{name}" and return false if running?
-      storage_path =
-        if continue
-          Dir.exists?("tmp") ? "tmp/#{name}.pstore" : "#{name}.pstore"
-        end
-      @storage = Storage.new(storage_path)
+      @storage = Storage.new
       @savers = {}
       @update_mutex = Mutex.new
@@ -149,15 +144,6 @@ module Kimurai
         close_spider if self.respond_to? :close_spider
-        if @storage.path
-          if completed?
-            @storage.delete!
-            logger.info "Spider: storage: persistence database #{@storage.path} was removed (successful run)"
-          else
-            logger.info "Spider: storage: persistence database #{@storage.path} wasn't removed (failed run)"
-          end
-        end
         message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
         failed? ? logger.fatal(message) : logger.info(message)
@@ -165,9 +151,16 @@ module Kimurai
       end
     end
-    def self.parse!(handler, engine = nil, url: nil, data: {})
-      spider = engine ? self.new(engine) : self.new
-      url.present? ? spider.request_to(handler, url: url, data: data) : spider.public_send(handler)
+    def self.parse!(handler, *args, **request)
+      spider = self.new
+      if args.present?
+        spider.public_send(handler, *args)
+      elsif request.present?
+        spider.request_to(handler, request)
+      else
+        spider.public_send(handler)
+      end
     ensure
       spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
     end
@@ -244,12 +237,17 @@ module Kimurai
       end
       self.class.add_event(scope, event)
+      logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
     end
     ###
     private
+    def create_browser(engine, config = {})
+      Kimurai::BrowserBuilder.build(engine, config, spider: self)
+    end
     def unique_request?(url)
       options = @config[:skip_duplicate_requests]
       if options.class == Hash

data/lib/kimurai/base/storage.rb CHANGED

@@ -1,60 +1,34 @@
-require 'pstore'
 module Kimurai
   class Base
     class Storage
-      attr_reader :database, :path
+      attr_reader :database
-      def initialize(path = nil)
-        @path = path
+      def initialize
         @mutex = Mutex.new
-        @database = path ? PStore.new(path) : {}
+        @database = {}
       end
       def all(scope = nil)
         @mutex.synchronize do
-          if path
-            database.transaction { scope ? database.fetch(scope, []) : database }
-          else
-            scope ? database.fetch(scope, []) : database
-          end
+          scope ? database.fetch(scope, []) : database
         end
       end
       def include?(scope, value)
         @mutex.synchronize do
-          if path
-            database.transaction do
-              database[scope] ||= []
-              database[scope].include?(value)
-            end
-          else
-            database[scope] ||= []
-            database[scope].include?(value)
-          end
+          database[scope] ||= []
+          database[scope].include?(value)
         end
       end
       def add(scope, value)
         @mutex.synchronize do
-          if path
-            database.transaction do
-              database[scope] ||= []
-              if value.class == Array
-                database[scope] += value
-                database[scope].uniq!
-              else
-                database[scope].push(value) unless database[scope].include?(value)
-              end
-            end
+          database[scope] ||= []
+          if value.kind_of?(Array)
+            database[scope] += value
+            database[scope].uniq!
           else
-            database[scope] ||= []
-            if value.class == Array
-              database[scope] += value
-              database[scope].uniq!
-            else
-              database[scope].push(value) unless database[scope].include?(value)
-            end
+            database[scope].push(value) unless database[scope].include?(value)
           end
         end
       end
@@ -63,15 +37,8 @@ module Kimurai
       def unique?(scope, value)
         @mutex.synchronize do
-          if path
-            database.transaction do
-              database[scope] ||= []
-              database[scope].include?(value) ? false : database[scope].push(value) and true
-            end
-          else
-            database[scope] ||= []
-            database[scope].include?(value) ? false : database[scope].push(value) and true
-          end
+          database[scope] ||= []
+          database[scope].include?(value) ? false : database[scope].push(value) and true
         end
       end
@@ -79,21 +46,7 @@ module Kimurai
       def clear!
         @mutex.synchronize do
-          if path
-            database.transaction do
-              database.roots.each { |key| database.delete key }
-            end
-          else
-            database = {}
-          end
-        end
-      end
-      def delete!
-        @mutex.synchronize do
-          if path
-            File.delete path if File.exists? path
-          end
+          @database = {}
         end
       end
     end

data/lib/kimurai/browser_builder/mechanize_builder.rb CHANGED

@@ -38,11 +38,11 @@ module Kimurai
           proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
           ip, port, type = proxy_string.split(":")
-          if type == "socks5"
-            logger.error "BrowserBuilder (mechanize): can't set socks5 proxy (not supported), skipped"
-          else
+          if type == "http"
             @browser.driver.set_proxy(*proxy_string.split(":"))
-            logger.debug "BrowserBuilder (mechanize): enabled #{type} proxy, ip: #{ip}, port: #{port}"
+            logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
+          else
+            logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
           end
         end

data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb CHANGED

@@ -64,8 +64,12 @@ module Kimurai
           proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
           ip, port, type = proxy_string.split(":")
-          @browser.driver.set_proxy(*proxy_string.split(":"))
-          logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
+          if %w(http socks5).include?(type)
+            @browser.driver.set_proxy(*proxy_string.split(":"))
+            logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
+          else
+            logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
+          end
         end
         # Headers

data/lib/kimurai/browser_builder/selenium_chrome_builder.rb CHANGED

@@ -44,11 +44,15 @@ module Kimurai
             proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
             ip, port, type, user, password = proxy_string.split(":")
-            if user.nil? && password.nil?
-              driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
-              logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
+            if %w(http socks5).include?(type)
+              if user.nil? && password.nil?
+                driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
+                logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
+              else
+                logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped"
+              end
             else
-              logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped"
+              logger.error "BrowserBuilder (selenium_chrome): wrong type of proxy: #{type}, skipped"
             end
           end

data/lib/kimurai/browser_builder/selenium_firefox_builder.rb CHANGED

@@ -40,13 +40,18 @@ module Kimurai
                 driver_options.profile["network.proxy.http_port"] = port.to_i
                 driver_options.profile["network.proxy.ssl"] = ip
                 driver_options.profile["network.proxy.ssl_port"] = port.to_i
+                logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}"
               elsif type == "socks5"
                 driver_options.profile["network.proxy.socks"] = ip
                 driver_options.profile["network.proxy.socks_port"] = port.to_i
                 driver_options.profile["network.proxy.socks_version"] = 5
                 driver_options.profile["network.proxy.socks_remote_dns"] = true
+                logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}"
+              else
+                logger.error "BrowserBuilder (selenium_firefox): wrong type of proxy: #{type}, skipped"
               end
-              logger.debug "BrowserBuilder (selenium_firefox): enabled #{type} proxy, ip: #{ip}, port: #{port}"
             else
               logger.error "BrowserBuilder (selenium_firefox): proxy with authentication doesn't supported by selenium, skipped"
             end

data/lib/kimurai/capybara_ext/mechanize/driver.rb CHANGED

@@ -10,6 +10,8 @@ class Capybara::Mechanize::Driver
     browser.agent.set_proxy(ip, port, user, password)
   end
+  ###
   def headers
     browser.agent.request_headers
   end
@@ -22,6 +24,12 @@ class Capybara::Mechanize::Driver
     browser.agent.request_headers[name] = value
   end
+  ###
+  def get_cookies
+    browser.agent.cookies
+  end
   def set_cookie(name, value, options = {})
     options[:name]  ||= name
     options[:value] ||= value
@@ -30,10 +38,18 @@ class Capybara::Mechanize::Driver
     browser.agent.cookie_jar << cookie
   end
+  def set_cookies(cookies)
+    cookies.each do |cookie|
+      set_cookie(cookie[:name], cookie[:value], cookie)
+    end
+  end
   def clear_cookies
     browser.agent.cookie_jar.clear!
   end
+  ###
   def quit
     browser.agent.shutdown
   end

data/lib/kimurai/capybara_ext/selenium/driver.rb CHANGED

@@ -1,6 +1,10 @@
 require_relative '../driver/base'
 class Capybara::Selenium::Driver
+  def get_cookies
+    browser.manage.all_cookies
+  end
   def set_cookie(name, value, options = {})
     options[:name]  ||= name
     options[:value] ||= value
@@ -8,6 +12,12 @@ class Capybara::Selenium::Driver
     browser.manage.add_cookie(options)
   end
+  def set_cookies(cookies)
+    cookies.each do |cookie|
+      set_cookie(cookie[:name], cookie[:value], cookie)
+    end
+  end
   def clear_cookies
     browser.manage.delete_all_cookies
   end

data/lib/kimurai/capybara_ext/session.rb CHANGED

@@ -33,7 +33,7 @@ module Capybara
               sleep sleep_interval and retry
             else
               logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
-              raise e
+              raise e unless skip_error_on_failure?(e)
             end
           else
             raise e
@@ -127,26 +127,40 @@ module Capybara
     ###
+    def scroll_to_bottom
+      execute_script("window.scrollBy(0,10000)")
+    end
     private
+    def skip_error_on_failure?(e)
+      config.retry_request_errors.any? do |error|
+        error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
+      end
+    end
     def match_error?(e, type:)
-      errors = (type == :to_retry ? config.retry_request_errors : config.skip_request_errors)
-      if errors.present?
-        errors.any? do |error|
-          if error.class == Hash
-            match = if error[:message].class == Regexp
+      errors =
+        case type
+        when :to_retry then config.retry_request_errors
+        when :to_skip then config.skip_request_errors
+        end
+      errors.any? do |error|
+        if error.kind_of?(Hash)
+          match_class = e.class.ancestors.include?(error[:error])
+          if error[:message].present?
+            if error[:message].kind_of?(Regexp)
               e.message&.match?(error[:message])
             else
               e.message&.include?(error[:message])
-            end
-            e.class == error[:error] && match
+            end && match_class
           else
-            e.class == error
+            match_class
           end
+        else
+          e.class.ancestors.include?(error)
         end
-      else
-        false
       end
     end

data/lib/kimurai/cli.rb CHANGED

@@ -66,7 +66,6 @@ module Kimurai
     ###
     desc "crawl", "Run a particular spider by it's name"
-    option :continue, aliases: :c, type: :boolean, default: false, banner: "Continue previous crawling"
     def crawl(spider_name)
       raise "Can't find Kimurai project" unless inside_project?
       require './config/boot'
@@ -81,7 +80,7 @@ module Kimurai
         Kimurai.time_zone = time_zone
       end
-      klass.crawl!(continue: options["continue"])
+      klass.crawl!
     end
     desc "parse", "Parse url in the particular spider method"
@@ -129,6 +128,8 @@ module Kimurai
     end
     desc "runner", "Run all spiders in the project in queue"
+    option :include, type: :array, default: [], banner: "List of spiders to run"
+    option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
     option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
     def runner
       raise "Can't find Kimurai project" unless inside_project?
@@ -138,7 +139,11 @@ module Kimurai
       require './config/boot'
       require 'kimurai/runner'
-      Runner.new(parallel_jobs: jobs).run!
+      spiders = options["include"].presence || Kimurai.list.keys
+      spiders -= options["exclude"]
+      Runner.new(spiders, jobs).run!
     end
     desc "--version, -v", "Print the version"

data/lib/kimurai/runner.rb CHANGED

@@ -4,9 +4,9 @@ module Kimurai
   class Runner
     attr_reader :jobs, :spiders, :session_info
-    def initialize(parallel_jobs:)
+    def initialize(spiders, parallel_jobs)
       @jobs = parallel_jobs
-      @spiders = Kimurai.list
+      @spiders = spiders
       @start_time = Time.now
       @session_info = {
@@ -16,7 +16,7 @@ module Kimurai
         stop_time: nil,
         environment: Kimurai.env,
         concurrent_jobs: @jobs,
-        spiders: @spiders.keys
+        spiders: @spiders
       }
       if time_zone = Kimurai.configuration.time_zone
@@ -28,8 +28,6 @@ module Kimurai
     end
     def run!(exception_on_fail: true)
-      running_pids = []
       puts ">>> Runner: started: #{session_info}"
       if at_start_callback = Kimurai.configuration.runner_at_start_callback
         at_start_callback.call(session_info)
@@ -39,29 +37,20 @@ module Kimurai
       spiders.peach_with_index(jobs) do |spider, i|
         next unless running
-        spider_name = spider[0]
-        puts "> Runner: started spider: #{spider_name}, index: #{i}"
-        pid = spawn("bundle", "exec", "kimurai", "crawl", spider_name, [:out, :err] => "log/#{spider_name}.log")
-        running_pids << pid
+        puts "> Runner: started spider: #{spider}, index: #{i}"
+        pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
         Process.wait pid
-        running_pids.delete(pid)
-        puts "< Runner: stopped spider: #{spider_name}, index: #{i}"
+        puts "< Runner: stopped spider: #{spider}, index: #{i}"
       end
     rescue StandardError, SignalException, SystemExit => e
       running = false
       session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
       exception_on_fail ? raise(e) : [session_info, e]
     else
       session_info.merge!(status: :completed, stop_time: Time.now)
     ensure
-      running = false
-      Thread.list.each { |t| t.kill if t != Thread.main }
-      # Kill currently running spiders (if any, in case of fail)
-      running_pids.each { |pid| Process.kill("INT", pid) }
       if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
         at_stop_callback.call(session_info)
       end

data/lib/kimurai/template/config/schedule.rb CHANGED

@@ -16,7 +16,7 @@ set :chronic_options, hours24: true
 #   crawl "google_spider.com", output: "log/google_spider.com.log"
 # end
 def local_to_utc(time_string, zone:)
-  TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time))
+  TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
 end
 # Note: by default Whenever exports cron commands with :environment == "production".

data/lib/kimurai/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Kimurai
-  VERSION = "1.2.0"
+  VERSION = "1.3.0"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kimurai
 version: !ruby/object:Gem::Version
-  version: 1.2.0
+  version: 1.3.0
 platform: ruby
 authors:
 - Victor Afanasev
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-09-20 00:00:00.000000000 Z
+date: 2018-11-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: thor