RubyGems - kimurai_dynamic - Versions diffs - 1.4.1 - Mend

kimurai_dynamic 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

checksums.yaml +7 -0
data/.gitignore +11 -0
data/.travis.yml +5 -0
data/CHANGELOG.md +111 -0
data/Gemfile +6 -0
data/LICENSE.txt +21 -0
data/README.md +2038 -0
data/Rakefile +10 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/exe/kimurai +6 -0
data/kimurai.gemspec +48 -0
data/lib/kimurai/automation/deploy.yml +54 -0
data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
data/lib/kimurai/automation/setup.yml +45 -0
data/lib/kimurai/base/saver.rb +106 -0
data/lib/kimurai/base/storage.rb +54 -0
data/lib/kimurai/base.rb +330 -0
data/lib/kimurai/base_helper.rb +22 -0
data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
data/lib/kimurai/browser_builder.rb +20 -0
data/lib/kimurai/capybara_configuration.rb +10 -0
data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
data/lib/kimurai/capybara_ext/session/config.rb +22 -0
data/lib/kimurai/capybara_ext/session.rb +249 -0
data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
data/lib/kimurai/cli/generator.rb +57 -0
data/lib/kimurai/cli.rb +183 -0
data/lib/kimurai/core_ext/array.rb +14 -0
data/lib/kimurai/core_ext/hash.rb +5 -0
data/lib/kimurai/core_ext/numeric.rb +19 -0
data/lib/kimurai/core_ext/string.rb +7 -0
data/lib/kimurai/pipeline.rb +33 -0
data/lib/kimurai/runner.rb +60 -0
data/lib/kimurai/template/.gitignore +18 -0
data/lib/kimurai/template/Gemfile +28 -0
data/lib/kimurai/template/README.md +3 -0
data/lib/kimurai/template/config/application.rb +37 -0
data/lib/kimurai/template/config/automation.yml +13 -0
data/lib/kimurai/template/config/boot.rb +22 -0
data/lib/kimurai/template/config/initializers/.keep +0 -0
data/lib/kimurai/template/config/schedule.rb +57 -0
data/lib/kimurai/template/db/.keep +0 -0
data/lib/kimurai/template/helpers/application_helper.rb +3 -0
data/lib/kimurai/template/lib/.keep +0 -0
data/lib/kimurai/template/log/.keep +0 -0
data/lib/kimurai/template/pipelines/saver.rb +11 -0
data/lib/kimurai/template/pipelines/validator.rb +24 -0
data/lib/kimurai/template/spiders/application_spider.rb +143 -0
data/lib/kimurai/template/tmp/.keep +0 -0
data/lib/kimurai/version.rb +3 -0
data/lib/kimurai.rb +54 -0
metadata +349 -0

data/lib/kimurai/runner.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require 'pmap'
+module Kimurai
+  class Runner
+    attr_reader :jobs, :spiders, :session_info
+    def initialize(spiders, parallel_jobs)
+      @jobs = parallel_jobs
+      @spiders = spiders
+      @start_time = Time.now
+      @session_info = {
+        id: @start_time.to_i,
+        status: :processing,
+        start_time: @start_time,
+        stop_time: nil,
+        environment: Kimurai.env,
+        concurrent_jobs: @jobs,
+        spiders: @spiders
+      }
+      if time_zone = Kimurai.configuration.time_zone
+        Kimurai.time_zone = time_zone
+      end
+      ENV.store("SESSION_ID", @start_time.to_i.to_s)
+      ENV.store("RBCAT_COLORIZER", "false")
+    end
+    def run!(exception_on_fail: true)
+      puts ">>> Runner: started: #{session_info}"
+      if at_start_callback = Kimurai.configuration.runner_at_start_callback
+        at_start_callback.call(session_info)
+      end
+      running = true
+      spiders.peach_with_index(jobs) do |spider, i|
+        next unless running
+        puts "> Runner: started spider: #{spider}, index: #{i}"
+        pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
+        Process.wait pid
+        puts "< Runner: stopped spider: #{spider}, index: #{i}"
+      end
+    rescue StandardError, SignalException, SystemExit => e
+      running = false
+      session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
+      exception_on_fail ? raise(e) : [session_info, e]
+    else
+      session_info.merge!(status: :completed, stop_time: Time.now)
+    ensure
+      if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
+        at_stop_callback.call(session_info)
+      end
+      puts "<<< Runner: stopped: #{session_info}"
+    end
+  end
+end

data/lib/kimurai/template/.gitignore ADDED Viewed

@@ -0,0 +1,18 @@
+/.bundle
+/cache
+/node_modules
+/log/*
+!/log/.keep
+/tmp/*
+!/tmp/.keep
+/db/*
+!/db/.keep
+.byebug_history
+*.swp
+.env
+capybara-*.png

data/lib/kimurai/template/Gemfile ADDED Viewed

@@ -0,0 +1,28 @@
+source 'https://rubygems.org'
+git_source(:github) { |repo| "https://github.com/#{repo}.git" }
+ruby '>= 2.5'
+# Framework
+gem 'kimurai', '~> 1.4'
+# Require files in directory and child directories recursively
+gem 'require_all'
+# Dotenv
+gem 'dotenv'
+# To debug spiders:
+group :development do
+  gem 'byebug', platforms: :mri
+  gem 'pry'
+end
+# If you want to save items to the database, require one of these gems:
+# gem 'sqlite3'
+# gem 'pg'
+# gem 'mysql2'
+# And use your preferred ORM/database connector:
+# gem 'activerecord', require: 'active_record'
+# gem 'sequel'

data/lib/kimurai/template/README.md ADDED Viewed

@@ -0,0 +1,3 @@
+# README
+New Kimurai project readme

data/lib/kimurai/template/config/application.rb ADDED Viewed

@@ -0,0 +1,37 @@
+Kimurai.configure do |config|
+  # Default logger has colored mode in development.
+  # If you would like to disable it, set `colorize_logger` to false.
+  # config.colorize_logger = false
+  # Logger level for default logger:
+  # config.log_level = :info
+  # Custom logger:
+  # config.logger = Logger.new(STDOUT)
+  # Custom time zone (for logs):
+  # config.time_zone = "UTC"
+  # config.time_zone = "Europe/Moscow"
+  # At start callback for a runner. Accepts argument with info as hash with
+  # keys: id, status, start_time, environment, concurrent_jobs, spiders list.
+  # For example, you can use this callback to send notification when runner was started:
+  # config.runner_at_start_callback = lambda do |info|
+  #   json = JSON.pretty_generate(info)
+  #   Sender.send_notification("Started session: #{json}")
+  # end
+  # At stop callback for a runner. Accepts argument with info as hash with
+  # all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
+  # stop status of a runner (completed or failed).
+  # You can use this callback to send notification when runner has been stopped:
+  # config.runner_at_stop_callback = lambda do |info|
+  #   json = JSON.pretty_generate(info)
+  #   Sender.send_notification("Stopped session: #{json}")
+  # end
+  # Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
+  # config.selenium_chrome_path = "/usr/bin/chromium-browser"
+  # Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
+  # config.chromedriver_path = "/usr/local/bin/chromedriver"
+end

data/lib/kimurai/template/config/automation.yml ADDED Viewed

@@ -0,0 +1,13 @@
+# software versions to install for `setup` command
+setup:
+  ruby: 2.5.1
+  # check latest here http://phantomjs.org/download.html
+  phantomjs: 2.1.1
+  # check latest here https://github.com/mozilla/geckodriver/releases/
+  geckodriver: 0.21.0
+  # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
+  chromedriver: 2.39
+# settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
+deploy:
+  # repo_url: git@bitbucket.org:username/repo_name.git
+  # repo_key_path: ~/.ssh/id_rsa

data/lib/kimurai/template/config/boot.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# require project gems
+require 'bundler/setup'
+Bundler.require(:default, Kimurai.env)
+# require custom ENV variables located in .env file
+require 'dotenv/load'
+# require initializers
+Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
+# require helpers
+Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
+# require pipelines
+Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
+# require spiders recursively in the `spiders/` folder
+require_relative '../spiders/application_spider'
+require_all "spiders"
+# require Kimurai configuration
+require_relative 'application'

data/lib/kimurai/template/config/initializers/.keep ADDED Viewed

File without changes

data/lib/kimurai/template/config/schedule.rb ADDED Viewed

@@ -0,0 +1,57 @@
+### Settings ###
+require 'tzinfo'
+# Export current PATH to the cron
+env :PATH, ENV["PATH"]
+# Use 24 hour format when using `at:` option
+set :chronic_options, hours24: true
+# Use local_to_utc helper to setup execution time using your local timezone instead
+# of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
+# Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
+# to have spiders logs in a specific time zone format.
+# Example usage of helper:
+# every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
+#   crawl "google_spider.com", output: "log/google_spider.com.log"
+# end
+def local_to_utc(time_string, zone:)
+  TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
+end
+# Note: by default Whenever exports cron commands with :environment == "production".
+# Note: Whenever can only append log data to a log file (>>). If you want
+# to overwrite (>) log file before each run, pass lambda:
+# crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
+# Project job types
+job_type :crawl,  "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
+job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
+# Single file job type
+job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
+# Single with bundle exec
+job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
+### Schedule ###
+# Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
+# every 1.day do
+  # Example to schedule a single spider in the project:
+  # crawl "google_spider.com", output: "log/google_spider.com.log"
+  # Example to schedule all spiders in the project using runner. Each spider will write
+  # it's own output to the `log/spider_name.log` file (handled by a runner itself).
+  # Runner output will be written to log/runner.log file.
+  # Argument number it's a count of concurrent jobs:
+  # runner 3, output:"log/runner.log"
+  # Example to schedule single spider (without project):
+  # single "single_spider.rb", output: "single_spider.log"
+# end
+### How to set a cron schedule ###
+# Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
+# If you don't have whenever command, install the gem: `$ gem install whenever`.
+### How to cancel a schedule ###
+# Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.

data/lib/kimurai/template/db/.keep ADDED Viewed

File without changes

data/lib/kimurai/template/helpers/application_helper.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module ApplicationHelper
+  # Put here custom methods which are will be available for any spider
+end

data/lib/kimurai/template/lib/.keep ADDED Viewed

File without changes

data/lib/kimurai/template/log/.keep ADDED Viewed

File without changes

data/lib/kimurai/template/pipelines/saver.rb ADDED Viewed

@@ -0,0 +1,11 @@
+class Saver < Kimurai::Pipeline
+  def process_item(item, options: {})
+    # Here you can save item to the database, send it to a remote API or
+    # simply save item to a file format using `save_to` helper:
+    # To get the name of a current spider: `spider.class.name`
+    # save_to "db/#{spider.class.name}.json", item, format: :pretty_json
+    item
+  end
+end

data/lib/kimurai/template/pipelines/validator.rb ADDED Viewed

@@ -0,0 +1,24 @@
+class Validator < Kimurai::Pipeline
+  def process_item(item, options: {})
+    # Here you can validate item and raise `DropItemError`
+    # if one of the validations failed. Examples:
+    # Check item sku for uniqueness using buit-in `unique?` helper:
+    # unless unique?(:sku, item[:sku])
+    #   raise DropItemError, "Item sku is not unique"
+    # end
+    # Drop item if title length shorter than 5 symbols:
+    # if item[:title].size < 5
+    #   raise DropItemError, "Item title is short"
+    # end
+    # Drop item if it doesn't contains any images:
+    # unless item[:images].present?
+    #   raise DropItemError, "Item images are not present"
+    # end
+    # Pass item to the next pipeline (if it wasn't dropped)
+    item
+  end
+end

data/lib/kimurai/template/spiders/application_spider.rb ADDED Viewed

@@ -0,0 +1,143 @@
+# ApplicationSpider is a default base spider class. You can set here
+# default settings for all spiders inherited from ApplicationSpider.
+# To generate a new spider, run: `$ kimurai generate spider spider_name`
+class ApplicationSpider < Kimurai::Base
+  include ApplicationHelper
+  # Default engine for spiders (available engines: :mechanize, :poltergeist_phantomjs,
+  # :selenium_firefox, :selenium_chrome)
+  @engine = :poltergeist_phantomjs
+  # Pipelines list, by order.
+  # To process item through pipelines pass item to the `send_item` method
+  @pipelines = [:validator, :saver]
+  # Default config. Set here options which are default for all spiders inherited
+  # from ApplicationSpider. Child's class config will be deep merged with this one
+  @config = {
+    # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
+    # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers)
+    # headers: {},
+    # Custom User Agent, format: string or lambda.
+    # Use lambda if you want to rotate user agents before each run:
+    # user_agent: -> { ARRAY_OF_USER_AGENTS.sample }
+    # Works for all engines
+    # user_agent: "Mozilla/5.0 Firefox/61.0",
+    # Custom cookies, format: array of hashes.
+    # Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" }
+    # Works for all engines
+    # cookies: [],
+    # Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password"
+    # `protocol` can be http or socks5. User and password are optional.
+    # Use lambda if you want to rotate proxies before each run:
+    # proxy: -> { ARRAY_OF_PROXIES.sample }
+    # Works for all engines, but keep in mind that Selenium drivers doesn't support proxies
+    # with authorization. Also, Mechanize doesn't support socks5 proxy format (only http)
+    # proxy: "3.4.5.6:3128:http:user:pass",
+    # If enabled, browser will ignore any https errors. It's handy while using a proxy
+    # with self-signed SSL cert (for example Crawlera or Mitmproxy)
+    # Also, it will allow to visit webpages with expires SSL certificate.
+    # Works for all engines
+    ignore_ssl_errors: true,
+    # Custom window size, works for all engines
+    # window_size: [1366, 768],
+    # Skip images downloading if true, works for all engines
+    disable_images: true,
+    # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
+    # Although native mode has a better performance, virtual display mode
+    # sometimes can be useful. For example, some websites can detect (and block)
+    # headless chrome, so you can use virtual_display mode instead
+    # headless_mode: :native,
+    # This option tells the browser not to use a proxy for the provided list of domains or IP addresses.
+    # Format: array of strings. Works only for :selenium_firefox and selenium_chrome
+    # proxy_bypass_list: [],
+    # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
+    # ssl_cert_path: "path/to/ssl_cert",
+    # Inject some JavaScript code to the browser.
+    # Format: array of strings, where each string is a path to JS file.
+    # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
+    # extensions: ["lib/code_to_inject.js"],
+    # Automatically skip duplicated (already visited) urls when using `request_to` method.
+    # Possible values: `true` or `hash` with options.
+    # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
+    # and if url already contains in this scope, request will be skipped.
+    # You can configure this setting by providing additional options as hash:
+    # `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
+    # `scope:` - use custom scope than `:requests_urls`
+    # `check_only:` - if true, then scope will be only checked for url, url will not
+    # be added to the scope if scope doesn't contains it.
+    # works for all drivers
+    # skip_duplicate_requests: true,
+    # Automatically skip provided errors while requesting a page.
+    # If raised error matches one of the errors in the list, then this error will be caught,
+    # and request will be skipped.
+    # It is a good idea to skip errors like NotFound(404), etc.
+    # Format: array where elements are error classes or/and hashes. You can use hash format
+    # for more flexibility: `{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }`.
+    # Provided `message:` will be compared with a full error message using `String#include?`. Also
+    # you can use regex instead: `{ error: "RuntimeError", message: /404|403/ }`.
+    # skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }],
+    # Automatically retry provided errors with a few attempts while requesting a page.
+    # If raised error matches one of the errors in the list, then this error will be caught
+    # and the request will be processed again within a delay. There are 3 attempts:
+    # first: delay 15 sec, second: delay 30 sec, third: delay 45 sec.
+    # If after 3 attempts there is still an exception, then the exception will be raised.
+    # It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
+    # Format: same like for `skip_request_errors` option.
+    # retry_request_errors: [Net::ReadTimeout],
+    # Handle page encoding while parsing html response using Nokogiri. There are two modes:
+    # Auto (`:auto`) (try to fetch correct encoding from <meta http-equiv="Content-Type"> or <meta charset> tags)
+    # Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually)
+    # Default this option is unset.
+    # encoding: nil,
+    # Restart browser if one of the options is true:
+    restart_if: {
+      # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
+      # memory_limit: 350_000,
+      # Restart browser if provided requests limit is exceeded (works for all engines)
+      # requests_limit: 100
+    },
+    # Perform several actions before each request:
+    before_request: {
+      # Change proxy before each request. The `proxy:` option above should be presented
+      # and has lambda format. Works only for poltergeist and mechanize engines
+      # (Selenium doesn't support proxy rotation).
+      # change_proxy: true,
+      # Change user agent before each request. The `user_agent:` option above should be presented
+      # and has lambda format. Works only for poltergeist and mechanize engines
+      # (selenium doesn't support to get/set headers).
+      # change_user_agent: true,
+      # Clear all cookies before each request, works for all engines
+      # clear_cookies: true,
+      # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
+      # use this option instead (works for all engines)
+      # clear_and_set_cookies: true,
+      # Global option to set delay between requests.
+      # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
+      # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
+      # delay: 1..3
+    }
+  }
+end

data/lib/kimurai/template/tmp/.keep ADDED Viewed

File without changes

data/lib/kimurai/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Kimurai
+  VERSION = "1.4.1"
+end

data/lib/kimurai.rb ADDED Viewed

@@ -0,0 +1,54 @@
+require 'ostruct'
+require 'logger'
+require 'json'
+require 'active_support'
+require 'active_support/core_ext'
+require 'rbcat'
+require_relative 'kimurai/version'
+require_relative 'kimurai/core_ext/numeric'
+require_relative 'kimurai/core_ext/string'
+require_relative 'kimurai/core_ext/array'
+require_relative 'kimurai/core_ext/hash'
+require_relative 'kimurai/browser_builder'
+require_relative 'kimurai/base_helper'
+require_relative 'kimurai/pipeline'
+require_relative 'kimurai/base'
+module Kimurai
+  class << self
+    def configuration
+      @configuration ||= OpenStruct.new
+    end
+    def configure
+      yield(configuration)
+    end
+    def env
+      ENV.fetch("KIMURAI_ENV") { "development" }
+    end
+    def time_zone
+      ENV["TZ"]
+    end
+    def time_zone=(value)
+      ENV.store("TZ", value)
+    end
+    def list
+      Base.descendants.map do |klass|
+        next unless klass.name
+        [klass.name, klass]
+      end.compact.to_h
+    end
+    def find_by_name(name)
+      return unless name
+      Base.descendants.find { |klass| klass.name == name }
+    end
+  end
+end