kimurai_dynamic 1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +111 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/kimurai +6 -0
- data/kimurai.gemspec +48 -0
- data/lib/kimurai/automation/deploy.yml +54 -0
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
- data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
- data/lib/kimurai/automation/setup.yml +45 -0
- data/lib/kimurai/base/saver.rb +106 -0
- data/lib/kimurai/base/storage.rb +54 -0
- data/lib/kimurai/base.rb +330 -0
- data/lib/kimurai/base_helper.rb +22 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/kimurai/browser_builder.rb +20 -0
- data/lib/kimurai/capybara_configuration.rb +10 -0
- data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/kimurai/capybara_ext/session/config.rb +22 -0
- data/lib/kimurai/capybara_ext/session.rb +249 -0
- data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
- data/lib/kimurai/cli/generator.rb +57 -0
- data/lib/kimurai/cli.rb +183 -0
- data/lib/kimurai/core_ext/array.rb +14 -0
- data/lib/kimurai/core_ext/hash.rb +5 -0
- data/lib/kimurai/core_ext/numeric.rb +19 -0
- data/lib/kimurai/core_ext/string.rb +7 -0
- data/lib/kimurai/pipeline.rb +33 -0
- data/lib/kimurai/runner.rb +60 -0
- data/lib/kimurai/template/.gitignore +18 -0
- data/lib/kimurai/template/Gemfile +28 -0
- data/lib/kimurai/template/README.md +3 -0
- data/lib/kimurai/template/config/application.rb +37 -0
- data/lib/kimurai/template/config/automation.yml +13 -0
- data/lib/kimurai/template/config/boot.rb +22 -0
- data/lib/kimurai/template/config/initializers/.keep +0 -0
- data/lib/kimurai/template/config/schedule.rb +57 -0
- data/lib/kimurai/template/db/.keep +0 -0
- data/lib/kimurai/template/helpers/application_helper.rb +3 -0
- data/lib/kimurai/template/lib/.keep +0 -0
- data/lib/kimurai/template/log/.keep +0 -0
- data/lib/kimurai/template/pipelines/saver.rb +11 -0
- data/lib/kimurai/template/pipelines/validator.rb +24 -0
- data/lib/kimurai/template/spiders/application_spider.rb +143 -0
- data/lib/kimurai/template/tmp/.keep +0 -0
- data/lib/kimurai/version.rb +3 -0
- data/lib/kimurai.rb +54 -0
- metadata +349 -0
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'pmap'
|
2
|
+
|
3
|
+
module Kimurai
|
4
|
+
class Runner
|
5
|
+
attr_reader :jobs, :spiders, :session_info
|
6
|
+
|
7
|
+
def initialize(spiders, parallel_jobs)
|
8
|
+
@jobs = parallel_jobs
|
9
|
+
@spiders = spiders
|
10
|
+
@start_time = Time.now
|
11
|
+
|
12
|
+
@session_info = {
|
13
|
+
id: @start_time.to_i,
|
14
|
+
status: :processing,
|
15
|
+
start_time: @start_time,
|
16
|
+
stop_time: nil,
|
17
|
+
environment: Kimurai.env,
|
18
|
+
concurrent_jobs: @jobs,
|
19
|
+
spiders: @spiders
|
20
|
+
}
|
21
|
+
|
22
|
+
if time_zone = Kimurai.configuration.time_zone
|
23
|
+
Kimurai.time_zone = time_zone
|
24
|
+
end
|
25
|
+
|
26
|
+
ENV.store("SESSION_ID", @start_time.to_i.to_s)
|
27
|
+
ENV.store("RBCAT_COLORIZER", "false")
|
28
|
+
end
|
29
|
+
|
30
|
+
def run!(exception_on_fail: true)
|
31
|
+
puts ">>> Runner: started: #{session_info}"
|
32
|
+
if at_start_callback = Kimurai.configuration.runner_at_start_callback
|
33
|
+
at_start_callback.call(session_info)
|
34
|
+
end
|
35
|
+
|
36
|
+
running = true
|
37
|
+
spiders.peach_with_index(jobs) do |spider, i|
|
38
|
+
next unless running
|
39
|
+
|
40
|
+
puts "> Runner: started spider: #{spider}, index: #{i}"
|
41
|
+
pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
|
42
|
+
Process.wait pid
|
43
|
+
|
44
|
+
puts "< Runner: stopped spider: #{spider}, index: #{i}"
|
45
|
+
end
|
46
|
+
rescue StandardError, SignalException, SystemExit => e
|
47
|
+
running = false
|
48
|
+
|
49
|
+
session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
|
50
|
+
exception_on_fail ? raise(e) : [session_info, e]
|
51
|
+
else
|
52
|
+
session_info.merge!(status: :completed, stop_time: Time.now)
|
53
|
+
ensure
|
54
|
+
if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
|
55
|
+
at_stop_callback.call(session_info)
|
56
|
+
end
|
57
|
+
puts "<<< Runner: stopped: #{session_info}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
|
3
|
+
|
4
|
+
ruby '>= 2.5'
|
5
|
+
|
6
|
+
# Framework
|
7
|
+
gem 'kimurai', '~> 1.4'
|
8
|
+
|
9
|
+
# Require files in directory and child directories recursively
|
10
|
+
gem 'require_all'
|
11
|
+
|
12
|
+
# Dotenv
|
13
|
+
gem 'dotenv'
|
14
|
+
|
15
|
+
# To debug spiders:
|
16
|
+
group :development do
|
17
|
+
gem 'byebug', platforms: :mri
|
18
|
+
gem 'pry'
|
19
|
+
end
|
20
|
+
|
21
|
+
# If you want to save items to the database, require one of these gems:
|
22
|
+
# gem 'sqlite3'
|
23
|
+
# gem 'pg'
|
24
|
+
# gem 'mysql2'
|
25
|
+
|
26
|
+
# And use your preferred ORM/database connector:
|
27
|
+
# gem 'activerecord', require: 'active_record'
|
28
|
+
# gem 'sequel'
|
@@ -0,0 +1,37 @@
|
|
1
|
+
Kimurai.configure do |config|
|
2
|
+
# Default logger has colored mode in development.
|
3
|
+
# If you would like to disable it, set `colorize_logger` to false.
|
4
|
+
# config.colorize_logger = false
|
5
|
+
|
6
|
+
# Logger level for default logger:
|
7
|
+
# config.log_level = :info
|
8
|
+
|
9
|
+
# Custom logger:
|
10
|
+
# config.logger = Logger.new(STDOUT)
|
11
|
+
|
12
|
+
# Custom time zone (for logs):
|
13
|
+
# config.time_zone = "UTC"
|
14
|
+
# config.time_zone = "Europe/Moscow"
|
15
|
+
|
16
|
+
# At start callback for a runner. Accepts argument with info as hash with
|
17
|
+
# keys: id, status, start_time, environment, concurrent_jobs, spiders list.
|
18
|
+
# For example, you can use this callback to send notification when runner was started:
|
19
|
+
# config.runner_at_start_callback = lambda do |info|
|
20
|
+
# json = JSON.pretty_generate(info)
|
21
|
+
# Sender.send_notification("Started session: #{json}")
|
22
|
+
# end
|
23
|
+
|
24
|
+
# At stop callback for a runner. Accepts argument with info as hash with
|
25
|
+
# all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
|
26
|
+
# stop status of a runner (completed or failed).
|
27
|
+
# You can use this callback to send notification when runner has been stopped:
|
28
|
+
# config.runner_at_stop_callback = lambda do |info|
|
29
|
+
# json = JSON.pretty_generate(info)
|
30
|
+
# Sender.send_notification("Stopped session: #{json}")
|
31
|
+
# end
|
32
|
+
|
33
|
+
# Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
|
34
|
+
# config.selenium_chrome_path = "/usr/bin/chromium-browser"
|
35
|
+
# Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
|
36
|
+
# config.chromedriver_path = "/usr/local/bin/chromedriver"
|
37
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# software versions to install for `setup` command
|
2
|
+
setup:
|
3
|
+
ruby: 2.5.1
|
4
|
+
# check latest here http://phantomjs.org/download.html
|
5
|
+
phantomjs: 2.1.1
|
6
|
+
# check latest here https://github.com/mozilla/geckodriver/releases/
|
7
|
+
geckodriver: 0.21.0
|
8
|
+
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
9
|
+
chromedriver: 2.39
|
10
|
+
# settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
|
11
|
+
deploy:
|
12
|
+
# repo_url: git@bitbucket.org:username/repo_name.git
|
13
|
+
# repo_key_path: ~/.ssh/id_rsa
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# require project gems
|
2
|
+
require 'bundler/setup'
|
3
|
+
Bundler.require(:default, Kimurai.env)
|
4
|
+
|
5
|
+
# require custom ENV variables located in .env file
|
6
|
+
require 'dotenv/load'
|
7
|
+
|
8
|
+
# require initializers
|
9
|
+
Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
|
10
|
+
|
11
|
+
# require helpers
|
12
|
+
Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
|
13
|
+
|
14
|
+
# require pipelines
|
15
|
+
Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
|
16
|
+
|
17
|
+
# require spiders recursively in the `spiders/` folder
|
18
|
+
require_relative '../spiders/application_spider'
|
19
|
+
require_all "spiders"
|
20
|
+
|
21
|
+
# require Kimurai configuration
|
22
|
+
require_relative 'application'
|
File without changes
|
@@ -0,0 +1,57 @@
|
|
1
|
+
### Settings ###
|
2
|
+
require 'tzinfo'
|
3
|
+
|
4
|
+
# Export current PATH to the cron
|
5
|
+
env :PATH, ENV["PATH"]
|
6
|
+
|
7
|
+
# Use 24 hour format when using `at:` option
|
8
|
+
set :chronic_options, hours24: true
|
9
|
+
|
10
|
+
# Use local_to_utc helper to setup execution time using your local timezone instead
|
11
|
+
# of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
|
12
|
+
# Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
|
13
|
+
# to have spiders logs in a specific time zone format.
|
14
|
+
# Example usage of helper:
|
15
|
+
# every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
|
16
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
17
|
+
# end
|
18
|
+
def local_to_utc(time_string, zone:)
|
19
|
+
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Note: by default Whenever exports cron commands with :environment == "production".
|
23
|
+
# Note: Whenever can only append log data to a log file (>>). If you want
|
24
|
+
# to overwrite (>) log file before each run, pass lambda:
|
25
|
+
# crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
|
26
|
+
|
27
|
+
# Project job types
|
28
|
+
job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
|
29
|
+
job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
|
30
|
+
|
31
|
+
# Single file job type
|
32
|
+
job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
|
33
|
+
# Single with bundle exec
|
34
|
+
job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
|
35
|
+
|
36
|
+
### Schedule ###
|
37
|
+
# Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
|
38
|
+
# every 1.day do
|
39
|
+
# Example to schedule a single spider in the project:
|
40
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
41
|
+
|
42
|
+
# Example to schedule all spiders in the project using runner. Each spider will write
|
43
|
+
# it's own output to the `log/spider_name.log` file (handled by a runner itself).
|
44
|
+
# Runner output will be written to log/runner.log file.
|
45
|
+
# Argument number it's a count of concurrent jobs:
|
46
|
+
# runner 3, output:"log/runner.log"
|
47
|
+
|
48
|
+
# Example to schedule single spider (without project):
|
49
|
+
# single "single_spider.rb", output: "single_spider.log"
|
50
|
+
# end
|
51
|
+
|
52
|
+
### How to set a cron schedule ###
|
53
|
+
# Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
|
54
|
+
# If you don't have whenever command, install the gem: `$ gem install whenever`.
|
55
|
+
|
56
|
+
### How to cancel a schedule ###
|
57
|
+
# Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class Saver < Kimurai::Pipeline
|
2
|
+
def process_item(item, options: {})
|
3
|
+
# Here you can save item to the database, send it to a remote API or
|
4
|
+
# simply save item to a file format using `save_to` helper:
|
5
|
+
|
6
|
+
# To get the name of a current spider: `spider.class.name`
|
7
|
+
# save_to "db/#{spider.class.name}.json", item, format: :pretty_json
|
8
|
+
|
9
|
+
item
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class Validator < Kimurai::Pipeline
|
2
|
+
def process_item(item, options: {})
|
3
|
+
# Here you can validate item and raise `DropItemError`
|
4
|
+
# if one of the validations failed. Examples:
|
5
|
+
|
6
|
+
# Check item sku for uniqueness using buit-in `unique?` helper:
|
7
|
+
# unless unique?(:sku, item[:sku])
|
8
|
+
# raise DropItemError, "Item sku is not unique"
|
9
|
+
# end
|
10
|
+
|
11
|
+
# Drop item if title length shorter than 5 symbols:
|
12
|
+
# if item[:title].size < 5
|
13
|
+
# raise DropItemError, "Item title is short"
|
14
|
+
# end
|
15
|
+
|
16
|
+
# Drop item if it doesn't contains any images:
|
17
|
+
# unless item[:images].present?
|
18
|
+
# raise DropItemError, "Item images are not present"
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Pass item to the next pipeline (if it wasn't dropped)
|
22
|
+
item
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,143 @@
|
|
1
|
+
# ApplicationSpider is a default base spider class. You can set here
|
2
|
+
# default settings for all spiders inherited from ApplicationSpider.
|
3
|
+
# To generate a new spider, run: `$ kimurai generate spider spider_name`
|
4
|
+
|
5
|
+
class ApplicationSpider < Kimurai::Base
|
6
|
+
include ApplicationHelper
|
7
|
+
|
8
|
+
# Default engine for spiders (available engines: :mechanize, :poltergeist_phantomjs,
|
9
|
+
# :selenium_firefox, :selenium_chrome)
|
10
|
+
@engine = :poltergeist_phantomjs
|
11
|
+
|
12
|
+
# Pipelines list, by order.
|
13
|
+
# To process item through pipelines pass item to the `send_item` method
|
14
|
+
@pipelines = [:validator, :saver]
|
15
|
+
|
16
|
+
# Default config. Set here options which are default for all spiders inherited
|
17
|
+
# from ApplicationSpider. Child's class config will be deep merged with this one
|
18
|
+
@config = {
|
19
|
+
# Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
|
20
|
+
# Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers)
|
21
|
+
# headers: {},
|
22
|
+
|
23
|
+
# Custom User Agent, format: string or lambda.
|
24
|
+
# Use lambda if you want to rotate user agents before each run:
|
25
|
+
# user_agent: -> { ARRAY_OF_USER_AGENTS.sample }
|
26
|
+
# Works for all engines
|
27
|
+
# user_agent: "Mozilla/5.0 Firefox/61.0",
|
28
|
+
|
29
|
+
# Custom cookies, format: array of hashes.
|
30
|
+
# Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" }
|
31
|
+
# Works for all engines
|
32
|
+
# cookies: [],
|
33
|
+
|
34
|
+
# Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password"
|
35
|
+
# `protocol` can be http or socks5. User and password are optional.
|
36
|
+
# Use lambda if you want to rotate proxies before each run:
|
37
|
+
# proxy: -> { ARRAY_OF_PROXIES.sample }
|
38
|
+
# Works for all engines, but keep in mind that Selenium drivers doesn't support proxies
|
39
|
+
# with authorization. Also, Mechanize doesn't support socks5 proxy format (only http)
|
40
|
+
# proxy: "3.4.5.6:3128:http:user:pass",
|
41
|
+
|
42
|
+
# If enabled, browser will ignore any https errors. It's handy while using a proxy
|
43
|
+
# with self-signed SSL cert (for example Crawlera or Mitmproxy)
|
44
|
+
# Also, it will allow to visit webpages with expires SSL certificate.
|
45
|
+
# Works for all engines
|
46
|
+
ignore_ssl_errors: true,
|
47
|
+
|
48
|
+
# Custom window size, works for all engines
|
49
|
+
# window_size: [1366, 768],
|
50
|
+
|
51
|
+
# Skip images downloading if true, works for all engines
|
52
|
+
disable_images: true,
|
53
|
+
|
54
|
+
# Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
|
55
|
+
# Although native mode has a better performance, virtual display mode
|
56
|
+
# sometimes can be useful. For example, some websites can detect (and block)
|
57
|
+
# headless chrome, so you can use virtual_display mode instead
|
58
|
+
# headless_mode: :native,
|
59
|
+
|
60
|
+
# This option tells the browser not to use a proxy for the provided list of domains or IP addresses.
|
61
|
+
# Format: array of strings. Works only for :selenium_firefox and selenium_chrome
|
62
|
+
# proxy_bypass_list: [],
|
63
|
+
|
64
|
+
# Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
|
65
|
+
# ssl_cert_path: "path/to/ssl_cert",
|
66
|
+
|
67
|
+
# Inject some JavaScript code to the browser.
|
68
|
+
# Format: array of strings, where each string is a path to JS file.
|
69
|
+
# Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
|
70
|
+
# extensions: ["lib/code_to_inject.js"],
|
71
|
+
|
72
|
+
# Automatically skip duplicated (already visited) urls when using `request_to` method.
|
73
|
+
# Possible values: `true` or `hash` with options.
|
74
|
+
# In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
|
75
|
+
# and if url already contains in this scope, request will be skipped.
|
76
|
+
# You can configure this setting by providing additional options as hash:
|
77
|
+
# `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
|
78
|
+
# `scope:` - use custom scope than `:requests_urls`
|
79
|
+
# `check_only:` - if true, then scope will be only checked for url, url will not
|
80
|
+
# be added to the scope if scope doesn't contains it.
|
81
|
+
# works for all drivers
|
82
|
+
# skip_duplicate_requests: true,
|
83
|
+
|
84
|
+
# Automatically skip provided errors while requesting a page.
|
85
|
+
# If raised error matches one of the errors in the list, then this error will be caught,
|
86
|
+
# and request will be skipped.
|
87
|
+
# It is a good idea to skip errors like NotFound(404), etc.
|
88
|
+
# Format: array where elements are error classes or/and hashes. You can use hash format
|
89
|
+
# for more flexibility: `{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }`.
|
90
|
+
# Provided `message:` will be compared with a full error message using `String#include?`. Also
|
91
|
+
# you can use regex instead: `{ error: "RuntimeError", message: /404|403/ }`.
|
92
|
+
# skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }],
|
93
|
+
|
94
|
+
# Automatically retry provided errors with a few attempts while requesting a page.
|
95
|
+
# If raised error matches one of the errors in the list, then this error will be caught
|
96
|
+
# and the request will be processed again within a delay. There are 3 attempts:
|
97
|
+
# first: delay 15 sec, second: delay 30 sec, third: delay 45 sec.
|
98
|
+
# If after 3 attempts there is still an exception, then the exception will be raised.
|
99
|
+
# It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
|
100
|
+
# Format: same like for `skip_request_errors` option.
|
101
|
+
# retry_request_errors: [Net::ReadTimeout],
|
102
|
+
|
103
|
+
# Handle page encoding while parsing html response using Nokogiri. There are two modes:
|
104
|
+
# Auto (`:auto`) (try to fetch correct encoding from <meta http-equiv="Content-Type"> or <meta charset> tags)
|
105
|
+
# Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually)
|
106
|
+
# Default this option is unset.
|
107
|
+
# encoding: nil,
|
108
|
+
|
109
|
+
# Restart browser if one of the options is true:
|
110
|
+
restart_if: {
|
111
|
+
# Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
|
112
|
+
# memory_limit: 350_000,
|
113
|
+
|
114
|
+
# Restart browser if provided requests limit is exceeded (works for all engines)
|
115
|
+
# requests_limit: 100
|
116
|
+
},
|
117
|
+
|
118
|
+
# Perform several actions before each request:
|
119
|
+
before_request: {
|
120
|
+
# Change proxy before each request. The `proxy:` option above should be presented
|
121
|
+
# and has lambda format. Works only for poltergeist and mechanize engines
|
122
|
+
# (Selenium doesn't support proxy rotation).
|
123
|
+
# change_proxy: true,
|
124
|
+
|
125
|
+
# Change user agent before each request. The `user_agent:` option above should be presented
|
126
|
+
# and has lambda format. Works only for poltergeist and mechanize engines
|
127
|
+
# (selenium doesn't support to get/set headers).
|
128
|
+
# change_user_agent: true,
|
129
|
+
|
130
|
+
# Clear all cookies before each request, works for all engines
|
131
|
+
# clear_cookies: true,
|
132
|
+
|
133
|
+
# If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
|
134
|
+
# use this option instead (works for all engines)
|
135
|
+
# clear_and_set_cookies: true,
|
136
|
+
|
137
|
+
# Global option to set delay between requests.
|
138
|
+
# Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
|
139
|
+
# delay number will be chosen randomly for each request: `rand (2..5) # => 3`
|
140
|
+
# delay: 1..3
|
141
|
+
}
|
142
|
+
}
|
143
|
+
end
|
File without changes
|
data/lib/kimurai.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
require 'logger'
|
3
|
+
require 'json'
|
4
|
+
require 'active_support'
|
5
|
+
require 'active_support/core_ext'
|
6
|
+
require 'rbcat'
|
7
|
+
|
8
|
+
require_relative 'kimurai/version'
|
9
|
+
|
10
|
+
require_relative 'kimurai/core_ext/numeric'
|
11
|
+
require_relative 'kimurai/core_ext/string'
|
12
|
+
require_relative 'kimurai/core_ext/array'
|
13
|
+
require_relative 'kimurai/core_ext/hash'
|
14
|
+
|
15
|
+
require_relative 'kimurai/browser_builder'
|
16
|
+
require_relative 'kimurai/base_helper'
|
17
|
+
require_relative 'kimurai/pipeline'
|
18
|
+
require_relative 'kimurai/base'
|
19
|
+
|
20
|
+
module Kimurai
|
21
|
+
class << self
|
22
|
+
def configuration
|
23
|
+
@configuration ||= OpenStruct.new
|
24
|
+
end
|
25
|
+
|
26
|
+
def configure
|
27
|
+
yield(configuration)
|
28
|
+
end
|
29
|
+
|
30
|
+
def env
|
31
|
+
ENV.fetch("KIMURAI_ENV") { "development" }
|
32
|
+
end
|
33
|
+
|
34
|
+
def time_zone
|
35
|
+
ENV["TZ"]
|
36
|
+
end
|
37
|
+
|
38
|
+
def time_zone=(value)
|
39
|
+
ENV.store("TZ", value)
|
40
|
+
end
|
41
|
+
|
42
|
+
def list
|
43
|
+
Base.descendants.map do |klass|
|
44
|
+
next unless klass.name
|
45
|
+
[klass.name, klass]
|
46
|
+
end.compact.to_h
|
47
|
+
end
|
48
|
+
|
49
|
+
def find_by_name(name)
|
50
|
+
return unless name
|
51
|
+
Base.descendants.find { |klass| klass.name == name }
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|