kimurai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +1923 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/kimurai +6 -0
- data/kimurai.gemspec +48 -0
- data/lib/kimurai.rb +53 -0
- data/lib/kimurai/automation/deploy.yml +54 -0
- data/lib/kimurai/automation/setup.yml +44 -0
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
- data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
- data/lib/kimurai/base.rb +249 -0
- data/lib/kimurai/base/simple_saver.rb +98 -0
- data/lib/kimurai/base/uniq_checker.rb +22 -0
- data/lib/kimurai/base_helper.rb +22 -0
- data/lib/kimurai/browser_builder.rb +32 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
- data/lib/kimurai/capybara_configuration.rb +10 -0
- data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
- data/lib/kimurai/capybara_ext/session.rb +150 -0
- data/lib/kimurai/capybara_ext/session/config.rb +18 -0
- data/lib/kimurai/cli.rb +157 -0
- data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
- data/lib/kimurai/cli/generator.rb +57 -0
- data/lib/kimurai/core_ext/array.rb +14 -0
- data/lib/kimurai/core_ext/numeric.rb +19 -0
- data/lib/kimurai/core_ext/string.rb +7 -0
- data/lib/kimurai/pipeline.rb +25 -0
- data/lib/kimurai/runner.rb +72 -0
- data/lib/kimurai/template/.gitignore +18 -0
- data/lib/kimurai/template/.ruby-version +1 -0
- data/lib/kimurai/template/Gemfile +20 -0
- data/lib/kimurai/template/README.md +3 -0
- data/lib/kimurai/template/config/application.rb +32 -0
- data/lib/kimurai/template/config/automation.yml +13 -0
- data/lib/kimurai/template/config/boot.rb +22 -0
- data/lib/kimurai/template/config/initializers/.keep +0 -0
- data/lib/kimurai/template/config/schedule.rb +57 -0
- data/lib/kimurai/template/db/.keep +0 -0
- data/lib/kimurai/template/helpers/application_helper.rb +3 -0
- data/lib/kimurai/template/lib/.keep +0 -0
- data/lib/kimurai/template/log/.keep +0 -0
- data/lib/kimurai/template/pipelines/saver.rb +11 -0
- data/lib/kimurai/template/pipelines/validator.rb +24 -0
- data/lib/kimurai/template/spiders/application_spider.rb +104 -0
- data/lib/kimurai/template/tmp/.keep +0 -0
- data/lib/kimurai/version.rb +3 -0
- metadata +349 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'cliver'
|
2
|
+
|
3
|
+
module Kimurai
|
4
|
+
class CLI
|
5
|
+
class AnsibleCommandBuilder
|
6
|
+
def initialize(user_host, options, playbook:, vars: {})
|
7
|
+
@user_host = user_host
|
8
|
+
@options = options
|
9
|
+
@playbook = playbook
|
10
|
+
@vars = vars
|
11
|
+
end
|
12
|
+
|
13
|
+
def get
|
14
|
+
unless Cliver.detect("ansible-playbook")
|
15
|
+
raise "Can't find `ansible-playbook` executable, to install: " \
|
16
|
+
"Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
|
17
|
+
end
|
18
|
+
|
19
|
+
user = @user_host[/(.*?)\@/, 1]
|
20
|
+
host = @user_host[/\@(.+)/, 1] || @user_host
|
21
|
+
inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
|
22
|
+
|
23
|
+
gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
|
24
|
+
playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
|
25
|
+
|
26
|
+
command = [
|
27
|
+
"ansible-playbook", playbook_path,
|
28
|
+
"--inventory", inventory,
|
29
|
+
"--ssh-extra-args", "-oForwardAgent=yes",
|
30
|
+
"--connection", @options["local"] ? "local" : "smart",
|
31
|
+
"--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
|
32
|
+
]
|
33
|
+
|
34
|
+
if File.exists? "config/automation.yml"
|
35
|
+
require 'yaml'
|
36
|
+
if config = YAML.load_file("config/automation.yml").dig(@playbook)
|
37
|
+
config.each { |key, value| @vars[key] = value unless @vars[key] }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
@vars.each do |key, value|
|
42
|
+
next unless value.present?
|
43
|
+
command.push "--extra-vars", "#{key}=#{value}"
|
44
|
+
end
|
45
|
+
|
46
|
+
if user
|
47
|
+
command.push "--user", user
|
48
|
+
end
|
49
|
+
|
50
|
+
if @options["ask-sudo"]
|
51
|
+
command.push "--ask-become-pass"
|
52
|
+
end
|
53
|
+
|
54
|
+
if @options["ask-auth-pass"]
|
55
|
+
unless Cliver.detect("sshpass")
|
56
|
+
raise "Can't find `sshpass` executable for password authentication, to install: " \
|
57
|
+
"Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
|
58
|
+
end
|
59
|
+
|
60
|
+
command.push "--ask-pass"
|
61
|
+
end
|
62
|
+
|
63
|
+
if ssh_key_path = @options["ssh-key-path"]
|
64
|
+
command.push "--private-key", ssh_key_path
|
65
|
+
end
|
66
|
+
|
67
|
+
command
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class CLI
|
3
|
+
class Generator < Thor::Group
|
4
|
+
include Thor::Actions
|
5
|
+
|
6
|
+
def self.source_root
|
7
|
+
File.dirname(File.expand_path('..', __FILE__))
|
8
|
+
end
|
9
|
+
|
10
|
+
def generate_project(project_name)
|
11
|
+
directory "template", project_name
|
12
|
+
inside(project_name) do
|
13
|
+
run "bundle install"
|
14
|
+
run "git init"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def generate_spider(spider_name, in_project:)
|
19
|
+
spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
|
20
|
+
raise "Spider #{spider_path} already exists" if File.exists? spider_path
|
21
|
+
|
22
|
+
spider_class = to_spider_class(spider_name)
|
23
|
+
create_file spider_path do
|
24
|
+
<<~RUBY
|
25
|
+
class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
|
26
|
+
@name = "#{spider_name}"
|
27
|
+
@start_urls = []
|
28
|
+
@config = {}
|
29
|
+
|
30
|
+
def parse(response, url:, data: {})
|
31
|
+
end
|
32
|
+
end
|
33
|
+
RUBY
|
34
|
+
end
|
35
|
+
|
36
|
+
unless in_project
|
37
|
+
insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
|
38
|
+
prepend_to_file spider_path, "require 'kimurai'\n\n"
|
39
|
+
append_to_file spider_path, "\n#{spider_class}.crawl!"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def generate_schedule
|
44
|
+
copy_file "template/config/schedule.rb", "./schedule.rb"
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def to_spider_class(string)
|
50
|
+
string.sub(/^./) { $&.capitalize }
|
51
|
+
.gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
52
|
+
.gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
|
53
|
+
.gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class Array
|
2
|
+
def in_sorted_groups(number, fill_width = nil)
|
3
|
+
sorted_groups = Array.new(number) { |a| a = [] }
|
4
|
+
|
5
|
+
self.in_groups_of(number, fill_width).each do |group|
|
6
|
+
number.times do |i|
|
7
|
+
group.fetch(i) rescue next
|
8
|
+
sorted_groups[i] << group[i]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
sorted_groups
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Numeric
|
2
|
+
# https://stackoverflow.com/a/1679963
|
3
|
+
def duration
|
4
|
+
secs = self.to_int
|
5
|
+
mins = secs / 60
|
6
|
+
hours = mins / 60
|
7
|
+
days = hours / 24
|
8
|
+
|
9
|
+
if days > 0
|
10
|
+
"#{days}d, #{hours % 24}h"
|
11
|
+
elsif hours > 0
|
12
|
+
"#{hours}h, #{mins % 60}m"
|
13
|
+
elsif mins > 0
|
14
|
+
"#{mins}m, #{secs % 60}s"
|
15
|
+
elsif secs >= 0
|
16
|
+
"#{secs}s"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class Pipeline
|
3
|
+
class DropItemError < StandardError; end
|
4
|
+
def self.name
|
5
|
+
self.to_s.sub(/.*?::/, "").underscore.to_sym
|
6
|
+
end
|
7
|
+
|
8
|
+
include BaseHelper
|
9
|
+
attr_accessor :spider
|
10
|
+
|
11
|
+
def name
|
12
|
+
self.class.name
|
13
|
+
end
|
14
|
+
|
15
|
+
###
|
16
|
+
|
17
|
+
def unique?(scope, value)
|
18
|
+
spider.unique?(scope, value)
|
19
|
+
end
|
20
|
+
|
21
|
+
def save_to(path, item, format:, position: true)
|
22
|
+
spider.save_to(path, item, format: format, position: position)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'pmap'
|
2
|
+
|
3
|
+
module Kimurai
|
4
|
+
class Runner
|
5
|
+
attr_reader :jobs, :spiders
|
6
|
+
|
7
|
+
def initialize(parallel_jobs:)
|
8
|
+
@jobs = parallel_jobs
|
9
|
+
@spiders = Kimurai.list
|
10
|
+
|
11
|
+
if time_zone = Kimurai.configuration.time_zone
|
12
|
+
Kimurai.time_zone = time_zone
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def run!
|
17
|
+
start_time = Time.now
|
18
|
+
run_id = start_time.to_i
|
19
|
+
running_pids = []
|
20
|
+
|
21
|
+
ENV.store("RBCAT_COLORIZER", "false")
|
22
|
+
|
23
|
+
run_info = {
|
24
|
+
id: run_id,
|
25
|
+
status: :processing,
|
26
|
+
start_time: start_time,
|
27
|
+
stop_time: nil,
|
28
|
+
environment: Kimurai.env,
|
29
|
+
concurrent_jobs: jobs,
|
30
|
+
spiders: spiders.keys
|
31
|
+
}
|
32
|
+
|
33
|
+
at_exit do
|
34
|
+
# Prevent queue to process new intems while executing at_exit body
|
35
|
+
Thread.list.each { |t| t.kill if t != Thread.main }
|
36
|
+
# Kill currently running spiders
|
37
|
+
running_pids.each { |pid| Process.kill("INT", pid) }
|
38
|
+
|
39
|
+
error = $!
|
40
|
+
stop_time = Time.now
|
41
|
+
|
42
|
+
if error.nil?
|
43
|
+
run_info.merge!(status: :completed, stop_time: stop_time)
|
44
|
+
else
|
45
|
+
run_info.merge!(status: :failed, error: error.inspect, stop_time: stop_time)
|
46
|
+
end
|
47
|
+
|
48
|
+
if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
|
49
|
+
at_stop_callback.call(run_info)
|
50
|
+
end
|
51
|
+
puts "<<< Runner: stopped: #{run_info}"
|
52
|
+
end
|
53
|
+
|
54
|
+
puts ">>> Runner: started: #{run_info}"
|
55
|
+
if at_start_callback = Kimurai.configuration.runner_at_start_callback
|
56
|
+
at_start_callback.call(run_info)
|
57
|
+
end
|
58
|
+
|
59
|
+
spiders.peach_with_index(jobs) do |spider, i|
|
60
|
+
spider_name = spider[0]
|
61
|
+
puts "> Runner: started spider: #{spider_name}, index: #{i}"
|
62
|
+
|
63
|
+
pid = spawn("bundle", "exec", "kimurai", "crawl", spider_name, [:out, :err] => "log/#{spider_name}.log")
|
64
|
+
running_pids << pid
|
65
|
+
Process.wait pid
|
66
|
+
|
67
|
+
running_pids.delete(pid)
|
68
|
+
puts "< Runner: stopped spider: #{spider_name}, index: #{i}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
2.5.1
|
@@ -0,0 +1,20 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
|
3
|
+
|
4
|
+
ruby '2.5.1'
|
5
|
+
|
6
|
+
# Framework
|
7
|
+
gem 'kimurai', '~> 1.0'
|
8
|
+
|
9
|
+
# Require files in directory and child directories recursively
|
10
|
+
gem 'require_all'
|
11
|
+
|
12
|
+
# Dotenv
|
13
|
+
gem 'dotenv'
|
14
|
+
|
15
|
+
# To debug spiders:
|
16
|
+
group :development do
|
17
|
+
gem 'byebug', platforms: :mri
|
18
|
+
gem 'pry'
|
19
|
+
end
|
20
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
Kimurai.configure do |config|
|
2
|
+
# Default logger has colored mode in development.
|
3
|
+
# If you would like to disable it, set `colorize_logger` to false.
|
4
|
+
# config.colorize_logger = false
|
5
|
+
|
6
|
+
# Logger level for default logger:
|
7
|
+
# config.log_level = :info
|
8
|
+
|
9
|
+
# Custom logger:
|
10
|
+
# config.logger = Logger.new(STDOUT)
|
11
|
+
|
12
|
+
# Custom time zone (for logs):
|
13
|
+
# config.time_zone = "UTC"
|
14
|
+
# config.time_zone = "Europe/Moscow"
|
15
|
+
|
16
|
+
# At start callback for a runner. Accepts argument with info as hash with
|
17
|
+
# keys: id, status, start_time, environment, concurrent_jobs, spiders list.
|
18
|
+
# For example, you can use this callback to send notification when runner was started:
|
19
|
+
# config.runner_at_start_callback = lambda do |info|
|
20
|
+
# json = JSON.pretty_generate(info)
|
21
|
+
# Sender.send_notification("Started session: #{json}")
|
22
|
+
# end
|
23
|
+
|
24
|
+
# At stop callback for a runner. Accepts argument with info as hash with
|
25
|
+
# all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
|
26
|
+
# stop status of a runner (completed or failed).
|
27
|
+
# You can use this callback to send notification when runner has been stopped:
|
28
|
+
# config.runner_at_stop_callback = lambda do |info|
|
29
|
+
# json = JSON.pretty_generate(info)
|
30
|
+
# Sender.send_notification("Stopped session: #{json}")
|
31
|
+
# end
|
32
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# software versions to install for `setup` command
|
2
|
+
setup:
|
3
|
+
ruby: 2.5.1
|
4
|
+
# check latest here http://phantomjs.org/download.html
|
5
|
+
phantomjs: 2.1.1
|
6
|
+
# check latest here https://github.com/mozilla/geckodriver/releases/
|
7
|
+
geckodriver: 0.21.0
|
8
|
+
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
9
|
+
chromedriver: 2.39
|
10
|
+
# settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
|
11
|
+
deploy:
|
12
|
+
# repo_url: git@bitbucket.org:username/repo_name.git
|
13
|
+
# repo_key_path: ~/.ssh/id_rsa
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# require project gems
|
2
|
+
require 'bundler/setup'
|
3
|
+
Bundler.require(:default, Kimurai.env)
|
4
|
+
|
5
|
+
# require custom ENV variables located in .env file
|
6
|
+
require 'dotenv/load'
|
7
|
+
|
8
|
+
# require initializers
|
9
|
+
Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
|
10
|
+
|
11
|
+
# require helpers
|
12
|
+
Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
|
13
|
+
|
14
|
+
# require pipelines
|
15
|
+
Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
|
16
|
+
|
17
|
+
# require spiders recursively in the `spiders/` folder
|
18
|
+
require_relative '../spiders/application_spider'
|
19
|
+
require_all "spiders"
|
20
|
+
|
21
|
+
# require Kimurai configuration
|
22
|
+
require_relative 'application'
|
File without changes
|
@@ -0,0 +1,57 @@
|
|
1
|
+
### Settings ###
|
2
|
+
require 'tzinfo'
|
3
|
+
|
4
|
+
# Export current PATH to the cron
|
5
|
+
env :PATH, ENV["PATH"]
|
6
|
+
|
7
|
+
# Use 24 hour format when using `at:` option
|
8
|
+
set :chronic_options, hours24: true
|
9
|
+
|
10
|
+
# Use local_to_utc helper to setup execution time using your local timezone instead
|
11
|
+
# of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
|
12
|
+
# Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
|
13
|
+
# to have spiders logs in a specific time zone format.
|
14
|
+
# Example usage of helper:
|
15
|
+
# every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
|
16
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
17
|
+
# end
|
18
|
+
def local_to_utc(time_string, zone:)
|
19
|
+
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Note: by default Whenever exports cron commands with :environment == "production".
|
23
|
+
# Note: Whenever can only append log data to a log file (>>). If you want
|
24
|
+
# to overwrite (>) log file before each run, pass lambda:
|
25
|
+
# crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
|
26
|
+
|
27
|
+
# Project job types
|
28
|
+
job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
|
29
|
+
job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
|
30
|
+
|
31
|
+
# Single file job type
|
32
|
+
job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
|
33
|
+
# Single with bundle exec
|
34
|
+
job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
|
35
|
+
|
36
|
+
### Schedule ###
|
37
|
+
# Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
|
38
|
+
# every 1.day do
|
39
|
+
# Example to schedule a single spider in the project:
|
40
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
41
|
+
|
42
|
+
# Example to schedule all spiders in the project using runner. Each spider will write
|
43
|
+
# it's own output to the `log/spider_name.log` file (handled by a runner itself).
|
44
|
+
# Runner output will be written to log/runner.log file.
|
45
|
+
# Argument number it's a count of concurrent jobs:
|
46
|
+
# runner 3, output:"log/runner.log"
|
47
|
+
|
48
|
+
# Example to schedule single spider (without project):
|
49
|
+
# single "single_spider.rb", output: "single_spider.log"
|
50
|
+
# end
|
51
|
+
|
52
|
+
### How to set a cron schedule ###
|
53
|
+
# Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
|
54
|
+
# If you don't have whenever command, install the gem: `$ gem install whenever`.
|
55
|
+
|
56
|
+
### How to cancel a schedule ###
|
57
|
+
# Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
|