kimurai 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +1923 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/kimurai +6 -0
- data/kimurai.gemspec +48 -0
- data/lib/kimurai.rb +53 -0
- data/lib/kimurai/automation/deploy.yml +54 -0
- data/lib/kimurai/automation/setup.yml +44 -0
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
- data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
- data/lib/kimurai/base.rb +249 -0
- data/lib/kimurai/base/simple_saver.rb +98 -0
- data/lib/kimurai/base/uniq_checker.rb +22 -0
- data/lib/kimurai/base_helper.rb +22 -0
- data/lib/kimurai/browser_builder.rb +32 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
- data/lib/kimurai/capybara_configuration.rb +10 -0
- data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
- data/lib/kimurai/capybara_ext/session.rb +150 -0
- data/lib/kimurai/capybara_ext/session/config.rb +18 -0
- data/lib/kimurai/cli.rb +157 -0
- data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
- data/lib/kimurai/cli/generator.rb +57 -0
- data/lib/kimurai/core_ext/array.rb +14 -0
- data/lib/kimurai/core_ext/numeric.rb +19 -0
- data/lib/kimurai/core_ext/string.rb +7 -0
- data/lib/kimurai/pipeline.rb +25 -0
- data/lib/kimurai/runner.rb +72 -0
- data/lib/kimurai/template/.gitignore +18 -0
- data/lib/kimurai/template/.ruby-version +1 -0
- data/lib/kimurai/template/Gemfile +20 -0
- data/lib/kimurai/template/README.md +3 -0
- data/lib/kimurai/template/config/application.rb +32 -0
- data/lib/kimurai/template/config/automation.yml +13 -0
- data/lib/kimurai/template/config/boot.rb +22 -0
- data/lib/kimurai/template/config/initializers/.keep +0 -0
- data/lib/kimurai/template/config/schedule.rb +57 -0
- data/lib/kimurai/template/db/.keep +0 -0
- data/lib/kimurai/template/helpers/application_helper.rb +3 -0
- data/lib/kimurai/template/lib/.keep +0 -0
- data/lib/kimurai/template/log/.keep +0 -0
- data/lib/kimurai/template/pipelines/saver.rb +11 -0
- data/lib/kimurai/template/pipelines/validator.rb +24 -0
- data/lib/kimurai/template/spiders/application_spider.rb +104 -0
- data/lib/kimurai/template/tmp/.keep +0 -0
- data/lib/kimurai/version.rb +3 -0
- metadata +349 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'cliver'
|
2
|
+
|
3
|
+
module Kimurai
|
4
|
+
class CLI
|
5
|
+
class AnsibleCommandBuilder
|
6
|
+
def initialize(user_host, options, playbook:, vars: {})
|
7
|
+
@user_host = user_host
|
8
|
+
@options = options
|
9
|
+
@playbook = playbook
|
10
|
+
@vars = vars
|
11
|
+
end
|
12
|
+
|
13
|
+
def get
|
14
|
+
unless Cliver.detect("ansible-playbook")
|
15
|
+
raise "Can't find `ansible-playbook` executable, to install: " \
|
16
|
+
"Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
|
17
|
+
end
|
18
|
+
|
19
|
+
user = @user_host[/(.*?)\@/, 1]
|
20
|
+
host = @user_host[/\@(.+)/, 1] || @user_host
|
21
|
+
inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
|
22
|
+
|
23
|
+
gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
|
24
|
+
playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
|
25
|
+
|
26
|
+
command = [
|
27
|
+
"ansible-playbook", playbook_path,
|
28
|
+
"--inventory", inventory,
|
29
|
+
"--ssh-extra-args", "-oForwardAgent=yes",
|
30
|
+
"--connection", @options["local"] ? "local" : "smart",
|
31
|
+
"--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
|
32
|
+
]
|
33
|
+
|
34
|
+
if File.exists? "config/automation.yml"
|
35
|
+
require 'yaml'
|
36
|
+
if config = YAML.load_file("config/automation.yml").dig(@playbook)
|
37
|
+
config.each { |key, value| @vars[key] = value unless @vars[key] }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
@vars.each do |key, value|
|
42
|
+
next unless value.present?
|
43
|
+
command.push "--extra-vars", "#{key}=#{value}"
|
44
|
+
end
|
45
|
+
|
46
|
+
if user
|
47
|
+
command.push "--user", user
|
48
|
+
end
|
49
|
+
|
50
|
+
if @options["ask-sudo"]
|
51
|
+
command.push "--ask-become-pass"
|
52
|
+
end
|
53
|
+
|
54
|
+
if @options["ask-auth-pass"]
|
55
|
+
unless Cliver.detect("sshpass")
|
56
|
+
raise "Can't find `sshpass` executable for password authentication, to install: " \
|
57
|
+
"Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
|
58
|
+
end
|
59
|
+
|
60
|
+
command.push "--ask-pass"
|
61
|
+
end
|
62
|
+
|
63
|
+
if ssh_key_path = @options["ssh-key-path"]
|
64
|
+
command.push "--private-key", ssh_key_path
|
65
|
+
end
|
66
|
+
|
67
|
+
command
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class CLI
|
3
|
+
class Generator < Thor::Group
|
4
|
+
include Thor::Actions
|
5
|
+
|
6
|
+
def self.source_root
|
7
|
+
File.dirname(File.expand_path('..', __FILE__))
|
8
|
+
end
|
9
|
+
|
10
|
+
def generate_project(project_name)
|
11
|
+
directory "template", project_name
|
12
|
+
inside(project_name) do
|
13
|
+
run "bundle install"
|
14
|
+
run "git init"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def generate_spider(spider_name, in_project:)
|
19
|
+
spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
|
20
|
+
raise "Spider #{spider_path} already exists" if File.exists? spider_path
|
21
|
+
|
22
|
+
spider_class = to_spider_class(spider_name)
|
23
|
+
create_file spider_path do
|
24
|
+
<<~RUBY
|
25
|
+
class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
|
26
|
+
@name = "#{spider_name}"
|
27
|
+
@start_urls = []
|
28
|
+
@config = {}
|
29
|
+
|
30
|
+
def parse(response, url:, data: {})
|
31
|
+
end
|
32
|
+
end
|
33
|
+
RUBY
|
34
|
+
end
|
35
|
+
|
36
|
+
unless in_project
|
37
|
+
insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
|
38
|
+
prepend_to_file spider_path, "require 'kimurai'\n\n"
|
39
|
+
append_to_file spider_path, "\n#{spider_class}.crawl!"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def generate_schedule
|
44
|
+
copy_file "template/config/schedule.rb", "./schedule.rb"
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def to_spider_class(string)
|
50
|
+
string.sub(/^./) { $&.capitalize }
|
51
|
+
.gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
52
|
+
.gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
|
53
|
+
.gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class Array
|
2
|
+
def in_sorted_groups(number, fill_width = nil)
|
3
|
+
sorted_groups = Array.new(number) { |a| a = [] }
|
4
|
+
|
5
|
+
self.in_groups_of(number, fill_width).each do |group|
|
6
|
+
number.times do |i|
|
7
|
+
group.fetch(i) rescue next
|
8
|
+
sorted_groups[i] << group[i]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
sorted_groups
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Numeric
|
2
|
+
# https://stackoverflow.com/a/1679963
|
3
|
+
def duration
|
4
|
+
secs = self.to_int
|
5
|
+
mins = secs / 60
|
6
|
+
hours = mins / 60
|
7
|
+
days = hours / 24
|
8
|
+
|
9
|
+
if days > 0
|
10
|
+
"#{days}d, #{hours % 24}h"
|
11
|
+
elsif hours > 0
|
12
|
+
"#{hours}h, #{mins % 60}m"
|
13
|
+
elsif mins > 0
|
14
|
+
"#{mins}m, #{secs % 60}s"
|
15
|
+
elsif secs >= 0
|
16
|
+
"#{secs}s"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Kimurai
|
2
|
+
class Pipeline
|
3
|
+
class DropItemError < StandardError; end
|
4
|
+
def self.name
|
5
|
+
self.to_s.sub(/.*?::/, "").underscore.to_sym
|
6
|
+
end
|
7
|
+
|
8
|
+
include BaseHelper
|
9
|
+
attr_accessor :spider
|
10
|
+
|
11
|
+
def name
|
12
|
+
self.class.name
|
13
|
+
end
|
14
|
+
|
15
|
+
###
|
16
|
+
|
17
|
+
def unique?(scope, value)
|
18
|
+
spider.unique?(scope, value)
|
19
|
+
end
|
20
|
+
|
21
|
+
def save_to(path, item, format:, position: true)
|
22
|
+
spider.save_to(path, item, format: format, position: position)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'pmap'
|
2
|
+
|
3
|
+
module Kimurai
|
4
|
+
class Runner
|
5
|
+
attr_reader :jobs, :spiders
|
6
|
+
|
7
|
+
def initialize(parallel_jobs:)
|
8
|
+
@jobs = parallel_jobs
|
9
|
+
@spiders = Kimurai.list
|
10
|
+
|
11
|
+
if time_zone = Kimurai.configuration.time_zone
|
12
|
+
Kimurai.time_zone = time_zone
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def run!
|
17
|
+
start_time = Time.now
|
18
|
+
run_id = start_time.to_i
|
19
|
+
running_pids = []
|
20
|
+
|
21
|
+
ENV.store("RBCAT_COLORIZER", "false")
|
22
|
+
|
23
|
+
run_info = {
|
24
|
+
id: run_id,
|
25
|
+
status: :processing,
|
26
|
+
start_time: start_time,
|
27
|
+
stop_time: nil,
|
28
|
+
environment: Kimurai.env,
|
29
|
+
concurrent_jobs: jobs,
|
30
|
+
spiders: spiders.keys
|
31
|
+
}
|
32
|
+
|
33
|
+
at_exit do
|
34
|
+
# Prevent queue to process new intems while executing at_exit body
|
35
|
+
Thread.list.each { |t| t.kill if t != Thread.main }
|
36
|
+
# Kill currently running spiders
|
37
|
+
running_pids.each { |pid| Process.kill("INT", pid) }
|
38
|
+
|
39
|
+
error = $!
|
40
|
+
stop_time = Time.now
|
41
|
+
|
42
|
+
if error.nil?
|
43
|
+
run_info.merge!(status: :completed, stop_time: stop_time)
|
44
|
+
else
|
45
|
+
run_info.merge!(status: :failed, error: error.inspect, stop_time: stop_time)
|
46
|
+
end
|
47
|
+
|
48
|
+
if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
|
49
|
+
at_stop_callback.call(run_info)
|
50
|
+
end
|
51
|
+
puts "<<< Runner: stopped: #{run_info}"
|
52
|
+
end
|
53
|
+
|
54
|
+
puts ">>> Runner: started: #{run_info}"
|
55
|
+
if at_start_callback = Kimurai.configuration.runner_at_start_callback
|
56
|
+
at_start_callback.call(run_info)
|
57
|
+
end
|
58
|
+
|
59
|
+
spiders.peach_with_index(jobs) do |spider, i|
|
60
|
+
spider_name = spider[0]
|
61
|
+
puts "> Runner: started spider: #{spider_name}, index: #{i}"
|
62
|
+
|
63
|
+
pid = spawn("bundle", "exec", "kimurai", "crawl", spider_name, [:out, :err] => "log/#{spider_name}.log")
|
64
|
+
running_pids << pid
|
65
|
+
Process.wait pid
|
66
|
+
|
67
|
+
running_pids.delete(pid)
|
68
|
+
puts "< Runner: stopped spider: #{spider_name}, index: #{i}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
2.5.1
|
@@ -0,0 +1,20 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
|
3
|
+
|
4
|
+
ruby '2.5.1'
|
5
|
+
|
6
|
+
# Framework
|
7
|
+
gem 'kimurai', '~> 1.0'
|
8
|
+
|
9
|
+
# Require files in directory and child directories recursively
|
10
|
+
gem 'require_all'
|
11
|
+
|
12
|
+
# Dotenv
|
13
|
+
gem 'dotenv'
|
14
|
+
|
15
|
+
# To debug spiders:
|
16
|
+
group :development do
|
17
|
+
gem 'byebug', platforms: :mri
|
18
|
+
gem 'pry'
|
19
|
+
end
|
20
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
Kimurai.configure do |config|
|
2
|
+
# Default logger has colored mode in development.
|
3
|
+
# If you would like to disable it, set `colorize_logger` to false.
|
4
|
+
# config.colorize_logger = false
|
5
|
+
|
6
|
+
# Logger level for default logger:
|
7
|
+
# config.log_level = :info
|
8
|
+
|
9
|
+
# Custom logger:
|
10
|
+
# config.logger = Logger.new(STDOUT)
|
11
|
+
|
12
|
+
# Custom time zone (for logs):
|
13
|
+
# config.time_zone = "UTC"
|
14
|
+
# config.time_zone = "Europe/Moscow"
|
15
|
+
|
16
|
+
# At start callback for a runner. Accepts argument with info as hash with
|
17
|
+
# keys: id, status, start_time, environment, concurrent_jobs, spiders list.
|
18
|
+
# For example, you can use this callback to send notification when runner was started:
|
19
|
+
# config.runner_at_start_callback = lambda do |info|
|
20
|
+
# json = JSON.pretty_generate(info)
|
21
|
+
# Sender.send_notification("Started session: #{json}")
|
22
|
+
# end
|
23
|
+
|
24
|
+
# At stop callback for a runner. Accepts argument with info as hash with
|
25
|
+
# all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
|
26
|
+
# stop status of a runner (completed or failed).
|
27
|
+
# You can use this callback to send notification when runner has been stopped:
|
28
|
+
# config.runner_at_stop_callback = lambda do |info|
|
29
|
+
# json = JSON.pretty_generate(info)
|
30
|
+
# Sender.send_notification("Stopped session: #{json}")
|
31
|
+
# end
|
32
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# software versions to install for `setup` command
|
2
|
+
setup:
|
3
|
+
ruby: 2.5.1
|
4
|
+
# check latest here http://phantomjs.org/download.html
|
5
|
+
phantomjs: 2.1.1
|
6
|
+
# check latest here https://github.com/mozilla/geckodriver/releases/
|
7
|
+
geckodriver: 0.21.0
|
8
|
+
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
9
|
+
chromedriver: 2.39
|
10
|
+
# settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
|
11
|
+
deploy:
|
12
|
+
# repo_url: git@bitbucket.org:username/repo_name.git
|
13
|
+
# repo_key_path: ~/.ssh/id_rsa
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# require project gems
|
2
|
+
require 'bundler/setup'
|
3
|
+
Bundler.require(:default, Kimurai.env)
|
4
|
+
|
5
|
+
# require custom ENV variables located in .env file
|
6
|
+
require 'dotenv/load'
|
7
|
+
|
8
|
+
# require initializers
|
9
|
+
Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
|
10
|
+
|
11
|
+
# require helpers
|
12
|
+
Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
|
13
|
+
|
14
|
+
# require pipelines
|
15
|
+
Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
|
16
|
+
|
17
|
+
# require spiders recursively in the `spiders/` folder
|
18
|
+
require_relative '../spiders/application_spider'
|
19
|
+
require_all "spiders"
|
20
|
+
|
21
|
+
# require Kimurai configuration
|
22
|
+
require_relative 'application'
|
File without changes
|
@@ -0,0 +1,57 @@
|
|
1
|
+
### Settings ###
|
2
|
+
require 'tzinfo'
|
3
|
+
|
4
|
+
# Export current PATH to the cron
|
5
|
+
env :PATH, ENV["PATH"]
|
6
|
+
|
7
|
+
# Use 24 hour format when using `at:` option
|
8
|
+
set :chronic_options, hours24: true
|
9
|
+
|
10
|
+
# Use local_to_utc helper to setup execution time using your local timezone instead
|
11
|
+
# of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
|
12
|
+
# Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
|
13
|
+
# to have spiders logs in a specific time zone format.
|
14
|
+
# Example usage of helper:
|
15
|
+
# every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
|
16
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
17
|
+
# end
|
18
|
+
def local_to_utc(time_string, zone:)
|
19
|
+
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Note: by default Whenever exports cron commands with :environment == "production".
|
23
|
+
# Note: Whenever can only append log data to a log file (>>). If you want
|
24
|
+
# to overwrite (>) log file before each run, pass lambda:
|
25
|
+
# crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
|
26
|
+
|
27
|
+
# Project job types
|
28
|
+
job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
|
29
|
+
job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
|
30
|
+
|
31
|
+
# Single file job type
|
32
|
+
job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
|
33
|
+
# Single with bundle exec
|
34
|
+
job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
|
35
|
+
|
36
|
+
### Schedule ###
|
37
|
+
# Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
|
38
|
+
# every 1.day do
|
39
|
+
# Example to schedule a single spider in the project:
|
40
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
41
|
+
|
42
|
+
# Example to schedule all spiders in the project using runner. Each spider will write
|
43
|
+
# it's own output to the `log/spider_name.log` file (handled by a runner itself).
|
44
|
+
# Runner output will be written to log/runner.log file.
|
45
|
+
# Argument number it's a count of concurrent jobs:
|
46
|
+
# runner 3, output:"log/runner.log"
|
47
|
+
|
48
|
+
# Example to schedule single spider (without project):
|
49
|
+
# single "single_spider.rb", output: "single_spider.log"
|
50
|
+
# end
|
51
|
+
|
52
|
+
### How to set a cron schedule ###
|
53
|
+
# Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
|
54
|
+
# If you don't have whenever command, install the gem: `$ gem install whenever`.
|
55
|
+
|
56
|
+
### How to cancel a schedule ###
|
57
|
+
# Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
|