kimurai 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
@@ -0,0 +1,71 @@
1
+ require 'cliver'
2
+
3
+ module Kimurai
4
+ class CLI
5
+ class AnsibleCommandBuilder
6
+ def initialize(user_host, options, playbook:, vars: {})
7
+ @user_host = user_host
8
+ @options = options
9
+ @playbook = playbook
10
+ @vars = vars
11
+ end
12
+
13
+ def get
14
+ unless Cliver.detect("ansible-playbook")
15
+ raise "Can't find `ansible-playbook` executable, to install: " \
16
+ "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17
+ end
18
+
19
+ user = @user_host[/(.*?)\@/, 1]
20
+ host = @user_host[/\@(.+)/, 1] || @user_host
21
+ inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22
+
23
+ gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
24
+ playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
25
+
26
+ command = [
27
+ "ansible-playbook", playbook_path,
28
+ "--inventory", inventory,
29
+ "--ssh-extra-args", "-oForwardAgent=yes",
30
+ "--connection", @options["local"] ? "local" : "smart",
31
+ "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32
+ ]
33
+
34
+ if File.exists? "config/automation.yml"
35
+ require 'yaml'
36
+ if config = YAML.load_file("config/automation.yml").dig(@playbook)
37
+ config.each { |key, value| @vars[key] = value unless @vars[key] }
38
+ end
39
+ end
40
+
41
+ @vars.each do |key, value|
42
+ next unless value.present?
43
+ command.push "--extra-vars", "#{key}=#{value}"
44
+ end
45
+
46
+ if user
47
+ command.push "--user", user
48
+ end
49
+
50
+ if @options["ask-sudo"]
51
+ command.push "--ask-become-pass"
52
+ end
53
+
54
+ if @options["ask-auth-pass"]
55
+ unless Cliver.detect("sshpass")
56
+ raise "Can't find `sshpass` executable for password authentication, to install: " \
57
+ "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58
+ end
59
+
60
+ command.push "--ask-pass"
61
+ end
62
+
63
+ if ssh_key_path = @options["ssh-key-path"]
64
+ command.push "--private-key", ssh_key_path
65
+ end
66
+
67
+ command
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,57 @@
1
+ module Kimurai
2
+ class CLI
3
+ class Generator < Thor::Group
4
+ include Thor::Actions
5
+
6
+ def self.source_root
7
+ File.dirname(File.expand_path('..', __FILE__))
8
+ end
9
+
10
+ def generate_project(project_name)
11
+ directory "template", project_name
12
+ inside(project_name) do
13
+ run "bundle install"
14
+ run "git init"
15
+ end
16
+ end
17
+
18
+ def generate_spider(spider_name, in_project:)
19
+ spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20
+ raise "Spider #{spider_path} already exists" if File.exists? spider_path
21
+
22
+ spider_class = to_spider_class(spider_name)
23
+ create_file spider_path do
24
+ <<~RUBY
25
+ class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
26
+ @name = "#{spider_name}"
27
+ @start_urls = []
28
+ @config = {}
29
+
30
+ def parse(response, url:, data: {})
31
+ end
32
+ end
33
+ RUBY
34
+ end
35
+
36
+ unless in_project
37
+ insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38
+ prepend_to_file spider_path, "require 'kimurai'\n\n"
39
+ append_to_file spider_path, "\n#{spider_class}.crawl!"
40
+ end
41
+ end
42
+
43
+ def generate_schedule
44
+ copy_file "template/config/schedule.rb", "./schedule.rb"
45
+ end
46
+
47
+ private
48
+
49
+ def to_spider_class(string)
50
+ string.sub(/^./) { $&.capitalize }
51
+ .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52
+ .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53
+ .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,14 @@
1
+ class Array
2
+ def in_sorted_groups(number, fill_width = nil)
3
+ sorted_groups = Array.new(number) { |a| a = [] }
4
+
5
+ self.in_groups_of(number, fill_width).each do |group|
6
+ number.times do |i|
7
+ group.fetch(i) rescue next
8
+ sorted_groups[i] << group[i]
9
+ end
10
+ end
11
+
12
+ sorted_groups
13
+ end
14
+ end
@@ -0,0 +1,19 @@
1
+ class Numeric
2
+ # https://stackoverflow.com/a/1679963
3
+ def duration
4
+ secs = self.to_int
5
+ mins = secs / 60
6
+ hours = mins / 60
7
+ days = hours / 24
8
+
9
+ if days > 0
10
+ "#{days}d, #{hours % 24}h"
11
+ elsif hours > 0
12
+ "#{hours}h, #{mins % 60}m"
13
+ elsif mins > 0
14
+ "#{mins}m, #{secs % 60}s"
15
+ elsif secs >= 0
16
+ "#{secs}s"
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ require 'murmurhash3'
2
+
3
+ class String
4
+ def to_id
5
+ MurmurHash3::V32.str_hash(self)
6
+ end
7
+ end
@@ -0,0 +1,25 @@
1
+ module Kimurai
2
+ class Pipeline
3
+ class DropItemError < StandardError; end
4
+ def self.name
5
+ self.to_s.sub(/.*?::/, "").underscore.to_sym
6
+ end
7
+
8
+ include BaseHelper
9
+ attr_accessor :spider
10
+
11
+ def name
12
+ self.class.name
13
+ end
14
+
15
+ ###
16
+
17
+ def unique?(scope, value)
18
+ spider.unique?(scope, value)
19
+ end
20
+
21
+ def save_to(path, item, format:, position: true)
22
+ spider.save_to(path, item, format: format, position: position)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,72 @@
1
+ require 'pmap'
2
+
3
+ module Kimurai
4
+ class Runner
5
+ attr_reader :jobs, :spiders
6
+
7
+ def initialize(parallel_jobs:)
8
+ @jobs = parallel_jobs
9
+ @spiders = Kimurai.list
10
+
11
+ if time_zone = Kimurai.configuration.time_zone
12
+ Kimurai.time_zone = time_zone
13
+ end
14
+ end
15
+
16
+ def run!
17
+ start_time = Time.now
18
+ run_id = start_time.to_i
19
+ running_pids = []
20
+
21
+ ENV.store("RBCAT_COLORIZER", "false")
22
+
23
+ run_info = {
24
+ id: run_id,
25
+ status: :processing,
26
+ start_time: start_time,
27
+ stop_time: nil,
28
+ environment: Kimurai.env,
29
+ concurrent_jobs: jobs,
30
+ spiders: spiders.keys
31
+ }
32
+
33
+ at_exit do
34
+ # Prevent queue to process new intems while executing at_exit body
35
+ Thread.list.each { |t| t.kill if t != Thread.main }
36
+ # Kill currently running spiders
37
+ running_pids.each { |pid| Process.kill("INT", pid) }
38
+
39
+ error = $!
40
+ stop_time = Time.now
41
+
42
+ if error.nil?
43
+ run_info.merge!(status: :completed, stop_time: stop_time)
44
+ else
45
+ run_info.merge!(status: :failed, error: error.inspect, stop_time: stop_time)
46
+ end
47
+
48
+ if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
49
+ at_stop_callback.call(run_info)
50
+ end
51
+ puts "<<< Runner: stopped: #{run_info}"
52
+ end
53
+
54
+ puts ">>> Runner: started: #{run_info}"
55
+ if at_start_callback = Kimurai.configuration.runner_at_start_callback
56
+ at_start_callback.call(run_info)
57
+ end
58
+
59
+ spiders.peach_with_index(jobs) do |spider, i|
60
+ spider_name = spider[0]
61
+ puts "> Runner: started spider: #{spider_name}, index: #{i}"
62
+
63
+ pid = spawn("bundle", "exec", "kimurai", "crawl", spider_name, [:out, :err] => "log/#{spider_name}.log")
64
+ running_pids << pid
65
+ Process.wait pid
66
+
67
+ running_pids.delete(pid)
68
+ puts "< Runner: stopped spider: #{spider_name}, index: #{i}"
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,18 @@
1
+ /.bundle
2
+ /cache
3
+ /node_modules
4
+
5
+ /log/*
6
+ !/log/.keep
7
+
8
+ /tmp/*
9
+ !/tmp/.keep
10
+
11
+ /db/*
12
+ !/db/.keep
13
+
14
+ .byebug_history
15
+ *.swp
16
+ .env
17
+
18
+ capybara-*.png
@@ -0,0 +1 @@
1
+ 2.5.1
@@ -0,0 +1,20 @@
1
+ source 'https://rubygems.org'
2
+ git_source(:github) { |repo| "https://github.com/#{repo}.git" }
3
+
4
+ ruby '2.5.1'
5
+
6
+ # Framework
7
+ gem 'kimurai', '~> 1.0'
8
+
9
+ # Require files in directory and child directories recursively
10
+ gem 'require_all'
11
+
12
+ # Dotenv
13
+ gem 'dotenv'
14
+
15
+ # To debug spiders:
16
+ group :development do
17
+ gem 'byebug', platforms: :mri
18
+ gem 'pry'
19
+ end
20
+
@@ -0,0 +1,3 @@
1
+ # README
2
+
3
+ New Kimurai project readme
@@ -0,0 +1,32 @@
1
+ Kimurai.configure do |config|
2
+ # Default logger has colored mode in development.
3
+ # If you would like to disable it, set `colorize_logger` to false.
4
+ # config.colorize_logger = false
5
+
6
+ # Logger level for default logger:
7
+ # config.log_level = :info
8
+
9
+ # Custom logger:
10
+ # config.logger = Logger.new(STDOUT)
11
+
12
+ # Custom time zone (for logs):
13
+ # config.time_zone = "UTC"
14
+ # config.time_zone = "Europe/Moscow"
15
+
16
+ # At start callback for a runner. Accepts argument with info as hash with
17
+ # keys: id, status, start_time, environment, concurrent_jobs, spiders list.
18
+ # For example, you can use this callback to send notification when runner was started:
19
+ # config.runner_at_start_callback = lambda do |info|
20
+ # json = JSON.pretty_generate(info)
21
+ # Sender.send_notification("Started session: #{json}")
22
+ # end
23
+
24
+ # At stop callback for a runner. Accepts argument with info as hash with
25
+ # all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
26
+ # stop status of a runner (completed or failed).
27
+ # You can use this callback to send notification when runner has been stopped:
28
+ # config.runner_at_stop_callback = lambda do |info|
29
+ # json = JSON.pretty_generate(info)
30
+ # Sender.send_notification("Stopped session: #{json}")
31
+ # end
32
+ end
@@ -0,0 +1,13 @@
1
+ # software versions to install for `setup` command
2
+ setup:
3
+ ruby: 2.5.1
4
+ # check latest here http://phantomjs.org/download.html
5
+ phantomjs: 2.1.1
6
+ # check latest here https://github.com/mozilla/geckodriver/releases/
7
+ geckodriver: 0.21.0
8
+ # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
9
+ chromedriver: 2.39
10
+ # settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
11
+ deploy:
12
+ # repo_url: git@bitbucket.org:username/repo_name.git
13
+ # repo_key_path: ~/.ssh/id_rsa
@@ -0,0 +1,22 @@
1
+ # require project gems
2
+ require 'bundler/setup'
3
+ Bundler.require(:default, Kimurai.env)
4
+
5
+ # require custom ENV variables located in .env file
6
+ require 'dotenv/load'
7
+
8
+ # require initializers
9
+ Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
10
+
11
+ # require helpers
12
+ Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
13
+
14
+ # require pipelines
15
+ Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
16
+
17
+ # require spiders recursively in the `spiders/` folder
18
+ require_relative '../spiders/application_spider'
19
+ require_all "spiders"
20
+
21
+ # require Kimurai configuration
22
+ require_relative 'application'
@@ -0,0 +1,57 @@
1
+ ### Settings ###
2
+ require 'tzinfo'
3
+
4
+ # Export current PATH to the cron
5
+ env :PATH, ENV["PATH"]
6
+
7
+ # Use 24 hour format when using `at:` option
8
+ set :chronic_options, hours24: true
9
+
10
+ # Use local_to_utc helper to setup execution time using your local timezone instead
11
+ # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
12
+ # Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
13
+ # to have spiders logs in a specific time zone format.
14
+ # Example usage of helper:
15
+ # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
16
+ # crawl "google_spider.com", output: "log/google_spider.com.log"
17
+ # end
18
+ def local_to_utc(time_string, zone:)
19
+ TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time))
20
+ end
21
+
22
+ # Note: by default Whenever exports cron commands with :environment == "production".
23
+ # Note: Whenever can only append log data to a log file (>>). If you want
24
+ # to overwrite (>) log file before each run, pass lambda:
25
+ # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
26
+
27
+ # Project job types
28
+ job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
29
+ job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
30
+
31
+ # Single file job type
32
+ job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
33
+ # Single with bundle exec
34
+ job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
35
+
36
+ ### Schedule ###
37
+ # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
38
+ # every 1.day do
39
+ # Example to schedule a single spider in the project:
40
+ # crawl "google_spider.com", output: "log/google_spider.com.log"
41
+
42
+ # Example to schedule all spiders in the project using runner. Each spider will write
43
+ # it's own output to the `log/spider_name.log` file (handled by a runner itself).
44
+ # Runner output will be written to log/runner.log file.
45
+ # Argument number it's a count of concurrent jobs:
46
+ # runner 3, output:"log/runner.log"
47
+
48
+ # Example to schedule single spider (without project):
49
+ # single "single_spider.rb", output: "single_spider.log"
50
+ # end
51
+
52
+ ### How to set a cron schedule ###
53
+ # Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
54
+ # If you don't have whenever command, install the gem: `$ gem install whenever`.
55
+
56
+ ### How to cancel a schedule ###
57
+ # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.