kimurai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
@@ -0,0 +1,71 @@
1
+ require 'cliver'
2
+
3
+ module Kimurai
4
+ class CLI
5
+ class AnsibleCommandBuilder
6
+ def initialize(user_host, options, playbook:, vars: {})
7
+ @user_host = user_host
8
+ @options = options
9
+ @playbook = playbook
10
+ @vars = vars
11
+ end
12
+
13
+ def get
14
+ unless Cliver.detect("ansible-playbook")
15
+ raise "Can't find `ansible-playbook` executable, to install: " \
16
+ "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17
+ end
18
+
19
+ user = @user_host[/(.*?)\@/, 1]
20
+ host = @user_host[/\@(.+)/, 1] || @user_host
21
+ inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22
+
23
+ gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
24
+ playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
25
+
26
+ command = [
27
+ "ansible-playbook", playbook_path,
28
+ "--inventory", inventory,
29
+ "--ssh-extra-args", "-oForwardAgent=yes",
30
+ "--connection", @options["local"] ? "local" : "smart",
31
+ "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32
+ ]
33
+
34
+ if File.exists? "config/automation.yml"
35
+ require 'yaml'
36
+ if config = YAML.load_file("config/automation.yml").dig(@playbook)
37
+ config.each { |key, value| @vars[key] = value unless @vars[key] }
38
+ end
39
+ end
40
+
41
+ @vars.each do |key, value|
42
+ next unless value.present?
43
+ command.push "--extra-vars", "#{key}=#{value}"
44
+ end
45
+
46
+ if user
47
+ command.push "--user", user
48
+ end
49
+
50
+ if @options["ask-sudo"]
51
+ command.push "--ask-become-pass"
52
+ end
53
+
54
+ if @options["ask-auth-pass"]
55
+ unless Cliver.detect("sshpass")
56
+ raise "Can't find `sshpass` executable for password authentication, to install: " \
57
+ "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58
+ end
59
+
60
+ command.push "--ask-pass"
61
+ end
62
+
63
+ if ssh_key_path = @options["ssh-key-path"]
64
+ command.push "--private-key", ssh_key_path
65
+ end
66
+
67
+ command
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,57 @@
1
+ module Kimurai
2
+ class CLI
3
+ class Generator < Thor::Group
4
+ include Thor::Actions
5
+
6
+ def self.source_root
7
+ File.dirname(File.expand_path('..', __FILE__))
8
+ end
9
+
10
+ def generate_project(project_name)
11
+ directory "template", project_name
12
+ inside(project_name) do
13
+ run "bundle install"
14
+ run "git init"
15
+ end
16
+ end
17
+
18
+ def generate_spider(spider_name, in_project:)
19
+ spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20
+ raise "Spider #{spider_path} already exists" if File.exists? spider_path
21
+
22
+ spider_class = to_spider_class(spider_name)
23
+ create_file spider_path do
24
+ <<~RUBY
25
+ class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
26
+ @name = "#{spider_name}"
27
+ @start_urls = []
28
+ @config = {}
29
+
30
+ def parse(response, url:, data: {})
31
+ end
32
+ end
33
+ RUBY
34
+ end
35
+
36
+ unless in_project
37
+ insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38
+ prepend_to_file spider_path, "require 'kimurai'\n\n"
39
+ append_to_file spider_path, "\n#{spider_class}.crawl!"
40
+ end
41
+ end
42
+
43
+ def generate_schedule
44
+ copy_file "template/config/schedule.rb", "./schedule.rb"
45
+ end
46
+
47
+ private
48
+
49
+ def to_spider_class(string)
50
+ string.sub(/^./) { $&.capitalize }
51
+ .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52
+ .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53
+ .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,14 @@
1
+ class Array
2
+ def in_sorted_groups(number, fill_width = nil)
3
+ sorted_groups = Array.new(number) { |a| a = [] }
4
+
5
+ self.in_groups_of(number, fill_width).each do |group|
6
+ number.times do |i|
7
+ group.fetch(i) rescue next
8
+ sorted_groups[i] << group[i]
9
+ end
10
+ end
11
+
12
+ sorted_groups
13
+ end
14
+ end
@@ -0,0 +1,19 @@
1
+ class Numeric
2
+ # https://stackoverflow.com/a/1679963
3
+ def duration
4
+ secs = self.to_int
5
+ mins = secs / 60
6
+ hours = mins / 60
7
+ days = hours / 24
8
+
9
+ if days > 0
10
+ "#{days}d, #{hours % 24}h"
11
+ elsif hours > 0
12
+ "#{hours}h, #{mins % 60}m"
13
+ elsif mins > 0
14
+ "#{mins}m, #{secs % 60}s"
15
+ elsif secs >= 0
16
+ "#{secs}s"
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ require 'murmurhash3'
2
+
3
+ class String
4
+ def to_id
5
+ MurmurHash3::V32.str_hash(self)
6
+ end
7
+ end
@@ -0,0 +1,25 @@
1
+ module Kimurai
2
+ class Pipeline
3
+ class DropItemError < StandardError; end
4
+ def self.name
5
+ self.to_s.sub(/.*?::/, "").underscore.to_sym
6
+ end
7
+
8
+ include BaseHelper
9
+ attr_accessor :spider
10
+
11
+ def name
12
+ self.class.name
13
+ end
14
+
15
+ ###
16
+
17
+ def unique?(scope, value)
18
+ spider.unique?(scope, value)
19
+ end
20
+
21
+ def save_to(path, item, format:, position: true)
22
+ spider.save_to(path, item, format: format, position: position)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,72 @@
1
+ require 'pmap'
2
+
3
+ module Kimurai
4
+ class Runner
5
+ attr_reader :jobs, :spiders
6
+
7
+ def initialize(parallel_jobs:)
8
+ @jobs = parallel_jobs
9
+ @spiders = Kimurai.list
10
+
11
+ if time_zone = Kimurai.configuration.time_zone
12
+ Kimurai.time_zone = time_zone
13
+ end
14
+ end
15
+
16
+ def run!
17
+ start_time = Time.now
18
+ run_id = start_time.to_i
19
+ running_pids = []
20
+
21
+ ENV.store("RBCAT_COLORIZER", "false")
22
+
23
+ run_info = {
24
+ id: run_id,
25
+ status: :processing,
26
+ start_time: start_time,
27
+ stop_time: nil,
28
+ environment: Kimurai.env,
29
+ concurrent_jobs: jobs,
30
+ spiders: spiders.keys
31
+ }
32
+
33
+ at_exit do
34
+ # Prevent queue to process new intems while executing at_exit body
35
+ Thread.list.each { |t| t.kill if t != Thread.main }
36
+ # Kill currently running spiders
37
+ running_pids.each { |pid| Process.kill("INT", pid) }
38
+
39
+ error = $!
40
+ stop_time = Time.now
41
+
42
+ if error.nil?
43
+ run_info.merge!(status: :completed, stop_time: stop_time)
44
+ else
45
+ run_info.merge!(status: :failed, error: error.inspect, stop_time: stop_time)
46
+ end
47
+
48
+ if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
49
+ at_stop_callback.call(run_info)
50
+ end
51
+ puts "<<< Runner: stopped: #{run_info}"
52
+ end
53
+
54
+ puts ">>> Runner: started: #{run_info}"
55
+ if at_start_callback = Kimurai.configuration.runner_at_start_callback
56
+ at_start_callback.call(run_info)
57
+ end
58
+
59
+ spiders.peach_with_index(jobs) do |spider, i|
60
+ spider_name = spider[0]
61
+ puts "> Runner: started spider: #{spider_name}, index: #{i}"
62
+
63
+ pid = spawn("bundle", "exec", "kimurai", "crawl", spider_name, [:out, :err] => "log/#{spider_name}.log")
64
+ running_pids << pid
65
+ Process.wait pid
66
+
67
+ running_pids.delete(pid)
68
+ puts "< Runner: stopped spider: #{spider_name}, index: #{i}"
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,18 @@
1
+ /.bundle
2
+ /cache
3
+ /node_modules
4
+
5
+ /log/*
6
+ !/log/.keep
7
+
8
+ /tmp/*
9
+ !/tmp/.keep
10
+
11
+ /db/*
12
+ !/db/.keep
13
+
14
+ .byebug_history
15
+ *.swp
16
+ .env
17
+
18
+ capybara-*.png
@@ -0,0 +1 @@
1
+ 2.5.1
@@ -0,0 +1,20 @@
1
+ source 'https://rubygems.org'
2
+ git_source(:github) { |repo| "https://github.com/#{repo}.git" }
3
+
4
+ ruby '2.5.1'
5
+
6
+ # Framework
7
+ gem 'kimurai', '~> 1.0'
8
+
9
+ # Require files in directory and child directories recursively
10
+ gem 'require_all'
11
+
12
+ # Dotenv
13
+ gem 'dotenv'
14
+
15
+ # To debug spiders:
16
+ group :development do
17
+ gem 'byebug', platforms: :mri
18
+ gem 'pry'
19
+ end
20
+
@@ -0,0 +1,3 @@
1
+ # README
2
+
3
+ New Kimurai project readme
@@ -0,0 +1,32 @@
1
+ Kimurai.configure do |config|
2
+ # Default logger has colored mode in development.
3
+ # If you would like to disable it, set `colorize_logger` to false.
4
+ # config.colorize_logger = false
5
+
6
+ # Logger level for default logger:
7
+ # config.log_level = :info
8
+
9
+ # Custom logger:
10
+ # config.logger = Logger.new(STDOUT)
11
+
12
+ # Custom time zone (for logs):
13
+ # config.time_zone = "UTC"
14
+ # config.time_zone = "Europe/Moscow"
15
+
16
+ # At start callback for a runner. Accepts argument with info as hash with
17
+ # keys: id, status, start_time, environment, concurrent_jobs, spiders list.
18
+ # For example, you can use this callback to send notification when runner was started:
19
+ # config.runner_at_start_callback = lambda do |info|
20
+ # json = JSON.pretty_generate(info)
21
+ # Sender.send_notification("Started session: #{json}")
22
+ # end
23
+
24
+ # At stop callback for a runner. Accepts argument with info as hash with
25
+ # all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
26
+ # stop status of a runner (completed or failed).
27
+ # You can use this callback to send notification when runner has been stopped:
28
+ # config.runner_at_stop_callback = lambda do |info|
29
+ # json = JSON.pretty_generate(info)
30
+ # Sender.send_notification("Stopped session: #{json}")
31
+ # end
32
+ end
@@ -0,0 +1,13 @@
1
+ # software versions to install for `setup` command
2
+ setup:
3
+ ruby: 2.5.1
4
+ # check latest here http://phantomjs.org/download.html
5
+ phantomjs: 2.1.1
6
+ # check latest here https://github.com/mozilla/geckodriver/releases/
7
+ geckodriver: 0.21.0
8
+ # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
9
+ chromedriver: 2.39
10
+ # settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
11
+ deploy:
12
+ # repo_url: git@bitbucket.org:username/repo_name.git
13
+ # repo_key_path: ~/.ssh/id_rsa
@@ -0,0 +1,22 @@
1
+ # require project gems
2
+ require 'bundler/setup'
3
+ Bundler.require(:default, Kimurai.env)
4
+
5
+ # require custom ENV variables located in .env file
6
+ require 'dotenv/load'
7
+
8
+ # require initializers
9
+ Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
10
+
11
+ # require helpers
12
+ Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
13
+
14
+ # require pipelines
15
+ Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
16
+
17
+ # require spiders recursively in the `spiders/` folder
18
+ require_relative '../spiders/application_spider'
19
+ require_all "spiders"
20
+
21
+ # require Kimurai configuration
22
+ require_relative 'application'
@@ -0,0 +1,57 @@
1
+ ### Settings ###
2
+ require 'tzinfo'
3
+
4
+ # Export current PATH to the cron
5
+ env :PATH, ENV["PATH"]
6
+
7
+ # Use 24 hour format when using `at:` option
8
+ set :chronic_options, hours24: true
9
+
10
+ # Use local_to_utc helper to setup execution time using your local timezone instead
11
+ # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
12
+ # Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
13
+ # to have spiders logs in a specific time zone format.
14
+ # Example usage of helper:
15
+ # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
16
+ # crawl "google_spider.com", output: "log/google_spider.com.log"
17
+ # end
18
+ def local_to_utc(time_string, zone:)
19
+ TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time))
20
+ end
21
+
22
+ # Note: by default Whenever exports cron commands with :environment == "production".
23
+ # Note: Whenever can only append log data to a log file (>>). If you want
24
+ # to overwrite (>) log file before each run, pass lambda:
25
+ # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
26
+
27
+ # Project job types
28
+ job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
29
+ job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
30
+
31
+ # Single file job type
32
+ job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
33
+ # Single with bundle exec
34
+ job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
35
+
36
+ ### Schedule ###
37
+ # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
38
+ # every 1.day do
39
+ # Example to schedule a single spider in the project:
40
+ # crawl "google_spider.com", output: "log/google_spider.com.log"
41
+
42
+ # Example to schedule all spiders in the project using runner. Each spider will write
43
+ # it's own output to the `log/spider_name.log` file (handled by a runner itself).
44
+ # Runner output will be written to log/runner.log file.
45
+ # Argument number it's a count of concurrent jobs:
46
+ # runner 3, output:"log/runner.log"
47
+
48
+ # Example to schedule single spider (without project):
49
+ # single "single_spider.rb", output: "single_spider.log"
50
+ # end
51
+
52
+ ### How to set a cron schedule ###
53
+ # Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
54
+ # If you don't have whenever command, install the gem: `$ gem install whenever`.
55
+
56
+ ### How to cancel a schedule ###
57
+ # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.