tanakai 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +118 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/tanakai +6 -0
  12. data/lib/tanakai/automation/deploy.yml +54 -0
  13. data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
  14. data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
  15. data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
  16. data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
  17. data/lib/tanakai/automation/setup.yml +45 -0
  18. data/lib/tanakai/base/saver.rb +106 -0
  19. data/lib/tanakai/base/storage.rb +54 -0
  20. data/lib/tanakai/base.rb +326 -0
  21. data/lib/tanakai/base_helper.rb +22 -0
  22. data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
  23. data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
  24. data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
  25. data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  26. data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
  27. data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
  28. data/lib/tanakai/browser_builder.rb +20 -0
  29. data/lib/tanakai/capybara_configuration.rb +10 -0
  30. data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
  31. data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
  32. data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
  33. data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
  34. data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
  35. data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
  36. data/lib/tanakai/capybara_ext/session/config.rb +22 -0
  37. data/lib/tanakai/capybara_ext/session.rb +249 -0
  38. data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
  39. data/lib/tanakai/cli/generator.rb +57 -0
  40. data/lib/tanakai/cli.rb +183 -0
  41. data/lib/tanakai/core_ext/array.rb +14 -0
  42. data/lib/tanakai/core_ext/hash.rb +5 -0
  43. data/lib/tanakai/core_ext/numeric.rb +19 -0
  44. data/lib/tanakai/core_ext/string.rb +7 -0
  45. data/lib/tanakai/pipeline.rb +33 -0
  46. data/lib/tanakai/runner.rb +60 -0
  47. data/lib/tanakai/template/.gitignore +18 -0
  48. data/lib/tanakai/template/Gemfile +28 -0
  49. data/lib/tanakai/template/README.md +3 -0
  50. data/lib/tanakai/template/config/application.rb +37 -0
  51. data/lib/tanakai/template/config/automation.yml +13 -0
  52. data/lib/tanakai/template/config/boot.rb +22 -0
  53. data/lib/tanakai/template/config/initializers/.keep +0 -0
  54. data/lib/tanakai/template/config/schedule.rb +57 -0
  55. data/lib/tanakai/template/db/.keep +0 -0
  56. data/lib/tanakai/template/helpers/application_helper.rb +3 -0
  57. data/lib/tanakai/template/lib/.keep +0 -0
  58. data/lib/tanakai/template/log/.keep +0 -0
  59. data/lib/tanakai/template/pipelines/saver.rb +11 -0
  60. data/lib/tanakai/template/pipelines/validator.rb +24 -0
  61. data/lib/tanakai/template/spiders/application_spider.rb +143 -0
  62. data/lib/tanakai/template/tmp/.keep +0 -0
  63. data/lib/tanakai/version.rb +3 -0
  64. data/lib/tanakai.rb +54 -0
  65. data/tanakai.gemspec +50 -0
  66. metadata +382 -0
@@ -0,0 +1,183 @@
1
+ require 'thor'
2
+
3
+ module Tanakai
4
+ class CLI < Thor
5
+ map %w[--version -v] => :__print_version
6
+
7
+ desc "generate", "Generator, available types: project, spider, schedule"
8
+ def generate(generator_type, *args)
9
+ case generator_type
10
+ when "project"
11
+ project_name = args.shift
12
+ raise "Provide project name to generate a new project" unless project_name.present?
13
+ Generator.new.generate_project(project_name)
14
+ when "spider"
15
+ spider_name = args.shift
16
+ raise "Provide spider name to generate a spider" unless spider_name.present?
17
+ Generator.new.generate_spider(spider_name, in_project: inside_project?)
18
+ when "schedule"
19
+ Generator.new.generate_schedule
20
+ else
21
+ raise "Don't know this generator type: #{generator_type}"
22
+ end
23
+ end
24
+
25
+ ###
26
+
27
+ desc "setup", "Setup server"
28
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
29
+ option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
30
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
31
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
32
+ option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
33
+ def setup(user_host)
34
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
35
+
36
+ pid = spawn *command
37
+ Process.wait pid
38
+ end
39
+
40
+ desc "deploy", "Deploy project to the server and update cron schedule"
41
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
42
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
43
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
44
+ option "repo-url", type: :string, banner: "Repo url"
45
+ option "repo-key-path", type: :string, banner: "SSH key for a git repo"
46
+ option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
47
+ def deploy(user_host)
48
+ unless options["skip-check"]
49
+ if !`git status --short`.empty?
50
+ raise "Deploy: Please commit your changes first"
51
+ elsif `git remote`.empty?
52
+ raise "Deploy: Please add remote origin repository to your repo first"
53
+ elsif !`git rev-list master...origin/master`.empty?
54
+ raise "Deploy: Please push your commits to the remote origin repo first"
55
+ end
56
+ end
57
+
58
+ repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
59
+ repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
60
+
61
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
62
+ vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
63
+ ).get
64
+
65
+ pid = spawn *command
66
+ Process.wait pid
67
+ end
68
+
69
+ ###
70
+
71
+ desc "crawl", "Run a particular spider by it's name"
72
+ def crawl(spider_name)
73
+ raise "Can't find Tanakai project" unless inside_project?
74
+ require './config/boot'
75
+
76
+ unless klass = Tanakai.find_by_name(spider_name)
77
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
78
+ "To list all available spiders, run: `$ bundle exec tanakai list`"
79
+ end
80
+
81
+ # Set time_zone if exists
82
+ if time_zone = Tanakai.configuration.time_zone
83
+ Tanakai.time_zone = time_zone
84
+ end
85
+
86
+ klass.crawl!
87
+ end
88
+
89
+ desc "parse", "Parse url in the particular spider method"
90
+ option :url, type: :string, required: true, banner: "Url to pass to the method"
91
+ def parse(spider_name, method_name)
92
+ raise "Can't find Tanakai project" unless inside_project?
93
+ require './config/boot'
94
+
95
+ unless klass = Tanakai.find_by_name(spider_name)
96
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
97
+ "To list all available spiders, run: `$ bundle exec tanakai list`"
98
+ end
99
+
100
+ klass.parse!(method_name, url: options["url"])
101
+ end
102
+
103
+ desc "console", "Start Tanakai console"
104
+ option :engine, type: :string, banner: "Engine to use"
105
+ option :url, type: :string, banner: "Url to process"
106
+ def console(spider_name = nil)
107
+ require 'pry'
108
+ require './config/boot' if inside_project?
109
+
110
+ if spider_name
111
+ raise "Can't find Tanakai project" unless inside_project?
112
+
113
+ unless klass = Tanakai.find_by_name(spider_name)
114
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
115
+ "To list all available spiders, run: `$ bundle exec tanakai list`"
116
+ end
117
+ else
118
+ klass = inside_project? ? ApplicationSpider : ::Tanakai::Base
119
+ end
120
+
121
+ engine = options["engine"]&.delete(":")&.to_sym
122
+ if url = options["url"]
123
+ klass.new(engine).request_to(:console, url: options["url"])
124
+ else
125
+ klass.new(engine).public_send(:console)
126
+ end
127
+ end
128
+
129
+ desc "list", "List all available spiders in the current project"
130
+ def list
131
+ raise "Can't find Tanakai project" unless inside_project?
132
+ require './config/boot'
133
+
134
+ Tanakai.list.keys.sort.each { |name| puts name }
135
+ end
136
+
137
+ desc "runner", "Run all spiders in the project in queue"
138
+ option :include, type: :array, default: [], banner: "List of spiders to run"
139
+ option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
140
+ option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
141
+ def runner
142
+ raise "Can't find Tanakai project" unless inside_project?
143
+
144
+ jobs = options["jobs"]
145
+ raise "Jobs count can't be 0" if jobs == 0
146
+
147
+ require './config/boot'
148
+ require 'tanakai/runner'
149
+
150
+ spiders = options["include"].presence || Tanakai.list.keys
151
+ spiders -= options["exclude"]
152
+
153
+ Runner.new(spiders, jobs).run!
154
+ end
155
+
156
+ desc "--version, -v", "Print the version"
157
+ def __print_version
158
+ puts VERSION
159
+ end
160
+
161
+ desc "dashboard", "Run dashboard"
162
+ def dashboard
163
+ raise "Can't find Tanakai project" unless inside_project?
164
+
165
+ require './config/boot'
166
+ if Object.const_defined?("Tanakai::Dashboard")
167
+ require 'tanakai/dashboard/app'
168
+ Tanakai::Dashboard::App.run!
169
+ else
170
+ raise "Tanakai::Dashboard is not defined"
171
+ end
172
+ end
173
+
174
+ private
175
+
176
+ def inside_project?
177
+ Dir.exists?("spiders") && File.exists?("./config/boot.rb")
178
+ end
179
+ end
180
+ end
181
+
182
+ require_relative 'cli/generator'
183
+ require_relative 'cli/ansible_command_builder'
@@ -0,0 +1,14 @@
1
+ class Array
2
+ def in_sorted_groups(number, fill_width = nil)
3
+ sorted_groups = Array.new(number) { |a| a = [] }
4
+
5
+ self.in_groups_of(number, fill_width).each do |group|
6
+ number.times do |i|
7
+ group.fetch(i) rescue next
8
+ sorted_groups[i] << group[i]
9
+ end
10
+ end
11
+
12
+ sorted_groups
13
+ end
14
+ end
@@ -0,0 +1,5 @@
1
+ class Hash
2
+ def deep_merge_excl(second, exclude)
3
+ self.merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
4
+ end
5
+ end
@@ -0,0 +1,19 @@
1
+ class Numeric
2
+ # https://stackoverflow.com/a/1679963
3
+ def duration
4
+ secs = self.to_int
5
+ mins = secs / 60
6
+ hours = mins / 60
7
+ days = hours / 24
8
+
9
+ if days > 0
10
+ "#{days}d, #{hours % 24}h"
11
+ elsif hours > 0
12
+ "#{hours}h, #{mins % 60}m"
13
+ elsif mins > 0
14
+ "#{mins}m, #{secs % 60}s"
15
+ elsif secs >= 0
16
+ "#{secs}s"
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ require 'murmurhash3'
2
+
3
+ class String
4
+ def to_id
5
+ MurmurHash3::V32.str_hash(self)
6
+ end
7
+ end
@@ -0,0 +1,33 @@
1
+ module Tanakai
2
+ class Pipeline
3
+ class DropItemError < StandardError; end
4
+ def self.name
5
+ self.to_s.sub(/.*?::/, "").underscore.to_sym
6
+ end
7
+
8
+ include BaseHelper
9
+ attr_accessor :spider
10
+
11
+ def name
12
+ self.class.name
13
+ end
14
+
15
+ ###
16
+
17
+ def storage
18
+ spider.storage
19
+ end
20
+
21
+ def unique?(scope, value)
22
+ spider.unique?(scope, value)
23
+ end
24
+
25
+ def save_to(path, item, format:, position: true, append: false)
26
+ spider.save_to(path, item, format: format, position: position, append: append)
27
+ end
28
+
29
+ def logger
30
+ spider.logger
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,60 @@
1
+ require 'pmap'
2
+
3
+ module Tanakai
4
+ class Runner
5
+ attr_reader :jobs, :spiders, :session_info
6
+
7
+ def initialize(spiders, parallel_jobs)
8
+ @jobs = parallel_jobs
9
+ @spiders = spiders
10
+ @start_time = Time.now
11
+
12
+ @session_info = {
13
+ id: @start_time.to_i,
14
+ status: :processing,
15
+ start_time: @start_time,
16
+ stop_time: nil,
17
+ environment: Tanakai.env,
18
+ concurrent_jobs: @jobs,
19
+ spiders: @spiders
20
+ }
21
+
22
+ if time_zone = Tanakai.configuration.time_zone
23
+ Tanakai.time_zone = time_zone
24
+ end
25
+
26
+ ENV.store("SESSION_ID", @start_time.to_i.to_s)
27
+ ENV.store("RBCAT_COLORIZER", "false")
28
+ end
29
+
30
+ def run!(exception_on_fail: true)
31
+ puts ">>> Runner: started: #{session_info}"
32
+ if at_start_callback = Tanakai.configuration.runner_at_start_callback
33
+ at_start_callback.call(session_info)
34
+ end
35
+
36
+ running = true
37
+ spiders.peach_with_index(jobs) do |spider, i|
38
+ next unless running
39
+
40
+ puts "> Runner: started spider: #{spider}, index: #{i}"
41
+ pid = spawn("bundle", "exec", "tanakai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
42
+ Process.wait pid
43
+
44
+ puts "< Runner: stopped spider: #{spider}, index: #{i}"
45
+ end
46
+ rescue StandardError, SignalException, SystemExit => e
47
+ running = false
48
+
49
+ session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
50
+ exception_on_fail ? raise(e) : [session_info, e]
51
+ else
52
+ session_info.merge!(status: :completed, stop_time: Time.now)
53
+ ensure
54
+ if at_stop_callback = Tanakai.configuration.runner_at_stop_callback
55
+ at_stop_callback.call(session_info)
56
+ end
57
+ puts "<<< Runner: stopped: #{session_info}"
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,18 @@
1
+ /.bundle
2
+ /cache
3
+ /node_modules
4
+
5
+ /log/*
6
+ !/log/.keep
7
+
8
+ /tmp/*
9
+ !/tmp/.keep
10
+
11
+ /db/*
12
+ !/db/.keep
13
+
14
+ .byebug_history
15
+ *.swp
16
+ .env
17
+
18
+ capybara-*.png
@@ -0,0 +1,28 @@
1
+ source 'https://rubygems.org'
2
+ git_source(:github) { |repo| "https://github.com/#{repo}.git" }
3
+
4
+ ruby '>= 2.5'
5
+
6
+ # Framework
7
+ gem 'tanakai'
8
+
9
+ # Require files in directory and child directories recursively
10
+ gem 'require_all'
11
+
12
+ # Dotenv
13
+ gem 'dotenv'
14
+
15
+ # To debug spiders:
16
+ group :development do
17
+ gem 'byebug', platforms: :mri
18
+ gem 'pry'
19
+ end
20
+
21
+ # If you want to save items to the database, require one of these gems:
22
+ # gem 'sqlite3'
23
+ # gem 'pg'
24
+ # gem 'mysql2'
25
+
26
+ # And use your preferred ORM/database connector:
27
+ # gem 'activerecord', require: 'active_record'
28
+ # gem 'sequel'
@@ -0,0 +1,3 @@
1
+ # README
2
+
3
+ New Tanakai project readme
@@ -0,0 +1,37 @@
1
+ Tanakai.configure do |config|
2
+ # Default logger has colored mode in development.
3
+ # If you would like to disable it, set `colorize_logger` to false.
4
+ # config.colorize_logger = false
5
+
6
+ # Logger level for default logger:
7
+ # config.log_level = :info
8
+
9
+ # Custom logger:
10
+ # config.logger = Logger.new(STDOUT)
11
+
12
+ # Custom time zone (for logs):
13
+ # config.time_zone = "UTC"
14
+ # config.time_zone = "Europe/Moscow"
15
+
16
+ # At start callback for a runner. Accepts argument with info as hash with
17
+ # keys: id, status, start_time, environment, concurrent_jobs, spiders list.
18
+ # For example, you can use this callback to send notification when runner was started:
19
+ # config.runner_at_start_callback = lambda do |info|
20
+ # json = JSON.pretty_generate(info)
21
+ # Sender.send_notification("Started session: #{json}")
22
+ # end
23
+
24
+ # At stop callback for a runner. Accepts argument with info as hash with
25
+ # all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
26
+ # stop status of a runner (completed or failed).
27
+ # You can use this callback to send notification when runner has been stopped:
28
+ # config.runner_at_stop_callback = lambda do |info|
29
+ # json = JSON.pretty_generate(info)
30
+ # Sender.send_notification("Stopped session: #{json}")
31
+ # end
32
+
33
+ # Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
34
+ # config.selenium_chrome_path = "/usr/bin/chromium-browser"
35
+ # Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
36
+ # config.chromedriver_path = "/usr/local/bin/chromedriver"
37
+ end
@@ -0,0 +1,13 @@
1
+ # software versions to install for `setup` command
2
+ setup:
3
+ ruby: 2.5.1
4
+ # check latest here http://phantomjs.org/download.html
5
+ phantomjs: 2.1.1
6
+ # check latest here https://github.com/mozilla/geckodriver/releases/
7
+ geckodriver: 0.21.0
8
+ # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
9
+ chromedriver: 2.39
10
+ # settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
11
+ deploy:
12
+ # repo_url: git@bitbucket.org:username/repo_name.git
13
+ # repo_key_path: ~/.ssh/id_rsa
@@ -0,0 +1,22 @@
1
+ # require project gems
2
+ require 'bundler/setup'
3
+ Bundler.require(:default, Tanakai.env)
4
+
5
+ # require custom ENV variables located in .env file
6
+ require 'dotenv/load'
7
+
8
+ # require initializers
9
+ Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
10
+
11
+ # require helpers
12
+ Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
13
+
14
+ # require pipelines
15
+ Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
16
+
17
+ # require spiders recursively in the `spiders/` folder
18
+ require_relative '../spiders/application_spider'
19
+ require_all "spiders"
20
+
21
+ # require Tanakai configuration
22
+ require_relative 'application'
File without changes
@@ -0,0 +1,57 @@
1
+ ### Settings ###
2
+ require 'tzinfo'
3
+
4
+ # Export current PATH to the cron
5
+ env :PATH, ENV["PATH"]
6
+
7
+ # Use 24 hour format when using `at:` option
8
+ set :chronic_options, hours24: true
9
+
10
+ # Use local_to_utc helper to setup execution time using your local timezone instead
11
+ # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
12
+ # Also maybe you'll want to set same timezone in tanakai as well (use `Tanakai.configuration.time_zone =` for that),
13
+ # to have spiders logs in a specific time zone format.
14
+ # Example usage of helper:
15
+ # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
16
+ # crawl "google_spider.com", output: "log/google_spider.com.log"
17
+ # end
18
+ def local_to_utc(time_string, zone:)
19
+ TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
20
+ end
21
+
22
+ # Note: by default Whenever exports cron commands with :environment == "production".
23
+ # Note: Whenever can only append log data to a log file (>>). If you want
24
+ # to overwrite (>) log file before each run, pass lambda:
25
+ # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
26
+
27
+ # Project job types
28
+ job_type :crawl, "cd :path && TANAKAI_ENV=:environment bundle exec tanakai crawl :task :output"
29
+ job_type :runner, "cd :path && TANAKAI_ENV=:environment bundle exec tanakai runner --jobs :task :output"
30
+
31
+ # Single file job type
32
+ job_type :single, "cd :path && TANAKAI_ENV=:environment ruby :task :output"
33
+ # Single with bundle exec
34
+ job_type :single_bundle, "cd :path && TANAKAI_ENV=:environment bundle exec ruby :task :output"
35
+
36
+ ### Schedule ###
37
+ # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
38
+ # every 1.day do
39
+ # Example to schedule a single spider in the project:
40
+ # crawl "google_spider.com", output: "log/google_spider.com.log"
41
+
42
+ # Example to schedule all spiders in the project using runner. Each spider will write
43
+ # it's own output to the `log/spider_name.log` file (handled by a runner itself).
44
+ # Runner output will be written to log/runner.log file.
45
+ # Argument number it's a count of concurrent jobs:
46
+ # runner 3, output:"log/runner.log"
47
+
48
+ # Example to schedule single spider (without project):
49
+ # single "single_spider.rb", output: "single_spider.log"
50
+ # end
51
+
52
+ ### How to set a cron schedule ###
53
+ # Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
54
+ # If you don't have whenever command, install the gem: `$ gem install whenever`.
55
+
56
+ ### How to cancel a schedule ###
57
+ # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
File without changes
@@ -0,0 +1,3 @@
1
+ module ApplicationHelper
2
+ # Put here custom methods which are will be available for any spider
3
+ end
File without changes
File without changes
@@ -0,0 +1,11 @@
1
+ class Saver < Tanakai::Pipeline
2
+ def process_item(item, options: {})
3
+ # Here you can save item to the database, send it to a remote API or
4
+ # simply save item to a file format using `save_to` helper:
5
+
6
+ # To get the name of a current spider: `spider.class.name`
7
+ # save_to "db/#{spider.class.name}.json", item, format: :pretty_json
8
+
9
+ item
10
+ end
11
+ end
@@ -0,0 +1,24 @@
1
+ class Validator < Tanakai::Pipeline
2
+ def process_item(item, options: {})
3
+ # Here you can validate item and raise `DropItemError`
4
+ # if one of the validations failed. Examples:
5
+
6
+ # Check item sku for uniqueness using buit-in `unique?` helper:
7
+ # unless unique?(:sku, item[:sku])
8
+ # raise DropItemError, "Item sku is not unique"
9
+ # end
10
+
11
+ # Drop item if title length shorter than 5 symbols:
12
+ # if item[:title].size < 5
13
+ # raise DropItemError, "Item title is short"
14
+ # end
15
+
16
+ # Drop item if it doesn't contains any images:
17
+ # unless item[:images].present?
18
+ # raise DropItemError, "Item images are not present"
19
+ # end
20
+
21
+ # Pass item to the next pipeline (if it wasn't dropped)
22
+ item
23
+ end
24
+ end