tanakai 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +118 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/tanakai +6 -0
  12. data/lib/tanakai/automation/deploy.yml +54 -0
  13. data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
  14. data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
  15. data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
  16. data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
  17. data/lib/tanakai/automation/setup.yml +45 -0
  18. data/lib/tanakai/base/saver.rb +106 -0
  19. data/lib/tanakai/base/storage.rb +54 -0
  20. data/lib/tanakai/base.rb +326 -0
  21. data/lib/tanakai/base_helper.rb +22 -0
  22. data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
  23. data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
  24. data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
  25. data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  26. data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
  27. data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
  28. data/lib/tanakai/browser_builder.rb +20 -0
  29. data/lib/tanakai/capybara_configuration.rb +10 -0
  30. data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
  31. data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
  32. data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
  33. data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
  34. data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
  35. data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
  36. data/lib/tanakai/capybara_ext/session/config.rb +22 -0
  37. data/lib/tanakai/capybara_ext/session.rb +249 -0
  38. data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
  39. data/lib/tanakai/cli/generator.rb +57 -0
  40. data/lib/tanakai/cli.rb +183 -0
  41. data/lib/tanakai/core_ext/array.rb +14 -0
  42. data/lib/tanakai/core_ext/hash.rb +5 -0
  43. data/lib/tanakai/core_ext/numeric.rb +19 -0
  44. data/lib/tanakai/core_ext/string.rb +7 -0
  45. data/lib/tanakai/pipeline.rb +33 -0
  46. data/lib/tanakai/runner.rb +60 -0
  47. data/lib/tanakai/template/.gitignore +18 -0
  48. data/lib/tanakai/template/Gemfile +28 -0
  49. data/lib/tanakai/template/README.md +3 -0
  50. data/lib/tanakai/template/config/application.rb +37 -0
  51. data/lib/tanakai/template/config/automation.yml +13 -0
  52. data/lib/tanakai/template/config/boot.rb +22 -0
  53. data/lib/tanakai/template/config/initializers/.keep +0 -0
  54. data/lib/tanakai/template/config/schedule.rb +57 -0
  55. data/lib/tanakai/template/db/.keep +0 -0
  56. data/lib/tanakai/template/helpers/application_helper.rb +3 -0
  57. data/lib/tanakai/template/lib/.keep +0 -0
  58. data/lib/tanakai/template/log/.keep +0 -0
  59. data/lib/tanakai/template/pipelines/saver.rb +11 -0
  60. data/lib/tanakai/template/pipelines/validator.rb +24 -0
  61. data/lib/tanakai/template/spiders/application_spider.rb +143 -0
  62. data/lib/tanakai/template/tmp/.keep +0 -0
  63. data/lib/tanakai/version.rb +3 -0
  64. data/lib/tanakai.rb +54 -0
  65. data/tanakai.gemspec +50 -0
  66. metadata +382 -0
@@ -0,0 +1,183 @@
1
+ require 'thor'
2
+
3
+ module Tanakai
4
+ class CLI < Thor
5
+ map %w[--version -v] => :__print_version
6
+
7
+ desc "generate", "Generator, available types: project, spider, schedule"
8
+ def generate(generator_type, *args)
9
+ case generator_type
10
+ when "project"
11
+ project_name = args.shift
12
+ raise "Provide project name to generate a new project" unless project_name.present?
13
+ Generator.new.generate_project(project_name)
14
+ when "spider"
15
+ spider_name = args.shift
16
+ raise "Provide spider name to generate a spider" unless spider_name.present?
17
+ Generator.new.generate_spider(spider_name, in_project: inside_project?)
18
+ when "schedule"
19
+ Generator.new.generate_schedule
20
+ else
21
+ raise "Don't know this generator type: #{generator_type}"
22
+ end
23
+ end
24
+
25
+ ###
26
+
27
+ desc "setup", "Setup server"
28
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
29
+ option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
30
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
31
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
32
+ option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
33
+ def setup(user_host)
34
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
35
+
36
+ pid = spawn *command
37
+ Process.wait pid
38
+ end
39
+
40
+ desc "deploy", "Deploy project to the server and update cron schedule"
41
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
42
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
43
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
44
+ option "repo-url", type: :string, banner: "Repo url"
45
+ option "repo-key-path", type: :string, banner: "SSH key for a git repo"
46
+ option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
47
+ def deploy(user_host)
48
+ unless options["skip-check"]
49
+ if !`git status --short`.empty?
50
+ raise "Deploy: Please commit your changes first"
51
+ elsif `git remote`.empty?
52
+ raise "Deploy: Please add remote origin repository to your repo first"
53
+ elsif !`git rev-list master...origin/master`.empty?
54
+ raise "Deploy: Please push your commits to the remote origin repo first"
55
+ end
56
+ end
57
+
58
+ repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
59
+ repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
60
+
61
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
62
+ vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
63
+ ).get
64
+
65
+ pid = spawn *command
66
+ Process.wait pid
67
+ end
68
+
69
+ ###
70
+
71
+ desc "crawl", "Run a particular spider by it's name"
72
+ def crawl(spider_name)
73
+ raise "Can't find Tanakai project" unless inside_project?
74
+ require './config/boot'
75
+
76
+ unless klass = Tanakai.find_by_name(spider_name)
77
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
78
+ "To list all available spiders, run: `$ bundle exec tanakai list`"
79
+ end
80
+
81
+ # Set time_zone if exists
82
+ if time_zone = Tanakai.configuration.time_zone
83
+ Tanakai.time_zone = time_zone
84
+ end
85
+
86
+ klass.crawl!
87
+ end
88
+
89
+ desc "parse", "Parse url in the particular spider method"
90
+ option :url, type: :string, required: true, banner: "Url to pass to the method"
91
+ def parse(spider_name, method_name)
92
+ raise "Can't find Tanakai project" unless inside_project?
93
+ require './config/boot'
94
+
95
+ unless klass = Tanakai.find_by_name(spider_name)
96
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
97
+ "To list all available spiders, run: `$ bundle exec tanakai list`"
98
+ end
99
+
100
+ klass.parse!(method_name, url: options["url"])
101
+ end
102
+
103
+ desc "console", "Start Tanakai console"
104
+ option :engine, type: :string, banner: "Engine to use"
105
+ option :url, type: :string, banner: "Url to process"
106
+ def console(spider_name = nil)
107
+ require 'pry'
108
+ require './config/boot' if inside_project?
109
+
110
+ if spider_name
111
+ raise "Can't find Tanakai project" unless inside_project?
112
+
113
+ unless klass = Tanakai.find_by_name(spider_name)
114
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
115
+ "To list all available spiders, run: `$ bundle exec tanakai list`"
116
+ end
117
+ else
118
+ klass = inside_project? ? ApplicationSpider : ::Tanakai::Base
119
+ end
120
+
121
+ engine = options["engine"]&.delete(":")&.to_sym
122
+ if url = options["url"]
123
+ klass.new(engine).request_to(:console, url: options["url"])
124
+ else
125
+ klass.new(engine).public_send(:console)
126
+ end
127
+ end
128
+
129
+ desc "list", "List all available spiders in the current project"
130
+ def list
131
+ raise "Can't find Tanakai project" unless inside_project?
132
+ require './config/boot'
133
+
134
+ Tanakai.list.keys.sort.each { |name| puts name }
135
+ end
136
+
137
+ desc "runner", "Run all spiders in the project in queue"
138
+ option :include, type: :array, default: [], banner: "List of spiders to run"
139
+ option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
140
+ option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
141
+ def runner
142
+ raise "Can't find Tanakai project" unless inside_project?
143
+
144
+ jobs = options["jobs"]
145
+ raise "Jobs count can't be 0" if jobs == 0
146
+
147
+ require './config/boot'
148
+ require 'tanakai/runner'
149
+
150
+ spiders = options["include"].presence || Tanakai.list.keys
151
+ spiders -= options["exclude"]
152
+
153
+ Runner.new(spiders, jobs).run!
154
+ end
155
+
156
+ desc "--version, -v", "Print the version"
157
+ def __print_version
158
+ puts VERSION
159
+ end
160
+
161
+ desc "dashboard", "Run dashboard"
162
+ def dashboard
163
+ raise "Can't find Tanakai project" unless inside_project?
164
+
165
+ require './config/boot'
166
+ if Object.const_defined?("Tanakai::Dashboard")
167
+ require 'tanakai/dashboard/app'
168
+ Tanakai::Dashboard::App.run!
169
+ else
170
+ raise "Tanakai::Dashboard is not defined"
171
+ end
172
+ end
173
+
174
+ private
175
+
176
+ def inside_project?
177
+ Dir.exists?("spiders") && File.exists?("./config/boot.rb")
178
+ end
179
+ end
180
+ end
181
+
182
+ require_relative 'cli/generator'
183
+ require_relative 'cli/ansible_command_builder'
@@ -0,0 +1,14 @@
1
+ class Array
2
+ def in_sorted_groups(number, fill_width = nil)
3
+ sorted_groups = Array.new(number) { |a| a = [] }
4
+
5
+ self.in_groups_of(number, fill_width).each do |group|
6
+ number.times do |i|
7
+ group.fetch(i) rescue next
8
+ sorted_groups[i] << group[i]
9
+ end
10
+ end
11
+
12
+ sorted_groups
13
+ end
14
+ end
@@ -0,0 +1,5 @@
1
+ class Hash
2
+ def deep_merge_excl(second, exclude)
3
+ self.merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
4
+ end
5
+ end
@@ -0,0 +1,19 @@
1
+ class Numeric
2
+ # https://stackoverflow.com/a/1679963
3
+ def duration
4
+ secs = self.to_int
5
+ mins = secs / 60
6
+ hours = mins / 60
7
+ days = hours / 24
8
+
9
+ if days > 0
10
+ "#{days}d, #{hours % 24}h"
11
+ elsif hours > 0
12
+ "#{hours}h, #{mins % 60}m"
13
+ elsif mins > 0
14
+ "#{mins}m, #{secs % 60}s"
15
+ elsif secs >= 0
16
+ "#{secs}s"
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ require 'murmurhash3'
2
+
3
+ class String
4
+ def to_id
5
+ MurmurHash3::V32.str_hash(self)
6
+ end
7
+ end
@@ -0,0 +1,33 @@
1
+ module Tanakai
2
+ class Pipeline
3
+ class DropItemError < StandardError; end
4
+ def self.name
5
+ self.to_s.sub(/.*?::/, "").underscore.to_sym
6
+ end
7
+
8
+ include BaseHelper
9
+ attr_accessor :spider
10
+
11
+ def name
12
+ self.class.name
13
+ end
14
+
15
+ ###
16
+
17
+ def storage
18
+ spider.storage
19
+ end
20
+
21
+ def unique?(scope, value)
22
+ spider.unique?(scope, value)
23
+ end
24
+
25
+ def save_to(path, item, format:, position: true, append: false)
26
+ spider.save_to(path, item, format: format, position: position, append: append)
27
+ end
28
+
29
+ def logger
30
+ spider.logger
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,60 @@
1
+ require 'pmap'
2
+
3
+ module Tanakai
4
+ class Runner
5
+ attr_reader :jobs, :spiders, :session_info
6
+
7
+ def initialize(spiders, parallel_jobs)
8
+ @jobs = parallel_jobs
9
+ @spiders = spiders
10
+ @start_time = Time.now
11
+
12
+ @session_info = {
13
+ id: @start_time.to_i,
14
+ status: :processing,
15
+ start_time: @start_time,
16
+ stop_time: nil,
17
+ environment: Tanakai.env,
18
+ concurrent_jobs: @jobs,
19
+ spiders: @spiders
20
+ }
21
+
22
+ if time_zone = Tanakai.configuration.time_zone
23
+ Tanakai.time_zone = time_zone
24
+ end
25
+
26
+ ENV.store("SESSION_ID", @start_time.to_i.to_s)
27
+ ENV.store("RBCAT_COLORIZER", "false")
28
+ end
29
+
30
+ def run!(exception_on_fail: true)
31
+ puts ">>> Runner: started: #{session_info}"
32
+ if at_start_callback = Tanakai.configuration.runner_at_start_callback
33
+ at_start_callback.call(session_info)
34
+ end
35
+
36
+ running = true
37
+ spiders.peach_with_index(jobs) do |spider, i|
38
+ next unless running
39
+
40
+ puts "> Runner: started spider: #{spider}, index: #{i}"
41
+ pid = spawn("bundle", "exec", "tanakai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
42
+ Process.wait pid
43
+
44
+ puts "< Runner: stopped spider: #{spider}, index: #{i}"
45
+ end
46
+ rescue StandardError, SignalException, SystemExit => e
47
+ running = false
48
+
49
+ session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
50
+ exception_on_fail ? raise(e) : [session_info, e]
51
+ else
52
+ session_info.merge!(status: :completed, stop_time: Time.now)
53
+ ensure
54
+ if at_stop_callback = Tanakai.configuration.runner_at_stop_callback
55
+ at_stop_callback.call(session_info)
56
+ end
57
+ puts "<<< Runner: stopped: #{session_info}"
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,18 @@
1
+ /.bundle
2
+ /cache
3
+ /node_modules
4
+
5
+ /log/*
6
+ !/log/.keep
7
+
8
+ /tmp/*
9
+ !/tmp/.keep
10
+
11
+ /db/*
12
+ !/db/.keep
13
+
14
+ .byebug_history
15
+ *.swp
16
+ .env
17
+
18
+ capybara-*.png
@@ -0,0 +1,28 @@
1
+ source 'https://rubygems.org'
2
+ git_source(:github) { |repo| "https://github.com/#{repo}.git" }
3
+
4
+ ruby '>= 2.5'
5
+
6
+ # Framework
7
+ gem 'tanakai'
8
+
9
+ # Require files in directory and child directories recursively
10
+ gem 'require_all'
11
+
12
+ # Dotenv
13
+ gem 'dotenv'
14
+
15
+ # To debug spiders:
16
+ group :development do
17
+ gem 'byebug', platforms: :mri
18
+ gem 'pry'
19
+ end
20
+
21
+ # If you want to save items to the database, require one of these gems:
22
+ # gem 'sqlite3'
23
+ # gem 'pg'
24
+ # gem 'mysql2'
25
+
26
+ # And use your preferred ORM/database connector:
27
+ # gem 'activerecord', require: 'active_record'
28
+ # gem 'sequel'
@@ -0,0 +1,3 @@
1
+ # README
2
+
3
+ New Tanakai project readme
@@ -0,0 +1,37 @@
1
+ Tanakai.configure do |config|
2
+ # Default logger has colored mode in development.
3
+ # If you would like to disable it, set `colorize_logger` to false.
4
+ # config.colorize_logger = false
5
+
6
+ # Logger level for default logger:
7
+ # config.log_level = :info
8
+
9
+ # Custom logger:
10
+ # config.logger = Logger.new(STDOUT)
11
+
12
+ # Custom time zone (for logs):
13
+ # config.time_zone = "UTC"
14
+ # config.time_zone = "Europe/Moscow"
15
+
16
+ # At start callback for a runner. Accepts argument with info as hash with
17
+ # keys: id, status, start_time, environment, concurrent_jobs, spiders list.
18
+ # For example, you can use this callback to send notification when runner was started:
19
+ # config.runner_at_start_callback = lambda do |info|
20
+ # json = JSON.pretty_generate(info)
21
+ # Sender.send_notification("Started session: #{json}")
22
+ # end
23
+
24
+ # At stop callback for a runner. Accepts argument with info as hash with
25
+ # all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
26
+ # stop status of a runner (completed or failed).
27
+ # You can use this callback to send notification when runner has been stopped:
28
+ # config.runner_at_stop_callback = lambda do |info|
29
+ # json = JSON.pretty_generate(info)
30
+ # Sender.send_notification("Stopped session: #{json}")
31
+ # end
32
+
33
+ # Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
34
+ # config.selenium_chrome_path = "/usr/bin/chromium-browser"
35
+ # Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
36
+ # config.chromedriver_path = "/usr/local/bin/chromedriver"
37
+ end
@@ -0,0 +1,13 @@
1
+ # software versions to install for `setup` command
2
+ setup:
3
+ ruby: 2.5.1
4
+ # check latest here http://phantomjs.org/download.html
5
+ phantomjs: 2.1.1
6
+ # check latest here https://github.com/mozilla/geckodriver/releases/
7
+ geckodriver: 0.21.0
8
+ # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
9
+ chromedriver: 2.39
10
+ # settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
11
+ deploy:
12
+ # repo_url: git@bitbucket.org:username/repo_name.git
13
+ # repo_key_path: ~/.ssh/id_rsa
@@ -0,0 +1,22 @@
1
+ # require project gems
2
+ require 'bundler/setup'
3
+ Bundler.require(:default, Tanakai.env)
4
+
5
+ # require custom ENV variables located in .env file
6
+ require 'dotenv/load'
7
+
8
+ # require initializers
9
+ Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
10
+
11
+ # require helpers
12
+ Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
13
+
14
+ # require pipelines
15
+ Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
16
+
17
+ # require spiders recursively in the `spiders/` folder
18
+ require_relative '../spiders/application_spider'
19
+ require_all "spiders"
20
+
21
+ # require Tanakai configuration
22
+ require_relative 'application'
File without changes
@@ -0,0 +1,57 @@
1
+ ### Settings ###
2
+ require 'tzinfo'
3
+
4
+ # Export current PATH to the cron
5
+ env :PATH, ENV["PATH"]
6
+
7
+ # Use 24 hour format when using `at:` option
8
+ set :chronic_options, hours24: true
9
+
10
+ # Use local_to_utc helper to setup execution time using your local timezone instead
11
+ # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
12
+ # Also maybe you'll want to set same timezone in tanakai as well (use `Tanakai.configuration.time_zone =` for that),
13
+ # to have spiders logs in a specific time zone format.
14
+ # Example usage of helper:
15
+ # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
16
+ # crawl "google_spider.com", output: "log/google_spider.com.log"
17
+ # end
18
+ def local_to_utc(time_string, zone:)
19
+ TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
20
+ end
21
+
22
+ # Note: by default Whenever exports cron commands with :environment == "production".
23
+ # Note: Whenever can only append log data to a log file (>>). If you want
24
+ # to overwrite (>) log file before each run, pass lambda:
25
+ # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
26
+
27
+ # Project job types
28
+ job_type :crawl, "cd :path && TANAKAI_ENV=:environment bundle exec tanakai crawl :task :output"
29
+ job_type :runner, "cd :path && TANAKAI_ENV=:environment bundle exec tanakai runner --jobs :task :output"
30
+
31
+ # Single file job type
32
+ job_type :single, "cd :path && TANAKAI_ENV=:environment ruby :task :output"
33
+ # Single with bundle exec
34
+ job_type :single_bundle, "cd :path && TANAKAI_ENV=:environment bundle exec ruby :task :output"
35
+
36
+ ### Schedule ###
37
+ # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
38
+ # every 1.day do
39
+ # Example to schedule a single spider in the project:
40
+ # crawl "google_spider.com", output: "log/google_spider.com.log"
41
+
42
+ # Example to schedule all spiders in the project using runner. Each spider will write
43
+ # it's own output to the `log/spider_name.log` file (handled by a runner itself).
44
+ # Runner output will be written to log/runner.log file.
45
+ # Argument number it's a count of concurrent jobs:
46
+ # runner 3, output:"log/runner.log"
47
+
48
+ # Example to schedule single spider (without project):
49
+ # single "single_spider.rb", output: "single_spider.log"
50
+ # end
51
+
52
+ ### How to set a cron schedule ###
53
+ # Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
54
+ # If you don't have whenever command, install the gem: `$ gem install whenever`.
55
+
56
+ ### How to cancel a schedule ###
57
+ # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
File without changes
@@ -0,0 +1,3 @@
1
+ module ApplicationHelper
2
+ # Put here custom methods which are will be available for any spider
3
+ end
File without changes
File without changes
@@ -0,0 +1,11 @@
1
+ class Saver < Tanakai::Pipeline
2
+ def process_item(item, options: {})
3
+ # Here you can save item to the database, send it to a remote API or
4
+ # simply save item to a file format using `save_to` helper:
5
+
6
+ # To get the name of a current spider: `spider.class.name`
7
+ # save_to "db/#{spider.class.name}.json", item, format: :pretty_json
8
+
9
+ item
10
+ end
11
+ end
@@ -0,0 +1,24 @@
1
+ class Validator < Tanakai::Pipeline
2
+ def process_item(item, options: {})
3
+ # Here you can validate item and raise `DropItemError`
4
+ # if one of the validations failed. Examples:
5
+
6
+ # Check item sku for uniqueness using buit-in `unique?` helper:
7
+ # unless unique?(:sku, item[:sku])
8
+ # raise DropItemError, "Item sku is not unique"
9
+ # end
10
+
11
+ # Drop item if title length shorter than 5 symbols:
12
+ # if item[:title].size < 5
13
+ # raise DropItemError, "Item title is short"
14
+ # end
15
+
16
+ # Drop item if it doesn't contains any images:
17
+ # unless item[:images].present?
18
+ # raise DropItemError, "Item images are not present"
19
+ # end
20
+
21
+ # Pass item to the next pipeline (if it wasn't dropped)
22
+ item
23
+ end
24
+ end