tanakai 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +118 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/tanakai +6 -0
- data/lib/tanakai/automation/deploy.yml +54 -0
- data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
- data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
- data/lib/tanakai/automation/setup.yml +45 -0
- data/lib/tanakai/base/saver.rb +106 -0
- data/lib/tanakai/base/storage.rb +54 -0
- data/lib/tanakai/base.rb +326 -0
- data/lib/tanakai/base_helper.rb +22 -0
- data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
- data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
- data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/tanakai/browser_builder.rb +20 -0
- data/lib/tanakai/capybara_configuration.rb +10 -0
- data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
- data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/tanakai/capybara_ext/session/config.rb +22 -0
- data/lib/tanakai/capybara_ext/session.rb +249 -0
- data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
- data/lib/tanakai/cli/generator.rb +57 -0
- data/lib/tanakai/cli.rb +183 -0
- data/lib/tanakai/core_ext/array.rb +14 -0
- data/lib/tanakai/core_ext/hash.rb +5 -0
- data/lib/tanakai/core_ext/numeric.rb +19 -0
- data/lib/tanakai/core_ext/string.rb +7 -0
- data/lib/tanakai/pipeline.rb +33 -0
- data/lib/tanakai/runner.rb +60 -0
- data/lib/tanakai/template/.gitignore +18 -0
- data/lib/tanakai/template/Gemfile +28 -0
- data/lib/tanakai/template/README.md +3 -0
- data/lib/tanakai/template/config/application.rb +37 -0
- data/lib/tanakai/template/config/automation.yml +13 -0
- data/lib/tanakai/template/config/boot.rb +22 -0
- data/lib/tanakai/template/config/initializers/.keep +0 -0
- data/lib/tanakai/template/config/schedule.rb +57 -0
- data/lib/tanakai/template/db/.keep +0 -0
- data/lib/tanakai/template/helpers/application_helper.rb +3 -0
- data/lib/tanakai/template/lib/.keep +0 -0
- data/lib/tanakai/template/log/.keep +0 -0
- data/lib/tanakai/template/pipelines/saver.rb +11 -0
- data/lib/tanakai/template/pipelines/validator.rb +24 -0
- data/lib/tanakai/template/spiders/application_spider.rb +143 -0
- data/lib/tanakai/template/tmp/.keep +0 -0
- data/lib/tanakai/version.rb +3 -0
- data/lib/tanakai.rb +54 -0
- data/tanakai.gemspec +50 -0
- metadata +382 -0
data/lib/tanakai/cli.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'thor'
|
2
|
+
|
3
|
+
module Tanakai
|
4
|
+
class CLI < Thor
|
5
|
+
map %w[--version -v] => :__print_version
|
6
|
+
|
7
|
+
desc "generate", "Generator, available types: project, spider, schedule"
|
8
|
+
def generate(generator_type, *args)
|
9
|
+
case generator_type
|
10
|
+
when "project"
|
11
|
+
project_name = args.shift
|
12
|
+
raise "Provide project name to generate a new project" unless project_name.present?
|
13
|
+
Generator.new.generate_project(project_name)
|
14
|
+
when "spider"
|
15
|
+
spider_name = args.shift
|
16
|
+
raise "Provide spider name to generate a spider" unless spider_name.present?
|
17
|
+
Generator.new.generate_spider(spider_name, in_project: inside_project?)
|
18
|
+
when "schedule"
|
19
|
+
Generator.new.generate_schedule
|
20
|
+
else
|
21
|
+
raise "Don't know this generator type: #{generator_type}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
###
|
26
|
+
|
27
|
+
desc "setup", "Setup server"
|
28
|
+
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
29
|
+
option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
|
30
|
+
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
31
|
+
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
32
|
+
option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
|
33
|
+
def setup(user_host)
|
34
|
+
command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
|
35
|
+
|
36
|
+
pid = spawn *command
|
37
|
+
Process.wait pid
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "deploy", "Deploy project to the server and update cron schedule"
|
41
|
+
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
42
|
+
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
43
|
+
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
44
|
+
option "repo-url", type: :string, banner: "Repo url"
|
45
|
+
option "repo-key-path", type: :string, banner: "SSH key for a git repo"
|
46
|
+
option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
|
47
|
+
def deploy(user_host)
|
48
|
+
unless options["skip-check"]
|
49
|
+
if !`git status --short`.empty?
|
50
|
+
raise "Deploy: Please commit your changes first"
|
51
|
+
elsif `git remote`.empty?
|
52
|
+
raise "Deploy: Please add remote origin repository to your repo first"
|
53
|
+
elsif !`git rev-list master...origin/master`.empty?
|
54
|
+
raise "Deploy: Please push your commits to the remote origin repo first"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
|
59
|
+
repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
|
60
|
+
|
61
|
+
command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
|
62
|
+
vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
|
63
|
+
).get
|
64
|
+
|
65
|
+
pid = spawn *command
|
66
|
+
Process.wait pid
|
67
|
+
end
|
68
|
+
|
69
|
+
###
|
70
|
+
|
71
|
+
desc "crawl", "Run a particular spider by it's name"
|
72
|
+
def crawl(spider_name)
|
73
|
+
raise "Can't find Tanakai project" unless inside_project?
|
74
|
+
require './config/boot'
|
75
|
+
|
76
|
+
unless klass = Tanakai.find_by_name(spider_name)
|
77
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
78
|
+
"To list all available spiders, run: `$ bundle exec tanakai list`"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Set time_zone if exists
|
82
|
+
if time_zone = Tanakai.configuration.time_zone
|
83
|
+
Tanakai.time_zone = time_zone
|
84
|
+
end
|
85
|
+
|
86
|
+
klass.crawl!
|
87
|
+
end
|
88
|
+
|
89
|
+
desc "parse", "Parse url in the particular spider method"
|
90
|
+
option :url, type: :string, required: true, banner: "Url to pass to the method"
|
91
|
+
def parse(spider_name, method_name)
|
92
|
+
raise "Can't find Tanakai project" unless inside_project?
|
93
|
+
require './config/boot'
|
94
|
+
|
95
|
+
unless klass = Tanakai.find_by_name(spider_name)
|
96
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
97
|
+
"To list all available spiders, run: `$ bundle exec tanakai list`"
|
98
|
+
end
|
99
|
+
|
100
|
+
klass.parse!(method_name, url: options["url"])
|
101
|
+
end
|
102
|
+
|
103
|
+
desc "console", "Start Tanakai console"
|
104
|
+
option :engine, type: :string, banner: "Engine to use"
|
105
|
+
option :url, type: :string, banner: "Url to process"
|
106
|
+
def console(spider_name = nil)
|
107
|
+
require 'pry'
|
108
|
+
require './config/boot' if inside_project?
|
109
|
+
|
110
|
+
if spider_name
|
111
|
+
raise "Can't find Tanakai project" unless inside_project?
|
112
|
+
|
113
|
+
unless klass = Tanakai.find_by_name(spider_name)
|
114
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
115
|
+
"To list all available spiders, run: `$ bundle exec tanakai list`"
|
116
|
+
end
|
117
|
+
else
|
118
|
+
klass = inside_project? ? ApplicationSpider : ::Tanakai::Base
|
119
|
+
end
|
120
|
+
|
121
|
+
engine = options["engine"]&.delete(":")&.to_sym
|
122
|
+
if url = options["url"]
|
123
|
+
klass.new(engine).request_to(:console, url: options["url"])
|
124
|
+
else
|
125
|
+
klass.new(engine).public_send(:console)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
desc "list", "List all available spiders in the current project"
|
130
|
+
def list
|
131
|
+
raise "Can't find Tanakai project" unless inside_project?
|
132
|
+
require './config/boot'
|
133
|
+
|
134
|
+
Tanakai.list.keys.sort.each { |name| puts name }
|
135
|
+
end
|
136
|
+
|
137
|
+
desc "runner", "Run all spiders in the project in queue"
|
138
|
+
option :include, type: :array, default: [], banner: "List of spiders to run"
|
139
|
+
option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
|
140
|
+
option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
|
141
|
+
def runner
|
142
|
+
raise "Can't find Tanakai project" unless inside_project?
|
143
|
+
|
144
|
+
jobs = options["jobs"]
|
145
|
+
raise "Jobs count can't be 0" if jobs == 0
|
146
|
+
|
147
|
+
require './config/boot'
|
148
|
+
require 'tanakai/runner'
|
149
|
+
|
150
|
+
spiders = options["include"].presence || Tanakai.list.keys
|
151
|
+
spiders -= options["exclude"]
|
152
|
+
|
153
|
+
Runner.new(spiders, jobs).run!
|
154
|
+
end
|
155
|
+
|
156
|
+
desc "--version, -v", "Print the version"
|
157
|
+
def __print_version
|
158
|
+
puts VERSION
|
159
|
+
end
|
160
|
+
|
161
|
+
desc "dashboard", "Run dashboard"
|
162
|
+
def dashboard
|
163
|
+
raise "Can't find Tanakai project" unless inside_project?
|
164
|
+
|
165
|
+
require './config/boot'
|
166
|
+
if Object.const_defined?("Tanakai::Dashboard")
|
167
|
+
require 'tanakai/dashboard/app'
|
168
|
+
Tanakai::Dashboard::App.run!
|
169
|
+
else
|
170
|
+
raise "Tanakai::Dashboard is not defined"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
private
|
175
|
+
|
176
|
+
def inside_project?
|
177
|
+
Dir.exists?("spiders") && File.exists?("./config/boot.rb")
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
require_relative 'cli/generator'
|
183
|
+
require_relative 'cli/ansible_command_builder'
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class Array
|
2
|
+
def in_sorted_groups(number, fill_width = nil)
|
3
|
+
sorted_groups = Array.new(number) { |a| a = [] }
|
4
|
+
|
5
|
+
self.in_groups_of(number, fill_width).each do |group|
|
6
|
+
number.times do |i|
|
7
|
+
group.fetch(i) rescue next
|
8
|
+
sorted_groups[i] << group[i]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
sorted_groups
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Numeric
|
2
|
+
# https://stackoverflow.com/a/1679963
|
3
|
+
def duration
|
4
|
+
secs = self.to_int
|
5
|
+
mins = secs / 60
|
6
|
+
hours = mins / 60
|
7
|
+
days = hours / 24
|
8
|
+
|
9
|
+
if days > 0
|
10
|
+
"#{days}d, #{hours % 24}h"
|
11
|
+
elsif hours > 0
|
12
|
+
"#{hours}h, #{mins % 60}m"
|
13
|
+
elsif mins > 0
|
14
|
+
"#{mins}m, #{secs % 60}s"
|
15
|
+
elsif secs >= 0
|
16
|
+
"#{secs}s"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Tanakai
|
2
|
+
class Pipeline
|
3
|
+
class DropItemError < StandardError; end
|
4
|
+
def self.name
|
5
|
+
self.to_s.sub(/.*?::/, "").underscore.to_sym
|
6
|
+
end
|
7
|
+
|
8
|
+
include BaseHelper
|
9
|
+
attr_accessor :spider
|
10
|
+
|
11
|
+
def name
|
12
|
+
self.class.name
|
13
|
+
end
|
14
|
+
|
15
|
+
###
|
16
|
+
|
17
|
+
def storage
|
18
|
+
spider.storage
|
19
|
+
end
|
20
|
+
|
21
|
+
def unique?(scope, value)
|
22
|
+
spider.unique?(scope, value)
|
23
|
+
end
|
24
|
+
|
25
|
+
def save_to(path, item, format:, position: true, append: false)
|
26
|
+
spider.save_to(path, item, format: format, position: position, append: append)
|
27
|
+
end
|
28
|
+
|
29
|
+
def logger
|
30
|
+
spider.logger
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'pmap'
|
2
|
+
|
3
|
+
module Tanakai
|
4
|
+
class Runner
|
5
|
+
attr_reader :jobs, :spiders, :session_info
|
6
|
+
|
7
|
+
def initialize(spiders, parallel_jobs)
|
8
|
+
@jobs = parallel_jobs
|
9
|
+
@spiders = spiders
|
10
|
+
@start_time = Time.now
|
11
|
+
|
12
|
+
@session_info = {
|
13
|
+
id: @start_time.to_i,
|
14
|
+
status: :processing,
|
15
|
+
start_time: @start_time,
|
16
|
+
stop_time: nil,
|
17
|
+
environment: Tanakai.env,
|
18
|
+
concurrent_jobs: @jobs,
|
19
|
+
spiders: @spiders
|
20
|
+
}
|
21
|
+
|
22
|
+
if time_zone = Tanakai.configuration.time_zone
|
23
|
+
Tanakai.time_zone = time_zone
|
24
|
+
end
|
25
|
+
|
26
|
+
ENV.store("SESSION_ID", @start_time.to_i.to_s)
|
27
|
+
ENV.store("RBCAT_COLORIZER", "false")
|
28
|
+
end
|
29
|
+
|
30
|
+
def run!(exception_on_fail: true)
|
31
|
+
puts ">>> Runner: started: #{session_info}"
|
32
|
+
if at_start_callback = Tanakai.configuration.runner_at_start_callback
|
33
|
+
at_start_callback.call(session_info)
|
34
|
+
end
|
35
|
+
|
36
|
+
running = true
|
37
|
+
spiders.peach_with_index(jobs) do |spider, i|
|
38
|
+
next unless running
|
39
|
+
|
40
|
+
puts "> Runner: started spider: #{spider}, index: #{i}"
|
41
|
+
pid = spawn("bundle", "exec", "tanakai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
|
42
|
+
Process.wait pid
|
43
|
+
|
44
|
+
puts "< Runner: stopped spider: #{spider}, index: #{i}"
|
45
|
+
end
|
46
|
+
rescue StandardError, SignalException, SystemExit => e
|
47
|
+
running = false
|
48
|
+
|
49
|
+
session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
|
50
|
+
exception_on_fail ? raise(e) : [session_info, e]
|
51
|
+
else
|
52
|
+
session_info.merge!(status: :completed, stop_time: Time.now)
|
53
|
+
ensure
|
54
|
+
if at_stop_callback = Tanakai.configuration.runner_at_stop_callback
|
55
|
+
at_stop_callback.call(session_info)
|
56
|
+
end
|
57
|
+
puts "<<< Runner: stopped: #{session_info}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
|
3
|
+
|
4
|
+
ruby '>= 2.5'
|
5
|
+
|
6
|
+
# Framework
|
7
|
+
gem 'tanakai'
|
8
|
+
|
9
|
+
# Require files in directory and child directories recursively
|
10
|
+
gem 'require_all'
|
11
|
+
|
12
|
+
# Dotenv
|
13
|
+
gem 'dotenv'
|
14
|
+
|
15
|
+
# To debug spiders:
|
16
|
+
group :development do
|
17
|
+
gem 'byebug', platforms: :mri
|
18
|
+
gem 'pry'
|
19
|
+
end
|
20
|
+
|
21
|
+
# If you want to save items to the database, require one of these gems:
|
22
|
+
# gem 'sqlite3'
|
23
|
+
# gem 'pg'
|
24
|
+
# gem 'mysql2'
|
25
|
+
|
26
|
+
# And use your preferred ORM/database connector:
|
27
|
+
# gem 'activerecord', require: 'active_record'
|
28
|
+
# gem 'sequel'
|
@@ -0,0 +1,37 @@
|
|
1
|
+
Tanakai.configure do |config|
|
2
|
+
# Default logger has colored mode in development.
|
3
|
+
# If you would like to disable it, set `colorize_logger` to false.
|
4
|
+
# config.colorize_logger = false
|
5
|
+
|
6
|
+
# Logger level for default logger:
|
7
|
+
# config.log_level = :info
|
8
|
+
|
9
|
+
# Custom logger:
|
10
|
+
# config.logger = Logger.new(STDOUT)
|
11
|
+
|
12
|
+
# Custom time zone (for logs):
|
13
|
+
# config.time_zone = "UTC"
|
14
|
+
# config.time_zone = "Europe/Moscow"
|
15
|
+
|
16
|
+
# At start callback for a runner. Accepts argument with info as hash with
|
17
|
+
# keys: id, status, start_time, environment, concurrent_jobs, spiders list.
|
18
|
+
# For example, you can use this callback to send notification when runner was started:
|
19
|
+
# config.runner_at_start_callback = lambda do |info|
|
20
|
+
# json = JSON.pretty_generate(info)
|
21
|
+
# Sender.send_notification("Started session: #{json}")
|
22
|
+
# end
|
23
|
+
|
24
|
+
# At stop callback for a runner. Accepts argument with info as hash with
|
25
|
+
# all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
|
26
|
+
# stop status of a runner (completed or failed).
|
27
|
+
# You can use this callback to send notification when runner has been stopped:
|
28
|
+
# config.runner_at_stop_callback = lambda do |info|
|
29
|
+
# json = JSON.pretty_generate(info)
|
30
|
+
# Sender.send_notification("Stopped session: #{json}")
|
31
|
+
# end
|
32
|
+
|
33
|
+
# Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
|
34
|
+
# config.selenium_chrome_path = "/usr/bin/chromium-browser"
|
35
|
+
# Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
|
36
|
+
# config.chromedriver_path = "/usr/local/bin/chromedriver"
|
37
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# software versions to install for `setup` command
|
2
|
+
setup:
|
3
|
+
ruby: 2.5.1
|
4
|
+
# check latest here http://phantomjs.org/download.html
|
5
|
+
phantomjs: 2.1.1
|
6
|
+
# check latest here https://github.com/mozilla/geckodriver/releases/
|
7
|
+
geckodriver: 0.21.0
|
8
|
+
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
9
|
+
chromedriver: 2.39
|
10
|
+
# settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
|
11
|
+
deploy:
|
12
|
+
# repo_url: git@bitbucket.org:username/repo_name.git
|
13
|
+
# repo_key_path: ~/.ssh/id_rsa
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# require project gems
|
2
|
+
require 'bundler/setup'
|
3
|
+
Bundler.require(:default, Tanakai.env)
|
4
|
+
|
5
|
+
# require custom ENV variables located in .env file
|
6
|
+
require 'dotenv/load'
|
7
|
+
|
8
|
+
# require initializers
|
9
|
+
Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
|
10
|
+
|
11
|
+
# require helpers
|
12
|
+
Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
|
13
|
+
|
14
|
+
# require pipelines
|
15
|
+
Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
|
16
|
+
|
17
|
+
# require spiders recursively in the `spiders/` folder
|
18
|
+
require_relative '../spiders/application_spider'
|
19
|
+
require_all "spiders"
|
20
|
+
|
21
|
+
# require Tanakai configuration
|
22
|
+
require_relative 'application'
|
File without changes
|
@@ -0,0 +1,57 @@
|
|
1
|
+
### Settings ###
|
2
|
+
require 'tzinfo'
|
3
|
+
|
4
|
+
# Export current PATH to the cron
|
5
|
+
env :PATH, ENV["PATH"]
|
6
|
+
|
7
|
+
# Use 24 hour format when using `at:` option
|
8
|
+
set :chronic_options, hours24: true
|
9
|
+
|
10
|
+
# Use local_to_utc helper to setup execution time using your local timezone instead
|
11
|
+
# of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
|
12
|
+
# Also maybe you'll want to set same timezone in tanakai as well (use `Tanakai.configuration.time_zone =` for that),
|
13
|
+
# to have spiders logs in a specific time zone format.
|
14
|
+
# Example usage of helper:
|
15
|
+
# every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
|
16
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
17
|
+
# end
|
18
|
+
def local_to_utc(time_string, zone:)
|
19
|
+
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Note: by default Whenever exports cron commands with :environment == "production".
|
23
|
+
# Note: Whenever can only append log data to a log file (>>). If you want
|
24
|
+
# to overwrite (>) log file before each run, pass lambda:
|
25
|
+
# crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
|
26
|
+
|
27
|
+
# Project job types
|
28
|
+
job_type :crawl, "cd :path && TANAKAI_ENV=:environment bundle exec tanakai crawl :task :output"
|
29
|
+
job_type :runner, "cd :path && TANAKAI_ENV=:environment bundle exec tanakai runner --jobs :task :output"
|
30
|
+
|
31
|
+
# Single file job type
|
32
|
+
job_type :single, "cd :path && TANAKAI_ENV=:environment ruby :task :output"
|
33
|
+
# Single with bundle exec
|
34
|
+
job_type :single_bundle, "cd :path && TANAKAI_ENV=:environment bundle exec ruby :task :output"
|
35
|
+
|
36
|
+
### Schedule ###
|
37
|
+
# Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
|
38
|
+
# every 1.day do
|
39
|
+
# Example to schedule a single spider in the project:
|
40
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
41
|
+
|
42
|
+
# Example to schedule all spiders in the project using runner. Each spider will write
|
43
|
+
# it's own output to the `log/spider_name.log` file (handled by a runner itself).
|
44
|
+
# Runner output will be written to log/runner.log file.
|
45
|
+
# Argument number it's a count of concurrent jobs:
|
46
|
+
# runner 3, output:"log/runner.log"
|
47
|
+
|
48
|
+
# Example to schedule single spider (without project):
|
49
|
+
# single "single_spider.rb", output: "single_spider.log"
|
50
|
+
# end
|
51
|
+
|
52
|
+
### How to set a cron schedule ###
|
53
|
+
# Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
|
54
|
+
# If you don't have whenever command, install the gem: `$ gem install whenever`.
|
55
|
+
|
56
|
+
### How to cancel a schedule ###
|
57
|
+
# Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class Saver < Tanakai::Pipeline
|
2
|
+
def process_item(item, options: {})
|
3
|
+
# Here you can save item to the database, send it to a remote API or
|
4
|
+
# simply save item to a file format using `save_to` helper:
|
5
|
+
|
6
|
+
# To get the name of a current spider: `spider.class.name`
|
7
|
+
# save_to "db/#{spider.class.name}.json", item, format: :pretty_json
|
8
|
+
|
9
|
+
item
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class Validator < Tanakai::Pipeline
|
2
|
+
def process_item(item, options: {})
|
3
|
+
# Here you can validate item and raise `DropItemError`
|
4
|
+
# if one of the validations failed. Examples:
|
5
|
+
|
6
|
+
# Check item sku for uniqueness using buit-in `unique?` helper:
|
7
|
+
# unless unique?(:sku, item[:sku])
|
8
|
+
# raise DropItemError, "Item sku is not unique"
|
9
|
+
# end
|
10
|
+
|
11
|
+
# Drop item if title length shorter than 5 symbols:
|
12
|
+
# if item[:title].size < 5
|
13
|
+
# raise DropItemError, "Item title is short"
|
14
|
+
# end
|
15
|
+
|
16
|
+
# Drop item if it doesn't contains any images:
|
17
|
+
# unless item[:images].present?
|
18
|
+
# raise DropItemError, "Item images are not present"
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Pass item to the next pipeline (if it wasn't dropped)
|
22
|
+
item
|
23
|
+
end
|
24
|
+
end
|