tanakai 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +5 -0
- data/CHANGELOG.md +118 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +2038 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/tanakai +6 -0
- data/lib/tanakai/automation/deploy.yml +54 -0
- data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
- data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
- data/lib/tanakai/automation/setup.yml +45 -0
- data/lib/tanakai/base/saver.rb +106 -0
- data/lib/tanakai/base/storage.rb +54 -0
- data/lib/tanakai/base.rb +326 -0
- data/lib/tanakai/base_helper.rb +22 -0
- data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
- data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
- data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
- data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
- data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
- data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
- data/lib/tanakai/browser_builder.rb +20 -0
- data/lib/tanakai/capybara_configuration.rb +10 -0
- data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
- data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
- data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
- data/lib/tanakai/capybara_ext/session/config.rb +22 -0
- data/lib/tanakai/capybara_ext/session.rb +249 -0
- data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
- data/lib/tanakai/cli/generator.rb +57 -0
- data/lib/tanakai/cli.rb +183 -0
- data/lib/tanakai/core_ext/array.rb +14 -0
- data/lib/tanakai/core_ext/hash.rb +5 -0
- data/lib/tanakai/core_ext/numeric.rb +19 -0
- data/lib/tanakai/core_ext/string.rb +7 -0
- data/lib/tanakai/pipeline.rb +33 -0
- data/lib/tanakai/runner.rb +60 -0
- data/lib/tanakai/template/.gitignore +18 -0
- data/lib/tanakai/template/Gemfile +28 -0
- data/lib/tanakai/template/README.md +3 -0
- data/lib/tanakai/template/config/application.rb +37 -0
- data/lib/tanakai/template/config/automation.yml +13 -0
- data/lib/tanakai/template/config/boot.rb +22 -0
- data/lib/tanakai/template/config/initializers/.keep +0 -0
- data/lib/tanakai/template/config/schedule.rb +57 -0
- data/lib/tanakai/template/db/.keep +0 -0
- data/lib/tanakai/template/helpers/application_helper.rb +3 -0
- data/lib/tanakai/template/lib/.keep +0 -0
- data/lib/tanakai/template/log/.keep +0 -0
- data/lib/tanakai/template/pipelines/saver.rb +11 -0
- data/lib/tanakai/template/pipelines/validator.rb +24 -0
- data/lib/tanakai/template/spiders/application_spider.rb +143 -0
- data/lib/tanakai/template/tmp/.keep +0 -0
- data/lib/tanakai/version.rb +3 -0
- data/lib/tanakai.rb +54 -0
- data/tanakai.gemspec +50 -0
- metadata +382 -0
data/lib/tanakai/cli.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'thor'
|
2
|
+
|
3
|
+
module Tanakai
|
4
|
+
class CLI < Thor
|
5
|
+
map %w[--version -v] => :__print_version
|
6
|
+
|
7
|
+
desc "generate", "Generator, available types: project, spider, schedule"
|
8
|
+
def generate(generator_type, *args)
|
9
|
+
case generator_type
|
10
|
+
when "project"
|
11
|
+
project_name = args.shift
|
12
|
+
raise "Provide project name to generate a new project" unless project_name.present?
|
13
|
+
Generator.new.generate_project(project_name)
|
14
|
+
when "spider"
|
15
|
+
spider_name = args.shift
|
16
|
+
raise "Provide spider name to generate a spider" unless spider_name.present?
|
17
|
+
Generator.new.generate_spider(spider_name, in_project: inside_project?)
|
18
|
+
when "schedule"
|
19
|
+
Generator.new.generate_schedule
|
20
|
+
else
|
21
|
+
raise "Don't know this generator type: #{generator_type}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
###
|
26
|
+
|
27
|
+
desc "setup", "Setup server"
|
28
|
+
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
29
|
+
option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
|
30
|
+
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
31
|
+
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
32
|
+
option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
|
33
|
+
def setup(user_host)
|
34
|
+
command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
|
35
|
+
|
36
|
+
pid = spawn *command
|
37
|
+
Process.wait pid
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "deploy", "Deploy project to the server and update cron schedule"
|
41
|
+
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
42
|
+
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
43
|
+
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
44
|
+
option "repo-url", type: :string, banner: "Repo url"
|
45
|
+
option "repo-key-path", type: :string, banner: "SSH key for a git repo"
|
46
|
+
option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
|
47
|
+
def deploy(user_host)
|
48
|
+
unless options["skip-check"]
|
49
|
+
if !`git status --short`.empty?
|
50
|
+
raise "Deploy: Please commit your changes first"
|
51
|
+
elsif `git remote`.empty?
|
52
|
+
raise "Deploy: Please add remote origin repository to your repo first"
|
53
|
+
elsif !`git rev-list master...origin/master`.empty?
|
54
|
+
raise "Deploy: Please push your commits to the remote origin repo first"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
|
59
|
+
repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
|
60
|
+
|
61
|
+
command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
|
62
|
+
vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
|
63
|
+
).get
|
64
|
+
|
65
|
+
pid = spawn *command
|
66
|
+
Process.wait pid
|
67
|
+
end
|
68
|
+
|
69
|
+
###
|
70
|
+
|
71
|
+
desc "crawl", "Run a particular spider by it's name"
|
72
|
+
def crawl(spider_name)
|
73
|
+
raise "Can't find Tanakai project" unless inside_project?
|
74
|
+
require './config/boot'
|
75
|
+
|
76
|
+
unless klass = Tanakai.find_by_name(spider_name)
|
77
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
78
|
+
"To list all available spiders, run: `$ bundle exec tanakai list`"
|
79
|
+
end
|
80
|
+
|
81
|
+
# Set time_zone if exists
|
82
|
+
if time_zone = Tanakai.configuration.time_zone
|
83
|
+
Tanakai.time_zone = time_zone
|
84
|
+
end
|
85
|
+
|
86
|
+
klass.crawl!
|
87
|
+
end
|
88
|
+
|
89
|
+
desc "parse", "Parse url in the particular spider method"
|
90
|
+
option :url, type: :string, required: true, banner: "Url to pass to the method"
|
91
|
+
def parse(spider_name, method_name)
|
92
|
+
raise "Can't find Tanakai project" unless inside_project?
|
93
|
+
require './config/boot'
|
94
|
+
|
95
|
+
unless klass = Tanakai.find_by_name(spider_name)
|
96
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
97
|
+
"To list all available spiders, run: `$ bundle exec tanakai list`"
|
98
|
+
end
|
99
|
+
|
100
|
+
klass.parse!(method_name, url: options["url"])
|
101
|
+
end
|
102
|
+
|
103
|
+
desc "console", "Start Tanakai console"
|
104
|
+
option :engine, type: :string, banner: "Engine to use"
|
105
|
+
option :url, type: :string, banner: "Url to process"
|
106
|
+
def console(spider_name = nil)
|
107
|
+
require 'pry'
|
108
|
+
require './config/boot' if inside_project?
|
109
|
+
|
110
|
+
if spider_name
|
111
|
+
raise "Can't find Tanakai project" unless inside_project?
|
112
|
+
|
113
|
+
unless klass = Tanakai.find_by_name(spider_name)
|
114
|
+
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
115
|
+
"To list all available spiders, run: `$ bundle exec tanakai list`"
|
116
|
+
end
|
117
|
+
else
|
118
|
+
klass = inside_project? ? ApplicationSpider : ::Tanakai::Base
|
119
|
+
end
|
120
|
+
|
121
|
+
engine = options["engine"]&.delete(":")&.to_sym
|
122
|
+
if url = options["url"]
|
123
|
+
klass.new(engine).request_to(:console, url: options["url"])
|
124
|
+
else
|
125
|
+
klass.new(engine).public_send(:console)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
desc "list", "List all available spiders in the current project"
|
130
|
+
def list
|
131
|
+
raise "Can't find Tanakai project" unless inside_project?
|
132
|
+
require './config/boot'
|
133
|
+
|
134
|
+
Tanakai.list.keys.sort.each { |name| puts name }
|
135
|
+
end
|
136
|
+
|
137
|
+
desc "runner", "Run all spiders in the project in queue"
|
138
|
+
option :include, type: :array, default: [], banner: "List of spiders to run"
|
139
|
+
option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
|
140
|
+
option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
|
141
|
+
def runner
|
142
|
+
raise "Can't find Tanakai project" unless inside_project?
|
143
|
+
|
144
|
+
jobs = options["jobs"]
|
145
|
+
raise "Jobs count can't be 0" if jobs == 0
|
146
|
+
|
147
|
+
require './config/boot'
|
148
|
+
require 'tanakai/runner'
|
149
|
+
|
150
|
+
spiders = options["include"].presence || Tanakai.list.keys
|
151
|
+
spiders -= options["exclude"]
|
152
|
+
|
153
|
+
Runner.new(spiders, jobs).run!
|
154
|
+
end
|
155
|
+
|
156
|
+
desc "--version, -v", "Print the version"
|
157
|
+
def __print_version
|
158
|
+
puts VERSION
|
159
|
+
end
|
160
|
+
|
161
|
+
desc "dashboard", "Run dashboard"
|
162
|
+
def dashboard
|
163
|
+
raise "Can't find Tanakai project" unless inside_project?
|
164
|
+
|
165
|
+
require './config/boot'
|
166
|
+
if Object.const_defined?("Tanakai::Dashboard")
|
167
|
+
require 'tanakai/dashboard/app'
|
168
|
+
Tanakai::Dashboard::App.run!
|
169
|
+
else
|
170
|
+
raise "Tanakai::Dashboard is not defined"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
private
|
175
|
+
|
176
|
+
def inside_project?
|
177
|
+
Dir.exists?("spiders") && File.exists?("./config/boot.rb")
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
require_relative 'cli/generator'
|
183
|
+
require_relative 'cli/ansible_command_builder'
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class Array
|
2
|
+
def in_sorted_groups(number, fill_width = nil)
|
3
|
+
sorted_groups = Array.new(number) { |a| a = [] }
|
4
|
+
|
5
|
+
self.in_groups_of(number, fill_width).each do |group|
|
6
|
+
number.times do |i|
|
7
|
+
group.fetch(i) rescue next
|
8
|
+
sorted_groups[i] << group[i]
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
sorted_groups
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Numeric
|
2
|
+
# https://stackoverflow.com/a/1679963
|
3
|
+
def duration
|
4
|
+
secs = self.to_int
|
5
|
+
mins = secs / 60
|
6
|
+
hours = mins / 60
|
7
|
+
days = hours / 24
|
8
|
+
|
9
|
+
if days > 0
|
10
|
+
"#{days}d, #{hours % 24}h"
|
11
|
+
elsif hours > 0
|
12
|
+
"#{hours}h, #{mins % 60}m"
|
13
|
+
elsif mins > 0
|
14
|
+
"#{mins}m, #{secs % 60}s"
|
15
|
+
elsif secs >= 0
|
16
|
+
"#{secs}s"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Tanakai
|
2
|
+
class Pipeline
|
3
|
+
class DropItemError < StandardError; end
|
4
|
+
def self.name
|
5
|
+
self.to_s.sub(/.*?::/, "").underscore.to_sym
|
6
|
+
end
|
7
|
+
|
8
|
+
include BaseHelper
|
9
|
+
attr_accessor :spider
|
10
|
+
|
11
|
+
def name
|
12
|
+
self.class.name
|
13
|
+
end
|
14
|
+
|
15
|
+
###
|
16
|
+
|
17
|
+
def storage
|
18
|
+
spider.storage
|
19
|
+
end
|
20
|
+
|
21
|
+
def unique?(scope, value)
|
22
|
+
spider.unique?(scope, value)
|
23
|
+
end
|
24
|
+
|
25
|
+
def save_to(path, item, format:, position: true, append: false)
|
26
|
+
spider.save_to(path, item, format: format, position: position, append: append)
|
27
|
+
end
|
28
|
+
|
29
|
+
def logger
|
30
|
+
spider.logger
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'pmap'
|
2
|
+
|
3
|
+
module Tanakai
|
4
|
+
class Runner
|
5
|
+
attr_reader :jobs, :spiders, :session_info
|
6
|
+
|
7
|
+
def initialize(spiders, parallel_jobs)
|
8
|
+
@jobs = parallel_jobs
|
9
|
+
@spiders = spiders
|
10
|
+
@start_time = Time.now
|
11
|
+
|
12
|
+
@session_info = {
|
13
|
+
id: @start_time.to_i,
|
14
|
+
status: :processing,
|
15
|
+
start_time: @start_time,
|
16
|
+
stop_time: nil,
|
17
|
+
environment: Tanakai.env,
|
18
|
+
concurrent_jobs: @jobs,
|
19
|
+
spiders: @spiders
|
20
|
+
}
|
21
|
+
|
22
|
+
if time_zone = Tanakai.configuration.time_zone
|
23
|
+
Tanakai.time_zone = time_zone
|
24
|
+
end
|
25
|
+
|
26
|
+
ENV.store("SESSION_ID", @start_time.to_i.to_s)
|
27
|
+
ENV.store("RBCAT_COLORIZER", "false")
|
28
|
+
end
|
29
|
+
|
30
|
+
def run!(exception_on_fail: true)
|
31
|
+
puts ">>> Runner: started: #{session_info}"
|
32
|
+
if at_start_callback = Tanakai.configuration.runner_at_start_callback
|
33
|
+
at_start_callback.call(session_info)
|
34
|
+
end
|
35
|
+
|
36
|
+
running = true
|
37
|
+
spiders.peach_with_index(jobs) do |spider, i|
|
38
|
+
next unless running
|
39
|
+
|
40
|
+
puts "> Runner: started spider: #{spider}, index: #{i}"
|
41
|
+
pid = spawn("bundle", "exec", "tanakai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
|
42
|
+
Process.wait pid
|
43
|
+
|
44
|
+
puts "< Runner: stopped spider: #{spider}, index: #{i}"
|
45
|
+
end
|
46
|
+
rescue StandardError, SignalException, SystemExit => e
|
47
|
+
running = false
|
48
|
+
|
49
|
+
session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
|
50
|
+
exception_on_fail ? raise(e) : [session_info, e]
|
51
|
+
else
|
52
|
+
session_info.merge!(status: :completed, stop_time: Time.now)
|
53
|
+
ensure
|
54
|
+
if at_stop_callback = Tanakai.configuration.runner_at_stop_callback
|
55
|
+
at_stop_callback.call(session_info)
|
56
|
+
end
|
57
|
+
puts "<<< Runner: stopped: #{session_info}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
|
3
|
+
|
4
|
+
ruby '>= 2.5'
|
5
|
+
|
6
|
+
# Framework
|
7
|
+
gem 'tanakai'
|
8
|
+
|
9
|
+
# Require files in directory and child directories recursively
|
10
|
+
gem 'require_all'
|
11
|
+
|
12
|
+
# Dotenv
|
13
|
+
gem 'dotenv'
|
14
|
+
|
15
|
+
# To debug spiders:
|
16
|
+
group :development do
|
17
|
+
gem 'byebug', platforms: :mri
|
18
|
+
gem 'pry'
|
19
|
+
end
|
20
|
+
|
21
|
+
# If you want to save items to the database, require one of these gems:
|
22
|
+
# gem 'sqlite3'
|
23
|
+
# gem 'pg'
|
24
|
+
# gem 'mysql2'
|
25
|
+
|
26
|
+
# And use your preferred ORM/database connector:
|
27
|
+
# gem 'activerecord', require: 'active_record'
|
28
|
+
# gem 'sequel'
|
@@ -0,0 +1,37 @@
|
|
1
|
+
Tanakai.configure do |config|
|
2
|
+
# Default logger has colored mode in development.
|
3
|
+
# If you would like to disable it, set `colorize_logger` to false.
|
4
|
+
# config.colorize_logger = false
|
5
|
+
|
6
|
+
# Logger level for default logger:
|
7
|
+
# config.log_level = :info
|
8
|
+
|
9
|
+
# Custom logger:
|
10
|
+
# config.logger = Logger.new(STDOUT)
|
11
|
+
|
12
|
+
# Custom time zone (for logs):
|
13
|
+
# config.time_zone = "UTC"
|
14
|
+
# config.time_zone = "Europe/Moscow"
|
15
|
+
|
16
|
+
# At start callback for a runner. Accepts argument with info as hash with
|
17
|
+
# keys: id, status, start_time, environment, concurrent_jobs, spiders list.
|
18
|
+
# For example, you can use this callback to send notification when runner was started:
|
19
|
+
# config.runner_at_start_callback = lambda do |info|
|
20
|
+
# json = JSON.pretty_generate(info)
|
21
|
+
# Sender.send_notification("Started session: #{json}")
|
22
|
+
# end
|
23
|
+
|
24
|
+
# At stop callback for a runner. Accepts argument with info as hash with
|
25
|
+
# all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
|
26
|
+
# stop status of a runner (completed or failed).
|
27
|
+
# You can use this callback to send notification when runner has been stopped:
|
28
|
+
# config.runner_at_stop_callback = lambda do |info|
|
29
|
+
# json = JSON.pretty_generate(info)
|
30
|
+
# Sender.send_notification("Stopped session: #{json}")
|
31
|
+
# end
|
32
|
+
|
33
|
+
# Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
|
34
|
+
# config.selenium_chrome_path = "/usr/bin/chromium-browser"
|
35
|
+
# Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
|
36
|
+
# config.chromedriver_path = "/usr/local/bin/chromedriver"
|
37
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# software versions to install for `setup` command
|
2
|
+
setup:
|
3
|
+
ruby: 2.5.1
|
4
|
+
# check latest here http://phantomjs.org/download.html
|
5
|
+
phantomjs: 2.1.1
|
6
|
+
# check latest here https://github.com/mozilla/geckodriver/releases/
|
7
|
+
geckodriver: 0.21.0
|
8
|
+
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
9
|
+
chromedriver: 2.39
|
10
|
+
# settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
|
11
|
+
deploy:
|
12
|
+
# repo_url: git@bitbucket.org:username/repo_name.git
|
13
|
+
# repo_key_path: ~/.ssh/id_rsa
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# require project gems
|
2
|
+
require 'bundler/setup'
|
3
|
+
Bundler.require(:default, Tanakai.env)
|
4
|
+
|
5
|
+
# require custom ENV variables located in .env file
|
6
|
+
require 'dotenv/load'
|
7
|
+
|
8
|
+
# require initializers
|
9
|
+
Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
|
10
|
+
|
11
|
+
# require helpers
|
12
|
+
Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
|
13
|
+
|
14
|
+
# require pipelines
|
15
|
+
Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
|
16
|
+
|
17
|
+
# require spiders recursively in the `spiders/` folder
|
18
|
+
require_relative '../spiders/application_spider'
|
19
|
+
require_all "spiders"
|
20
|
+
|
21
|
+
# require Tanakai configuration
|
22
|
+
require_relative 'application'
|
File without changes
|
@@ -0,0 +1,57 @@
|
|
1
|
+
### Settings ###
|
2
|
+
require 'tzinfo'
|
3
|
+
|
4
|
+
# Export current PATH to the cron
|
5
|
+
env :PATH, ENV["PATH"]
|
6
|
+
|
7
|
+
# Use 24 hour format when using `at:` option
|
8
|
+
set :chronic_options, hours24: true
|
9
|
+
|
10
|
+
# Use local_to_utc helper to setup execution time using your local timezone instead
|
11
|
+
# of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
|
12
|
+
# Also maybe you'll want to set same timezone in tanakai as well (use `Tanakai.configuration.time_zone =` for that),
|
13
|
+
# to have spiders logs in a specific time zone format.
|
14
|
+
# Example usage of helper:
|
15
|
+
# every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
|
16
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
17
|
+
# end
|
18
|
+
def local_to_utc(time_string, zone:)
|
19
|
+
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Note: by default Whenever exports cron commands with :environment == "production".
|
23
|
+
# Note: Whenever can only append log data to a log file (>>). If you want
|
24
|
+
# to overwrite (>) log file before each run, pass lambda:
|
25
|
+
# crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
|
26
|
+
|
27
|
+
# Project job types
|
28
|
+
job_type :crawl, "cd :path && TANAKAI_ENV=:environment bundle exec tanakai crawl :task :output"
|
29
|
+
job_type :runner, "cd :path && TANAKAI_ENV=:environment bundle exec tanakai runner --jobs :task :output"
|
30
|
+
|
31
|
+
# Single file job type
|
32
|
+
job_type :single, "cd :path && TANAKAI_ENV=:environment ruby :task :output"
|
33
|
+
# Single with bundle exec
|
34
|
+
job_type :single_bundle, "cd :path && TANAKAI_ENV=:environment bundle exec ruby :task :output"
|
35
|
+
|
36
|
+
### Schedule ###
|
37
|
+
# Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
|
38
|
+
# every 1.day do
|
39
|
+
# Example to schedule a single spider in the project:
|
40
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
41
|
+
|
42
|
+
# Example to schedule all spiders in the project using runner. Each spider will write
|
43
|
+
# it's own output to the `log/spider_name.log` file (handled by a runner itself).
|
44
|
+
# Runner output will be written to log/runner.log file.
|
45
|
+
# Argument number it's a count of concurrent jobs:
|
46
|
+
# runner 3, output:"log/runner.log"
|
47
|
+
|
48
|
+
# Example to schedule single spider (without project):
|
49
|
+
# single "single_spider.rb", output: "single_spider.log"
|
50
|
+
# end
|
51
|
+
|
52
|
+
### How to set a cron schedule ###
|
53
|
+
# Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
|
54
|
+
# If you don't have whenever command, install the gem: `$ gem install whenever`.
|
55
|
+
|
56
|
+
### How to cancel a schedule ###
|
57
|
+
# Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class Saver < Tanakai::Pipeline
|
2
|
+
def process_item(item, options: {})
|
3
|
+
# Here you can save item to the database, send it to a remote API or
|
4
|
+
# simply save item to a file format using `save_to` helper:
|
5
|
+
|
6
|
+
# To get the name of a current spider: `spider.class.name`
|
7
|
+
# save_to "db/#{spider.class.name}.json", item, format: :pretty_json
|
8
|
+
|
9
|
+
item
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class Validator < Tanakai::Pipeline
|
2
|
+
def process_item(item, options: {})
|
3
|
+
# Here you can validate item and raise `DropItemError`
|
4
|
+
# if one of the validations failed. Examples:
|
5
|
+
|
6
|
+
# Check item sku for uniqueness using buit-in `unique?` helper:
|
7
|
+
# unless unique?(:sku, item[:sku])
|
8
|
+
# raise DropItemError, "Item sku is not unique"
|
9
|
+
# end
|
10
|
+
|
11
|
+
# Drop item if title length shorter than 5 symbols:
|
12
|
+
# if item[:title].size < 5
|
13
|
+
# raise DropItemError, "Item title is short"
|
14
|
+
# end
|
15
|
+
|
16
|
+
# Drop item if it doesn't contains any images:
|
17
|
+
# unless item[:images].present?
|
18
|
+
# raise DropItemError, "Item images are not present"
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Pass item to the next pipeline (if it wasn't dropped)
|
22
|
+
item
|
23
|
+
end
|
24
|
+
end
|