kimurai 1.3.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +29 -0
- data/Gemfile +2 -2
- data/README.md +478 -649
- data/Rakefile +6 -6
- data/bin/console +3 -4
- data/exe/kimurai +0 -1
- data/kimurai.gemspec +38 -37
- data/lib/kimurai/base/saver.rb +15 -19
- data/lib/kimurai/base/storage.rb +1 -1
- data/lib/kimurai/base.rb +42 -38
- data/lib/kimurai/base_helper.rb +5 -4
- data/lib/kimurai/browser_builder/mechanize_builder.rb +44 -38
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +63 -51
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +61 -55
- data/lib/kimurai/browser_builder.rb +7 -31
- data/lib/kimurai/capybara_configuration.rb +1 -1
- data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
- data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
- data/lib/kimurai/capybara_ext/session/config.rb +1 -1
- data/lib/kimurai/capybara_ext/session.rb +40 -38
- data/lib/kimurai/cli/generator.rb +15 -15
- data/lib/kimurai/cli.rb +52 -85
- data/lib/kimurai/core_ext/array.rb +2 -2
- data/lib/kimurai/core_ext/hash.rb +1 -1
- data/lib/kimurai/core_ext/numeric.rb +4 -4
- data/lib/kimurai/pipeline.rb +2 -1
- data/lib/kimurai/runner.rb +6 -6
- data/lib/kimurai/template/Gemfile +2 -2
- data/lib/kimurai/template/config/boot.rb +4 -4
- data/lib/kimurai/template/config/schedule.rb +15 -15
- data/lib/kimurai/template/spiders/application_spider.rb +14 -14
- data/lib/kimurai/version.rb +1 -1
- data/lib/kimurai.rb +7 -3
- metadata +58 -65
- data/.travis.yml +0 -5
- data/lib/kimurai/automation/deploy.yml +0 -54
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
- data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
- data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
- data/lib/kimurai/automation/setup.yml +0 -44
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -171
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
- data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
- data/lib/kimurai/template/config/automation.yml +0 -13
data/lib/kimurai/cli.rb
CHANGED
|
@@ -4,18 +4,22 @@ module Kimurai
|
|
|
4
4
|
class CLI < Thor
|
|
5
5
|
map %w[--version -v] => :__print_version
|
|
6
6
|
|
|
7
|
-
desc
|
|
7
|
+
desc 'new PROJECT_NAME', 'Create a new Kimurai project'
|
|
8
|
+
def new(project_name)
|
|
9
|
+
raise 'Provide project name to generate a new project' unless project_name.present?
|
|
10
|
+
|
|
11
|
+
Generator.new.generate_project(project_name)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
desc 'generate', 'Generator, available types: spider, schedule'
|
|
8
15
|
def generate(generator_type, *args)
|
|
9
16
|
case generator_type
|
|
10
|
-
when
|
|
11
|
-
project_name = args.shift
|
|
12
|
-
raise "Provide project name to generate a new project" unless project_name.present?
|
|
13
|
-
Generator.new.generate_project(project_name)
|
|
14
|
-
when "spider"
|
|
17
|
+
when 'spider'
|
|
15
18
|
spider_name = args.shift
|
|
16
|
-
raise
|
|
19
|
+
raise 'Provide spider name to generate a spider' unless spider_name.present?
|
|
20
|
+
|
|
17
21
|
Generator.new.generate_spider(spider_name, in_project: inside_project?)
|
|
18
|
-
when
|
|
22
|
+
when 'schedule'
|
|
19
23
|
Generator.new.generate_schedule
|
|
20
24
|
else
|
|
21
25
|
raise "Don't know this generator type: #{generator_type}"
|
|
@@ -24,82 +28,43 @@ module Kimurai
|
|
|
24
28
|
|
|
25
29
|
###
|
|
26
30
|
|
|
27
|
-
desc
|
|
28
|
-
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
|
29
|
-
option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
|
|
30
|
-
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
|
31
|
-
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
|
32
|
-
option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
|
|
33
|
-
def setup(user_host)
|
|
34
|
-
command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
|
|
35
|
-
|
|
36
|
-
pid = spawn *command
|
|
37
|
-
Process.wait pid
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
desc "deploy", "Deploy project to the server and update cron schedule"
|
|
41
|
-
option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
|
|
42
|
-
option "ask-auth-pass", type: :boolean, banner: "Auth using password"
|
|
43
|
-
option "ssh-key-path", type: :string, banner: "Auth using ssh key"
|
|
44
|
-
option "repo-url", type: :string, banner: "Repo url"
|
|
45
|
-
option "repo-key-path", type: :string, banner: "SSH key for a git repo"
|
|
46
|
-
def deploy(user_host)
|
|
47
|
-
if !`git status --short`.empty?
|
|
48
|
-
raise "Deploy: Please commit your changes first"
|
|
49
|
-
elsif `git remote`.empty?
|
|
50
|
-
raise "Deploy: Please add remote origin repository to your repo first"
|
|
51
|
-
elsif !`git rev-list master...origin/master`.empty?
|
|
52
|
-
raise "Deploy: Please push your commits to the remote origin repo first"
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
|
|
56
|
-
repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
|
|
57
|
-
|
|
58
|
-
command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
|
|
59
|
-
vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
|
|
60
|
-
).get
|
|
61
|
-
|
|
62
|
-
pid = spawn *command
|
|
63
|
-
Process.wait pid
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
###
|
|
67
|
-
|
|
68
|
-
desc "crawl", "Run a particular spider by it's name"
|
|
31
|
+
desc 'crawl', "Run a particular spider by it's name"
|
|
69
32
|
def crawl(spider_name)
|
|
70
33
|
raise "Can't find Kimurai project" unless inside_project?
|
|
34
|
+
|
|
71
35
|
require './config/boot'
|
|
72
36
|
|
|
73
|
-
unless klass = Kimurai.find_by_name(spider_name)
|
|
37
|
+
unless (klass = Kimurai.find_by_name(spider_name))
|
|
74
38
|
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
|
75
|
-
|
|
39
|
+
'To list all available spiders, run: `$ bundle exec kimurai list`'
|
|
76
40
|
end
|
|
77
41
|
|
|
78
42
|
# Set time_zone if exists
|
|
79
|
-
if time_zone = Kimurai.configuration.time_zone
|
|
43
|
+
if (time_zone = Kimurai.configuration.time_zone)
|
|
80
44
|
Kimurai.time_zone = time_zone
|
|
81
45
|
end
|
|
82
46
|
|
|
83
47
|
klass.crawl!
|
|
84
48
|
end
|
|
85
49
|
|
|
86
|
-
desc
|
|
87
|
-
option :url, type: :string, required: true, banner:
|
|
50
|
+
desc 'parse', 'Parse url in the particular spider method'
|
|
51
|
+
option :url, type: :string, required: true, banner: 'Url to pass to the method'
|
|
88
52
|
def parse(spider_name, method_name)
|
|
89
53
|
raise "Can't find Kimurai project" unless inside_project?
|
|
54
|
+
|
|
90
55
|
require './config/boot'
|
|
91
56
|
|
|
92
|
-
unless klass = Kimurai.find_by_name(spider_name)
|
|
57
|
+
unless (klass = Kimurai.find_by_name(spider_name))
|
|
93
58
|
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
|
94
|
-
|
|
59
|
+
'To list all available spiders, run: `$ bundle exec kimurai list`'
|
|
95
60
|
end
|
|
96
61
|
|
|
97
|
-
klass.parse!(method_name, url: options[
|
|
62
|
+
klass.parse!(method_name, url: options['url'])
|
|
98
63
|
end
|
|
99
64
|
|
|
100
|
-
desc
|
|
101
|
-
option :engine, type: :string, banner:
|
|
102
|
-
option :url, type: :string, banner:
|
|
65
|
+
desc 'console', 'Start Kimurai console'
|
|
66
|
+
option :engine, type: :string, banner: 'Engine to use'
|
|
67
|
+
option :url, type: :string, banner: 'Url to process'
|
|
103
68
|
def console(spider_name = nil)
|
|
104
69
|
require 'pry'
|
|
105
70
|
require './config/boot' if inside_project?
|
|
@@ -107,70 +72,72 @@ module Kimurai
|
|
|
107
72
|
if spider_name
|
|
108
73
|
raise "Can't find Kimurai project" unless inside_project?
|
|
109
74
|
|
|
110
|
-
unless klass = Kimurai.find_by_name(spider_name)
|
|
75
|
+
unless (klass = Kimurai.find_by_name(spider_name))
|
|
111
76
|
raise "Can't find spider with name `#{spider_name}` in the project. " \
|
|
112
|
-
|
|
77
|
+
'To list all available spiders, run: `$ bundle exec kimurai list`'
|
|
113
78
|
end
|
|
114
79
|
else
|
|
115
80
|
klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
|
|
116
81
|
end
|
|
117
82
|
|
|
118
|
-
engine = options[
|
|
119
|
-
|
|
83
|
+
engine = options['engine']&.delete(':')&.to_sym
|
|
84
|
+
if options['url']
|
|
85
|
+
klass.new(engine).request_to(:console, url: options['url'])
|
|
86
|
+
else
|
|
87
|
+
klass.new(engine).public_send(:console)
|
|
88
|
+
end
|
|
120
89
|
end
|
|
121
90
|
|
|
122
|
-
desc
|
|
91
|
+
desc 'list', 'List all available spiders in the current project'
|
|
123
92
|
def list
|
|
124
93
|
raise "Can't find Kimurai project" unless inside_project?
|
|
94
|
+
|
|
125
95
|
require './config/boot'
|
|
126
96
|
|
|
127
|
-
Kimurai.list.keys.each { |name| puts name }
|
|
97
|
+
Kimurai.list.keys.sort.each { |name| puts name }
|
|
128
98
|
end
|
|
129
99
|
|
|
130
|
-
desc
|
|
131
|
-
option :include, type: :array, default: [], banner:
|
|
132
|
-
option :exclude, type: :array, default: [], banner:
|
|
133
|
-
option :jobs, aliases: :j, type: :numeric, default: 1, banner:
|
|
100
|
+
desc 'runner', 'Run all spiders in the project in queue'
|
|
101
|
+
option :include, type: :array, default: [], banner: 'List of spiders to run'
|
|
102
|
+
option :exclude, type: :array, default: [], banner: 'List of spiders to exclude from run'
|
|
103
|
+
option :jobs, aliases: :j, type: :numeric, default: 1, banner: 'The number of concurrent jobs'
|
|
134
104
|
def runner
|
|
135
105
|
raise "Can't find Kimurai project" unless inside_project?
|
|
136
106
|
|
|
137
|
-
jobs = options[
|
|
138
|
-
raise "Jobs count can't be 0" if jobs
|
|
107
|
+
jobs = options['jobs']
|
|
108
|
+
raise "Jobs count can't be 0" if jobs.zero?
|
|
139
109
|
|
|
140
110
|
require './config/boot'
|
|
141
111
|
require 'kimurai/runner'
|
|
142
112
|
|
|
143
|
-
spiders = options[
|
|
144
|
-
spiders -= options[
|
|
113
|
+
spiders = options['include'].presence || Kimurai.list.keys
|
|
114
|
+
spiders -= options['exclude']
|
|
145
115
|
|
|
146
116
|
Runner.new(spiders, jobs).run!
|
|
147
117
|
end
|
|
148
118
|
|
|
149
|
-
desc
|
|
119
|
+
desc '--version, -v', 'Print the version'
|
|
150
120
|
def __print_version
|
|
151
121
|
puts VERSION
|
|
152
122
|
end
|
|
153
123
|
|
|
154
|
-
desc
|
|
124
|
+
desc 'dashboard', 'Run dashboard'
|
|
155
125
|
def dashboard
|
|
156
126
|
raise "Can't find Kimurai project" unless inside_project?
|
|
157
127
|
|
|
158
128
|
require './config/boot'
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
raise "Kimurai::Dashboard is not defined"
|
|
164
|
-
end
|
|
129
|
+
raise 'Kimurai::Dashboard is not defined' unless Object.const_defined?('Kimurai::Dashboard')
|
|
130
|
+
|
|
131
|
+
require 'kimurai/dashboard/app'
|
|
132
|
+
Kimurai::Dashboard::App.run!
|
|
165
133
|
end
|
|
166
134
|
|
|
167
135
|
private
|
|
168
136
|
|
|
169
137
|
def inside_project?
|
|
170
|
-
Dir.
|
|
138
|
+
Dir.exist?('spiders') && File.exist?('./config/boot.rb')
|
|
171
139
|
end
|
|
172
140
|
end
|
|
173
141
|
end
|
|
174
142
|
|
|
175
143
|
require_relative 'cli/generator'
|
|
176
|
-
require_relative 'cli/ansible_command_builder'
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
class Array
|
|
2
2
|
def in_sorted_groups(number, fill_width = nil)
|
|
3
|
-
sorted_groups = Array.new(number) { |
|
|
3
|
+
sorted_groups = Array.new(number) { |_a| [] }
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
in_groups_of(number, fill_width).each do |group|
|
|
6
6
|
number.times do |i|
|
|
7
7
|
group.fetch(i) rescue next
|
|
8
8
|
sorted_groups[i] << group[i]
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
class Numeric
|
|
2
2
|
# https://stackoverflow.com/a/1679963
|
|
3
3
|
def duration
|
|
4
|
-
secs =
|
|
4
|
+
secs = to_int
|
|
5
5
|
mins = secs / 60
|
|
6
6
|
hours = mins / 60
|
|
7
7
|
days = hours / 24
|
|
8
8
|
|
|
9
|
-
if days
|
|
9
|
+
if days.positive?
|
|
10
10
|
"#{days}d, #{hours % 24}h"
|
|
11
|
-
elsif hours
|
|
11
|
+
elsif hours.positive?
|
|
12
12
|
"#{hours}h, #{mins % 60}m"
|
|
13
|
-
elsif mins
|
|
13
|
+
elsif mins.positive?
|
|
14
14
|
"#{mins}m, #{secs % 60}s"
|
|
15
15
|
elsif secs >= 0
|
|
16
16
|
"#{secs}s"
|
data/lib/kimurai/pipeline.rb
CHANGED
data/lib/kimurai/runner.rb
CHANGED
|
@@ -19,17 +19,17 @@ module Kimurai
|
|
|
19
19
|
spiders: @spiders
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
-
if time_zone = Kimurai.configuration.time_zone
|
|
22
|
+
if (time_zone = Kimurai.configuration.time_zone)
|
|
23
23
|
Kimurai.time_zone = time_zone
|
|
24
24
|
end
|
|
25
25
|
|
|
26
|
-
ENV.store(
|
|
27
|
-
ENV.store(
|
|
26
|
+
ENV.store('SESSION_ID', @start_time.to_i.to_s)
|
|
27
|
+
ENV.store('RBCAT_COLORIZER', 'false')
|
|
28
28
|
end
|
|
29
29
|
|
|
30
30
|
def run!(exception_on_fail: true)
|
|
31
31
|
puts ">>> Runner: started: #{session_info}"
|
|
32
|
-
if at_start_callback = Kimurai.configuration.runner_at_start_callback
|
|
32
|
+
if (at_start_callback = Kimurai.configuration.runner_at_start_callback)
|
|
33
33
|
at_start_callback.call(session_info)
|
|
34
34
|
end
|
|
35
35
|
|
|
@@ -38,7 +38,7 @@ module Kimurai
|
|
|
38
38
|
next unless running
|
|
39
39
|
|
|
40
40
|
puts "> Runner: started spider: #{spider}, index: #{i}"
|
|
41
|
-
pid = spawn(
|
|
41
|
+
pid = spawn('bundle', 'exec', 'kimurai', 'crawl', spider, %i[out err] => "log/#{spider}.log")
|
|
42
42
|
Process.wait pid
|
|
43
43
|
|
|
44
44
|
puts "< Runner: stopped spider: #{spider}, index: #{i}"
|
|
@@ -51,7 +51,7 @@ module Kimurai
|
|
|
51
51
|
else
|
|
52
52
|
session_info.merge!(status: :completed, stop_time: Time.now)
|
|
53
53
|
ensure
|
|
54
|
-
if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
|
|
54
|
+
if (at_stop_callback = Kimurai.configuration.runner_at_stop_callback)
|
|
55
55
|
at_stop_callback.call(session_info)
|
|
56
56
|
end
|
|
57
57
|
puts "<<< Runner: stopped: #{session_info}"
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
source 'https://rubygems.org'
|
|
2
2
|
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
|
|
3
3
|
|
|
4
|
-
ruby '>=
|
|
4
|
+
ruby '>= 3.1'
|
|
5
5
|
|
|
6
6
|
# Framework
|
|
7
|
-
gem 'kimurai', '~>
|
|
7
|
+
gem 'kimurai', '~> 2.0'
|
|
8
8
|
|
|
9
9
|
# Require files in directory and child directories recursively
|
|
10
10
|
gem 'require_all'
|
|
@@ -6,17 +6,17 @@ Bundler.require(:default, Kimurai.env)
|
|
|
6
6
|
require 'dotenv/load'
|
|
7
7
|
|
|
8
8
|
# require initializers
|
|
9
|
-
Dir.glob(File.join(
|
|
9
|
+
Dir.glob(File.join('./config/initializers', '*.rb'), &method(:require))
|
|
10
10
|
|
|
11
11
|
# require helpers
|
|
12
|
-
Dir.glob(File.join(
|
|
12
|
+
Dir.glob(File.join('./helpers', '*.rb'), &method(:require))
|
|
13
13
|
|
|
14
14
|
# require pipelines
|
|
15
|
-
Dir.glob(File.join(
|
|
15
|
+
Dir.glob(File.join('./pipelines', '*.rb'), &method(:require))
|
|
16
16
|
|
|
17
17
|
# require spiders recursively in the `spiders/` folder
|
|
18
18
|
require_relative '../spiders/application_spider'
|
|
19
|
-
require_all
|
|
19
|
+
require_all 'spiders'
|
|
20
20
|
|
|
21
21
|
# require Kimurai configuration
|
|
22
22
|
require_relative 'application'
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
require 'tzinfo'
|
|
3
3
|
|
|
4
4
|
# Export current PATH to the cron
|
|
5
|
-
env :PATH, ENV[
|
|
5
|
+
env :PATH, ENV['PATH']
|
|
6
6
|
|
|
7
7
|
# Use 24 hour format when using `at:` option
|
|
8
8
|
set :chronic_options, hours24: true
|
|
@@ -19,34 +19,34 @@ def local_to_utc(time_string, zone:)
|
|
|
19
19
|
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
|
|
20
20
|
end
|
|
21
21
|
|
|
22
|
-
#
|
|
22
|
+
# NOTE: by default Whenever exports cron commands with :environment == "production".
|
|
23
23
|
# Note: Whenever can only append log data to a log file (>>). If you want
|
|
24
24
|
# to overwrite (>) log file before each run, pass lambda:
|
|
25
25
|
# crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
|
|
26
26
|
|
|
27
27
|
# Project job types
|
|
28
|
-
job_type :crawl,
|
|
29
|
-
job_type :runner,
|
|
28
|
+
job_type :crawl, 'cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output'
|
|
29
|
+
job_type :runner, 'cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output'
|
|
30
30
|
|
|
31
31
|
# Single file job type
|
|
32
|
-
job_type :single,
|
|
32
|
+
job_type :single, 'cd :path && KIMURAI_ENV=:environment ruby :task :output'
|
|
33
33
|
# Single with bundle exec
|
|
34
|
-
job_type :single_bundle,
|
|
34
|
+
job_type :single_bundle, 'cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output'
|
|
35
35
|
|
|
36
36
|
### Schedule ###
|
|
37
37
|
# Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
|
|
38
38
|
# every 1.day do
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
# Example to schedule a single spider in the project:
|
|
40
|
+
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
42
|
+
# Example to schedule all spiders in the project using runner. Each spider will write
|
|
43
|
+
# it's own output to the `log/spider_name.log` file (handled by a runner itself).
|
|
44
|
+
# Runner output will be written to log/runner.log file.
|
|
45
|
+
# Argument number it's a count of concurrent jobs:
|
|
46
|
+
# runner 3, output:"log/runner.log"
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
# Example to schedule single spider (without project):
|
|
49
|
+
# single "single_spider.rb", output: "single_spider.log"
|
|
50
50
|
# end
|
|
51
51
|
|
|
52
52
|
### How to set a cron schedule ###
|
|
@@ -5,19 +5,18 @@
|
|
|
5
5
|
class ApplicationSpider < Kimurai::Base
|
|
6
6
|
include ApplicationHelper
|
|
7
7
|
|
|
8
|
-
# Default engine for spiders (available engines: :mechanize, :
|
|
9
|
-
|
|
10
|
-
@engine = :poltergeist_phantomjs
|
|
8
|
+
# Default engine for spiders (available engines: :mechanize, :selenium_firefox, :selenium_chrome)
|
|
9
|
+
@engine = :selenium_chrome
|
|
11
10
|
|
|
12
11
|
# Pipelines list, by order.
|
|
13
12
|
# To process item through pipelines pass item to the `send_item` method
|
|
14
|
-
@pipelines = [
|
|
13
|
+
@pipelines = %i[validator saver]
|
|
15
14
|
|
|
16
15
|
# Default config. Set here options which are default for all spiders inherited
|
|
17
16
|
# from ApplicationSpider. Child's class config will be deep merged with this one
|
|
18
17
|
@config = {
|
|
19
18
|
# Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
|
|
20
|
-
# Works
|
|
19
|
+
# Works for :mechanize engine. Selenium doesn't allow to set/get headers.
|
|
21
20
|
# headers: {},
|
|
22
21
|
|
|
23
22
|
# Custom User Agent, format: string or lambda.
|
|
@@ -49,7 +48,7 @@ class ApplicationSpider < Kimurai::Base
|
|
|
49
48
|
# window_size: [1366, 768],
|
|
50
49
|
|
|
51
50
|
# Skip images downloading if true, works for all engines
|
|
52
|
-
disable_images: true,
|
|
51
|
+
# disable_images: true,
|
|
53
52
|
|
|
54
53
|
# Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
|
|
55
54
|
# Although native mode has a better performance, virtual display mode
|
|
@@ -61,14 +60,9 @@ class ApplicationSpider < Kimurai::Base
|
|
|
61
60
|
# Format: array of strings. Works only for :selenium_firefox and selenium_chrome
|
|
62
61
|
# proxy_bypass_list: [],
|
|
63
62
|
|
|
64
|
-
# Option to provide custom SSL certificate. Works only for :
|
|
63
|
+
# Option to provide custom SSL certificate. Works only for :mechanize
|
|
65
64
|
# ssl_cert_path: "path/to/ssl_cert",
|
|
66
65
|
|
|
67
|
-
# Inject some JavaScript code to the browser.
|
|
68
|
-
# Format: array of strings, where each string is a path to JS file.
|
|
69
|
-
# Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
|
|
70
|
-
# extensions: ["lib/code_to_inject.js"],
|
|
71
|
-
|
|
72
66
|
# Automatically skip duplicated (already visited) urls when using `request_to` method.
|
|
73
67
|
# Possible values: `true` or `hash` with options.
|
|
74
68
|
# In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
|
|
@@ -100,6 +94,12 @@ class ApplicationSpider < Kimurai::Base
|
|
|
100
94
|
# Format: same like for `skip_request_errors` option.
|
|
101
95
|
# retry_request_errors: [Net::ReadTimeout],
|
|
102
96
|
|
|
97
|
+
# Handle page encoding while parsing html response using Nokogiri. There are two modes:
|
|
98
|
+
# Auto (`:auto`) (try to fetch correct encoding from <meta http-equiv="Content-Type"> or <meta charset> tags)
|
|
99
|
+
# Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually)
|
|
100
|
+
# Default this option is unset.
|
|
101
|
+
# encoding: nil,
|
|
102
|
+
|
|
103
103
|
# Restart browser if one of the options is true:
|
|
104
104
|
restart_if: {
|
|
105
105
|
# Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
|
|
@@ -112,12 +112,12 @@ class ApplicationSpider < Kimurai::Base
|
|
|
112
112
|
# Perform several actions before each request:
|
|
113
113
|
before_request: {
|
|
114
114
|
# Change proxy before each request. The `proxy:` option above should be presented
|
|
115
|
-
# and has lambda format. Works
|
|
115
|
+
# and has lambda format. Works for :mechanize engine.
|
|
116
116
|
# (Selenium doesn't support proxy rotation).
|
|
117
117
|
# change_proxy: true,
|
|
118
118
|
|
|
119
119
|
# Change user agent before each request. The `user_agent:` option above should be presented
|
|
120
|
-
# and has lambda format. Works
|
|
120
|
+
# and has lambda format. Works for :mechanize engine.
|
|
121
121
|
# (selenium doesn't support to get/set headers).
|
|
122
122
|
# change_user_agent: true,
|
|
123
123
|
|
data/lib/kimurai/version.rb
CHANGED
data/lib/kimurai.rb
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
require 'ostruct'
|
|
2
2
|
require 'logger'
|
|
3
3
|
require 'json'
|
|
4
|
+
require 'uri'
|
|
5
|
+
|
|
4
6
|
require 'active_support'
|
|
5
7
|
require 'active_support/core_ext'
|
|
6
8
|
require 'rbcat'
|
|
@@ -28,26 +30,28 @@ module Kimurai
|
|
|
28
30
|
end
|
|
29
31
|
|
|
30
32
|
def env
|
|
31
|
-
ENV.fetch(
|
|
33
|
+
ENV.fetch('KIMURAI_ENV', 'development')
|
|
32
34
|
end
|
|
33
35
|
|
|
34
36
|
def time_zone
|
|
35
|
-
ENV[
|
|
37
|
+
ENV['TZ']
|
|
36
38
|
end
|
|
37
39
|
|
|
38
40
|
def time_zone=(value)
|
|
39
|
-
ENV.store(
|
|
41
|
+
ENV.store('TZ', value)
|
|
40
42
|
end
|
|
41
43
|
|
|
42
44
|
def list
|
|
43
45
|
Base.descendants.map do |klass|
|
|
44
46
|
next unless klass.name
|
|
47
|
+
|
|
45
48
|
[klass.name, klass]
|
|
46
49
|
end.compact.to_h
|
|
47
50
|
end
|
|
48
51
|
|
|
49
52
|
def find_by_name(name)
|
|
50
53
|
return unless name
|
|
54
|
+
|
|
51
55
|
Base.descendants.find { |klass| klass.name == name }
|
|
52
56
|
end
|
|
53
57
|
end
|