kimurai 1.3.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +29 -0
  4. data/Gemfile +2 -2
  5. data/README.md +478 -649
  6. data/Rakefile +6 -6
  7. data/bin/console +3 -4
  8. data/exe/kimurai +0 -1
  9. data/kimurai.gemspec +38 -37
  10. data/lib/kimurai/base/saver.rb +15 -19
  11. data/lib/kimurai/base/storage.rb +1 -1
  12. data/lib/kimurai/base.rb +42 -38
  13. data/lib/kimurai/base_helper.rb +5 -4
  14. data/lib/kimurai/browser_builder/mechanize_builder.rb +44 -38
  15. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +63 -51
  16. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +61 -55
  17. data/lib/kimurai/browser_builder.rb +7 -31
  18. data/lib/kimurai/capybara_configuration.rb +1 -1
  19. data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
  20. data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
  21. data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
  22. data/lib/kimurai/capybara_ext/session/config.rb +1 -1
  23. data/lib/kimurai/capybara_ext/session.rb +40 -38
  24. data/lib/kimurai/cli/generator.rb +15 -15
  25. data/lib/kimurai/cli.rb +52 -85
  26. data/lib/kimurai/core_ext/array.rb +2 -2
  27. data/lib/kimurai/core_ext/hash.rb +1 -1
  28. data/lib/kimurai/core_ext/numeric.rb +4 -4
  29. data/lib/kimurai/pipeline.rb +2 -1
  30. data/lib/kimurai/runner.rb +6 -6
  31. data/lib/kimurai/template/Gemfile +2 -2
  32. data/lib/kimurai/template/config/boot.rb +4 -4
  33. data/lib/kimurai/template/config/schedule.rb +15 -15
  34. data/lib/kimurai/template/spiders/application_spider.rb +14 -14
  35. data/lib/kimurai/version.rb +1 -1
  36. data/lib/kimurai.rb +7 -3
  37. metadata +58 -65
  38. data/.travis.yml +0 -5
  39. data/lib/kimurai/automation/deploy.yml +0 -54
  40. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
  41. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
  42. data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
  43. data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
  44. data/lib/kimurai/automation/setup.yml +0 -44
  45. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -171
  46. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
  47. data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
  48. data/lib/kimurai/template/config/automation.yml +0 -13
data/lib/kimurai/cli.rb CHANGED
@@ -4,18 +4,22 @@ module Kimurai
4
4
  class CLI < Thor
5
5
  map %w[--version -v] => :__print_version
6
6
 
7
- desc "generate", "Generator, available types: project, spider, schedule"
7
+ desc 'new PROJECT_NAME', 'Create a new Kimurai project'
8
+ def new(project_name)
9
+ raise 'Provide project name to generate a new project' unless project_name.present?
10
+
11
+ Generator.new.generate_project(project_name)
12
+ end
13
+
14
+ desc 'generate', 'Generator, available types: spider, schedule'
8
15
  def generate(generator_type, *args)
9
16
  case generator_type
10
- when "project"
11
- project_name = args.shift
12
- raise "Provide project name to generate a new project" unless project_name.present?
13
- Generator.new.generate_project(project_name)
14
- when "spider"
17
+ when 'spider'
15
18
  spider_name = args.shift
16
- raise "Provide spider name to generate a spider" unless spider_name.present?
19
+ raise 'Provide spider name to generate a spider' unless spider_name.present?
20
+
17
21
  Generator.new.generate_spider(spider_name, in_project: inside_project?)
18
- when "schedule"
22
+ when 'schedule'
19
23
  Generator.new.generate_schedule
20
24
  else
21
25
  raise "Don't know this generator type: #{generator_type}"
@@ -24,82 +28,43 @@ module Kimurai
24
28
 
25
29
  ###
26
30
 
27
- desc "setup", "Setup server"
28
- option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
29
- option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
30
- option "ask-auth-pass", type: :boolean, banner: "Auth using password"
31
- option "ssh-key-path", type: :string, banner: "Auth using ssh key"
32
- option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
33
- def setup(user_host)
34
- command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
35
-
36
- pid = spawn *command
37
- Process.wait pid
38
- end
39
-
40
- desc "deploy", "Deploy project to the server and update cron schedule"
41
- option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
42
- option "ask-auth-pass", type: :boolean, banner: "Auth using password"
43
- option "ssh-key-path", type: :string, banner: "Auth using ssh key"
44
- option "repo-url", type: :string, banner: "Repo url"
45
- option "repo-key-path", type: :string, banner: "SSH key for a git repo"
46
- def deploy(user_host)
47
- if !`git status --short`.empty?
48
- raise "Deploy: Please commit your changes first"
49
- elsif `git remote`.empty?
50
- raise "Deploy: Please add remote origin repository to your repo first"
51
- elsif !`git rev-list master...origin/master`.empty?
52
- raise "Deploy: Please push your commits to the remote origin repo first"
53
- end
54
-
55
- repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
56
- repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
57
-
58
- command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
59
- vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
60
- ).get
61
-
62
- pid = spawn *command
63
- Process.wait pid
64
- end
65
-
66
- ###
67
-
68
- desc "crawl", "Run a particular spider by it's name"
31
+ desc 'crawl', "Run a particular spider by it's name"
69
32
  def crawl(spider_name)
70
33
  raise "Can't find Kimurai project" unless inside_project?
34
+
71
35
  require './config/boot'
72
36
 
73
- unless klass = Kimurai.find_by_name(spider_name)
37
+ unless (klass = Kimurai.find_by_name(spider_name))
74
38
  raise "Can't find spider with name `#{spider_name}` in the project. " \
75
- "To list all available spiders, run: `$ bundle exec kimurai list`"
39
+ 'To list all available spiders, run: `$ bundle exec kimurai list`'
76
40
  end
77
41
 
78
42
  # Set time_zone if exists
79
- if time_zone = Kimurai.configuration.time_zone
43
+ if (time_zone = Kimurai.configuration.time_zone)
80
44
  Kimurai.time_zone = time_zone
81
45
  end
82
46
 
83
47
  klass.crawl!
84
48
  end
85
49
 
86
- desc "parse", "Parse url in the particular spider method"
87
- option :url, type: :string, required: true, banner: "Url to pass to the method"
50
+ desc 'parse', 'Parse url in the particular spider method'
51
+ option :url, type: :string, required: true, banner: 'Url to pass to the method'
88
52
  def parse(spider_name, method_name)
89
53
  raise "Can't find Kimurai project" unless inside_project?
54
+
90
55
  require './config/boot'
91
56
 
92
- unless klass = Kimurai.find_by_name(spider_name)
57
+ unless (klass = Kimurai.find_by_name(spider_name))
93
58
  raise "Can't find spider with name `#{spider_name}` in the project. " \
94
- "To list all available spiders, run: `$ bundle exec kimurai list`"
59
+ 'To list all available spiders, run: `$ bundle exec kimurai list`'
95
60
  end
96
61
 
97
- klass.parse!(method_name, url: options["url"])
62
+ klass.parse!(method_name, url: options['url'])
98
63
  end
99
64
 
100
- desc "console", "Start Kimurai console"
101
- option :engine, type: :string, banner: "Engine to use"
102
- option :url, type: :string, banner: "Url to process"
65
+ desc 'console', 'Start Kimurai console'
66
+ option :engine, type: :string, banner: 'Engine to use'
67
+ option :url, type: :string, banner: 'Url to process'
103
68
  def console(spider_name = nil)
104
69
  require 'pry'
105
70
  require './config/boot' if inside_project?
@@ -107,70 +72,72 @@ module Kimurai
107
72
  if spider_name
108
73
  raise "Can't find Kimurai project" unless inside_project?
109
74
 
110
- unless klass = Kimurai.find_by_name(spider_name)
75
+ unless (klass = Kimurai.find_by_name(spider_name))
111
76
  raise "Can't find spider with name `#{spider_name}` in the project. " \
112
- "To list all available spiders, run: `$ bundle exec kimurai list`"
77
+ 'To list all available spiders, run: `$ bundle exec kimurai list`'
113
78
  end
114
79
  else
115
80
  klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
116
81
  end
117
82
 
118
- engine = options["engine"]&.delete(":")&.to_sym
119
- klass.parse!(:console, engine, url: options["url"])
83
+ engine = options['engine']&.delete(':')&.to_sym
84
+ if options['url']
85
+ klass.new(engine).request_to(:console, url: options['url'])
86
+ else
87
+ klass.new(engine).public_send(:console)
88
+ end
120
89
  end
121
90
 
122
- desc "list", "List all available spiders in the current project"
91
+ desc 'list', 'List all available spiders in the current project'
123
92
  def list
124
93
  raise "Can't find Kimurai project" unless inside_project?
94
+
125
95
  require './config/boot'
126
96
 
127
- Kimurai.list.keys.each { |name| puts name }
97
+ Kimurai.list.keys.sort.each { |name| puts name }
128
98
  end
129
99
 
130
- desc "runner", "Run all spiders in the project in queue"
131
- option :include, type: :array, default: [], banner: "List of spiders to run"
132
- option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
133
- option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
100
+ desc 'runner', 'Run all spiders in the project in queue'
101
+ option :include, type: :array, default: [], banner: 'List of spiders to run'
102
+ option :exclude, type: :array, default: [], banner: 'List of spiders to exclude from run'
103
+ option :jobs, aliases: :j, type: :numeric, default: 1, banner: 'The number of concurrent jobs'
134
104
  def runner
135
105
  raise "Can't find Kimurai project" unless inside_project?
136
106
 
137
- jobs = options["jobs"]
138
- raise "Jobs count can't be 0" if jobs == 0
107
+ jobs = options['jobs']
108
+ raise "Jobs count can't be 0" if jobs.zero?
139
109
 
140
110
  require './config/boot'
141
111
  require 'kimurai/runner'
142
112
 
143
- spiders = options["include"].presence || Kimurai.list.keys
144
- spiders -= options["exclude"]
113
+ spiders = options['include'].presence || Kimurai.list.keys
114
+ spiders -= options['exclude']
145
115
 
146
116
  Runner.new(spiders, jobs).run!
147
117
  end
148
118
 
149
- desc "--version, -v", "Print the version"
119
+ desc '--version, -v', 'Print the version'
150
120
  def __print_version
151
121
  puts VERSION
152
122
  end
153
123
 
154
- desc "dashboard", "Run dashboard"
124
+ desc 'dashboard', 'Run dashboard'
155
125
  def dashboard
156
126
  raise "Can't find Kimurai project" unless inside_project?
157
127
 
158
128
  require './config/boot'
159
- if Object.const_defined?("Kimurai::Dashboard")
160
- require 'kimurai/dashboard/app'
161
- Kimurai::Dashboard::App.run!
162
- else
163
- raise "Kimurai::Dashboard is not defined"
164
- end
129
+ raise 'Kimurai::Dashboard is not defined' unless Object.const_defined?('Kimurai::Dashboard')
130
+
131
+ require 'kimurai/dashboard/app'
132
+ Kimurai::Dashboard::App.run!
165
133
  end
166
134
 
167
135
  private
168
136
 
169
137
  def inside_project?
170
- Dir.exists?("spiders") && File.exists?("./config/boot.rb")
138
+ Dir.exist?('spiders') && File.exist?('./config/boot.rb')
171
139
  end
172
140
  end
173
141
  end
174
142
 
175
143
  require_relative 'cli/generator'
176
- require_relative 'cli/ansible_command_builder'
@@ -1,8 +1,8 @@
1
1
  class Array
2
2
  def in_sorted_groups(number, fill_width = nil)
3
- sorted_groups = Array.new(number) { |a| a = [] }
3
+ sorted_groups = Array.new(number) { |_a| [] }
4
4
 
5
- self.in_groups_of(number, fill_width).each do |group|
5
+ in_groups_of(number, fill_width).each do |group|
6
6
  number.times do |i|
7
7
  group.fetch(i) rescue next
8
8
  sorted_groups[i] << group[i]
@@ -1,5 +1,5 @@
1
1
  class Hash
2
2
  def deep_merge_excl(second, exclude)
3
- self.merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
3
+ merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
4
4
  end
5
5
  end
@@ -1,16 +1,16 @@
1
1
  class Numeric
2
2
  # https://stackoverflow.com/a/1679963
3
3
  def duration
4
- secs = self.to_int
4
+ secs = to_int
5
5
  mins = secs / 60
6
6
  hours = mins / 60
7
7
  days = hours / 24
8
8
 
9
- if days > 0
9
+ if days.positive?
10
10
  "#{days}d, #{hours % 24}h"
11
- elsif hours > 0
11
+ elsif hours.positive?
12
12
  "#{hours}h, #{mins % 60}m"
13
- elsif mins > 0
13
+ elsif mins.positive?
14
14
  "#{mins}m, #{secs % 60}s"
15
15
  elsif secs >= 0
16
16
  "#{secs}s"
@@ -1,8 +1,9 @@
1
1
  module Kimurai
2
2
  class Pipeline
3
3
  class DropItemError < StandardError; end
4
+
4
5
  def self.name
5
- self.to_s.sub(/.*?::/, "").underscore.to_sym
6
+ to_s.sub(/.*?::/, '').underscore.to_sym
6
7
  end
7
8
 
8
9
  include BaseHelper
@@ -19,17 +19,17 @@ module Kimurai
19
19
  spiders: @spiders
20
20
  }
21
21
 
22
- if time_zone = Kimurai.configuration.time_zone
22
+ if (time_zone = Kimurai.configuration.time_zone)
23
23
  Kimurai.time_zone = time_zone
24
24
  end
25
25
 
26
- ENV.store("SESSION_ID", @start_time.to_i.to_s)
27
- ENV.store("RBCAT_COLORIZER", "false")
26
+ ENV.store('SESSION_ID', @start_time.to_i.to_s)
27
+ ENV.store('RBCAT_COLORIZER', 'false')
28
28
  end
29
29
 
30
30
  def run!(exception_on_fail: true)
31
31
  puts ">>> Runner: started: #{session_info}"
32
- if at_start_callback = Kimurai.configuration.runner_at_start_callback
32
+ if (at_start_callback = Kimurai.configuration.runner_at_start_callback)
33
33
  at_start_callback.call(session_info)
34
34
  end
35
35
 
@@ -38,7 +38,7 @@ module Kimurai
38
38
  next unless running
39
39
 
40
40
  puts "> Runner: started spider: #{spider}, index: #{i}"
41
- pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
41
+ pid = spawn('bundle', 'exec', 'kimurai', 'crawl', spider, %i[out err] => "log/#{spider}.log")
42
42
  Process.wait pid
43
43
 
44
44
  puts "< Runner: stopped spider: #{spider}, index: #{i}"
@@ -51,7 +51,7 @@ module Kimurai
51
51
  else
52
52
  session_info.merge!(status: :completed, stop_time: Time.now)
53
53
  ensure
54
- if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
54
+ if (at_stop_callback = Kimurai.configuration.runner_at_stop_callback)
55
55
  at_stop_callback.call(session_info)
56
56
  end
57
57
  puts "<<< Runner: stopped: #{session_info}"
@@ -1,10 +1,10 @@
1
1
  source 'https://rubygems.org'
2
2
  git_source(:github) { |repo| "https://github.com/#{repo}.git" }
3
3
 
4
- ruby '>= 2.5'
4
+ ruby '>= 3.1'
5
5
 
6
6
  # Framework
7
- gem 'kimurai', '~> 1.0'
7
+ gem 'kimurai', '~> 2.0'
8
8
 
9
9
  # Require files in directory and child directories recursively
10
10
  gem 'require_all'
@@ -6,17 +6,17 @@ Bundler.require(:default, Kimurai.env)
6
6
  require 'dotenv/load'
7
7
 
8
8
  # require initializers
9
- Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
9
+ Dir.glob(File.join('./config/initializers', '*.rb'), &method(:require))
10
10
 
11
11
  # require helpers
12
- Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
12
+ Dir.glob(File.join('./helpers', '*.rb'), &method(:require))
13
13
 
14
14
  # require pipelines
15
- Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
15
+ Dir.glob(File.join('./pipelines', '*.rb'), &method(:require))
16
16
 
17
17
  # require spiders recursively in the `spiders/` folder
18
18
  require_relative '../spiders/application_spider'
19
- require_all "spiders"
19
+ require_all 'spiders'
20
20
 
21
21
  # require Kimurai configuration
22
22
  require_relative 'application'
@@ -2,7 +2,7 @@
2
2
  require 'tzinfo'
3
3
 
4
4
  # Export current PATH to the cron
5
- env :PATH, ENV["PATH"]
5
+ env :PATH, ENV['PATH']
6
6
 
7
7
  # Use 24 hour format when using `at:` option
8
8
  set :chronic_options, hours24: true
@@ -19,34 +19,34 @@ def local_to_utc(time_string, zone:)
19
19
  TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
20
20
  end
21
21
 
22
- # Note: by default Whenever exports cron commands with :environment == "production".
22
+ # NOTE: by default Whenever exports cron commands with :environment == "production".
23
23
  # Note: Whenever can only append log data to a log file (>>). If you want
24
24
  # to overwrite (>) log file before each run, pass lambda:
25
25
  # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
26
26
 
27
27
  # Project job types
28
- job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
29
- job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
28
+ job_type :crawl, 'cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output'
29
+ job_type :runner, 'cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output'
30
30
 
31
31
  # Single file job type
32
- job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
32
+ job_type :single, 'cd :path && KIMURAI_ENV=:environment ruby :task :output'
33
33
  # Single with bundle exec
34
- job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
34
+ job_type :single_bundle, 'cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output'
35
35
 
36
36
  ### Schedule ###
37
37
  # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
38
38
  # every 1.day do
39
- # Example to schedule a single spider in the project:
40
- # crawl "google_spider.com", output: "log/google_spider.com.log"
39
+ # Example to schedule a single spider in the project:
40
+ # crawl "google_spider.com", output: "log/google_spider.com.log"
41
41
 
42
- # Example to schedule all spiders in the project using runner. Each spider will write
43
- # it's own output to the `log/spider_name.log` file (handled by a runner itself).
44
- # Runner output will be written to log/runner.log file.
45
- # Argument number it's a count of concurrent jobs:
46
- # runner 3, output:"log/runner.log"
42
+ # Example to schedule all spiders in the project using runner. Each spider will write
43
+ # it's own output to the `log/spider_name.log` file (handled by a runner itself).
44
+ # Runner output will be written to log/runner.log file.
45
+ # Argument number it's a count of concurrent jobs:
46
+ # runner 3, output:"log/runner.log"
47
47
 
48
- # Example to schedule single spider (without project):
49
- # single "single_spider.rb", output: "single_spider.log"
48
+ # Example to schedule single spider (without project):
49
+ # single "single_spider.rb", output: "single_spider.log"
50
50
  # end
51
51
 
52
52
  ### How to set a cron schedule ###
@@ -5,19 +5,18 @@
5
5
  class ApplicationSpider < Kimurai::Base
6
6
  include ApplicationHelper
7
7
 
8
- # Default engine for spiders (available engines: :mechanize, :poltergeist_phantomjs,
9
- # :selenium_firefox, :selenium_chrome)
10
- @engine = :poltergeist_phantomjs
8
+ # Default engine for spiders (available engines: :mechanize, :selenium_firefox, :selenium_chrome)
9
+ @engine = :selenium_chrome
11
10
 
12
11
  # Pipelines list, by order.
13
12
  # To process item through pipelines pass item to the `send_item` method
14
- @pipelines = [:validator, :saver]
13
+ @pipelines = %i[validator saver]
15
14
 
16
15
  # Default config. Set here options which are default for all spiders inherited
17
16
  # from ApplicationSpider. Child's class config will be deep merged with this one
18
17
  @config = {
19
18
  # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
20
- # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers)
19
+ # Works for :mechanize engine. Selenium doesn't allow to set/get headers.
21
20
  # headers: {},
22
21
 
23
22
  # Custom User Agent, format: string or lambda.
@@ -49,7 +48,7 @@ class ApplicationSpider < Kimurai::Base
49
48
  # window_size: [1366, 768],
50
49
 
51
50
  # Skip images downloading if true, works for all engines
52
- disable_images: true,
51
+ # disable_images: true,
53
52
 
54
53
  # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
55
54
  # Although native mode has a better performance, virtual display mode
@@ -61,14 +60,9 @@ class ApplicationSpider < Kimurai::Base
61
60
  # Format: array of strings. Works only for :selenium_firefox and selenium_chrome
62
61
  # proxy_bypass_list: [],
63
62
 
64
- # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
63
+ # Option to provide custom SSL certificate. Works only for :mechanize
65
64
  # ssl_cert_path: "path/to/ssl_cert",
66
65
 
67
- # Inject some JavaScript code to the browser.
68
- # Format: array of strings, where each string is a path to JS file.
69
- # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
70
- # extensions: ["lib/code_to_inject.js"],
71
-
72
66
  # Automatically skip duplicated (already visited) urls when using `request_to` method.
73
67
  # Possible values: `true` or `hash` with options.
74
68
  # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
@@ -100,6 +94,12 @@ class ApplicationSpider < Kimurai::Base
100
94
  # Format: same like for `skip_request_errors` option.
101
95
  # retry_request_errors: [Net::ReadTimeout],
102
96
 
97
+ # Handle page encoding while parsing html response using Nokogiri. There are two modes:
98
+ # Auto (`:auto`) (try to fetch correct encoding from <meta http-equiv="Content-Type"> or <meta charset> tags)
99
+ # Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually)
100
+ # Default this option is unset.
101
+ # encoding: nil,
102
+
103
103
  # Restart browser if one of the options is true:
104
104
  restart_if: {
105
105
  # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
@@ -112,12 +112,12 @@ class ApplicationSpider < Kimurai::Base
112
112
  # Perform several actions before each request:
113
113
  before_request: {
114
114
  # Change proxy before each request. The `proxy:` option above should be presented
115
- # and has lambda format. Works only for poltergeist and mechanize engines
115
+ # and has lambda format. Works for :mechanize engine.
116
116
  # (Selenium doesn't support proxy rotation).
117
117
  # change_proxy: true,
118
118
 
119
119
  # Change user agent before each request. The `user_agent:` option above should be presented
120
- # and has lambda format. Works only for poltergeist and mechanize engines
120
+ # and has lambda format. Works for :mechanize engine.
121
121
  # (selenium doesn't support to get/set headers).
122
122
  # change_user_agent: true,
123
123
 
@@ -1,3 +1,3 @@
1
1
  module Kimurai
2
- VERSION = "1.3.2"
2
+ VERSION = '2.0.0'.freeze
3
3
  end
data/lib/kimurai.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require 'ostruct'
2
2
  require 'logger'
3
3
  require 'json'
4
+ require 'uri'
5
+
4
6
  require 'active_support'
5
7
  require 'active_support/core_ext'
6
8
  require 'rbcat'
@@ -28,26 +30,28 @@ module Kimurai
28
30
  end
29
31
 
30
32
  def env
31
- ENV.fetch("KIMURAI_ENV") { "development" }
33
+ ENV.fetch('KIMURAI_ENV', 'development')
32
34
  end
33
35
 
34
36
  def time_zone
35
- ENV["TZ"]
37
+ ENV['TZ']
36
38
  end
37
39
 
38
40
  def time_zone=(value)
39
- ENV.store("TZ", value)
41
+ ENV.store('TZ', value)
40
42
  end
41
43
 
42
44
  def list
43
45
  Base.descendants.map do |klass|
44
46
  next unless klass.name
47
+
45
48
  [klass.name, klass]
46
49
  end.compact.to_h
47
50
  end
48
51
 
49
52
  def find_by_name(name)
50
53
  return unless name
54
+
51
55
  Base.descendants.find { |klass| klass.name == name }
52
56
  end
53
57
  end