kimurai_dynamic 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +111 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai/automation/deploy.yml +54 -0
  14. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  15. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  16. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  17. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  18. data/lib/kimurai/automation/setup.yml +45 -0
  19. data/lib/kimurai/base/saver.rb +106 -0
  20. data/lib/kimurai/base/storage.rb +54 -0
  21. data/lib/kimurai/base.rb +330 -0
  22. data/lib/kimurai/base_helper.rb +22 -0
  23. data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
  24. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  25. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
  26. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
  27. data/lib/kimurai/browser_builder.rb +20 -0
  28. data/lib/kimurai/capybara_configuration.rb +10 -0
  29. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  30. data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
  31. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  32. data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
  33. data/lib/kimurai/capybara_ext/session/config.rb +22 -0
  34. data/lib/kimurai/capybara_ext/session.rb +249 -0
  35. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  36. data/lib/kimurai/cli/generator.rb +57 -0
  37. data/lib/kimurai/cli.rb +183 -0
  38. data/lib/kimurai/core_ext/array.rb +14 -0
  39. data/lib/kimurai/core_ext/hash.rb +5 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +33 -0
  43. data/lib/kimurai/runner.rb +60 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/Gemfile +28 -0
  46. data/lib/kimurai/template/README.md +3 -0
  47. data/lib/kimurai/template/config/application.rb +37 -0
  48. data/lib/kimurai/template/config/automation.yml +13 -0
  49. data/lib/kimurai/template/config/boot.rb +22 -0
  50. data/lib/kimurai/template/config/initializers/.keep +0 -0
  51. data/lib/kimurai/template/config/schedule.rb +57 -0
  52. data/lib/kimurai/template/db/.keep +0 -0
  53. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  54. data/lib/kimurai/template/lib/.keep +0 -0
  55. data/lib/kimurai/template/log/.keep +0 -0
  56. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  57. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  58. data/lib/kimurai/template/spiders/application_spider.rb +143 -0
  59. data/lib/kimurai/template/tmp/.keep +0 -0
  60. data/lib/kimurai/version.rb +3 -0
  61. data/lib/kimurai.rb +54 -0
  62. metadata +349 -0
@@ -0,0 +1,249 @@
1
+ require 'capybara'
2
+ require 'nokogiri'
3
+ require 'json'
4
+ require_relative 'session/config'
5
+
6
+ module Capybara
7
+ class Session
8
+ attr_accessor :spider
9
+
10
+ alias_method :original_visit, :visit
11
+ def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
12
+ if spider
13
+ process_delay(delay) if delay
14
+ retries, sleep_interval = 0, 0
15
+
16
+ begin
17
+ check_request_options(visit_uri) unless skip_request_options
18
+ driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
19
+ spider.class.update(:visits, :requests) if spider.with_info
20
+
21
+ original_visit(visit_uri)
22
+ rescue => e
23
+ if match_error?(e, type: :to_skip)
24
+ logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
25
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
26
+ false
27
+ elsif match_error?(e, type: :to_retry)
28
+ logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
29
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
30
+
31
+ if (retries += 1) <= max_retries
32
+ logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
33
+ sleep sleep_interval and retry
34
+ else
35
+ logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
36
+ raise e unless skip_error_on_failure?(e)
37
+ end
38
+ else
39
+ raise e
40
+ end
41
+ else
42
+ driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
43
+ spider.class.update(:visits, :responses) if spider.with_info
44
+ driver.visited = true unless driver.visited
45
+ true
46
+ ensure
47
+ if spider.with_info
48
+ logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
49
+ end
50
+
51
+ if memory = driver.current_memory
52
+ logger.debug "Browser: driver.current_memory: #{memory}"
53
+ end
54
+ end
55
+ else
56
+ original_visit(visit_uri)
57
+ end
58
+ end
59
+
60
+ def destroy_driver!
61
+ if @driver
62
+ begin
63
+ @driver.quit
64
+ # handle Net::ReadTimeout error for Selenium like drivers
65
+ rescue Net::ReadTimeout => e
66
+ @driver.quit
67
+ end
68
+
69
+ @driver = nil
70
+ logger.info "Browser: driver #{mode} has been destroyed"
71
+ else
72
+ logger.warn "Browser: driver #{mode} is not present"
73
+ end
74
+ end
75
+
76
+ def restart!
77
+ if mode.match?(/poltergeist/)
78
+ @driver.browser.restart
79
+ @driver.requests, @driver.responses = 0, 0
80
+ else
81
+ destroy_driver!
82
+ driver
83
+ end
84
+
85
+ logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
86
+ end
87
+
88
+ def current_response(response_type = :html)
89
+ case response_type
90
+ when :html
91
+ if config.encoding
92
+ if config.encoding == :auto
93
+ charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
94
+ Nokogiri::HTML(body, nil, charset)
95
+ else
96
+ Nokogiri::HTML(body, nil, config.encoding)
97
+ end
98
+ else
99
+ Nokogiri::HTML(body)
100
+ end
101
+ when :json
102
+ JSON.parse(body)
103
+ end
104
+ end
105
+
106
+ ###
107
+
108
+ # Handy method to perform some processing in the new tab within block and then automatically close this tab:
109
+ # Usage (url):
110
+ # browser.within_new_window_by(url: "https://google.com") do
111
+ # do some stuff and then automatically close this tab and return back to the first tab
112
+ # end
113
+ # Usage (action) (when new tab opening by some action, for example by clicking
114
+ # on a particular element):
115
+ # action = -> { browser.find("//some/element/path").click }
116
+ # browser.within_new_window_by(action: action) do
117
+ # do some stuff and then automatically close this tab and return back to the first tab
118
+ # end
119
+ def within_new_window_by(action: nil, url: nil)
120
+ case
121
+ when action
122
+ opened_window = window_opened_by { action.call }
123
+ within_window(opened_window) do
124
+ yield
125
+ current_window.close
126
+ end
127
+ when url
128
+ within_window(open_new_window) do
129
+ visit(url)
130
+
131
+ yield
132
+ current_window.close
133
+ end
134
+ end
135
+ end
136
+
137
+ ###
138
+
139
+ def scroll_to_bottom
140
+ execute_script("window.scrollBy(0,10000)")
141
+ end
142
+
143
+ private
144
+
145
+ def skip_error_on_failure?(e)
146
+ config.retry_request_errors.any? do |error|
147
+ error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
148
+ end
149
+ end
150
+
151
+ def match_error?(e, type:)
152
+ errors =
153
+ case type
154
+ when :to_retry then config.retry_request_errors
155
+ when :to_skip then config.skip_request_errors
156
+ end
157
+
158
+ errors.any? do |error|
159
+ if error.kind_of?(Hash)
160
+ match_class = e.class.ancestors.include?(error[:error])
161
+ if error[:message].present?
162
+ if error[:message].kind_of?(Regexp)
163
+ e.message&.match?(error[:message])
164
+ else
165
+ e.message&.include?(error[:message])
166
+ end && match_class
167
+ else
168
+ match_class
169
+ end
170
+ else
171
+ e.class.ancestors.include?(error)
172
+ end
173
+ end
174
+ end
175
+
176
+ def process_delay(delay)
177
+ interval = (delay.class == Range ? rand(delay) : delay)
178
+ logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
179
+ sleep interval
180
+ end
181
+
182
+ def check_request_options(url_to_visit)
183
+ # restart_if
184
+ if memory_limit = config.restart_if[:memory_limit]
185
+ memory = driver.current_memory
186
+ if memory && memory >= memory_limit
187
+ logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
188
+ restart!
189
+ end
190
+ end
191
+
192
+ if requests_limit = config.restart_if[:requests_limit]
193
+ requests = driver.requests
194
+ if requests >= requests_limit
195
+ logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
196
+ restart!
197
+ end
198
+ end
199
+
200
+ # cookies
201
+ # (Selenium only) if config.cookies present and browser was just created,
202
+ # visit url_to_visit first and only then set cookies:
203
+ if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
204
+ visit(url_to_visit, skip_request_options: true)
205
+ config.cookies.each do |cookie|
206
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
207
+ end
208
+ end
209
+
210
+ if config.before_request[:clear_cookies]
211
+ driver.clear_cookies
212
+ logger.debug "Browser: cleared cookies before request"
213
+ end
214
+
215
+ if config.before_request[:clear_and_set_cookies]
216
+ driver.clear_cookies
217
+
218
+ # (Selenium only) if browser is not visited yet any page, visit url_to_visit
219
+ # first and then set cookies (needs after browser restart):
220
+ if driver.visited.nil? && mode.match?(/selenium/)
221
+ visit(url_to_visit, skip_request_options: true)
222
+ end
223
+
224
+ config.cookies.each do |cookie|
225
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
226
+ end
227
+
228
+ logger.debug "Browser: cleared and set cookies before request"
229
+ end
230
+
231
+ # user_agent
232
+ if config.before_request[:change_user_agent]
233
+ driver.add_header("User-Agent", config.user_agent.call)
234
+ logger.debug "Browser: changed user_agent before request"
235
+ end
236
+
237
+ # proxy
238
+ if config.before_request[:change_proxy]
239
+ proxy_string = config.proxy.call
240
+ driver.set_proxy(*proxy_string.split(":"))
241
+ logger.debug "Browser: changed proxy before request"
242
+ end
243
+ end
244
+
245
+ def logger
246
+ spider.logger
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,71 @@
1
+ require 'cliver'
2
+
3
+ module Kimurai
4
+ class CLI
5
+ class AnsibleCommandBuilder
6
+ def initialize(user_host, options, playbook:, vars: {})
7
+ @user_host = user_host
8
+ @options = options
9
+ @playbook = playbook
10
+ @vars = vars
11
+ end
12
+
13
+ def get
14
+ unless Cliver.detect("ansible-playbook")
15
+ raise "Can't find `ansible-playbook` executable, to install: " \
16
+ "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17
+ end
18
+
19
+ user = @user_host[/(.*?)\@/, 1]
20
+ host = @user_host[/\@(.+)/, 1] || @user_host
21
+ inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22
+
23
+ gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
24
+ playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
25
+
26
+ command = [
27
+ "ansible-playbook", playbook_path,
28
+ "--inventory", inventory,
29
+ "--ssh-extra-args", "-oForwardAgent=yes",
30
+ "--connection", @options["local"] ? "local" : "smart",
31
+ "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32
+ ]
33
+
34
+ if File.exists? "config/automation.yml"
35
+ require 'yaml'
36
+ if config = YAML.load_file("config/automation.yml").dig(@playbook)
37
+ config.each { |key, value| @vars[key] = value unless @vars[key] }
38
+ end
39
+ end
40
+
41
+ @vars.each do |key, value|
42
+ next unless value.present?
43
+ command.push "--extra-vars", "#{key}=#{value}"
44
+ end
45
+
46
+ if user
47
+ command.push "--user", user
48
+ end
49
+
50
+ if @options["ask-sudo"]
51
+ command.push "--ask-become-pass"
52
+ end
53
+
54
+ if @options["ask-auth-pass"]
55
+ unless Cliver.detect("sshpass")
56
+ raise "Can't find `sshpass` executable for password authentication, to install: " \
57
+ "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58
+ end
59
+
60
+ command.push "--ask-pass"
61
+ end
62
+
63
+ if ssh_key_path = @options["ssh-key-path"]
64
+ command.push "--private-key", ssh_key_path
65
+ end
66
+
67
+ command
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,57 @@
1
+ module Kimurai
2
+ class CLI
3
+ class Generator < Thor::Group
4
+ include Thor::Actions
5
+
6
+ def self.source_root
7
+ File.dirname(File.expand_path('..', __FILE__))
8
+ end
9
+
10
+ def generate_project(project_name)
11
+ directory "template", project_name
12
+ inside(project_name) do
13
+ run "bundle install"
14
+ run "git init"
15
+ end
16
+ end
17
+
18
+ def generate_spider(spider_name, in_project:)
19
+ spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20
+ raise "Spider #{spider_path} already exists" if File.exists? spider_path
21
+
22
+ spider_class = to_spider_class(spider_name)
23
+ create_file spider_path do
24
+ <<~RUBY
25
+ class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
26
+ @name = "#{spider_name}"
27
+ @start_urls = []
28
+ @config = {}
29
+
30
+ def parse(response, url:, data: {})
31
+ end
32
+ end
33
+ RUBY
34
+ end
35
+
36
+ unless in_project
37
+ insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38
+ prepend_to_file spider_path, "require 'kimurai'\n\n"
39
+ append_to_file spider_path, "\n#{spider_class}.crawl!"
40
+ end
41
+ end
42
+
43
+ def generate_schedule
44
+ copy_file "template/config/schedule.rb", "./schedule.rb"
45
+ end
46
+
47
+ private
48
+
49
+ def to_spider_class(string)
50
+ string.sub(/^./) { $&.capitalize }
51
+ .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52
+ .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53
+ .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,183 @@
1
+ require 'thor'
2
+
3
+ module Kimurai
4
+ class CLI < Thor
5
+ map %w[--version -v] => :__print_version
6
+
7
+ desc "generate", "Generator, available types: project, spider, schedule"
8
+ def generate(generator_type, *args)
9
+ case generator_type
10
+ when "project"
11
+ project_name = args.shift
12
+ raise "Provide project name to generate a new project" unless project_name.present?
13
+ Generator.new.generate_project(project_name)
14
+ when "spider"
15
+ spider_name = args.shift
16
+ raise "Provide spider name to generate a spider" unless spider_name.present?
17
+ Generator.new.generate_spider(spider_name, in_project: inside_project?)
18
+ when "schedule"
19
+ Generator.new.generate_schedule
20
+ else
21
+ raise "Don't know this generator type: #{generator_type}"
22
+ end
23
+ end
24
+
25
+ ###
26
+
27
+ desc "setup", "Setup server"
28
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
29
+ option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
30
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
31
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
32
+ option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
33
+ def setup(user_host)
34
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
35
+
36
+ pid = spawn *command
37
+ Process.wait pid
38
+ end
39
+
40
+ desc "deploy", "Deploy project to the server and update cron schedule"
41
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
42
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
43
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
44
+ option "repo-url", type: :string, banner: "Repo url"
45
+ option "repo-key-path", type: :string, banner: "SSH key for a git repo"
46
+ option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
47
+ def deploy(user_host)
48
+ unless options["skip-check"]
49
+ if !`git status --short`.empty?
50
+ raise "Deploy: Please commit your changes first"
51
+ elsif `git remote`.empty?
52
+ raise "Deploy: Please add remote origin repository to your repo first"
53
+ elsif !`git rev-list master...origin/master`.empty?
54
+ raise "Deploy: Please push your commits to the remote origin repo first"
55
+ end
56
+ end
57
+
58
+ repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
59
+ repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
60
+
61
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
62
+ vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
63
+ ).get
64
+
65
+ pid = spawn *command
66
+ Process.wait pid
67
+ end
68
+
69
+ ###
70
+
71
+ desc "crawl", "Run a particular spider by it's name"
72
+ def crawl(spider_name)
73
+ raise "Can't find Kimurai project" unless inside_project?
74
+ require './config/boot'
75
+
76
+ unless klass = Kimurai.find_by_name(spider_name)
77
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
78
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
79
+ end
80
+
81
+ # Set time_zone if exists
82
+ if time_zone = Kimurai.configuration.time_zone
83
+ Kimurai.time_zone = time_zone
84
+ end
85
+
86
+ klass.crawl!
87
+ end
88
+
89
+ desc "parse", "Parse url in the particular spider method"
90
+ option :url, type: :string, required: true, banner: "Url to pass to the method"
91
+ def parse(spider_name, method_name)
92
+ raise "Can't find Kimurai project" unless inside_project?
93
+ require './config/boot'
94
+
95
+ unless klass = Kimurai.find_by_name(spider_name)
96
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
97
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
98
+ end
99
+
100
+ klass.parse!(method_name, url: options["url"])
101
+ end
102
+
103
+ desc "console", "Start Kimurai console"
104
+ option :engine, type: :string, banner: "Engine to use"
105
+ option :url, type: :string, banner: "Url to process"
106
+ def console(spider_name = nil)
107
+ require 'pry'
108
+ require './config/boot' if inside_project?
109
+
110
+ if spider_name
111
+ raise "Can't find Kimurai project" unless inside_project?
112
+
113
+ unless klass = Kimurai.find_by_name(spider_name)
114
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
115
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
116
+ end
117
+ else
118
+ klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
119
+ end
120
+
121
+ engine = options["engine"]&.delete(":")&.to_sym
122
+ if url = options["url"]
123
+ klass.new(engine).request_to(:console, url: options["url"])
124
+ else
125
+ klass.new(engine).public_send(:console)
126
+ end
127
+ end
128
+
129
+ desc "list", "List all available spiders in the current project"
130
+ def list
131
+ raise "Can't find Kimurai project" unless inside_project?
132
+ require './config/boot'
133
+
134
+ Kimurai.list.keys.sort.each { |name| puts name }
135
+ end
136
+
137
+ desc "runner", "Run all spiders in the project in queue"
138
+ option :include, type: :array, default: [], banner: "List of spiders to run"
139
+ option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
140
+ option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
141
+ def runner
142
+ raise "Can't find Kimurai project" unless inside_project?
143
+
144
+ jobs = options["jobs"]
145
+ raise "Jobs count can't be 0" if jobs == 0
146
+
147
+ require './config/boot'
148
+ require 'kimurai/runner'
149
+
150
+ spiders = options["include"].presence || Kimurai.list.keys
151
+ spiders -= options["exclude"]
152
+
153
+ Runner.new(spiders, jobs).run!
154
+ end
155
+
156
+ desc "--version, -v", "Print the version"
157
+ def __print_version
158
+ puts VERSION
159
+ end
160
+
161
+ desc "dashboard", "Run dashboard"
162
+ def dashboard
163
+ raise "Can't find Kimurai project" unless inside_project?
164
+
165
+ require './config/boot'
166
+ if Object.const_defined?("Kimurai::Dashboard")
167
+ require 'kimurai/dashboard/app'
168
+ Kimurai::Dashboard::App.run!
169
+ else
170
+ raise "Kimurai::Dashboard is not defined"
171
+ end
172
+ end
173
+
174
+ private
175
+
176
+ def inside_project?
177
+ Dir.exists?("spiders") && File.exists?("./config/boot.rb")
178
+ end
179
+ end
180
+ end
181
+
182
+ require_relative 'cli/generator'
183
+ require_relative 'cli/ansible_command_builder'
@@ -0,0 +1,14 @@
1
+ class Array
2
+ def in_sorted_groups(number, fill_width = nil)
3
+ sorted_groups = Array.new(number) { |a| a = [] }
4
+
5
+ self.in_groups_of(number, fill_width).each do |group|
6
+ number.times do |i|
7
+ group.fetch(i) rescue next
8
+ sorted_groups[i] << group[i]
9
+ end
10
+ end
11
+
12
+ sorted_groups
13
+ end
14
+ end
@@ -0,0 +1,5 @@
1
+ class Hash
2
+ def deep_merge_excl(second, exclude)
3
+ self.merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
4
+ end
5
+ end
@@ -0,0 +1,19 @@
1
+ class Numeric
2
+ # https://stackoverflow.com/a/1679963
3
+ def duration
4
+ secs = self.to_int
5
+ mins = secs / 60
6
+ hours = mins / 60
7
+ days = hours / 24
8
+
9
+ if days > 0
10
+ "#{days}d, #{hours % 24}h"
11
+ elsif hours > 0
12
+ "#{hours}h, #{mins % 60}m"
13
+ elsif mins > 0
14
+ "#{mins}m, #{secs % 60}s"
15
+ elsif secs >= 0
16
+ "#{secs}s"
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ require 'murmurhash3'
2
+
3
+ class String
4
+ def to_id
5
+ MurmurHash3::V32.str_hash(self)
6
+ end
7
+ end
@@ -0,0 +1,33 @@
1
+ module Kimurai
2
+ class Pipeline
3
+ class DropItemError < StandardError; end
4
+ def self.name
5
+ self.to_s.sub(/.*?::/, "").underscore.to_sym
6
+ end
7
+
8
+ include BaseHelper
9
+ attr_accessor :spider
10
+
11
+ def name
12
+ self.class.name
13
+ end
14
+
15
+ ###
16
+
17
+ def storage
18
+ spider.storage
19
+ end
20
+
21
+ def unique?(scope, value)
22
+ spider.unique?(scope, value)
23
+ end
24
+
25
+ def save_to(path, item, format:, position: true, append: false)
26
+ spider.save_to(path, item, format: format, position: position, append: append)
27
+ end
28
+
29
+ def logger
30
+ spider.logger
31
+ end
32
+ end
33
+ end