kimurai_dynamic 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +111 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai/automation/deploy.yml +54 -0
  14. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  15. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  16. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  17. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  18. data/lib/kimurai/automation/setup.yml +45 -0
  19. data/lib/kimurai/base/saver.rb +106 -0
  20. data/lib/kimurai/base/storage.rb +54 -0
  21. data/lib/kimurai/base.rb +330 -0
  22. data/lib/kimurai/base_helper.rb +22 -0
  23. data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
  24. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  25. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
  26. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
  27. data/lib/kimurai/browser_builder.rb +20 -0
  28. data/lib/kimurai/capybara_configuration.rb +10 -0
  29. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  30. data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
  31. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  32. data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
  33. data/lib/kimurai/capybara_ext/session/config.rb +22 -0
  34. data/lib/kimurai/capybara_ext/session.rb +249 -0
  35. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  36. data/lib/kimurai/cli/generator.rb +57 -0
  37. data/lib/kimurai/cli.rb +183 -0
  38. data/lib/kimurai/core_ext/array.rb +14 -0
  39. data/lib/kimurai/core_ext/hash.rb +5 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +33 -0
  43. data/lib/kimurai/runner.rb +60 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/Gemfile +28 -0
  46. data/lib/kimurai/template/README.md +3 -0
  47. data/lib/kimurai/template/config/application.rb +37 -0
  48. data/lib/kimurai/template/config/automation.yml +13 -0
  49. data/lib/kimurai/template/config/boot.rb +22 -0
  50. data/lib/kimurai/template/config/initializers/.keep +0 -0
  51. data/lib/kimurai/template/config/schedule.rb +57 -0
  52. data/lib/kimurai/template/db/.keep +0 -0
  53. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  54. data/lib/kimurai/template/lib/.keep +0 -0
  55. data/lib/kimurai/template/log/.keep +0 -0
  56. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  57. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  58. data/lib/kimurai/template/spiders/application_spider.rb +143 -0
  59. data/lib/kimurai/template/tmp/.keep +0 -0
  60. data/lib/kimurai/version.rb +3 -0
  61. data/lib/kimurai.rb +54 -0
  62. metadata +349 -0
@@ -0,0 +1,249 @@
1
+ require 'capybara'
2
+ require 'nokogiri'
3
+ require 'json'
4
+ require_relative 'session/config'
5
+
6
+ module Capybara
7
+ class Session
8
+ attr_accessor :spider
9
+
10
+ alias_method :original_visit, :visit
11
+ def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
12
+ if spider
13
+ process_delay(delay) if delay
14
+ retries, sleep_interval = 0, 0
15
+
16
+ begin
17
+ check_request_options(visit_uri) unless skip_request_options
18
+ driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
19
+ spider.class.update(:visits, :requests) if spider.with_info
20
+
21
+ original_visit(visit_uri)
22
+ rescue => e
23
+ if match_error?(e, type: :to_skip)
24
+ logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
25
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
26
+ false
27
+ elsif match_error?(e, type: :to_retry)
28
+ logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
29
+ spider.add_event(:requests_errors, e.inspect) if spider.with_info
30
+
31
+ if (retries += 1) <= max_retries
32
+ logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
33
+ sleep sleep_interval and retry
34
+ else
35
+ logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
36
+ raise e unless skip_error_on_failure?(e)
37
+ end
38
+ else
39
+ raise e
40
+ end
41
+ else
42
+ driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
43
+ spider.class.update(:visits, :responses) if spider.with_info
44
+ driver.visited = true unless driver.visited
45
+ true
46
+ ensure
47
+ if spider.with_info
48
+ logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
49
+ end
50
+
51
+ if memory = driver.current_memory
52
+ logger.debug "Browser: driver.current_memory: #{memory}"
53
+ end
54
+ end
55
+ else
56
+ original_visit(visit_uri)
57
+ end
58
+ end
59
+
60
+ def destroy_driver!
61
+ if @driver
62
+ begin
63
+ @driver.quit
64
+ # handle Net::ReadTimeout error for Selenium like drivers
65
+ rescue Net::ReadTimeout => e
66
+ @driver.quit
67
+ end
68
+
69
+ @driver = nil
70
+ logger.info "Browser: driver #{mode} has been destroyed"
71
+ else
72
+ logger.warn "Browser: driver #{mode} is not present"
73
+ end
74
+ end
75
+
76
+ def restart!
77
+ if mode.match?(/poltergeist/)
78
+ @driver.browser.restart
79
+ @driver.requests, @driver.responses = 0, 0
80
+ else
81
+ destroy_driver!
82
+ driver
83
+ end
84
+
85
+ logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
86
+ end
87
+
88
+ def current_response(response_type = :html)
89
+ case response_type
90
+ when :html
91
+ if config.encoding
92
+ if config.encoding == :auto
93
+ charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
94
+ Nokogiri::HTML(body, nil, charset)
95
+ else
96
+ Nokogiri::HTML(body, nil, config.encoding)
97
+ end
98
+ else
99
+ Nokogiri::HTML(body)
100
+ end
101
+ when :json
102
+ JSON.parse(body)
103
+ end
104
+ end
105
+
106
+ ###
107
+
108
+ # Handy method to perform some processing in the new tab within block and then automatically close this tab:
109
+ # Usage (url):
110
+ # browser.within_new_window_by(url: "https://google.com") do
111
+ # do some stuff and then automatically close this tab and return back to the first tab
112
+ # end
113
+ # Usage (action) (when new tab opening by some action, for example by clicking
114
+ # on a particular element):
115
+ # action = -> { browser.find("//some/element/path").click }
116
+ # browser.within_new_window_by(action: action) do
117
+ # do some stuff and then automatically close this tab and return back to the first tab
118
+ # end
119
+ def within_new_window_by(action: nil, url: nil)
120
+ case
121
+ when action
122
+ opened_window = window_opened_by { action.call }
123
+ within_window(opened_window) do
124
+ yield
125
+ current_window.close
126
+ end
127
+ when url
128
+ within_window(open_new_window) do
129
+ visit(url)
130
+
131
+ yield
132
+ current_window.close
133
+ end
134
+ end
135
+ end
136
+
137
+ ###
138
+
139
+ def scroll_to_bottom
140
+ execute_script("window.scrollBy(0,10000)")
141
+ end
142
+
143
+ private
144
+
145
+ def skip_error_on_failure?(e)
146
+ config.retry_request_errors.any? do |error|
147
+ error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
148
+ end
149
+ end
150
+
151
+ def match_error?(e, type:)
152
+ errors =
153
+ case type
154
+ when :to_retry then config.retry_request_errors
155
+ when :to_skip then config.skip_request_errors
156
+ end
157
+
158
+ errors.any? do |error|
159
+ if error.kind_of?(Hash)
160
+ match_class = e.class.ancestors.include?(error[:error])
161
+ if error[:message].present?
162
+ if error[:message].kind_of?(Regexp)
163
+ e.message&.match?(error[:message])
164
+ else
165
+ e.message&.include?(error[:message])
166
+ end && match_class
167
+ else
168
+ match_class
169
+ end
170
+ else
171
+ e.class.ancestors.include?(error)
172
+ end
173
+ end
174
+ end
175
+
176
+ def process_delay(delay)
177
+ interval = (delay.class == Range ? rand(delay) : delay)
178
+ logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
179
+ sleep interval
180
+ end
181
+
182
+ def check_request_options(url_to_visit)
183
+ # restart_if
184
+ if memory_limit = config.restart_if[:memory_limit]
185
+ memory = driver.current_memory
186
+ if memory && memory >= memory_limit
187
+ logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
188
+ restart!
189
+ end
190
+ end
191
+
192
+ if requests_limit = config.restart_if[:requests_limit]
193
+ requests = driver.requests
194
+ if requests >= requests_limit
195
+ logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
196
+ restart!
197
+ end
198
+ end
199
+
200
+ # cookies
201
+ # (Selenium only) if config.cookies present and browser was just created,
202
+ # visit url_to_visit first and only then set cookies:
203
+ if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
204
+ visit(url_to_visit, skip_request_options: true)
205
+ config.cookies.each do |cookie|
206
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
207
+ end
208
+ end
209
+
210
+ if config.before_request[:clear_cookies]
211
+ driver.clear_cookies
212
+ logger.debug "Browser: cleared cookies before request"
213
+ end
214
+
215
+ if config.before_request[:clear_and_set_cookies]
216
+ driver.clear_cookies
217
+
218
+ # (Selenium only) if browser is not visited yet any page, visit url_to_visit
219
+ # first and then set cookies (needs after browser restart):
220
+ if driver.visited.nil? && mode.match?(/selenium/)
221
+ visit(url_to_visit, skip_request_options: true)
222
+ end
223
+
224
+ config.cookies.each do |cookie|
225
+ driver.set_cookie(cookie[:name], cookie[:value], cookie)
226
+ end
227
+
228
+ logger.debug "Browser: cleared and set cookies before request"
229
+ end
230
+
231
+ # user_agent
232
+ if config.before_request[:change_user_agent]
233
+ driver.add_header("User-Agent", config.user_agent.call)
234
+ logger.debug "Browser: changed user_agent before request"
235
+ end
236
+
237
+ # proxy
238
+ if config.before_request[:change_proxy]
239
+ proxy_string = config.proxy.call
240
+ driver.set_proxy(*proxy_string.split(":"))
241
+ logger.debug "Browser: changed proxy before request"
242
+ end
243
+ end
244
+
245
+ def logger
246
+ spider.logger
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,71 @@
1
+ require 'cliver'
2
+
3
+ module Kimurai
4
+ class CLI
5
+ class AnsibleCommandBuilder
6
+ def initialize(user_host, options, playbook:, vars: {})
7
+ @user_host = user_host
8
+ @options = options
9
+ @playbook = playbook
10
+ @vars = vars
11
+ end
12
+
13
+ def get
14
+ unless Cliver.detect("ansible-playbook")
15
+ raise "Can't find `ansible-playbook` executable, to install: " \
16
+ "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17
+ end
18
+
19
+ user = @user_host[/(.*?)\@/, 1]
20
+ host = @user_host[/\@(.+)/, 1] || @user_host
21
+ inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22
+
23
+ gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
24
+ playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
25
+
26
+ command = [
27
+ "ansible-playbook", playbook_path,
28
+ "--inventory", inventory,
29
+ "--ssh-extra-args", "-oForwardAgent=yes",
30
+ "--connection", @options["local"] ? "local" : "smart",
31
+ "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32
+ ]
33
+
34
+ if File.exists? "config/automation.yml"
35
+ require 'yaml'
36
+ if config = YAML.load_file("config/automation.yml").dig(@playbook)
37
+ config.each { |key, value| @vars[key] = value unless @vars[key] }
38
+ end
39
+ end
40
+
41
+ @vars.each do |key, value|
42
+ next unless value.present?
43
+ command.push "--extra-vars", "#{key}=#{value}"
44
+ end
45
+
46
+ if user
47
+ command.push "--user", user
48
+ end
49
+
50
+ if @options["ask-sudo"]
51
+ command.push "--ask-become-pass"
52
+ end
53
+
54
+ if @options["ask-auth-pass"]
55
+ unless Cliver.detect("sshpass")
56
+ raise "Can't find `sshpass` executable for password authentication, to install: " \
57
+ "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58
+ end
59
+
60
+ command.push "--ask-pass"
61
+ end
62
+
63
+ if ssh_key_path = @options["ssh-key-path"]
64
+ command.push "--private-key", ssh_key_path
65
+ end
66
+
67
+ command
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,57 @@
1
+ module Kimurai
2
+ class CLI
3
+ class Generator < Thor::Group
4
+ include Thor::Actions
5
+
6
+ def self.source_root
7
+ File.dirname(File.expand_path('..', __FILE__))
8
+ end
9
+
10
+ def generate_project(project_name)
11
+ directory "template", project_name
12
+ inside(project_name) do
13
+ run "bundle install"
14
+ run "git init"
15
+ end
16
+ end
17
+
18
+ def generate_spider(spider_name, in_project:)
19
+ spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20
+ raise "Spider #{spider_path} already exists" if File.exists? spider_path
21
+
22
+ spider_class = to_spider_class(spider_name)
23
+ create_file spider_path do
24
+ <<~RUBY
25
+ class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
26
+ @name = "#{spider_name}"
27
+ @start_urls = []
28
+ @config = {}
29
+
30
+ def parse(response, url:, data: {})
31
+ end
32
+ end
33
+ RUBY
34
+ end
35
+
36
+ unless in_project
37
+ insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38
+ prepend_to_file spider_path, "require 'kimurai'\n\n"
39
+ append_to_file spider_path, "\n#{spider_class}.crawl!"
40
+ end
41
+ end
42
+
43
+ def generate_schedule
44
+ copy_file "template/config/schedule.rb", "./schedule.rb"
45
+ end
46
+
47
+ private
48
+
49
+ def to_spider_class(string)
50
+ string.sub(/^./) { $&.capitalize }
51
+ .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52
+ .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53
+ .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,183 @@
1
+ require 'thor'
2
+
3
+ module Kimurai
4
+ class CLI < Thor
5
+ map %w[--version -v] => :__print_version
6
+
7
+ desc "generate", "Generator, available types: project, spider, schedule"
8
+ def generate(generator_type, *args)
9
+ case generator_type
10
+ when "project"
11
+ project_name = args.shift
12
+ raise "Provide project name to generate a new project" unless project_name.present?
13
+ Generator.new.generate_project(project_name)
14
+ when "spider"
15
+ spider_name = args.shift
16
+ raise "Provide spider name to generate a spider" unless spider_name.present?
17
+ Generator.new.generate_spider(spider_name, in_project: inside_project?)
18
+ when "schedule"
19
+ Generator.new.generate_schedule
20
+ else
21
+ raise "Don't know this generator type: #{generator_type}"
22
+ end
23
+ end
24
+
25
+ ###
26
+
27
+ desc "setup", "Setup server"
28
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
29
+ option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
30
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
31
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
32
+ option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
33
+ def setup(user_host)
34
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
35
+
36
+ pid = spawn *command
37
+ Process.wait pid
38
+ end
39
+
40
+ desc "deploy", "Deploy project to the server and update cron schedule"
41
+ option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
42
+ option "ask-auth-pass", type: :boolean, banner: "Auth using password"
43
+ option "ssh-key-path", type: :string, banner: "Auth using ssh key"
44
+ option "repo-url", type: :string, banner: "Repo url"
45
+ option "repo-key-path", type: :string, banner: "SSH key for a git repo"
46
+ option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
47
+ def deploy(user_host)
48
+ unless options["skip-check"]
49
+ if !`git status --short`.empty?
50
+ raise "Deploy: Please commit your changes first"
51
+ elsif `git remote`.empty?
52
+ raise "Deploy: Please add remote origin repository to your repo first"
53
+ elsif !`git rev-list master...origin/master`.empty?
54
+ raise "Deploy: Please push your commits to the remote origin repo first"
55
+ end
56
+ end
57
+
58
+ repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
59
+ repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
60
+
61
+ command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
62
+ vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
63
+ ).get
64
+
65
+ pid = spawn *command
66
+ Process.wait pid
67
+ end
68
+
69
+ ###
70
+
71
+ desc "crawl", "Run a particular spider by it's name"
72
+ def crawl(spider_name)
73
+ raise "Can't find Kimurai project" unless inside_project?
74
+ require './config/boot'
75
+
76
+ unless klass = Kimurai.find_by_name(spider_name)
77
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
78
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
79
+ end
80
+
81
+ # Set time_zone if exists
82
+ if time_zone = Kimurai.configuration.time_zone
83
+ Kimurai.time_zone = time_zone
84
+ end
85
+
86
+ klass.crawl!
87
+ end
88
+
89
+ desc "parse", "Parse url in the particular spider method"
90
+ option :url, type: :string, required: true, banner: "Url to pass to the method"
91
+ def parse(spider_name, method_name)
92
+ raise "Can't find Kimurai project" unless inside_project?
93
+ require './config/boot'
94
+
95
+ unless klass = Kimurai.find_by_name(spider_name)
96
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
97
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
98
+ end
99
+
100
+ klass.parse!(method_name, url: options["url"])
101
+ end
102
+
103
+ desc "console", "Start Kimurai console"
104
+ option :engine, type: :string, banner: "Engine to use"
105
+ option :url, type: :string, banner: "Url to process"
106
+ def console(spider_name = nil)
107
+ require 'pry'
108
+ require './config/boot' if inside_project?
109
+
110
+ if spider_name
111
+ raise "Can't find Kimurai project" unless inside_project?
112
+
113
+ unless klass = Kimurai.find_by_name(spider_name)
114
+ raise "Can't find spider with name `#{spider_name}` in the project. " \
115
+ "To list all available spiders, run: `$ bundle exec kimurai list`"
116
+ end
117
+ else
118
+ klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
119
+ end
120
+
121
+ engine = options["engine"]&.delete(":")&.to_sym
122
+ if url = options["url"]
123
+ klass.new(engine).request_to(:console, url: options["url"])
124
+ else
125
+ klass.new(engine).public_send(:console)
126
+ end
127
+ end
128
+
129
+ desc "list", "List all available spiders in the current project"
130
+ def list
131
+ raise "Can't find Kimurai project" unless inside_project?
132
+ require './config/boot'
133
+
134
+ Kimurai.list.keys.sort.each { |name| puts name }
135
+ end
136
+
137
+ desc "runner", "Run all spiders in the project in queue"
138
+ option :include, type: :array, default: [], banner: "List of spiders to run"
139
+ option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
140
+ option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
141
+ def runner
142
+ raise "Can't find Kimurai project" unless inside_project?
143
+
144
+ jobs = options["jobs"]
145
+ raise "Jobs count can't be 0" if jobs == 0
146
+
147
+ require './config/boot'
148
+ require 'kimurai/runner'
149
+
150
+ spiders = options["include"].presence || Kimurai.list.keys
151
+ spiders -= options["exclude"]
152
+
153
+ Runner.new(spiders, jobs).run!
154
+ end
155
+
156
+ desc "--version, -v", "Print the version"
157
+ def __print_version
158
+ puts VERSION
159
+ end
160
+
161
+ desc "dashboard", "Run dashboard"
162
+ def dashboard
163
+ raise "Can't find Kimurai project" unless inside_project?
164
+
165
+ require './config/boot'
166
+ if Object.const_defined?("Kimurai::Dashboard")
167
+ require 'kimurai/dashboard/app'
168
+ Kimurai::Dashboard::App.run!
169
+ else
170
+ raise "Kimurai::Dashboard is not defined"
171
+ end
172
+ end
173
+
174
+ private
175
+
176
+ def inside_project?
177
+ Dir.exists?("spiders") && File.exists?("./config/boot.rb")
178
+ end
179
+ end
180
+ end
181
+
182
+ require_relative 'cli/generator'
183
+ require_relative 'cli/ansible_command_builder'
@@ -0,0 +1,14 @@
1
+ class Array
2
+ def in_sorted_groups(number, fill_width = nil)
3
+ sorted_groups = Array.new(number) { |a| a = [] }
4
+
5
+ self.in_groups_of(number, fill_width).each do |group|
6
+ number.times do |i|
7
+ group.fetch(i) rescue next
8
+ sorted_groups[i] << group[i]
9
+ end
10
+ end
11
+
12
+ sorted_groups
13
+ end
14
+ end
@@ -0,0 +1,5 @@
1
+ class Hash
2
+ def deep_merge_excl(second, exclude)
3
+ self.merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
4
+ end
5
+ end
@@ -0,0 +1,19 @@
1
+ class Numeric
2
+ # https://stackoverflow.com/a/1679963
3
+ def duration
4
+ secs = self.to_int
5
+ mins = secs / 60
6
+ hours = mins / 60
7
+ days = hours / 24
8
+
9
+ if days > 0
10
+ "#{days}d, #{hours % 24}h"
11
+ elsif hours > 0
12
+ "#{hours}h, #{mins % 60}m"
13
+ elsif mins > 0
14
+ "#{mins}m, #{secs % 60}s"
15
+ elsif secs >= 0
16
+ "#{secs}s"
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ require 'murmurhash3'
2
+
3
+ class String
4
+ def to_id
5
+ MurmurHash3::V32.str_hash(self)
6
+ end
7
+ end
@@ -0,0 +1,33 @@
1
+ module Kimurai
2
+ class Pipeline
3
+ class DropItemError < StandardError; end
4
+ def self.name
5
+ self.to_s.sub(/.*?::/, "").underscore.to_sym
6
+ end
7
+
8
+ include BaseHelper
9
+ attr_accessor :spider
10
+
11
+ def name
12
+ self.class.name
13
+ end
14
+
15
+ ###
16
+
17
+ def storage
18
+ spider.storage
19
+ end
20
+
21
+ def unique?(scope, value)
22
+ spider.unique?(scope, value)
23
+ end
24
+
25
+ def save_to(path, item, format:, position: true, append: false)
26
+ spider.save_to(path, item, format: format, position: position, append: append)
27
+ end
28
+
29
+ def logger
30
+ spider.logger
31
+ end
32
+ end
33
+ end