kimurai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
@@ -0,0 +1,249 @@
1
+ require_relative 'base/simple_saver'
2
+ require_relative 'base/uniq_checker'
3
+
4
+ module Kimurai
5
+ class Base
6
+ LoggerFormatter = proc do |severity, datetime, progname, msg|
7
+ current_thread_id = Thread.current.object_id
8
+ thread_type = Thread.main == Thread.current ? "M" : "C"
9
+ output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
10
+ .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
11
+
12
+ if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
13
+ Rbcat.colorize(output, predefined: [:jsonhash, :logger])
14
+ else
15
+ output
16
+ end
17
+ end
18
+
19
+ include BaseHelper
20
+
21
+ ###
22
+
23
+ class << self
24
+ attr_reader :run_info
25
+ end
26
+
27
+ def self.running?
28
+ @run_info && @run_info[:status] == :running
29
+ end
30
+
31
+ def self.completed?
32
+ @run_info && @run_info[:status] == :completed
33
+ end
34
+
35
+ def self.failed?
36
+ @run_info && @run_info[:status] == :failed
37
+ end
38
+
39
+ def self.visits
40
+ @run_info && @run_info[:visits]
41
+ end
42
+
43
+ def self.items
44
+ @run_info && @run_info[:items]
45
+ end
46
+
47
+ def self.update(type, subtype)
48
+ return unless @run_info
49
+
50
+ (@update_mutex ||= Mutex.new).synchronize do
51
+ @run_info[type][subtype] += 1
52
+ end
53
+ end
54
+
55
+ ###
56
+
57
+ @engine = :mechanize
58
+ @pipelines = []
59
+ @config = {}
60
+
61
+ ###
62
+
63
+ def self.name
64
+ @name
65
+ end
66
+
67
+ def self.engine
68
+ @engine ||= superclass.engine
69
+ end
70
+
71
+ def self.pipelines
72
+ @pipelines ||= superclass.pipelines
73
+ end
74
+
75
+ def self.start_urls
76
+ @start_urls
77
+ end
78
+
79
+ def self.config
80
+ superclass.equal?(::Object) ? @config : superclass.config.deep_merge(@config || {})
81
+ end
82
+
83
+ ###
84
+
85
+ def self.logger
86
+ @logger ||= Kimurai.configuration.logger || begin
87
+ log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
88
+ log_level = "Logger::#{log_level}".constantize
89
+ Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
90
+ end
91
+ end
92
+
93
+ ###
94
+
95
+ def self.checker
96
+ @checker ||= UniqChecker.new
97
+ end
98
+
99
+ def unique?(scope, value)
100
+ self.class.checker.unique?(scope, value)
101
+ end
102
+
103
+ def self.saver
104
+ @saver ||= SimpleSaver.new
105
+ end
106
+
107
+ def save_to(path, item, format:, position: true)
108
+ self.class.saver.save(path, item, format: format, position: position)
109
+ end
110
+
111
+ ###
112
+
113
+ def self.crawl!
114
+ logger.error "Spider: already running: #{name}" and return false if running?
115
+ @run_info = {
116
+ spider_name: name, status: :running, environment: Kimurai.env,
117
+ start_time: Time.new, stop_time: nil, running_time: nil,
118
+ visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 }, error: nil
119
+ }
120
+
121
+ logger.info "Spider: started: #{name}"
122
+ open_spider if self.respond_to? :open_spider
123
+
124
+ spider = self.new
125
+ spider.with_info = true
126
+ if start_urls
127
+ start_urls.each do |start_url|
128
+ spider.request_to(:parse, url: start_url)
129
+ end
130
+ else
131
+ spider.parse
132
+ end
133
+ rescue StandardError, SignalException => e
134
+ @run_info.merge!(status: :failed, error: e.inspect)
135
+ raise e
136
+ else
137
+ @run_info[:status] = :completed
138
+ @run_info
139
+ ensure
140
+ if spider
141
+ spider.browser.destroy_driver!
142
+
143
+ stop_time = Time.now
144
+ total_time = (stop_time - @run_info[:start_time]).round(3)
145
+ @run_info.merge!(stop_time: stop_time, running_time: total_time)
146
+
147
+ close_spider if self.respond_to? :close_spider
148
+ message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
149
+ failed? ? @logger.fatal(message) : @logger.info(message)
150
+
151
+ @run_info, @checker, @saver = nil
152
+ end
153
+ end
154
+
155
+ def self.parse!(handler, engine = nil, url: nil, data: {})
156
+ spider = engine ? self.new(engine) : self.new
157
+ url.present? ? spider.request_to(handler, url: url, data: data) : spider.public_send(handler)
158
+ ensure
159
+ spider.browser.destroy_driver!
160
+ end
161
+
162
+ ###
163
+
164
+ attr_reader :logger
165
+ attr_accessor :with_info
166
+
167
+ def initialize(engine = self.class.engine, config: {})
168
+ @engine = engine
169
+ @config = self.class.config.deep_merge(config)
170
+ @pipelines = self.class.pipelines.map do |pipeline_name|
171
+ klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
172
+ instance = klass.new
173
+ instance.spider = self
174
+ [pipeline_name, instance]
175
+ end.to_h
176
+
177
+ @logger = self.class.logger
178
+ end
179
+
180
+ def browser
181
+ @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
182
+ end
183
+
184
+ def request_to(handler, delay = nil, url:, data: {})
185
+ request_data = { url: url, data: data }
186
+ delay ? browser.visit(url, delay: delay) : browser.visit(url)
187
+ public_send(handler, browser.current_response, request_data)
188
+ end
189
+
190
+ def console(response = nil, url: nil, data: {})
191
+ binding.pry
192
+ end
193
+
194
+ private
195
+
196
+ def send_item(item, options = {})
197
+ logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
198
+ self.class.update(:items, :sent) if self.with_info
199
+
200
+ @pipelines.each do |name, instance|
201
+ item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
202
+ end
203
+ rescue => e
204
+ logger.error "Pipeline: dropped: #{e.inspect}, item: #{item}"
205
+ false
206
+ else
207
+ self.class.update(:items, :processed) if self.with_info
208
+ logger.info "Pipeline: processed: #{JSON.generate(item)}"
209
+ true
210
+ ensure
211
+ if self.with_info
212
+ logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
213
+ end
214
+ end
215
+
216
+ def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
217
+ parts = urls.in_sorted_groups(threads, false)
218
+ urls_count = urls.size
219
+
220
+ all = []
221
+ start_time = Time.now
222
+ logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
223
+
224
+ parts.each do |part|
225
+ all << Thread.new(part) do |part|
226
+ Thread.current.abort_on_exception = true
227
+
228
+ spider = self.class.new(engine, config: config)
229
+ spider.with_info = true if self.with_info
230
+
231
+ part.each do |url_data|
232
+ if url_data.class == Hash
233
+ spider.request_to(handler, delay, url_data)
234
+ else
235
+ spider.request_to(handler, delay, url: url_data, data: data)
236
+ end
237
+ end
238
+ ensure
239
+ spider.browser.destroy_driver!
240
+ end
241
+
242
+ sleep 0.5
243
+ end
244
+
245
+ all.each(&:join)
246
+ logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,98 @@
1
+ require 'json'
2
+ require 'csv'
3
+
4
+ module Kimurai
5
+ class Base
6
+ class SimpleSaver
7
+ def initialize
8
+ @index = 0
9
+ @mutex = Mutex.new
10
+ end
11
+
12
+ def save(path, item, format:, position:)
13
+ @mutex.synchronize do
14
+ @index += 1
15
+ item[:position] = @index if position
16
+
17
+ case format
18
+ when :json
19
+ save_to_json(item, path)
20
+ when :pretty_json
21
+ save_to_pretty_json(item, path)
22
+ when :jsonlines
23
+ save_to_jsonlines(item, path)
24
+ when :csv
25
+ save_to_csv(item, path)
26
+ else
27
+ raise "SimpleSaver: wrong type of format: #{format}"
28
+ end
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def save_to_json(item, path)
35
+ data = JSON.generate([item])
36
+
37
+ if @index > 1
38
+ file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
39
+ File.open(path, "w") do |f|
40
+ f.write(file_content + data.sub(/\A\[/, ""))
41
+ end
42
+ else
43
+ File.open(path, "w") { |f| f.write(data) }
44
+ end
45
+ end
46
+
47
+ def save_to_pretty_json(item, path)
48
+ data = JSON.pretty_generate([item])
49
+
50
+ if @index > 1
51
+ file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
52
+ File.open(path, "w") do |f|
53
+ f.write(file_content + data.sub(/\A\[\n/, ""))
54
+ end
55
+ else
56
+ File.open(path, "w") { |f| f.write(data) }
57
+ end
58
+ end
59
+
60
+ def save_to_jsonlines(item, path)
61
+ data = JSON.generate(item)
62
+
63
+ if @index > 1
64
+ File.open(path, "a") { |file| file.write("\n" + data) }
65
+ else
66
+ File.open(path, "w") { |file| file.write(data) }
67
+ end
68
+ end
69
+
70
+ def save_to_csv(item, path)
71
+ data = flatten_hash(item)
72
+
73
+ if @index > 1
74
+ CSV.open(path, "a+", force_quotes: true) do |csv|
75
+ csv << data.values
76
+ end
77
+ else
78
+ CSV.open(path, "w", force_quotes: true) do |csv|
79
+ csv << data.keys
80
+ csv << data.values
81
+ end
82
+ end
83
+ end
84
+
85
+ def flatten_hash(hash)
86
+ hash.each_with_object({}) do |(k, v), h|
87
+ if v.is_a? Hash
88
+ flatten_hash(v).map { |h_k, h_v| h["#{k}.#{h_k}"] = h_v }
89
+ else
90
+ h[k&.to_s] = v
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+
@@ -0,0 +1,22 @@
1
+ module Kimurai
2
+ class Base
3
+ class UniqChecker
4
+ def initialize
5
+ @database = {}
6
+ @mutex = Mutex.new
7
+ end
8
+
9
+ def unique?(scope, value)
10
+ @mutex.synchronize do
11
+ @database[scope] ||= []
12
+ if @database[scope].include?(value)
13
+ false
14
+ else
15
+ @database[scope].push(value)
16
+ true
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ module Kimurai
2
+ module BaseHelper
3
+ private
4
+
5
+ def absolute_url(url, base:)
6
+ return unless url
7
+ URI.join(base, URI.escape(url)).to_s
8
+ end
9
+
10
+ def escape_url(url)
11
+ uri = URI.parse(url)
12
+ rescue URI::InvalidURIError => e
13
+ URI.parse(URI.escape url).to_s rescue url
14
+ else
15
+ url
16
+ end
17
+
18
+ def normalize_url(url, base:)
19
+ escape_url(absolute_url(url, base: base))
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,32 @@
1
+ module Kimurai
2
+ class BrowserBuilder
3
+ AVAILABLE_ENGINES = [
4
+ :mechanize,
5
+ :mechanize_standalone,
6
+ :poltergeist_phantomjs,
7
+ :selenium_firefox,
8
+ :selenium_chrome
9
+ ]
10
+
11
+ def self.build(engine, config = {}, spider:)
12
+ unless AVAILABLE_ENGINES.include? engine
13
+ raise "BrowserBuilder: wrong name of engine, available engines: #{AVAILABLE_ENGINES.join(', ')}"
14
+ end
15
+
16
+ case engine
17
+ when :mechanize
18
+ require_relative 'browser_builder/mechanize_builder'
19
+ MechanizeBuilder.new(config, spider: spider).build
20
+ when :selenium_chrome
21
+ require_relative 'browser_builder/selenium_chrome_builder'
22
+ SeleniumChromeBuilder.new(config, spider: spider).build
23
+ when :poltergeist_phantomjs
24
+ require_relative 'browser_builder/poltergeist_phantomjs_builder'
25
+ PoltergeistPhantomJSBuilder.new(config, spider: spider).build
26
+ when :selenium_firefox
27
+ require_relative 'browser_builder/selenium_firefox_builder'
28
+ SeleniumFirefoxBuilder.new(config, spider: spider).build
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,140 @@
1
+ require 'capybara'
2
+ require 'capybara/mechanize'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/mechanize/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Kimurai
8
+ class BrowserBuilder
9
+ class MechanizeBuilder
10
+ attr_reader :logger, :spider
11
+
12
+ def initialize(config, spider:)
13
+ @config = config
14
+ @spider = spider
15
+ @logger = spider.logger
16
+ end
17
+
18
+ def build
19
+ # Register driver
20
+ Capybara.register_driver :mechanize do |app|
21
+ driver = Capybara::Mechanize::Driver.new("app")
22
+ # keep the history as small as possible (by default it's unlimited)
23
+ driver.configure { |a| a.history.max_size = 2 }
24
+ driver
25
+ end
26
+
27
+ # Create browser instance (Capybara session)
28
+ @browser = Capybara::Session.new(:mechanize)
29
+ @browser.spider = spider
30
+ logger.debug "BrowserBuilder (mechanize): created browser instance"
31
+
32
+ # Proxy
33
+ if proxy = @config[:proxy].presence
34
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
35
+ ip, port, type = proxy_string.split(":")
36
+
37
+ if type == "socks5"
38
+ logger.error "BrowserBuilder (mechanize): can't set socks5 proxy (not supported), skipped"
39
+ else
40
+ @browser.set_proxy(*proxy_string.split(":"))
41
+ logger.debug "BrowserBuilder (mechanize): enabled #{type} proxy, ip: #{ip}, port: #{port}"
42
+ end
43
+ end
44
+
45
+ # SSL
46
+ if ssl_cert_path = @config[:ssl_cert_path].presence
47
+ @browser.driver.browser.agent.http.ca_file = ssl_cert_path
48
+ logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
49
+ end
50
+
51
+ if @config[:ignore_ssl_errors].present?
52
+ @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
53
+ logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
54
+ end
55
+
56
+ # Headers
57
+ if headers = @config[:headers].presence
58
+ @browser.driver.headers = headers
59
+ logger.debug "BrowserBuilder (mechanize): enabled custom headers"
60
+ end
61
+
62
+ if user_agent = @config[:user_agent].presence
63
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
64
+
65
+ @browser.driver.add_header("User-Agent", user_agent_string)
66
+ logger.debug "BrowserBuilder (mechanize): enabled custom user-agent"
67
+ end
68
+
69
+ # Cookies
70
+ if cookies = @config[:cookies].presence
71
+ cookies.each do |cookie|
72
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
73
+ end
74
+
75
+ logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
76
+ end
77
+
78
+ # Browser instance options
79
+ # retry_request_errors
80
+ if errors = @config.dig(:browser, :retry_request_errors).presence
81
+ @browser.config.retry_request_errors = errors
82
+ logger.debug "BrowserBuilder (mechanize): enabled `browser retry_request_errors`"
83
+ end
84
+
85
+ # restart_if
86
+ if @config.dig(:browser, :restart_if).present?
87
+ logger.error "BrowserBuilder (mechanize): `browser restart_if` options not supported by Mechanize, skipped"
88
+ end
89
+
90
+ # before_request clear_cookies
91
+ if @config.dig(:browser, :before_request, :clear_cookies)
92
+ @browser.config.before_request[:clear_cookies] = true
93
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_cookies`"
94
+ end
95
+
96
+ # before_request clear_and_set_cookies
97
+ if @config.dig(:browser, :before_request, :clear_and_set_cookies)
98
+ if cookies = @config[:cookies].presence
99
+ @browser.config.cookies = cookies
100
+ @browser.config.before_request[:clear_and_set_cookies] = true
101
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_and_set_cookies`"
102
+ else
103
+ logger.error "BrowserBuilder (mechanize): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
104
+ end
105
+ end
106
+
107
+ # before_request change_user_agent
108
+ if @config.dig(:browser, :before_request, :change_user_agent)
109
+ if @config[:user_agent].present? && @config[:user_agent].class == Proc
110
+ @browser.config.user_agent = @config[:user_agent]
111
+ @browser.config.before_request[:change_user_agent] = true
112
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_user_agent`"
113
+ else
114
+ logger.error "BrowserBuilder (mechanize): `user_agent` should be present and has lambda format to enable `browser before_request change_user_agent`, skipped"
115
+ end
116
+ end
117
+
118
+ # before_request change_proxy
119
+ if @config.dig(:browser, :before_request, :change_proxy)
120
+ if @config[:proxy].present? && @config[:proxy].class == Proc
121
+ @browser.config.proxy = @config[:proxy]
122
+ @browser.config.before_request[:change_proxy] = true
123
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_proxy`"
124
+ else
125
+ logger.error "BrowserBuilder (mechanize): `proxy` should be present and has lambda format to enable `browser before_request change_proxy`, skipped"
126
+ end
127
+ end
128
+
129
+ # before_request delay
130
+ if delay = @config.dig(:browser, :before_request, :delay).presence
131
+ @browser.config.before_request[:delay] = delay
132
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request delay`"
133
+ end
134
+
135
+ # return Capybara session instance
136
+ @browser
137
+ end
138
+ end
139
+ end
140
+ end