kimurai 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
@@ -0,0 +1,249 @@
1
+ require_relative 'base/simple_saver'
2
+ require_relative 'base/uniq_checker'
3
+
4
+ module Kimurai
5
+ class Base
6
+ LoggerFormatter = proc do |severity, datetime, progname, msg|
7
+ current_thread_id = Thread.current.object_id
8
+ thread_type = Thread.main == Thread.current ? "M" : "C"
9
+ output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
10
+ .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
11
+
12
+ if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
13
+ Rbcat.colorize(output, predefined: [:jsonhash, :logger])
14
+ else
15
+ output
16
+ end
17
+ end
18
+
19
+ include BaseHelper
20
+
21
+ ###
22
+
23
+ class << self
24
+ attr_reader :run_info
25
+ end
26
+
27
+ def self.running?
28
+ @run_info && @run_info[:status] == :running
29
+ end
30
+
31
+ def self.completed?
32
+ @run_info && @run_info[:status] == :completed
33
+ end
34
+
35
+ def self.failed?
36
+ @run_info && @run_info[:status] == :failed
37
+ end
38
+
39
+ def self.visits
40
+ @run_info && @run_info[:visits]
41
+ end
42
+
43
+ def self.items
44
+ @run_info && @run_info[:items]
45
+ end
46
+
47
+ def self.update(type, subtype)
48
+ return unless @run_info
49
+
50
+ (@update_mutex ||= Mutex.new).synchronize do
51
+ @run_info[type][subtype] += 1
52
+ end
53
+ end
54
+
55
+ ###
56
+
57
+ @engine = :mechanize
58
+ @pipelines = []
59
+ @config = {}
60
+
61
+ ###
62
+
63
+ def self.name
64
+ @name
65
+ end
66
+
67
+ def self.engine
68
+ @engine ||= superclass.engine
69
+ end
70
+
71
+ def self.pipelines
72
+ @pipelines ||= superclass.pipelines
73
+ end
74
+
75
+ def self.start_urls
76
+ @start_urls
77
+ end
78
+
79
+ def self.config
80
+ superclass.equal?(::Object) ? @config : superclass.config.deep_merge(@config || {})
81
+ end
82
+
83
+ ###
84
+
85
+ def self.logger
86
+ @logger ||= Kimurai.configuration.logger || begin
87
+ log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
88
+ log_level = "Logger::#{log_level}".constantize
89
+ Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
90
+ end
91
+ end
92
+
93
+ ###
94
+
95
+ def self.checker
96
+ @checker ||= UniqChecker.new
97
+ end
98
+
99
+ def unique?(scope, value)
100
+ self.class.checker.unique?(scope, value)
101
+ end
102
+
103
+ def self.saver
104
+ @saver ||= SimpleSaver.new
105
+ end
106
+
107
+ def save_to(path, item, format:, position: true)
108
+ self.class.saver.save(path, item, format: format, position: position)
109
+ end
110
+
111
+ ###
112
+
113
+ def self.crawl!
114
+ logger.error "Spider: already running: #{name}" and return false if running?
115
+ @run_info = {
116
+ spider_name: name, status: :running, environment: Kimurai.env,
117
+ start_time: Time.new, stop_time: nil, running_time: nil,
118
+ visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 }, error: nil
119
+ }
120
+
121
+ logger.info "Spider: started: #{name}"
122
+ open_spider if self.respond_to? :open_spider
123
+
124
+ spider = self.new
125
+ spider.with_info = true
126
+ if start_urls
127
+ start_urls.each do |start_url|
128
+ spider.request_to(:parse, url: start_url)
129
+ end
130
+ else
131
+ spider.parse
132
+ end
133
+ rescue StandardError, SignalException => e
134
+ @run_info.merge!(status: :failed, error: e.inspect)
135
+ raise e
136
+ else
137
+ @run_info[:status] = :completed
138
+ @run_info
139
+ ensure
140
+ if spider
141
+ spider.browser.destroy_driver!
142
+
143
+ stop_time = Time.now
144
+ total_time = (stop_time - @run_info[:start_time]).round(3)
145
+ @run_info.merge!(stop_time: stop_time, running_time: total_time)
146
+
147
+ close_spider if self.respond_to? :close_spider
148
+ message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
149
+ failed? ? @logger.fatal(message) : @logger.info(message)
150
+
151
+ @run_info, @checker, @saver = nil
152
+ end
153
+ end
154
+
155
+ def self.parse!(handler, engine = nil, url: nil, data: {})
156
+ spider = engine ? self.new(engine) : self.new
157
+ url.present? ? spider.request_to(handler, url: url, data: data) : spider.public_send(handler)
158
+ ensure
159
+ spider.browser.destroy_driver!
160
+ end
161
+
162
+ ###
163
+
164
+ attr_reader :logger
165
+ attr_accessor :with_info
166
+
167
+ def initialize(engine = self.class.engine, config: {})
168
+ @engine = engine
169
+ @config = self.class.config.deep_merge(config)
170
+ @pipelines = self.class.pipelines.map do |pipeline_name|
171
+ klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
172
+ instance = klass.new
173
+ instance.spider = self
174
+ [pipeline_name, instance]
175
+ end.to_h
176
+
177
+ @logger = self.class.logger
178
+ end
179
+
180
+ def browser
181
+ @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
182
+ end
183
+
184
+ def request_to(handler, delay = nil, url:, data: {})
185
+ request_data = { url: url, data: data }
186
+ delay ? browser.visit(url, delay: delay) : browser.visit(url)
187
+ public_send(handler, browser.current_response, request_data)
188
+ end
189
+
190
+ def console(response = nil, url: nil, data: {})
191
+ binding.pry
192
+ end
193
+
194
+ private
195
+
196
+ def send_item(item, options = {})
197
+ logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
198
+ self.class.update(:items, :sent) if self.with_info
199
+
200
+ @pipelines.each do |name, instance|
201
+ item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
202
+ end
203
+ rescue => e
204
+ logger.error "Pipeline: dropped: #{e.inspect}, item: #{item}"
205
+ false
206
+ else
207
+ self.class.update(:items, :processed) if self.with_info
208
+ logger.info "Pipeline: processed: #{JSON.generate(item)}"
209
+ true
210
+ ensure
211
+ if self.with_info
212
+ logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
213
+ end
214
+ end
215
+
216
+ def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
217
+ parts = urls.in_sorted_groups(threads, false)
218
+ urls_count = urls.size
219
+
220
+ all = []
221
+ start_time = Time.now
222
+ logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
223
+
224
+ parts.each do |part|
225
+ all << Thread.new(part) do |part|
226
+ Thread.current.abort_on_exception = true
227
+
228
+ spider = self.class.new(engine, config: config)
229
+ spider.with_info = true if self.with_info
230
+
231
+ part.each do |url_data|
232
+ if url_data.class == Hash
233
+ spider.request_to(handler, delay, url_data)
234
+ else
235
+ spider.request_to(handler, delay, url: url_data, data: data)
236
+ end
237
+ end
238
+ ensure
239
+ spider.browser.destroy_driver!
240
+ end
241
+
242
+ sleep 0.5
243
+ end
244
+
245
+ all.each(&:join)
246
+ logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,98 @@
1
+ require 'json'
2
+ require 'csv'
3
+
4
+ module Kimurai
5
+ class Base
6
+ class SimpleSaver
7
+ def initialize
8
+ @index = 0
9
+ @mutex = Mutex.new
10
+ end
11
+
12
+ def save(path, item, format:, position:)
13
+ @mutex.synchronize do
14
+ @index += 1
15
+ item[:position] = @index if position
16
+
17
+ case format
18
+ when :json
19
+ save_to_json(item, path)
20
+ when :pretty_json
21
+ save_to_pretty_json(item, path)
22
+ when :jsonlines
23
+ save_to_jsonlines(item, path)
24
+ when :csv
25
+ save_to_csv(item, path)
26
+ else
27
+ raise "SimpleSaver: wrong type of format: #{format}"
28
+ end
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def save_to_json(item, path)
35
+ data = JSON.generate([item])
36
+
37
+ if @index > 1
38
+ file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
39
+ File.open(path, "w") do |f|
40
+ f.write(file_content + data.sub(/\A\[/, ""))
41
+ end
42
+ else
43
+ File.open(path, "w") { |f| f.write(data) }
44
+ end
45
+ end
46
+
47
+ def save_to_pretty_json(item, path)
48
+ data = JSON.pretty_generate([item])
49
+
50
+ if @index > 1
51
+ file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
52
+ File.open(path, "w") do |f|
53
+ f.write(file_content + data.sub(/\A\[\n/, ""))
54
+ end
55
+ else
56
+ File.open(path, "w") { |f| f.write(data) }
57
+ end
58
+ end
59
+
60
+ def save_to_jsonlines(item, path)
61
+ data = JSON.generate(item)
62
+
63
+ if @index > 1
64
+ File.open(path, "a") { |file| file.write("\n" + data) }
65
+ else
66
+ File.open(path, "w") { |file| file.write(data) }
67
+ end
68
+ end
69
+
70
+ def save_to_csv(item, path)
71
+ data = flatten_hash(item)
72
+
73
+ if @index > 1
74
+ CSV.open(path, "a+", force_quotes: true) do |csv|
75
+ csv << data.values
76
+ end
77
+ else
78
+ CSV.open(path, "w", force_quotes: true) do |csv|
79
+ csv << data.keys
80
+ csv << data.values
81
+ end
82
+ end
83
+ end
84
+
85
+ def flatten_hash(hash)
86
+ hash.each_with_object({}) do |(k, v), h|
87
+ if v.is_a? Hash
88
+ flatten_hash(v).map { |h_k, h_v| h["#{k}.#{h_k}"] = h_v }
89
+ else
90
+ h[k&.to_s] = v
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+
@@ -0,0 +1,22 @@
1
+ module Kimurai
2
+ class Base
3
+ class UniqChecker
4
+ def initialize
5
+ @database = {}
6
+ @mutex = Mutex.new
7
+ end
8
+
9
+ def unique?(scope, value)
10
+ @mutex.synchronize do
11
+ @database[scope] ||= []
12
+ if @database[scope].include?(value)
13
+ false
14
+ else
15
+ @database[scope].push(value)
16
+ true
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ module Kimurai
2
+ module BaseHelper
3
+ private
4
+
5
+ def absolute_url(url, base:)
6
+ return unless url
7
+ URI.join(base, URI.escape(url)).to_s
8
+ end
9
+
10
+ def escape_url(url)
11
+ uri = URI.parse(url)
12
+ rescue URI::InvalidURIError => e
13
+ URI.parse(URI.escape url).to_s rescue url
14
+ else
15
+ url
16
+ end
17
+
18
+ def normalize_url(url, base:)
19
+ escape_url(absolute_url(url, base: base))
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,32 @@
1
+ module Kimurai
2
+ class BrowserBuilder
3
+ AVAILABLE_ENGINES = [
4
+ :mechanize,
5
+ :mechanize_standalone,
6
+ :poltergeist_phantomjs,
7
+ :selenium_firefox,
8
+ :selenium_chrome
9
+ ]
10
+
11
+ def self.build(engine, config = {}, spider:)
12
+ unless AVAILABLE_ENGINES.include? engine
13
+ raise "BrowserBuilder: wrong name of engine, available engines: #{AVAILABLE_ENGINES.join(', ')}"
14
+ end
15
+
16
+ case engine
17
+ when :mechanize
18
+ require_relative 'browser_builder/mechanize_builder'
19
+ MechanizeBuilder.new(config, spider: spider).build
20
+ when :selenium_chrome
21
+ require_relative 'browser_builder/selenium_chrome_builder'
22
+ SeleniumChromeBuilder.new(config, spider: spider).build
23
+ when :poltergeist_phantomjs
24
+ require_relative 'browser_builder/poltergeist_phantomjs_builder'
25
+ PoltergeistPhantomJSBuilder.new(config, spider: spider).build
26
+ when :selenium_firefox
27
+ require_relative 'browser_builder/selenium_firefox_builder'
28
+ SeleniumFirefoxBuilder.new(config, spider: spider).build
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,140 @@
1
+ require 'capybara'
2
+ require 'capybara/mechanize'
3
+ require_relative '../capybara_configuration'
4
+ require_relative '../capybara_ext/mechanize/driver'
5
+ require_relative '../capybara_ext/session'
6
+
7
+ module Kimurai
8
+ class BrowserBuilder
9
+ class MechanizeBuilder
10
+ attr_reader :logger, :spider
11
+
12
+ def initialize(config, spider:)
13
+ @config = config
14
+ @spider = spider
15
+ @logger = spider.logger
16
+ end
17
+
18
+ def build
19
+ # Register driver
20
+ Capybara.register_driver :mechanize do |app|
21
+ driver = Capybara::Mechanize::Driver.new("app")
22
+ # keep the history as small as possible (by default it's unlimited)
23
+ driver.configure { |a| a.history.max_size = 2 }
24
+ driver
25
+ end
26
+
27
+ # Create browser instance (Capybara session)
28
+ @browser = Capybara::Session.new(:mechanize)
29
+ @browser.spider = spider
30
+ logger.debug "BrowserBuilder (mechanize): created browser instance"
31
+
32
+ # Proxy
33
+ if proxy = @config[:proxy].presence
34
+ proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
35
+ ip, port, type = proxy_string.split(":")
36
+
37
+ if type == "socks5"
38
+ logger.error "BrowserBuilder (mechanize): can't set socks5 proxy (not supported), skipped"
39
+ else
40
+ @browser.set_proxy(*proxy_string.split(":"))
41
+ logger.debug "BrowserBuilder (mechanize): enabled #{type} proxy, ip: #{ip}, port: #{port}"
42
+ end
43
+ end
44
+
45
+ # SSL
46
+ if ssl_cert_path = @config[:ssl_cert_path].presence
47
+ @browser.driver.browser.agent.http.ca_file = ssl_cert_path
48
+ logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
49
+ end
50
+
51
+ if @config[:ignore_ssl_errors].present?
52
+ @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
53
+ logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
54
+ end
55
+
56
+ # Headers
57
+ if headers = @config[:headers].presence
58
+ @browser.driver.headers = headers
59
+ logger.debug "BrowserBuilder (mechanize): enabled custom headers"
60
+ end
61
+
62
+ if user_agent = @config[:user_agent].presence
63
+ user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
64
+
65
+ @browser.driver.add_header("User-Agent", user_agent_string)
66
+ logger.debug "BrowserBuilder (mechanize): enabled custom user-agent"
67
+ end
68
+
69
+ # Cookies
70
+ if cookies = @config[:cookies].presence
71
+ cookies.each do |cookie|
72
+ @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
73
+ end
74
+
75
+ logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
76
+ end
77
+
78
+ # Browser instance options
79
+ # retry_request_errors
80
+ if errors = @config.dig(:browser, :retry_request_errors).presence
81
+ @browser.config.retry_request_errors = errors
82
+ logger.debug "BrowserBuilder (mechanize): enabled `browser retry_request_errors`"
83
+ end
84
+
85
+ # restart_if
86
+ if @config.dig(:browser, :restart_if).present?
87
+ logger.error "BrowserBuilder (mechanize): `browser restart_if` options not supported by Mechanize, skipped"
88
+ end
89
+
90
+ # before_request clear_cookies
91
+ if @config.dig(:browser, :before_request, :clear_cookies)
92
+ @browser.config.before_request[:clear_cookies] = true
93
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_cookies`"
94
+ end
95
+
96
+ # before_request clear_and_set_cookies
97
+ if @config.dig(:browser, :before_request, :clear_and_set_cookies)
98
+ if cookies = @config[:cookies].presence
99
+ @browser.config.cookies = cookies
100
+ @browser.config.before_request[:clear_and_set_cookies] = true
101
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request clear_and_set_cookies`"
102
+ else
103
+ logger.error "BrowserBuilder (mechanize): `cookies` should be present to enable `browser before_request clear_and_set_cookies`, skipped"
104
+ end
105
+ end
106
+
107
+ # before_request change_user_agent
108
+ if @config.dig(:browser, :before_request, :change_user_agent)
109
+ if @config[:user_agent].present? && @config[:user_agent].class == Proc
110
+ @browser.config.user_agent = @config[:user_agent]
111
+ @browser.config.before_request[:change_user_agent] = true
112
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_user_agent`"
113
+ else
114
+ logger.error "BrowserBuilder (mechanize): `user_agent` should be present and has lambda format to enable `browser before_request change_user_agent`, skipped"
115
+ end
116
+ end
117
+
118
+ # before_request change_proxy
119
+ if @config.dig(:browser, :before_request, :change_proxy)
120
+ if @config[:proxy].present? && @config[:proxy].class == Proc
121
+ @browser.config.proxy = @config[:proxy]
122
+ @browser.config.before_request[:change_proxy] = true
123
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request change_proxy`"
124
+ else
125
+ logger.error "BrowserBuilder (mechanize): `proxy` should be present and has lambda format to enable `browser before_request change_proxy`, skipped"
126
+ end
127
+ end
128
+
129
+ # before_request delay
130
+ if delay = @config.dig(:browser, :before_request, :delay).presence
131
+ @browser.config.before_request[:delay] = delay
132
+ logger.debug "BrowserBuilder (mechanize): enabled `browser before_request delay`"
133
+ end
134
+
135
+ # return Capybara session instance
136
+ @browser
137
+ end
138
+ end
139
+ end
140
+ end