kimurai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
File without changes
@@ -0,0 +1,3 @@
1
+ module ApplicationHelper
2
+ # Put here custom methods which are will be available for any spider
3
+ end
File without changes
File without changes
@@ -0,0 +1,11 @@
1
+ class Saver < Kimurai::Pipeline
2
+ def process_item(item, options: {})
3
+ # Here you can save item to the database, send it to a remote API or
4
+ # simply save item to a file format using `save_to` helper:
5
+
6
+ # To get the name of a current spider: `spider.class.name`
7
+ # save_to "db/#{spider.class.name}.json", item, format: :pretty_json
8
+
9
+ item
10
+ end
11
+ end
@@ -0,0 +1,24 @@
1
+ class Validator < Kimurai::Pipeline
2
+ def process_item(item, options: {})
3
+ # Here you can validate item and raise `DropItemError`
4
+ # if one of the validations failed. Examples:
5
+
6
+ # Check item sku for uniqueness using buit-in `unique?` helper:
7
+ # unless unique?(:sku, item[:sku])
8
+ # raise DropItemError, "Item sku is not unique"
9
+ # end
10
+
11
+ # Drop item if title length shorter than 5 symbols:
12
+ # if item[:title].size < 5
13
+ # raise DropItemError, "Item title is short"
14
+ # end
15
+
16
+ # Drop item if it doesn't contains any images:
17
+ # unless item[:images].present?
18
+ # raise DropItemError, "Item images are not present"
19
+ # end
20
+
21
+ # Pass item to the next pipeline (if it wasn't dropped)
22
+ item
23
+ end
24
+ end
@@ -0,0 +1,104 @@
1
+ # ApplicationSpider is a default base spider class. You can set here
2
+ # default settings for all spiders inherited from ApplicationSpider.
3
+ # To generate a new spider, run: `$ kimurai generate spider spider_name`
4
+
5
+ class ApplicationSpider < Kimurai::Base
6
+ include ApplicationHelper
7
+
8
+ # Default engine for spiders (available engines: :mechanize, :poltergeist_phantomjs,
9
+ # :selenium_firefox, :selenium_chrome)
10
+ @engine = :poltergeist_phantomjs
11
+
12
+ # Pipelines list, by order.
13
+ # To process item through pipelines pass item to the `send_item` method
14
+ @pipelines = [:validator, :saver]
15
+
16
+ # Default config. Set here options which are default for all spiders inherited
17
+ # from ApplicationSpider. Child's class config will be deep merged with this one
18
+ @config = {
19
+ # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
20
+ # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers)
21
+ # headers: {},
22
+
23
+ # Custom User Agent, format: string or lambda.
24
+ # Use lambda if you want to rotate user agents before each run:
25
+ # user_agent: -> { ARRAY_OF_USER_AGENTS.sample }
26
+ # Works for all engines
27
+ # user_agent: "Mozilla/5.0 Firefox/61.0",
28
+
29
+ # Custom cookies, format: array of hashes.
30
+ # Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" }
31
+ # Works for all engines
32
+ # cookies: [],
33
+
34
+ # Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password"
35
+ # `protocol` can be http or socks5. User and password are optional.
36
+ # Use lambda if you want to rotate proxies before each run:
37
+ # proxy: -> { ARRAY_OF_PROXIES.sample }
38
+ # Works for all engines, but keep in mind that Selenium drivers doesn't support proxies
39
+ # with authorization. Also, Mechanize doesn't support socks5 proxy format (only http)
40
+ # proxy: "3.4.5.6:3128:http:user:pass",
41
+
42
+ # If enabled, browser will ignore any https errors. It's handy while using a proxy
43
+ # with self-signed SSL cert (for example Crawlera or Mitmproxy)
44
+ # Also, it will allow to visit webpages with expires SSL certificate.
45
+ # Works for all engines
46
+ ignore_ssl_errors: true,
47
+
48
+ # Custom window size, works for all engines
49
+ # window_size: [1366, 768],
50
+
51
+ # Skip images downloading if true, works for all engines
52
+ disable_images: true,
53
+
54
+ # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
55
+ # Although native mode has a better performance, virtual display mode
56
+ # sometimes can be useful. For example, some websites can detect (and block)
57
+ # headless chrome, so you can use virtual_display mode instead
58
+ # headless_mode: :native,
59
+
60
+ # This option tells the browser not to use a proxy for the provided list of domains or IP addresses.
61
+ # Format: array of strings. Works only for :selenium_firefox and selenium_chrome
62
+ # proxy_bypass_list: [],
63
+
64
+ # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
65
+ # ssl_cert_path: "path/to/ssl_cert",
66
+
67
+ # Browser (Capybara session instance) options:
68
+ browser: {
69
+ # Array of errors to retry while processing a request
70
+ # retry_request_errors: [Net::ReadTimeout],
71
+ # Restart browser if one of the options is true:
72
+ restart_if: {
73
+ # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
74
+ # memory_limit: 350_000,
75
+
76
+ # Restart browser if provided requests limit is exceeded (works for all engines)
77
+ # requests_limit: 100
78
+ },
79
+ before_request: {
80
+ # Change proxy before each request. The `proxy:` option above should be presented
81
+ # and has lambda format. Works only for poltergeist and mechanize engines
82
+ # (Selenium doesn't support proxy rotation).
83
+ # change_proxy: true,
84
+
85
+ # Change user agent before each request. The `user_agent:` option above should be presented
86
+ # and has lambda format. Works only for poltergeist and mechanize engines
87
+ # (selenium doesn't support to get/set headers).
88
+ # change_user_agent: true,
89
+
90
+ # Clear all cookies before each request, works for all engines
91
+ # clear_cookies: true,
92
+
93
+ # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
94
+ # use this option instead (works for all engines)
95
+ # clear_and_set_cookies: true,
96
+
97
+ # Global option to set delay between requests.
98
+ # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
99
+ # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
100
+ # delay: 1..3
101
+ }
102
+ }
103
+ }
104
+ end
File without changes
@@ -0,0 +1,3 @@
1
+ module Kimurai
2
+ VERSION = "1.0.0"
3
+ end
metadata ADDED
@@ -0,0 +1,349 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kimurai
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Victor Afanasev
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-08-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: cliver
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: murmurhash3
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: capybara
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '2.15'
90
+ - - "<"
91
+ - !ruby/object:Gem::Version
92
+ version: '4.0'
93
+ type: :runtime
94
+ prerelease: false
95
+ version_requirements: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: '2.15'
100
+ - - "<"
101
+ - !ruby/object:Gem::Version
102
+ version: '4.0'
103
+ - !ruby/object:Gem::Dependency
104
+ name: capybara-mechanize
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ - !ruby/object:Gem::Dependency
118
+ name: poltergeist
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ type: :runtime
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ - !ruby/object:Gem::Dependency
132
+ name: selenium-webdriver
133
+ requirement: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '0'
138
+ type: :runtime
139
+ prerelease: false
140
+ version_requirements: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ - !ruby/object:Gem::Dependency
146
+ name: headless
147
+ requirement: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ type: :runtime
153
+ prerelease: false
154
+ version_requirements: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - ">="
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ - !ruby/object:Gem::Dependency
160
+ name: pmap
161
+ requirement: !ruby/object:Gem::Requirement
162
+ requirements:
163
+ - - ">="
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
166
+ type: :runtime
167
+ prerelease: false
168
+ version_requirements: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ - !ruby/object:Gem::Dependency
174
+ name: whenever
175
+ requirement: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ type: :runtime
181
+ prerelease: false
182
+ version_requirements: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: '0'
187
+ - !ruby/object:Gem::Dependency
188
+ name: rbcat
189
+ requirement: !ruby/object:Gem::Requirement
190
+ requirements:
191
+ - - "~>"
192
+ - !ruby/object:Gem::Version
193
+ version: '0.2'
194
+ type: :runtime
195
+ prerelease: false
196
+ version_requirements: !ruby/object:Gem::Requirement
197
+ requirements:
198
+ - - "~>"
199
+ - !ruby/object:Gem::Version
200
+ version: '0.2'
201
+ - !ruby/object:Gem::Dependency
202
+ name: pry
203
+ requirement: !ruby/object:Gem::Requirement
204
+ requirements:
205
+ - - ">="
206
+ - !ruby/object:Gem::Version
207
+ version: '0'
208
+ type: :runtime
209
+ prerelease: false
210
+ version_requirements: !ruby/object:Gem::Requirement
211
+ requirements:
212
+ - - ">="
213
+ - !ruby/object:Gem::Version
214
+ version: '0'
215
+ - !ruby/object:Gem::Dependency
216
+ name: bundler
217
+ requirement: !ruby/object:Gem::Requirement
218
+ requirements:
219
+ - - "~>"
220
+ - !ruby/object:Gem::Version
221
+ version: '1.16'
222
+ type: :development
223
+ prerelease: false
224
+ version_requirements: !ruby/object:Gem::Requirement
225
+ requirements:
226
+ - - "~>"
227
+ - !ruby/object:Gem::Version
228
+ version: '1.16'
229
+ - !ruby/object:Gem::Dependency
230
+ name: rake
231
+ requirement: !ruby/object:Gem::Requirement
232
+ requirements:
233
+ - - "~>"
234
+ - !ruby/object:Gem::Version
235
+ version: '10.0'
236
+ type: :development
237
+ prerelease: false
238
+ version_requirements: !ruby/object:Gem::Requirement
239
+ requirements:
240
+ - - "~>"
241
+ - !ruby/object:Gem::Version
242
+ version: '10.0'
243
+ - !ruby/object:Gem::Dependency
244
+ name: minitest
245
+ requirement: !ruby/object:Gem::Requirement
246
+ requirements:
247
+ - - "~>"
248
+ - !ruby/object:Gem::Version
249
+ version: '5.0'
250
+ type: :development
251
+ prerelease: false
252
+ version_requirements: !ruby/object:Gem::Requirement
253
+ requirements:
254
+ - - "~>"
255
+ - !ruby/object:Gem::Version
256
+ version: '5.0'
257
+ description:
258
+ email:
259
+ - vicfreefly@gmail.com
260
+ executables:
261
+ - kimurai
262
+ extensions: []
263
+ extra_rdoc_files: []
264
+ files:
265
+ - ".gitignore"
266
+ - ".travis.yml"
267
+ - CODE_OF_CONDUCT.md
268
+ - Gemfile
269
+ - LICENSE.txt
270
+ - README.md
271
+ - Rakefile
272
+ - bin/console
273
+ - bin/setup
274
+ - exe/kimurai
275
+ - kimurai.gemspec
276
+ - lib/kimurai.rb
277
+ - lib/kimurai/automation/deploy.yml
278
+ - lib/kimurai/automation/setup.yml
279
+ - lib/kimurai/automation/setup/chromium_chromedriver.yml
280
+ - lib/kimurai/automation/setup/firefox_geckodriver.yml
281
+ - lib/kimurai/automation/setup/phantomjs.yml
282
+ - lib/kimurai/automation/setup/ruby_environment.yml
283
+ - lib/kimurai/base.rb
284
+ - lib/kimurai/base/simple_saver.rb
285
+ - lib/kimurai/base/uniq_checker.rb
286
+ - lib/kimurai/base_helper.rb
287
+ - lib/kimurai/browser_builder.rb
288
+ - lib/kimurai/browser_builder/mechanize_builder.rb
289
+ - lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb
290
+ - lib/kimurai/browser_builder/selenium_chrome_builder.rb
291
+ - lib/kimurai/browser_builder/selenium_firefox_builder.rb
292
+ - lib/kimurai/capybara_configuration.rb
293
+ - lib/kimurai/capybara_ext/driver/base.rb
294
+ - lib/kimurai/capybara_ext/mechanize/driver.rb
295
+ - lib/kimurai/capybara_ext/poltergeist/driver.rb
296
+ - lib/kimurai/capybara_ext/selenium/driver.rb
297
+ - lib/kimurai/capybara_ext/session.rb
298
+ - lib/kimurai/capybara_ext/session/config.rb
299
+ - lib/kimurai/cli.rb
300
+ - lib/kimurai/cli/ansible_command_builder.rb
301
+ - lib/kimurai/cli/generator.rb
302
+ - lib/kimurai/core_ext/array.rb
303
+ - lib/kimurai/core_ext/numeric.rb
304
+ - lib/kimurai/core_ext/string.rb
305
+ - lib/kimurai/pipeline.rb
306
+ - lib/kimurai/runner.rb
307
+ - lib/kimurai/template/.gitignore
308
+ - lib/kimurai/template/.ruby-version
309
+ - lib/kimurai/template/Gemfile
310
+ - lib/kimurai/template/README.md
311
+ - lib/kimurai/template/config/application.rb
312
+ - lib/kimurai/template/config/automation.yml
313
+ - lib/kimurai/template/config/boot.rb
314
+ - lib/kimurai/template/config/initializers/.keep
315
+ - lib/kimurai/template/config/schedule.rb
316
+ - lib/kimurai/template/db/.keep
317
+ - lib/kimurai/template/helpers/application_helper.rb
318
+ - lib/kimurai/template/lib/.keep
319
+ - lib/kimurai/template/log/.keep
320
+ - lib/kimurai/template/pipelines/saver.rb
321
+ - lib/kimurai/template/pipelines/validator.rb
322
+ - lib/kimurai/template/spiders/application_spider.rb
323
+ - lib/kimurai/template/tmp/.keep
324
+ - lib/kimurai/version.rb
325
+ homepage: https://github.com/vifreefly/kimurai
326
+ licenses:
327
+ - MIT
328
+ metadata: {}
329
+ post_install_message:
330
+ rdoc_options: []
331
+ require_paths:
332
+ - lib
333
+ required_ruby_version: !ruby/object:Gem::Requirement
334
+ requirements:
335
+ - - ">="
336
+ - !ruby/object:Gem::Version
337
+ version: 2.5.0
338
+ required_rubygems_version: !ruby/object:Gem::Requirement
339
+ requirements:
340
+ - - ">="
341
+ - !ruby/object:Gem::Version
342
+ version: '0'
343
+ requirements: []
344
+ rubyforge_project:
345
+ rubygems_version: 2.7.6
346
+ signing_key:
347
+ specification_version: 4
348
+ summary: Modern web scraping framework written in Ruby and based on Capybara/Nokogiri
349
+ test_files: []