kimurai 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
File without changes
@@ -0,0 +1,3 @@
1
+ module ApplicationHelper
2
+ # Put here custom methods which are will be available for any spider
3
+ end
File without changes
File without changes
@@ -0,0 +1,11 @@
1
+ class Saver < Kimurai::Pipeline
2
+ def process_item(item, options: {})
3
+ # Here you can save item to the database, send it to a remote API or
4
+ # simply save item to a file format using `save_to` helper:
5
+
6
+ # To get the name of a current spider: `spider.class.name`
7
+ # save_to "db/#{spider.class.name}.json", item, format: :pretty_json
8
+
9
+ item
10
+ end
11
+ end
@@ -0,0 +1,24 @@
1
+ class Validator < Kimurai::Pipeline
2
+ def process_item(item, options: {})
3
+ # Here you can validate item and raise `DropItemError`
4
+ # if one of the validations failed. Examples:
5
+
6
+ # Check item sku for uniqueness using buit-in `unique?` helper:
7
+ # unless unique?(:sku, item[:sku])
8
+ # raise DropItemError, "Item sku is not unique"
9
+ # end
10
+
11
+ # Drop item if title length shorter than 5 symbols:
12
+ # if item[:title].size < 5
13
+ # raise DropItemError, "Item title is short"
14
+ # end
15
+
16
+ # Drop item if it doesn't contains any images:
17
+ # unless item[:images].present?
18
+ # raise DropItemError, "Item images are not present"
19
+ # end
20
+
21
+ # Pass item to the next pipeline (if it wasn't dropped)
22
+ item
23
+ end
24
+ end
@@ -0,0 +1,104 @@
1
+ # ApplicationSpider is a default base spider class. You can set here
2
+ # default settings for all spiders inherited from ApplicationSpider.
3
+ # To generate a new spider, run: `$ kimurai generate spider spider_name`
4
+
5
+ class ApplicationSpider < Kimurai::Base
6
+ include ApplicationHelper
7
+
8
+ # Default engine for spiders (available engines: :mechanize, :poltergeist_phantomjs,
9
+ # :selenium_firefox, :selenium_chrome)
10
+ @engine = :poltergeist_phantomjs
11
+
12
+ # Pipelines list, by order.
13
+ # To process item through pipelines pass item to the `send_item` method
14
+ @pipelines = [:validator, :saver]
15
+
16
+ # Default config. Set here options which are default for all spiders inherited
17
+ # from ApplicationSpider. Child's class config will be deep merged with this one
18
+ @config = {
19
+ # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
20
+ # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers)
21
+ # headers: {},
22
+
23
+ # Custom User Agent, format: string or lambda.
24
+ # Use lambda if you want to rotate user agents before each run:
25
+ # user_agent: -> { ARRAY_OF_USER_AGENTS.sample }
26
+ # Works for all engines
27
+ # user_agent: "Mozilla/5.0 Firefox/61.0",
28
+
29
+ # Custom cookies, format: array of hashes.
30
+ # Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" }
31
+ # Works for all engines
32
+ # cookies: [],
33
+
34
+ # Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password"
35
+ # `protocol` can be http or socks5. User and password are optional.
36
+ # Use lambda if you want to rotate proxies before each run:
37
+ # proxy: -> { ARRAY_OF_PROXIES.sample }
38
+ # Works for all engines, but keep in mind that Selenium drivers doesn't support proxies
39
+ # with authorization. Also, Mechanize doesn't support socks5 proxy format (only http)
40
+ # proxy: "3.4.5.6:3128:http:user:pass",
41
+
42
+ # If enabled, browser will ignore any https errors. It's handy while using a proxy
43
+ # with self-signed SSL cert (for example Crawlera or Mitmproxy)
44
+ # Also, it will allow to visit webpages with expires SSL certificate.
45
+ # Works for all engines
46
+ ignore_ssl_errors: true,
47
+
48
+ # Custom window size, works for all engines
49
+ # window_size: [1366, 768],
50
+
51
+ # Skip images downloading if true, works for all engines
52
+ disable_images: true,
53
+
54
+ # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
55
+ # Although native mode has a better performance, virtual display mode
56
+ # sometimes can be useful. For example, some websites can detect (and block)
57
+ # headless chrome, so you can use virtual_display mode instead
58
+ # headless_mode: :native,
59
+
60
+ # This option tells the browser not to use a proxy for the provided list of domains or IP addresses.
61
+ # Format: array of strings. Works only for :selenium_firefox and selenium_chrome
62
+ # proxy_bypass_list: [],
63
+
64
+ # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
65
+ # ssl_cert_path: "path/to/ssl_cert",
66
+
67
+ # Browser (Capybara session instance) options:
68
+ browser: {
69
+ # Array of errors to retry while processing a request
70
+ # retry_request_errors: [Net::ReadTimeout],
71
+ # Restart browser if one of the options is true:
72
+ restart_if: {
73
+ # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
74
+ # memory_limit: 350_000,
75
+
76
+ # Restart browser if provided requests limit is exceeded (works for all engines)
77
+ # requests_limit: 100
78
+ },
79
+ before_request: {
80
+ # Change proxy before each request. The `proxy:` option above should be presented
81
+ # and has lambda format. Works only for poltergeist and mechanize engines
82
+ # (Selenium doesn't support proxy rotation).
83
+ # change_proxy: true,
84
+
85
+ # Change user agent before each request. The `user_agent:` option above should be presented
86
+ # and has lambda format. Works only for poltergeist and mechanize engines
87
+ # (selenium doesn't support to get/set headers).
88
+ # change_user_agent: true,
89
+
90
+ # Clear all cookies before each request, works for all engines
91
+ # clear_cookies: true,
92
+
93
+ # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
94
+ # use this option instead (works for all engines)
95
+ # clear_and_set_cookies: true,
96
+
97
+ # Global option to set delay between requests.
98
+ # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
99
+ # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
100
+ # delay: 1..3
101
+ }
102
+ }
103
+ }
104
+ end
File without changes
@@ -0,0 +1,3 @@
1
+ module Kimurai
2
+ VERSION = "1.0.0"
3
+ end
metadata ADDED
@@ -0,0 +1,349 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kimurai
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Victor Afanasev
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-08-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: cliver
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: murmurhash3
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: capybara
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '2.15'
90
+ - - "<"
91
+ - !ruby/object:Gem::Version
92
+ version: '4.0'
93
+ type: :runtime
94
+ prerelease: false
95
+ version_requirements: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: '2.15'
100
+ - - "<"
101
+ - !ruby/object:Gem::Version
102
+ version: '4.0'
103
+ - !ruby/object:Gem::Dependency
104
+ name: capybara-mechanize
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ - !ruby/object:Gem::Dependency
118
+ name: poltergeist
119
+ requirement: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ type: :runtime
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ - !ruby/object:Gem::Dependency
132
+ name: selenium-webdriver
133
+ requirement: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '0'
138
+ type: :runtime
139
+ prerelease: false
140
+ version_requirements: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ - !ruby/object:Gem::Dependency
146
+ name: headless
147
+ requirement: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ type: :runtime
153
+ prerelease: false
154
+ version_requirements: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - ">="
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ - !ruby/object:Gem::Dependency
160
+ name: pmap
161
+ requirement: !ruby/object:Gem::Requirement
162
+ requirements:
163
+ - - ">="
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
166
+ type: :runtime
167
+ prerelease: false
168
+ version_requirements: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ - !ruby/object:Gem::Dependency
174
+ name: whenever
175
+ requirement: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ type: :runtime
181
+ prerelease: false
182
+ version_requirements: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: '0'
187
+ - !ruby/object:Gem::Dependency
188
+ name: rbcat
189
+ requirement: !ruby/object:Gem::Requirement
190
+ requirements:
191
+ - - "~>"
192
+ - !ruby/object:Gem::Version
193
+ version: '0.2'
194
+ type: :runtime
195
+ prerelease: false
196
+ version_requirements: !ruby/object:Gem::Requirement
197
+ requirements:
198
+ - - "~>"
199
+ - !ruby/object:Gem::Version
200
+ version: '0.2'
201
+ - !ruby/object:Gem::Dependency
202
+ name: pry
203
+ requirement: !ruby/object:Gem::Requirement
204
+ requirements:
205
+ - - ">="
206
+ - !ruby/object:Gem::Version
207
+ version: '0'
208
+ type: :runtime
209
+ prerelease: false
210
+ version_requirements: !ruby/object:Gem::Requirement
211
+ requirements:
212
+ - - ">="
213
+ - !ruby/object:Gem::Version
214
+ version: '0'
215
+ - !ruby/object:Gem::Dependency
216
+ name: bundler
217
+ requirement: !ruby/object:Gem::Requirement
218
+ requirements:
219
+ - - "~>"
220
+ - !ruby/object:Gem::Version
221
+ version: '1.16'
222
+ type: :development
223
+ prerelease: false
224
+ version_requirements: !ruby/object:Gem::Requirement
225
+ requirements:
226
+ - - "~>"
227
+ - !ruby/object:Gem::Version
228
+ version: '1.16'
229
+ - !ruby/object:Gem::Dependency
230
+ name: rake
231
+ requirement: !ruby/object:Gem::Requirement
232
+ requirements:
233
+ - - "~>"
234
+ - !ruby/object:Gem::Version
235
+ version: '10.0'
236
+ type: :development
237
+ prerelease: false
238
+ version_requirements: !ruby/object:Gem::Requirement
239
+ requirements:
240
+ - - "~>"
241
+ - !ruby/object:Gem::Version
242
+ version: '10.0'
243
+ - !ruby/object:Gem::Dependency
244
+ name: minitest
245
+ requirement: !ruby/object:Gem::Requirement
246
+ requirements:
247
+ - - "~>"
248
+ - !ruby/object:Gem::Version
249
+ version: '5.0'
250
+ type: :development
251
+ prerelease: false
252
+ version_requirements: !ruby/object:Gem::Requirement
253
+ requirements:
254
+ - - "~>"
255
+ - !ruby/object:Gem::Version
256
+ version: '5.0'
257
+ description:
258
+ email:
259
+ - vicfreefly@gmail.com
260
+ executables:
261
+ - kimurai
262
+ extensions: []
263
+ extra_rdoc_files: []
264
+ files:
265
+ - ".gitignore"
266
+ - ".travis.yml"
267
+ - CODE_OF_CONDUCT.md
268
+ - Gemfile
269
+ - LICENSE.txt
270
+ - README.md
271
+ - Rakefile
272
+ - bin/console
273
+ - bin/setup
274
+ - exe/kimurai
275
+ - kimurai.gemspec
276
+ - lib/kimurai.rb
277
+ - lib/kimurai/automation/deploy.yml
278
+ - lib/kimurai/automation/setup.yml
279
+ - lib/kimurai/automation/setup/chromium_chromedriver.yml
280
+ - lib/kimurai/automation/setup/firefox_geckodriver.yml
281
+ - lib/kimurai/automation/setup/phantomjs.yml
282
+ - lib/kimurai/automation/setup/ruby_environment.yml
283
+ - lib/kimurai/base.rb
284
+ - lib/kimurai/base/simple_saver.rb
285
+ - lib/kimurai/base/uniq_checker.rb
286
+ - lib/kimurai/base_helper.rb
287
+ - lib/kimurai/browser_builder.rb
288
+ - lib/kimurai/browser_builder/mechanize_builder.rb
289
+ - lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb
290
+ - lib/kimurai/browser_builder/selenium_chrome_builder.rb
291
+ - lib/kimurai/browser_builder/selenium_firefox_builder.rb
292
+ - lib/kimurai/capybara_configuration.rb
293
+ - lib/kimurai/capybara_ext/driver/base.rb
294
+ - lib/kimurai/capybara_ext/mechanize/driver.rb
295
+ - lib/kimurai/capybara_ext/poltergeist/driver.rb
296
+ - lib/kimurai/capybara_ext/selenium/driver.rb
297
+ - lib/kimurai/capybara_ext/session.rb
298
+ - lib/kimurai/capybara_ext/session/config.rb
299
+ - lib/kimurai/cli.rb
300
+ - lib/kimurai/cli/ansible_command_builder.rb
301
+ - lib/kimurai/cli/generator.rb
302
+ - lib/kimurai/core_ext/array.rb
303
+ - lib/kimurai/core_ext/numeric.rb
304
+ - lib/kimurai/core_ext/string.rb
305
+ - lib/kimurai/pipeline.rb
306
+ - lib/kimurai/runner.rb
307
+ - lib/kimurai/template/.gitignore
308
+ - lib/kimurai/template/.ruby-version
309
+ - lib/kimurai/template/Gemfile
310
+ - lib/kimurai/template/README.md
311
+ - lib/kimurai/template/config/application.rb
312
+ - lib/kimurai/template/config/automation.yml
313
+ - lib/kimurai/template/config/boot.rb
314
+ - lib/kimurai/template/config/initializers/.keep
315
+ - lib/kimurai/template/config/schedule.rb
316
+ - lib/kimurai/template/db/.keep
317
+ - lib/kimurai/template/helpers/application_helper.rb
318
+ - lib/kimurai/template/lib/.keep
319
+ - lib/kimurai/template/log/.keep
320
+ - lib/kimurai/template/pipelines/saver.rb
321
+ - lib/kimurai/template/pipelines/validator.rb
322
+ - lib/kimurai/template/spiders/application_spider.rb
323
+ - lib/kimurai/template/tmp/.keep
324
+ - lib/kimurai/version.rb
325
+ homepage: https://github.com/vifreefly/kimurai
326
+ licenses:
327
+ - MIT
328
+ metadata: {}
329
+ post_install_message:
330
+ rdoc_options: []
331
+ require_paths:
332
+ - lib
333
+ required_ruby_version: !ruby/object:Gem::Requirement
334
+ requirements:
335
+ - - ">="
336
+ - !ruby/object:Gem::Version
337
+ version: 2.5.0
338
+ required_rubygems_version: !ruby/object:Gem::Requirement
339
+ requirements:
340
+ - - ">="
341
+ - !ruby/object:Gem::Version
342
+ version: '0'
343
+ requirements: []
344
+ rubyforge_project:
345
+ rubygems_version: 2.7.6
346
+ signing_key:
347
+ specification_version: 4
348
+ summary: Modern web scraping framework written in Ruby and based on Capybara/Nokogiri
349
+ test_files: []