kimurai 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +58 -99
- data/kimurai.gemspec +1 -1
- data/lib/kimurai/automation/setup.yml +3 -3
- data/lib/kimurai/base.rb +17 -19
- data/lib/kimurai/base/storage.rb +14 -61
- data/lib/kimurai/browser_builder/mechanize_builder.rb +4 -4
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +6 -2
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +8 -4
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +6 -1
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +16 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +10 -0
- data/lib/kimurai/capybara_ext/session.rb +26 -12
- data/lib/kimurai/cli.rb +8 -3
- data/lib/kimurai/runner.rb +7 -18
- data/lib/kimurai/template/config/schedule.rb +1 -1
- data/lib/kimurai/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9fb75ab8965adb212964eee4b319642c3b83bbb4e3b957b739b6eba28b64546d
|
4
|
+
data.tar.gz: b03efc58a57222fdbb51e5ab6aa389f352de1f94fbd02358ccb613d7a15a5774
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 737f467deef60eb2983d86286c040d1b899732ff3e52fdab50ce07e8893070c0cd9f3f3020fe57e32ab1d9140299a9112db698e8d72112934b4fd90ee10a7644
|
7
|
+
data.tar.gz: de89bc1ef85f7f29986b7033d883a49c804592a21f78b409bf325f2e0e578dc78bbf67cdca4ca719f34e43f305dd99f8f227cbd8b545e42f06ca7f49d24e1e35
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,21 @@
|
|
1
1
|
# CHANGELOG
|
2
|
+
## 1.3.0
|
3
|
+
### Breaking changes 1.3.0
|
4
|
+
* Remove persistence database feature (because it's slow and makes things complicated)
|
5
|
+
|
6
|
+
### New
|
7
|
+
* Add `--include` and `--exclude` options to CLI#runner
|
8
|
+
* Add Base `#create_browser` method to easily create additional browser instances
|
9
|
+
* Add Capybara::Session `#scroll_to_bottom`
|
10
|
+
* Add skip_on_failure feature to `retry_request_errors` config option
|
11
|
+
* Add info about `add_event` method to the README
|
12
|
+
|
13
|
+
### Fixes and improvements
|
14
|
+
* Improve Runner
|
15
|
+
* Fix time helper in schedule.rb
|
16
|
+
* Add proxy validation to browser builders
|
17
|
+
* Allow to pass different arguments to the `Base.parse` method
|
18
|
+
|
2
19
|
## 1.2.0
|
3
20
|
### New
|
4
21
|
* Add possibility to add array of values to the storage (`Base::Storage#add`)
|
data/README.md
CHANGED
@@ -18,7 +18,7 @@
|
|
18
18
|
|
19
19
|
<br>
|
20
20
|
|
21
|
-
> Note: this readme is for `1.
|
21
|
+
> Note: this readme is for `1.3.0` gem version. CHANGELOG [here](CHANGELOG.md).
|
22
22
|
|
23
23
|
Kimurai is a modern web scraping framework written in Ruby which **works out of box with Headless Chromium/Firefox, PhantomJS**, or simple HTTP requests and **allows to scrape and interact with JavaScript rendered websites.**
|
24
24
|
|
@@ -241,10 +241,10 @@ I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scrol
|
|
241
241
|
* [Skip duplicates](#skip-duplicates)
|
242
242
|
* [Automatically skip all duplicated requests urls](#automatically-skip-all-duplicated-requests-urls)
|
243
243
|
* [Storage object](#storage-object)
|
244
|
-
* [Persistence database for the storage](#persistence-database-for-the-storage)
|
245
244
|
* [Handle request errors](#handle-request-errors)
|
246
245
|
* [skip_request_errors](#skip_request_errors)
|
247
246
|
* [retry_request_errors](#retry_request_errors)
|
247
|
+
* [Logging custom events](#logging-custom-events)
|
248
248
|
* [open_spider and close_spider callbacks](#open_spider-and-close_spider-callbacks)
|
249
249
|
* [KIMURAI_ENV](#kimurai_env)
|
250
250
|
* [Parallel crawling using in_parallel](#parallel-crawling-using-in_parallel)
|
@@ -297,8 +297,8 @@ echo 'export PATH="$HOME/.rbenv/plugins/ruby-build/bin:$PATH"' >> ~/.bashrc
|
|
297
297
|
exec $SHELL
|
298
298
|
|
299
299
|
# Install latest Ruby
|
300
|
-
rbenv install 2.5.
|
301
|
-
rbenv global 2.5.
|
300
|
+
rbenv install 2.5.3
|
301
|
+
rbenv global 2.5.3
|
302
302
|
|
303
303
|
gem install bundler
|
304
304
|
```
|
@@ -317,8 +317,8 @@ echo 'if which rbenv > /dev/null; then eval "$(rbenv init -)"; fi' >> ~/.bash_pr
|
|
317
317
|
source ~/.bash_profile
|
318
318
|
|
319
319
|
# Install latest Ruby
|
320
|
-
rbenv install 2.5.
|
321
|
-
rbenv global 2.5.
|
320
|
+
rbenv install 2.5.3
|
321
|
+
rbenv global 2.5.3
|
322
322
|
|
323
323
|
gem install bundler
|
324
324
|
```
|
@@ -349,17 +349,17 @@ sudo apt install -q -y xvfb
|
|
349
349
|
# Install chromium-browser and firefox
|
350
350
|
sudo apt install -q -y chromium-browser firefox
|
351
351
|
|
352
|
-
# Instal chromedriver (2.
|
352
|
+
# Instal chromedriver (2.44 version)
|
353
353
|
# All versions located here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
354
|
-
cd /tmp && wget https://chromedriver.storage.googleapis.com/2.
|
354
|
+
cd /tmp && wget https://chromedriver.storage.googleapis.com/2.44/chromedriver_linux64.zip
|
355
355
|
sudo unzip chromedriver_linux64.zip -d /usr/local/bin
|
356
356
|
rm -f chromedriver_linux64.zip
|
357
357
|
|
358
|
-
# Install geckodriver (0.
|
358
|
+
# Install geckodriver (0.23.0 version)
|
359
359
|
# All versions located here https://github.com/mozilla/geckodriver/releases/
|
360
|
-
cd /tmp && wget https://github.com/mozilla/geckodriver/releases/download/v0.
|
361
|
-
sudo tar -xvzf geckodriver-v0.
|
362
|
-
rm -f geckodriver-v0.
|
360
|
+
cd /tmp && wget https://github.com/mozilla/geckodriver/releases/download/v0.23.0/geckodriver-v0.23.0-linux64.tar.gz
|
361
|
+
sudo tar -xvzf geckodriver-v0.23.0-linux64.tar.gz -C /usr/local/bin
|
362
|
+
rm -f geckodriver-v0.23.0-linux64.tar.gz
|
363
363
|
|
364
364
|
# Install PhantomJS (2.1.1)
|
365
365
|
# All versions located here http://phantomjs.org/download.html
|
@@ -815,113 +815,60 @@ It is possible to automatically skip all already visited urls while calling `req
|
|
815
815
|
* `#all` - display storage hash where keys are existing scopes.
|
816
816
|
* `#include?(scope, value)` - return `true` if value in the scope exists, and `false` if not
|
817
817
|
* `#add(scope, value)` - add value to the scope
|
818
|
-
*
|
819
|
-
*
|
818
|
+
* `#unique?(scope, value)` - method already described above, will return `false` if value in the scope exists, or return `true` + add value to the scope if value in the scope not exists.
|
819
|
+
* `#clear!` - reset the whole storage by deleting all values from all scopes.
|
820
820
|
|
821
|
-
#### Persistence database for the storage
|
822
821
|
|
823
|
-
|
824
|
-
|
825
|
-
Kimurai can use persistence database for a `storage` using Ruby built-in [PStore](https://ruby-doc.org/stdlib-2.5.1/libdoc/pstore/rdoc/PStore.html) database. With this option, you can automatically skip already visited urls in the next run _if previous run was failed_, otherwise _(if run was successful)_ storage database will be removed before spider stops.
|
826
|
-
|
827
|
-
Also, with persistence storage enabled, [save_to](#save_to-helper) method will keep adding items to an existing file (it will not be cleared before each run).
|
822
|
+
### Handle request errors
|
823
|
+
It is quite common that some pages of crawling website can return different response code than `200 ok`. In such cases, method `request_to` (or `browser.visit`) can raise an exception. Kimurai provides `skip_request_errors` and `retry_request_errors` [config](#spider-config) options to handle such errors:
|
828
824
|
|
829
|
-
|
825
|
+
#### skip_request_errors
|
826
|
+
You can automatically skip some of errors while requesting a page using `skip_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught, and request will be skipped. It is a good idea to skip errors like NotFound(404), etc.
|
830
827
|
|
831
|
-
|
828
|
+
Format for the option: array where elements are error classes or/and hashes. You can use _hash_ format for more flexibility:
|
832
829
|
|
833
|
-
```
|
834
|
-
|
835
|
-
|
830
|
+
```
|
831
|
+
@config = {
|
832
|
+
skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }]
|
833
|
+
}
|
834
|
+
```
|
835
|
+
In this case, provided `message:` will be compared with a full error message using `String#include?`. Also you can use regex instead: `{ error: RuntimeError, message: /404|403/ }`.
|
836
836
|
|
837
|
-
|
838
|
-
|
839
|
-
request_to :parse_category, url: category[:href]
|
840
|
-
end
|
841
|
-
end
|
837
|
+
#### retry_request_errors
|
838
|
+
You can automatically retry some of errors with a few attempts while requesting a page using `retry_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught and the request will be processed again within a delay.
|
842
839
|
|
843
|
-
|
844
|
-
response.xpath("//products/path").each do |product|
|
845
|
-
# check if product url already contains in the scope `:product_urls`, if so, skip the request:
|
846
|
-
next if storage.contains?(:product_urls, product[:href])
|
847
|
-
# Otherwise process it:
|
848
|
-
request_to :parse_product, url: product[:href]
|
849
|
-
end
|
850
|
-
end
|
840
|
+
There are 3 attempts: first: delay _15 sec_, second: delay _30 sec_, third: delay _45 sec_. If after 3 attempts there is still an exception, then the exception will be raised. It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
|
851
841
|
|
852
|
-
|
853
|
-
# Add visited item to the storage:
|
854
|
-
storage.add(:product_urls, url)
|
842
|
+
Format for the option: same like for `skip_request_errors` option.
|
855
843
|
|
856
|
-
|
857
|
-
end
|
858
|
-
end
|
844
|
+
If you would like to skip (not raise) error after all retries gone, you can specify `skip_on_failure: true` option:
|
859
845
|
|
860
|
-
|
861
|
-
|
846
|
+
```ruby
|
847
|
+
@config = {
|
848
|
+
retry_request_errors: [{ error: RuntimeError, skip_on_failure: true }]
|
849
|
+
}
|
862
850
|
```
|
863
851
|
|
864
|
-
|
852
|
+
### Logging custom events
|
865
853
|
|
866
|
-
|
867
|
-
<summary>Check the code</summary>
|
854
|
+
It is possible to save custom messages to the [run_info](#open_spider-and-close_spider-callbacks) hash using `add_event('Some message')` method. This feature helps you to keep track on important things which happened during crawling without checking the whole spider log (in case if you're logging these messages using `logger`). Example:
|
868
855
|
|
869
856
|
```ruby
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
# Configure skip_duplicate_requests option:
|
874
|
-
skip_duplicate_requests: { scope: :product_urls, check_only: true }
|
875
|
-
}
|
876
|
-
|
877
|
-
def parse(response, url:, data: {})
|
878
|
-
response.xpath("//categories/path").each do |category|
|
879
|
-
request_to :parse_category, url: category[:href]
|
880
|
-
end
|
857
|
+
def parse_product(response, url:, data: {})
|
858
|
+
unless response.at_xpath("//path/to/add_to_card_button")
|
859
|
+
add_event("Product is sold") and return
|
881
860
|
end
|
882
861
|
|
883
|
-
|
884
|
-
response.xpath("//products/path").each do |product|
|
885
|
-
# Before visiting the url, `request_to` will check if it already contains
|
886
|
-
# in the storage scope `:product_urls`, if so, request will be skipped:
|
887
|
-
request_to :parse_product, url: product[:href]
|
888
|
-
end
|
889
|
-
end
|
890
|
-
|
891
|
-
def parse_product(response, url:, data: {})
|
892
|
-
# Add visited item url to the storage scope `:product_urls`:
|
893
|
-
storage.add(:product_urls, url)
|
894
|
-
|
895
|
-
# ...
|
896
|
-
end
|
862
|
+
# ...
|
897
863
|
end
|
898
|
-
|
899
|
-
# Run the spider with persistence database option:
|
900
|
-
ProductsSpider.crawl!(continue: true)
|
901
864
|
```
|
902
|
-
</details>
|
903
|
-
|
904
|
-
### Handle request errors
|
905
|
-
It is quite common that some pages of crawling website can return different response code than `200 ok`. In such cases, method `request_to` (or `browser.visit`) can raise an exception. Kimurai provides `skip_request_errors` and `retry_request_errors` [config](#spider-config) options to handle such errors:
|
906
|
-
|
907
|
-
#### skip_request_errors
|
908
|
-
You can automatically skip some of errors while requesting a page using `skip_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught, and request will be skipped. It is a good idea to skip errors like NotFound(404), etc.
|
909
865
|
|
910
|
-
Format for the option: array where elements are error classes or/and hashes. You can use _hash_ format for more flexibility:
|
911
|
-
|
912
|
-
```ruby
|
913
|
-
@config = {
|
914
|
-
skip_request_errors: [{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }]
|
915
|
-
}
|
916
866
|
```
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
There are 3 attempts: first: delay _15 sec_, second: delay _30 sec_, third: delay _45 sec_. If after 3 attempts there is still an exception, then the exception will be raised. It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
|
923
|
-
|
924
|
-
Format for the option: same like for `skip_request_errors` option.
|
867
|
+
...
|
868
|
+
I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640] INFO -- example_spider: Spider: new event (scope: custom): Product is sold
|
869
|
+
...
|
870
|
+
I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640] INFO -- example_spider: Spider: stopped: {:events=>{:custom=>{"Product is sold"=>1}}}
|
871
|
+
```
|
925
872
|
|
926
873
|
### `open_spider` and `close_spider` callbacks
|
927
874
|
|
@@ -1296,7 +1243,7 @@ set :chronic_options, hours24: true
|
|
1296
1243
|
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
1297
1244
|
# end
|
1298
1245
|
def local_to_utc(time_string, zone:)
|
1299
|
-
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(
|
1246
|
+
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
|
1300
1247
|
end
|
1301
1248
|
|
1302
1249
|
# Note: by default Whenever exports cron commands with :environment == "production".
|
@@ -1483,6 +1430,8 @@ You can automatically setup [required environment](#installation) for Kimurai on
|
|
1483
1430
|
|
1484
1431
|
> To perform remote server setup, [Ansible](https://github.com/ansible/ansible) is required **on the desktop** machine (to install: Ubuntu: `$ sudo apt install ansible`, Mac OS X: `$ brew install ansible`)
|
1485
1432
|
|
1433
|
+
> It's recommended to use regular user to setup the server, not `root`. To create a new user, login to the server `$ ssh root@your_server_ip`, type `$ adduser username` to create a user, and `$ gpasswd -a username sudo` to add new user to a sudo group.
|
1434
|
+
|
1486
1435
|
Example:
|
1487
1436
|
|
1488
1437
|
```bash
|
@@ -2081,6 +2030,16 @@ $ bundle exec kimurai runner -j 3
|
|
2081
2030
|
|
2082
2031
|
Each spider runs in a separate process. Spiders logs available at `log/` folder. Pass `-j` option to specify how many spiders should be processed at the same time (default is 1).
|
2083
2032
|
|
2033
|
+
You can provide additional arguments like `--include` or `--exclude` to specify which spiders to run:
|
2034
|
+
|
2035
|
+
```bash
|
2036
|
+
# Run only custom_spider and example_spider:
|
2037
|
+
$ bundle exec kimurai runner --include custom_spider example_spider
|
2038
|
+
|
2039
|
+
# Run all except github_spider:
|
2040
|
+
$ bundle exec kimurai runner --exclude github_spider
|
2041
|
+
```
|
2042
|
+
|
2084
2043
|
#### Runner callbacks
|
2085
2044
|
|
2086
2045
|
You can perform custom actions before runner starts and after runner stops using `config.runner_at_start_callback` and `config.runner_at_stop_callback`. Check [config/application.rb](lib/kimurai/template/config/application.rb) to see example.
|
data/kimurai.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
20
20
|
end
|
21
21
|
spec.bindir = "exe"
|
22
|
-
spec.executables =
|
22
|
+
spec.executables = "kimurai"
|
23
23
|
spec.require_paths = ["lib"]
|
24
24
|
spec.required_ruby_version = ">= 2.5.0"
|
25
25
|
|
@@ -1,16 +1,16 @@
|
|
1
1
|
---
|
2
2
|
- hosts: all
|
3
3
|
vars:
|
4
|
-
ruby: 2.5.
|
4
|
+
ruby: 2.5.3
|
5
5
|
rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
|
6
6
|
rbenv_shims_path: "{{ rbenv_root_path }}/shims"
|
7
7
|
ruby_versions_path: "{{ rbenv_root_path }}/versions"
|
8
8
|
# check latest here http://phantomjs.org/download.html
|
9
9
|
phantomjs: 2.1.1
|
10
10
|
# check latest here https://github.com/mozilla/geckodriver/releases/
|
11
|
-
geckodriver: 0.
|
11
|
+
geckodriver: 0.23.0
|
12
12
|
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
13
|
-
chromedriver: 2.
|
13
|
+
chromedriver: 2.44
|
14
14
|
|
15
15
|
tasks:
|
16
16
|
- name: Update apt cache
|
data/lib/kimurai/base.rb
CHANGED
@@ -97,15 +97,10 @@ module Kimurai
|
|
97
97
|
end
|
98
98
|
end
|
99
99
|
|
100
|
-
def self.crawl!(
|
100
|
+
def self.crawl!(exception_on_fail: true)
|
101
101
|
logger.error "Spider: already running: #{name}" and return false if running?
|
102
102
|
|
103
|
-
|
104
|
-
if continue
|
105
|
-
Dir.exists?("tmp") ? "tmp/#{name}.pstore" : "#{name}.pstore"
|
106
|
-
end
|
107
|
-
|
108
|
-
@storage = Storage.new(storage_path)
|
103
|
+
@storage = Storage.new
|
109
104
|
@savers = {}
|
110
105
|
@update_mutex = Mutex.new
|
111
106
|
|
@@ -149,15 +144,6 @@ module Kimurai
|
|
149
144
|
|
150
145
|
close_spider if self.respond_to? :close_spider
|
151
146
|
|
152
|
-
if @storage.path
|
153
|
-
if completed?
|
154
|
-
@storage.delete!
|
155
|
-
logger.info "Spider: storage: persistence database #{@storage.path} was removed (successful run)"
|
156
|
-
else
|
157
|
-
logger.info "Spider: storage: persistence database #{@storage.path} wasn't removed (failed run)"
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
147
|
message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
|
162
148
|
failed? ? logger.fatal(message) : logger.info(message)
|
163
149
|
|
@@ -165,9 +151,16 @@ module Kimurai
|
|
165
151
|
end
|
166
152
|
end
|
167
153
|
|
168
|
-
def self.parse!(handler,
|
169
|
-
spider =
|
170
|
-
|
154
|
+
def self.parse!(handler, *args, **request)
|
155
|
+
spider = self.new
|
156
|
+
|
157
|
+
if args.present?
|
158
|
+
spider.public_send(handler, *args)
|
159
|
+
elsif request.present?
|
160
|
+
spider.request_to(handler, request)
|
161
|
+
else
|
162
|
+
spider.public_send(handler)
|
163
|
+
end
|
171
164
|
ensure
|
172
165
|
spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
|
173
166
|
end
|
@@ -244,12 +237,17 @@ module Kimurai
|
|
244
237
|
end
|
245
238
|
|
246
239
|
self.class.add_event(scope, event)
|
240
|
+
logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
|
247
241
|
end
|
248
242
|
|
249
243
|
###
|
250
244
|
|
251
245
|
private
|
252
246
|
|
247
|
+
def create_browser(engine, config = {})
|
248
|
+
Kimurai::BrowserBuilder.build(engine, config, spider: self)
|
249
|
+
end
|
250
|
+
|
253
251
|
def unique_request?(url)
|
254
252
|
options = @config[:skip_duplicate_requests]
|
255
253
|
if options.class == Hash
|
data/lib/kimurai/base/storage.rb
CHANGED
@@ -1,60 +1,34 @@
|
|
1
|
-
require 'pstore'
|
2
|
-
|
3
1
|
module Kimurai
|
4
2
|
class Base
|
5
3
|
class Storage
|
6
|
-
attr_reader :database
|
4
|
+
attr_reader :database
|
7
5
|
|
8
|
-
def initialize
|
9
|
-
@path = path
|
6
|
+
def initialize
|
10
7
|
@mutex = Mutex.new
|
11
|
-
@database =
|
8
|
+
@database = {}
|
12
9
|
end
|
13
10
|
|
14
11
|
def all(scope = nil)
|
15
12
|
@mutex.synchronize do
|
16
|
-
|
17
|
-
database.transaction { scope ? database.fetch(scope, []) : database }
|
18
|
-
else
|
19
|
-
scope ? database.fetch(scope, []) : database
|
20
|
-
end
|
13
|
+
scope ? database.fetch(scope, []) : database
|
21
14
|
end
|
22
15
|
end
|
23
16
|
|
24
17
|
def include?(scope, value)
|
25
18
|
@mutex.synchronize do
|
26
|
-
|
27
|
-
|
28
|
-
database[scope] ||= []
|
29
|
-
database[scope].include?(value)
|
30
|
-
end
|
31
|
-
else
|
32
|
-
database[scope] ||= []
|
33
|
-
database[scope].include?(value)
|
34
|
-
end
|
19
|
+
database[scope] ||= []
|
20
|
+
database[scope].include?(value)
|
35
21
|
end
|
36
22
|
end
|
37
23
|
|
38
24
|
def add(scope, value)
|
39
25
|
@mutex.synchronize do
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
database[scope] += value
|
45
|
-
database[scope].uniq!
|
46
|
-
else
|
47
|
-
database[scope].push(value) unless database[scope].include?(value)
|
48
|
-
end
|
49
|
-
end
|
26
|
+
database[scope] ||= []
|
27
|
+
if value.kind_of?(Array)
|
28
|
+
database[scope] += value
|
29
|
+
database[scope].uniq!
|
50
30
|
else
|
51
|
-
database[scope]
|
52
|
-
if value.class == Array
|
53
|
-
database[scope] += value
|
54
|
-
database[scope].uniq!
|
55
|
-
else
|
56
|
-
database[scope].push(value) unless database[scope].include?(value)
|
57
|
-
end
|
31
|
+
database[scope].push(value) unless database[scope].include?(value)
|
58
32
|
end
|
59
33
|
end
|
60
34
|
end
|
@@ -63,15 +37,8 @@ module Kimurai
|
|
63
37
|
|
64
38
|
def unique?(scope, value)
|
65
39
|
@mutex.synchronize do
|
66
|
-
|
67
|
-
|
68
|
-
database[scope] ||= []
|
69
|
-
database[scope].include?(value) ? false : database[scope].push(value) and true
|
70
|
-
end
|
71
|
-
else
|
72
|
-
database[scope] ||= []
|
73
|
-
database[scope].include?(value) ? false : database[scope].push(value) and true
|
74
|
-
end
|
40
|
+
database[scope] ||= []
|
41
|
+
database[scope].include?(value) ? false : database[scope].push(value) and true
|
75
42
|
end
|
76
43
|
end
|
77
44
|
|
@@ -79,21 +46,7 @@ module Kimurai
|
|
79
46
|
|
80
47
|
def clear!
|
81
48
|
@mutex.synchronize do
|
82
|
-
|
83
|
-
database.transaction do
|
84
|
-
database.roots.each { |key| database.delete key }
|
85
|
-
end
|
86
|
-
else
|
87
|
-
database = {}
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
def delete!
|
93
|
-
@mutex.synchronize do
|
94
|
-
if path
|
95
|
-
File.delete path if File.exists? path
|
96
|
-
end
|
49
|
+
@database = {}
|
97
50
|
end
|
98
51
|
end
|
99
52
|
end
|
@@ -38,11 +38,11 @@ module Kimurai
|
|
38
38
|
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
39
39
|
ip, port, type = proxy_string.split(":")
|
40
40
|
|
41
|
-
if type == "
|
42
|
-
logger.error "BrowserBuilder (mechanize): can't set socks5 proxy (not supported), skipped"
|
43
|
-
else
|
41
|
+
if type == "http"
|
44
42
|
@browser.driver.set_proxy(*proxy_string.split(":"))
|
45
|
-
logger.debug "BrowserBuilder (mechanize): enabled
|
43
|
+
logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
|
44
|
+
else
|
45
|
+
logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
@@ -64,8 +64,12 @@ module Kimurai
|
|
64
64
|
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
65
65
|
ip, port, type = proxy_string.split(":")
|
66
66
|
|
67
|
-
|
68
|
-
|
67
|
+
if %w(http socks5).include?(type)
|
68
|
+
@browser.driver.set_proxy(*proxy_string.split(":"))
|
69
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
70
|
+
else
|
71
|
+
logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
|
72
|
+
end
|
69
73
|
end
|
70
74
|
|
71
75
|
# Headers
|
@@ -44,11 +44,15 @@ module Kimurai
|
|
44
44
|
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
45
45
|
ip, port, type, user, password = proxy_string.split(":")
|
46
46
|
|
47
|
-
if
|
48
|
-
|
49
|
-
|
47
|
+
if %w(http socks5).include?(type)
|
48
|
+
if user.nil? && password.nil?
|
49
|
+
driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
|
50
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
51
|
+
else
|
52
|
+
logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped"
|
53
|
+
end
|
50
54
|
else
|
51
|
-
logger.error "BrowserBuilder (selenium_chrome):
|
55
|
+
logger.error "BrowserBuilder (selenium_chrome): wrong type of proxy: #{type}, skipped"
|
52
56
|
end
|
53
57
|
end
|
54
58
|
|
@@ -40,13 +40,18 @@ module Kimurai
|
|
40
40
|
driver_options.profile["network.proxy.http_port"] = port.to_i
|
41
41
|
driver_options.profile["network.proxy.ssl"] = ip
|
42
42
|
driver_options.profile["network.proxy.ssl_port"] = port.to_i
|
43
|
+
|
44
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}"
|
43
45
|
elsif type == "socks5"
|
44
46
|
driver_options.profile["network.proxy.socks"] = ip
|
45
47
|
driver_options.profile["network.proxy.socks_port"] = port.to_i
|
46
48
|
driver_options.profile["network.proxy.socks_version"] = 5
|
47
49
|
driver_options.profile["network.proxy.socks_remote_dns"] = true
|
50
|
+
|
51
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}"
|
52
|
+
else
|
53
|
+
logger.error "BrowserBuilder (selenium_firefox): wrong type of proxy: #{type}, skipped"
|
48
54
|
end
|
49
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
50
55
|
else
|
51
56
|
logger.error "BrowserBuilder (selenium_firefox): proxy with authentication doesn't supported by selenium, skipped"
|
52
57
|
end
|
@@ -10,6 +10,8 @@ class Capybara::Mechanize::Driver
|
|
10
10
|
browser.agent.set_proxy(ip, port, user, password)
|
11
11
|
end
|
12
12
|
|
13
|
+
###
|
14
|
+
|
13
15
|
def headers
|
14
16
|
browser.agent.request_headers
|
15
17
|
end
|
@@ -22,6 +24,12 @@ class Capybara::Mechanize::Driver
|
|
22
24
|
browser.agent.request_headers[name] = value
|
23
25
|
end
|
24
26
|
|
27
|
+
###
|
28
|
+
|
29
|
+
def get_cookies
|
30
|
+
browser.agent.cookies
|
31
|
+
end
|
32
|
+
|
25
33
|
def set_cookie(name, value, options = {})
|
26
34
|
options[:name] ||= name
|
27
35
|
options[:value] ||= value
|
@@ -30,10 +38,18 @@ class Capybara::Mechanize::Driver
|
|
30
38
|
browser.agent.cookie_jar << cookie
|
31
39
|
end
|
32
40
|
|
41
|
+
def set_cookies(cookies)
|
42
|
+
cookies.each do |cookie|
|
43
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
33
47
|
def clear_cookies
|
34
48
|
browser.agent.cookie_jar.clear!
|
35
49
|
end
|
36
50
|
|
51
|
+
###
|
52
|
+
|
37
53
|
def quit
|
38
54
|
browser.agent.shutdown
|
39
55
|
end
|
@@ -1,6 +1,10 @@
|
|
1
1
|
require_relative '../driver/base'
|
2
2
|
|
3
3
|
class Capybara::Selenium::Driver
|
4
|
+
def get_cookies
|
5
|
+
browser.manage.all_cookies
|
6
|
+
end
|
7
|
+
|
4
8
|
def set_cookie(name, value, options = {})
|
5
9
|
options[:name] ||= name
|
6
10
|
options[:value] ||= value
|
@@ -8,6 +12,12 @@ class Capybara::Selenium::Driver
|
|
8
12
|
browser.manage.add_cookie(options)
|
9
13
|
end
|
10
14
|
|
15
|
+
def set_cookies(cookies)
|
16
|
+
cookies.each do |cookie|
|
17
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
11
21
|
def clear_cookies
|
12
22
|
browser.manage.delete_all_cookies
|
13
23
|
end
|
@@ -33,7 +33,7 @@ module Capybara
|
|
33
33
|
sleep sleep_interval and retry
|
34
34
|
else
|
35
35
|
logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
|
36
|
-
raise e
|
36
|
+
raise e unless skip_error_on_failure?(e)
|
37
37
|
end
|
38
38
|
else
|
39
39
|
raise e
|
@@ -127,26 +127,40 @@ module Capybara
|
|
127
127
|
|
128
128
|
###
|
129
129
|
|
130
|
+
def scroll_to_bottom
|
131
|
+
execute_script("window.scrollBy(0,10000)")
|
132
|
+
end
|
133
|
+
|
130
134
|
private
|
131
135
|
|
136
|
+
def skip_error_on_failure?(e)
|
137
|
+
config.retry_request_errors.any? do |error|
|
138
|
+
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
132
142
|
def match_error?(e, type:)
|
133
|
-
errors =
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
143
|
+
errors =
|
144
|
+
case type
|
145
|
+
when :to_retry then config.retry_request_errors
|
146
|
+
when :to_skip then config.skip_request_errors
|
147
|
+
end
|
148
|
+
|
149
|
+
errors.any? do |error|
|
150
|
+
if error.kind_of?(Hash)
|
151
|
+
match_class = e.class.ancestors.include?(error[:error])
|
152
|
+
if error[:message].present?
|
153
|
+
if error[:message].kind_of?(Regexp)
|
138
154
|
e.message&.match?(error[:message])
|
139
155
|
else
|
140
156
|
e.message&.include?(error[:message])
|
141
|
-
end
|
142
|
-
|
143
|
-
e.class == error[:error] && match
|
157
|
+
end && match_class
|
144
158
|
else
|
145
|
-
|
159
|
+
match_class
|
146
160
|
end
|
161
|
+
else
|
162
|
+
e.class.ancestors.include?(error)
|
147
163
|
end
|
148
|
-
else
|
149
|
-
false
|
150
164
|
end
|
151
165
|
end
|
152
166
|
|
data/lib/kimurai/cli.rb
CHANGED
@@ -66,7 +66,6 @@ module Kimurai
|
|
66
66
|
###
|
67
67
|
|
68
68
|
desc "crawl", "Run a particular spider by it's name"
|
69
|
-
option :continue, aliases: :c, type: :boolean, default: false, banner: "Continue previous crawling"
|
70
69
|
def crawl(spider_name)
|
71
70
|
raise "Can't find Kimurai project" unless inside_project?
|
72
71
|
require './config/boot'
|
@@ -81,7 +80,7 @@ module Kimurai
|
|
81
80
|
Kimurai.time_zone = time_zone
|
82
81
|
end
|
83
82
|
|
84
|
-
klass.crawl!
|
83
|
+
klass.crawl!
|
85
84
|
end
|
86
85
|
|
87
86
|
desc "parse", "Parse url in the particular spider method"
|
@@ -129,6 +128,8 @@ module Kimurai
|
|
129
128
|
end
|
130
129
|
|
131
130
|
desc "runner", "Run all spiders in the project in queue"
|
131
|
+
option :include, type: :array, default: [], banner: "List of spiders to run"
|
132
|
+
option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
|
132
133
|
option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
|
133
134
|
def runner
|
134
135
|
raise "Can't find Kimurai project" unless inside_project?
|
@@ -138,7 +139,11 @@ module Kimurai
|
|
138
139
|
|
139
140
|
require './config/boot'
|
140
141
|
require 'kimurai/runner'
|
141
|
-
|
142
|
+
|
143
|
+
spiders = options["include"].presence || Kimurai.list.keys
|
144
|
+
spiders -= options["exclude"]
|
145
|
+
|
146
|
+
Runner.new(spiders, jobs).run!
|
142
147
|
end
|
143
148
|
|
144
149
|
desc "--version, -v", "Print the version"
|
data/lib/kimurai/runner.rb
CHANGED
@@ -4,9 +4,9 @@ module Kimurai
|
|
4
4
|
class Runner
|
5
5
|
attr_reader :jobs, :spiders, :session_info
|
6
6
|
|
7
|
-
def initialize(parallel_jobs
|
7
|
+
def initialize(spiders, parallel_jobs)
|
8
8
|
@jobs = parallel_jobs
|
9
|
-
@spiders =
|
9
|
+
@spiders = spiders
|
10
10
|
@start_time = Time.now
|
11
11
|
|
12
12
|
@session_info = {
|
@@ -16,7 +16,7 @@ module Kimurai
|
|
16
16
|
stop_time: nil,
|
17
17
|
environment: Kimurai.env,
|
18
18
|
concurrent_jobs: @jobs,
|
19
|
-
spiders: @spiders
|
19
|
+
spiders: @spiders
|
20
20
|
}
|
21
21
|
|
22
22
|
if time_zone = Kimurai.configuration.time_zone
|
@@ -28,8 +28,6 @@ module Kimurai
|
|
28
28
|
end
|
29
29
|
|
30
30
|
def run!(exception_on_fail: true)
|
31
|
-
running_pids = []
|
32
|
-
|
33
31
|
puts ">>> Runner: started: #{session_info}"
|
34
32
|
if at_start_callback = Kimurai.configuration.runner_at_start_callback
|
35
33
|
at_start_callback.call(session_info)
|
@@ -39,29 +37,20 @@ module Kimurai
|
|
39
37
|
spiders.peach_with_index(jobs) do |spider, i|
|
40
38
|
next unless running
|
41
39
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
pid = spawn("bundle", "exec", "kimurai", "crawl", spider_name, [:out, :err] => "log/#{spider_name}.log")
|
46
|
-
running_pids << pid
|
40
|
+
puts "> Runner: started spider: #{spider}, index: #{i}"
|
41
|
+
pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
|
47
42
|
Process.wait pid
|
48
43
|
|
49
|
-
|
50
|
-
puts "< Runner: stopped spider: #{spider_name}, index: #{i}"
|
44
|
+
puts "< Runner: stopped spider: #{spider}, index: #{i}"
|
51
45
|
end
|
52
46
|
rescue StandardError, SignalException, SystemExit => e
|
53
47
|
running = false
|
48
|
+
|
54
49
|
session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
|
55
50
|
exception_on_fail ? raise(e) : [session_info, e]
|
56
51
|
else
|
57
52
|
session_info.merge!(status: :completed, stop_time: Time.now)
|
58
53
|
ensure
|
59
|
-
running = false
|
60
|
-
Thread.list.each { |t| t.kill if t != Thread.main }
|
61
|
-
|
62
|
-
# Kill currently running spiders (if any, in case of fail)
|
63
|
-
running_pids.each { |pid| Process.kill("INT", pid) }
|
64
|
-
|
65
54
|
if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
|
66
55
|
at_stop_callback.call(session_info)
|
67
56
|
end
|
@@ -16,7 +16,7 @@ set :chronic_options, hours24: true
|
|
16
16
|
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
17
17
|
# end
|
18
18
|
def local_to_utc(time_string, zone:)
|
19
|
-
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(
|
19
|
+
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
|
20
20
|
end
|
21
21
|
|
22
22
|
# Note: by default Whenever exports cron commands with :environment == "production".
|
data/lib/kimurai/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kimurai
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Afanasev
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|