kimurai 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +58 -99
- data/kimurai.gemspec +1 -1
- data/lib/kimurai/automation/setup.yml +3 -3
- data/lib/kimurai/base.rb +17 -19
- data/lib/kimurai/base/storage.rb +14 -61
- data/lib/kimurai/browser_builder/mechanize_builder.rb +4 -4
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +6 -2
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +8 -4
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +6 -1
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +16 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +10 -0
- data/lib/kimurai/capybara_ext/session.rb +26 -12
- data/lib/kimurai/cli.rb +8 -3
- data/lib/kimurai/runner.rb +7 -18
- data/lib/kimurai/template/config/schedule.rb +1 -1
- data/lib/kimurai/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9fb75ab8965adb212964eee4b319642c3b83bbb4e3b957b739b6eba28b64546d
|
4
|
+
data.tar.gz: b03efc58a57222fdbb51e5ab6aa389f352de1f94fbd02358ccb613d7a15a5774
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 737f467deef60eb2983d86286c040d1b899732ff3e52fdab50ce07e8893070c0cd9f3f3020fe57e32ab1d9140299a9112db698e8d72112934b4fd90ee10a7644
|
7
|
+
data.tar.gz: de89bc1ef85f7f29986b7033d883a49c804592a21f78b409bf325f2e0e578dc78bbf67cdca4ca719f34e43f305dd99f8f227cbd8b545e42f06ca7f49d24e1e35
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,21 @@
|
|
1
1
|
# CHANGELOG
|
2
|
+
## 1.3.0
|
3
|
+
### Breaking changes 1.3.0
|
4
|
+
* Remove persistence database feature (because it's slow and makes things complicated)
|
5
|
+
|
6
|
+
### New
|
7
|
+
* Add `--include` and `--exclude` options to CLI#runner
|
8
|
+
* Add Base `#create_browser` method to easily create additional browser instances
|
9
|
+
* Add Capybara::Session `#scroll_to_bottom`
|
10
|
+
* Add skip_on_failure feature to `retry_request_errors` config option
|
11
|
+
* Add info about `add_event` method to the README
|
12
|
+
|
13
|
+
### Fixes and improvements
|
14
|
+
* Improve Runner
|
15
|
+
* Fix time helper in schedule.rb
|
16
|
+
* Add proxy validation to browser builders
|
17
|
+
* Allow to pass different arguments to the `Base.parse` method
|
18
|
+
|
2
19
|
## 1.2.0
|
3
20
|
### New
|
4
21
|
* Add possibility to add array of values to the storage (`Base::Storage#add`)
|
data/README.md
CHANGED
@@ -18,7 +18,7 @@
|
|
18
18
|
|
19
19
|
<br>
|
20
20
|
|
21
|
-
> Note: this readme is for `1.
|
21
|
+
> Note: this readme is for `1.3.0` gem version. CHANGELOG [here](CHANGELOG.md).
|
22
22
|
|
23
23
|
Kimurai is a modern web scraping framework written in Ruby which **works out of box with Headless Chromium/Firefox, PhantomJS**, or simple HTTP requests and **allows to scrape and interact with JavaScript rendered websites.**
|
24
24
|
|
@@ -241,10 +241,10 @@ I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scrol
|
|
241
241
|
* [Skip duplicates](#skip-duplicates)
|
242
242
|
* [Automatically skip all duplicated requests urls](#automatically-skip-all-duplicated-requests-urls)
|
243
243
|
* [Storage object](#storage-object)
|
244
|
-
* [Persistence database for the storage](#persistence-database-for-the-storage)
|
245
244
|
* [Handle request errors](#handle-request-errors)
|
246
245
|
* [skip_request_errors](#skip_request_errors)
|
247
246
|
* [retry_request_errors](#retry_request_errors)
|
247
|
+
* [Logging custom events](#logging-custom-events)
|
248
248
|
* [open_spider and close_spider callbacks](#open_spider-and-close_spider-callbacks)
|
249
249
|
* [KIMURAI_ENV](#kimurai_env)
|
250
250
|
* [Parallel crawling using in_parallel](#parallel-crawling-using-in_parallel)
|
@@ -297,8 +297,8 @@ echo 'export PATH="$HOME/.rbenv/plugins/ruby-build/bin:$PATH"' >> ~/.bashrc
|
|
297
297
|
exec $SHELL
|
298
298
|
|
299
299
|
# Install latest Ruby
|
300
|
-
rbenv install 2.5.
|
301
|
-
rbenv global 2.5.
|
300
|
+
rbenv install 2.5.3
|
301
|
+
rbenv global 2.5.3
|
302
302
|
|
303
303
|
gem install bundler
|
304
304
|
```
|
@@ -317,8 +317,8 @@ echo 'if which rbenv > /dev/null; then eval "$(rbenv init -)"; fi' >> ~/.bash_pr
|
|
317
317
|
source ~/.bash_profile
|
318
318
|
|
319
319
|
# Install latest Ruby
|
320
|
-
rbenv install 2.5.
|
321
|
-
rbenv global 2.5.
|
320
|
+
rbenv install 2.5.3
|
321
|
+
rbenv global 2.5.3
|
322
322
|
|
323
323
|
gem install bundler
|
324
324
|
```
|
@@ -349,17 +349,17 @@ sudo apt install -q -y xvfb
|
|
349
349
|
# Install chromium-browser and firefox
|
350
350
|
sudo apt install -q -y chromium-browser firefox
|
351
351
|
|
352
|
-
# Instal chromedriver (2.
|
352
|
+
# Instal chromedriver (2.44 version)
|
353
353
|
# All versions located here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
354
|
-
cd /tmp && wget https://chromedriver.storage.googleapis.com/2.
|
354
|
+
cd /tmp && wget https://chromedriver.storage.googleapis.com/2.44/chromedriver_linux64.zip
|
355
355
|
sudo unzip chromedriver_linux64.zip -d /usr/local/bin
|
356
356
|
rm -f chromedriver_linux64.zip
|
357
357
|
|
358
|
-
# Install geckodriver (0.
|
358
|
+
# Install geckodriver (0.23.0 version)
|
359
359
|
# All versions located here https://github.com/mozilla/geckodriver/releases/
|
360
|
-
cd /tmp && wget https://github.com/mozilla/geckodriver/releases/download/v0.
|
361
|
-
sudo tar -xvzf geckodriver-v0.
|
362
|
-
rm -f geckodriver-v0.
|
360
|
+
cd /tmp && wget https://github.com/mozilla/geckodriver/releases/download/v0.23.0/geckodriver-v0.23.0-linux64.tar.gz
|
361
|
+
sudo tar -xvzf geckodriver-v0.23.0-linux64.tar.gz -C /usr/local/bin
|
362
|
+
rm -f geckodriver-v0.23.0-linux64.tar.gz
|
363
363
|
|
364
364
|
# Install PhantomJS (2.1.1)
|
365
365
|
# All versions located here http://phantomjs.org/download.html
|
@@ -815,113 +815,60 @@ It is possible to automatically skip all already visited urls while calling `req
|
|
815
815
|
* `#all` - display storage hash where keys are existing scopes.
|
816
816
|
* `#include?(scope, value)` - return `true` if value in the scope exists, and `false` if not
|
817
817
|
* `#add(scope, value)` - add value to the scope
|
818
|
-
*
|
819
|
-
*
|
818
|
+
* `#unique?(scope, value)` - method already described above, will return `false` if value in the scope exists, or return `true` + add value to the scope if value in the scope not exists.
|
819
|
+
* `#clear!` - reset the whole storage by deleting all values from all scopes.
|
820
820
|
|
821
|
-
#### Persistence database for the storage
|
822
821
|
|
823
|
-
|
824
|
-
|
825
|
-
Kimurai can use persistence database for a `storage` using Ruby built-in [PStore](https://ruby-doc.org/stdlib-2.5.1/libdoc/pstore/rdoc/PStore.html) database. With this option, you can automatically skip already visited urls in the next run _if previous run was failed_, otherwise _(if run was successful)_ storage database will be removed before spider stops.
|
826
|
-
|
827
|
-
Also, with persistence storage enabled, [save_to](#save_to-helper) method will keep adding items to an existing file (it will not be cleared before each run).
|
822
|
+
### Handle request errors
|
823
|
+
It is quite common that some pages of crawling website can return different response code than `200 ok`. In such cases, method `request_to` (or `browser.visit`) can raise an exception. Kimurai provides `skip_request_errors` and `retry_request_errors` [config](#spider-config) options to handle such errors:
|
828
824
|
|
829
|
-
|
825
|
+
#### skip_request_errors
|
826
|
+
You can automatically skip some of errors while requesting a page using `skip_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught, and request will be skipped. It is a good idea to skip errors like NotFound(404), etc.
|
830
827
|
|
831
|
-
|
828
|
+
Format for the option: array where elements are error classes or/and hashes. You can use _hash_ format for more flexibility:
|
832
829
|
|
833
|
-
```
|
834
|
-
|
835
|
-
|
830
|
+
```
|
831
|
+
@config = {
|
832
|
+
skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }]
|
833
|
+
}
|
834
|
+
```
|
835
|
+
In this case, provided `message:` will be compared with a full error message using `String#include?`. Also you can use regex instead: `{ error: RuntimeError, message: /404|403/ }`.
|
836
836
|
|
837
|
-
|
838
|
-
|
839
|
-
request_to :parse_category, url: category[:href]
|
840
|
-
end
|
841
|
-
end
|
837
|
+
#### retry_request_errors
|
838
|
+
You can automatically retry some of errors with a few attempts while requesting a page using `retry_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught and the request will be processed again within a delay.
|
842
839
|
|
843
|
-
|
844
|
-
response.xpath("//products/path").each do |product|
|
845
|
-
# check if product url already contains in the scope `:product_urls`, if so, skip the request:
|
846
|
-
next if storage.contains?(:product_urls, product[:href])
|
847
|
-
# Otherwise process it:
|
848
|
-
request_to :parse_product, url: product[:href]
|
849
|
-
end
|
850
|
-
end
|
840
|
+
There are 3 attempts: first: delay _15 sec_, second: delay _30 sec_, third: delay _45 sec_. If after 3 attempts there is still an exception, then the exception will be raised. It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
|
851
841
|
|
852
|
-
|
853
|
-
# Add visited item to the storage:
|
854
|
-
storage.add(:product_urls, url)
|
842
|
+
Format for the option: same like for `skip_request_errors` option.
|
855
843
|
|
856
|
-
|
857
|
-
end
|
858
|
-
end
|
844
|
+
If you would like to skip (not raise) error after all retries gone, you can specify `skip_on_failure: true` option:
|
859
845
|
|
860
|
-
|
861
|
-
|
846
|
+
```ruby
|
847
|
+
@config = {
|
848
|
+
retry_request_errors: [{ error: RuntimeError, skip_on_failure: true }]
|
849
|
+
}
|
862
850
|
```
|
863
851
|
|
864
|
-
|
852
|
+
### Logging custom events
|
865
853
|
|
866
|
-
|
867
|
-
<summary>Check the code</summary>
|
854
|
+
It is possible to save custom messages to the [run_info](#open_spider-and-close_spider-callbacks) hash using `add_event('Some message')` method. This feature helps you to keep track on important things which happened during crawling without checking the whole spider log (in case if you're logging these messages using `logger`). Example:
|
868
855
|
|
869
856
|
```ruby
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
# Configure skip_duplicate_requests option:
|
874
|
-
skip_duplicate_requests: { scope: :product_urls, check_only: true }
|
875
|
-
}
|
876
|
-
|
877
|
-
def parse(response, url:, data: {})
|
878
|
-
response.xpath("//categories/path").each do |category|
|
879
|
-
request_to :parse_category, url: category[:href]
|
880
|
-
end
|
857
|
+
def parse_product(response, url:, data: {})
|
858
|
+
unless response.at_xpath("//path/to/add_to_card_button")
|
859
|
+
add_event("Product is sold") and return
|
881
860
|
end
|
882
861
|
|
883
|
-
|
884
|
-
response.xpath("//products/path").each do |product|
|
885
|
-
# Before visiting the url, `request_to` will check if it already contains
|
886
|
-
# in the storage scope `:product_urls`, if so, request will be skipped:
|
887
|
-
request_to :parse_product, url: product[:href]
|
888
|
-
end
|
889
|
-
end
|
890
|
-
|
891
|
-
def parse_product(response, url:, data: {})
|
892
|
-
# Add visited item url to the storage scope `:product_urls`:
|
893
|
-
storage.add(:product_urls, url)
|
894
|
-
|
895
|
-
# ...
|
896
|
-
end
|
862
|
+
# ...
|
897
863
|
end
|
898
|
-
|
899
|
-
# Run the spider with persistence database option:
|
900
|
-
ProductsSpider.crawl!(continue: true)
|
901
864
|
```
|
902
|
-
</details>
|
903
|
-
|
904
|
-
### Handle request errors
|
905
|
-
It is quite common that some pages of crawling website can return different response code than `200 ok`. In such cases, method `request_to` (or `browser.visit`) can raise an exception. Kimurai provides `skip_request_errors` and `retry_request_errors` [config](#spider-config) options to handle such errors:
|
906
|
-
|
907
|
-
#### skip_request_errors
|
908
|
-
You can automatically skip some of errors while requesting a page using `skip_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught, and request will be skipped. It is a good idea to skip errors like NotFound(404), etc.
|
909
865
|
|
910
|
-
Format for the option: array where elements are error classes or/and hashes. You can use _hash_ format for more flexibility:
|
911
|
-
|
912
|
-
```ruby
|
913
|
-
@config = {
|
914
|
-
skip_request_errors: [{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }]
|
915
|
-
}
|
916
866
|
```
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
There are 3 attempts: first: delay _15 sec_, second: delay _30 sec_, third: delay _45 sec_. If after 3 attempts there is still an exception, then the exception will be raised. It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
|
923
|
-
|
924
|
-
Format for the option: same like for `skip_request_errors` option.
|
867
|
+
...
|
868
|
+
I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640] INFO -- example_spider: Spider: new event (scope: custom): Product is sold
|
869
|
+
...
|
870
|
+
I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640] INFO -- example_spider: Spider: stopped: {:events=>{:custom=>{"Product is sold"=>1}}}
|
871
|
+
```
|
925
872
|
|
926
873
|
### `open_spider` and `close_spider` callbacks
|
927
874
|
|
@@ -1296,7 +1243,7 @@ set :chronic_options, hours24: true
|
|
1296
1243
|
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
1297
1244
|
# end
|
1298
1245
|
def local_to_utc(time_string, zone:)
|
1299
|
-
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(
|
1246
|
+
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
|
1300
1247
|
end
|
1301
1248
|
|
1302
1249
|
# Note: by default Whenever exports cron commands with :environment == "production".
|
@@ -1483,6 +1430,8 @@ You can automatically setup [required environment](#installation) for Kimurai on
|
|
1483
1430
|
|
1484
1431
|
> To perform remote server setup, [Ansible](https://github.com/ansible/ansible) is required **on the desktop** machine (to install: Ubuntu: `$ sudo apt install ansible`, Mac OS X: `$ brew install ansible`)
|
1485
1432
|
|
1433
|
+
> It's recommended to use regular user to setup the server, not `root`. To create a new user, login to the server `$ ssh root@your_server_ip`, type `$ adduser username` to create a user, and `$ gpasswd -a username sudo` to add new user to a sudo group.
|
1434
|
+
|
1486
1435
|
Example:
|
1487
1436
|
|
1488
1437
|
```bash
|
@@ -2081,6 +2030,16 @@ $ bundle exec kimurai runner -j 3
|
|
2081
2030
|
|
2082
2031
|
Each spider runs in a separate process. Spiders logs available at `log/` folder. Pass `-j` option to specify how many spiders should be processed at the same time (default is 1).
|
2083
2032
|
|
2033
|
+
You can provide additional arguments like `--include` or `--exclude` to specify which spiders to run:
|
2034
|
+
|
2035
|
+
```bash
|
2036
|
+
# Run only custom_spider and example_spider:
|
2037
|
+
$ bundle exec kimurai runner --include custom_spider example_spider
|
2038
|
+
|
2039
|
+
# Run all except github_spider:
|
2040
|
+
$ bundle exec kimurai runner --exclude github_spider
|
2041
|
+
```
|
2042
|
+
|
2084
2043
|
#### Runner callbacks
|
2085
2044
|
|
2086
2045
|
You can perform custom actions before runner starts and after runner stops using `config.runner_at_start_callback` and `config.runner_at_stop_callback`. Check [config/application.rb](lib/kimurai/template/config/application.rb) to see example.
|
data/kimurai.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
20
20
|
end
|
21
21
|
spec.bindir = "exe"
|
22
|
-
spec.executables =
|
22
|
+
spec.executables = "kimurai"
|
23
23
|
spec.require_paths = ["lib"]
|
24
24
|
spec.required_ruby_version = ">= 2.5.0"
|
25
25
|
|
@@ -1,16 +1,16 @@
|
|
1
1
|
---
|
2
2
|
- hosts: all
|
3
3
|
vars:
|
4
|
-
ruby: 2.5.
|
4
|
+
ruby: 2.5.3
|
5
5
|
rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
|
6
6
|
rbenv_shims_path: "{{ rbenv_root_path }}/shims"
|
7
7
|
ruby_versions_path: "{{ rbenv_root_path }}/versions"
|
8
8
|
# check latest here http://phantomjs.org/download.html
|
9
9
|
phantomjs: 2.1.1
|
10
10
|
# check latest here https://github.com/mozilla/geckodriver/releases/
|
11
|
-
geckodriver: 0.
|
11
|
+
geckodriver: 0.23.0
|
12
12
|
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
13
|
-
chromedriver: 2.
|
13
|
+
chromedriver: 2.44
|
14
14
|
|
15
15
|
tasks:
|
16
16
|
- name: Update apt cache
|
data/lib/kimurai/base.rb
CHANGED
@@ -97,15 +97,10 @@ module Kimurai
|
|
97
97
|
end
|
98
98
|
end
|
99
99
|
|
100
|
-
def self.crawl!(
|
100
|
+
def self.crawl!(exception_on_fail: true)
|
101
101
|
logger.error "Spider: already running: #{name}" and return false if running?
|
102
102
|
|
103
|
-
|
104
|
-
if continue
|
105
|
-
Dir.exists?("tmp") ? "tmp/#{name}.pstore" : "#{name}.pstore"
|
106
|
-
end
|
107
|
-
|
108
|
-
@storage = Storage.new(storage_path)
|
103
|
+
@storage = Storage.new
|
109
104
|
@savers = {}
|
110
105
|
@update_mutex = Mutex.new
|
111
106
|
|
@@ -149,15 +144,6 @@ module Kimurai
|
|
149
144
|
|
150
145
|
close_spider if self.respond_to? :close_spider
|
151
146
|
|
152
|
-
if @storage.path
|
153
|
-
if completed?
|
154
|
-
@storage.delete!
|
155
|
-
logger.info "Spider: storage: persistence database #{@storage.path} was removed (successful run)"
|
156
|
-
else
|
157
|
-
logger.info "Spider: storage: persistence database #{@storage.path} wasn't removed (failed run)"
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
147
|
message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
|
162
148
|
failed? ? logger.fatal(message) : logger.info(message)
|
163
149
|
|
@@ -165,9 +151,16 @@ module Kimurai
|
|
165
151
|
end
|
166
152
|
end
|
167
153
|
|
168
|
-
def self.parse!(handler,
|
169
|
-
spider =
|
170
|
-
|
154
|
+
def self.parse!(handler, *args, **request)
|
155
|
+
spider = self.new
|
156
|
+
|
157
|
+
if args.present?
|
158
|
+
spider.public_send(handler, *args)
|
159
|
+
elsif request.present?
|
160
|
+
spider.request_to(handler, request)
|
161
|
+
else
|
162
|
+
spider.public_send(handler)
|
163
|
+
end
|
171
164
|
ensure
|
172
165
|
spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
|
173
166
|
end
|
@@ -244,12 +237,17 @@ module Kimurai
|
|
244
237
|
end
|
245
238
|
|
246
239
|
self.class.add_event(scope, event)
|
240
|
+
logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
|
247
241
|
end
|
248
242
|
|
249
243
|
###
|
250
244
|
|
251
245
|
private
|
252
246
|
|
247
|
+
def create_browser(engine, config = {})
|
248
|
+
Kimurai::BrowserBuilder.build(engine, config, spider: self)
|
249
|
+
end
|
250
|
+
|
253
251
|
def unique_request?(url)
|
254
252
|
options = @config[:skip_duplicate_requests]
|
255
253
|
if options.class == Hash
|
data/lib/kimurai/base/storage.rb
CHANGED
@@ -1,60 +1,34 @@
|
|
1
|
-
require 'pstore'
|
2
|
-
|
3
1
|
module Kimurai
|
4
2
|
class Base
|
5
3
|
class Storage
|
6
|
-
attr_reader :database
|
4
|
+
attr_reader :database
|
7
5
|
|
8
|
-
def initialize
|
9
|
-
@path = path
|
6
|
+
def initialize
|
10
7
|
@mutex = Mutex.new
|
11
|
-
@database =
|
8
|
+
@database = {}
|
12
9
|
end
|
13
10
|
|
14
11
|
def all(scope = nil)
|
15
12
|
@mutex.synchronize do
|
16
|
-
|
17
|
-
database.transaction { scope ? database.fetch(scope, []) : database }
|
18
|
-
else
|
19
|
-
scope ? database.fetch(scope, []) : database
|
20
|
-
end
|
13
|
+
scope ? database.fetch(scope, []) : database
|
21
14
|
end
|
22
15
|
end
|
23
16
|
|
24
17
|
def include?(scope, value)
|
25
18
|
@mutex.synchronize do
|
26
|
-
|
27
|
-
|
28
|
-
database[scope] ||= []
|
29
|
-
database[scope].include?(value)
|
30
|
-
end
|
31
|
-
else
|
32
|
-
database[scope] ||= []
|
33
|
-
database[scope].include?(value)
|
34
|
-
end
|
19
|
+
database[scope] ||= []
|
20
|
+
database[scope].include?(value)
|
35
21
|
end
|
36
22
|
end
|
37
23
|
|
38
24
|
def add(scope, value)
|
39
25
|
@mutex.synchronize do
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
database[scope] += value
|
45
|
-
database[scope].uniq!
|
46
|
-
else
|
47
|
-
database[scope].push(value) unless database[scope].include?(value)
|
48
|
-
end
|
49
|
-
end
|
26
|
+
database[scope] ||= []
|
27
|
+
if value.kind_of?(Array)
|
28
|
+
database[scope] += value
|
29
|
+
database[scope].uniq!
|
50
30
|
else
|
51
|
-
database[scope]
|
52
|
-
if value.class == Array
|
53
|
-
database[scope] += value
|
54
|
-
database[scope].uniq!
|
55
|
-
else
|
56
|
-
database[scope].push(value) unless database[scope].include?(value)
|
57
|
-
end
|
31
|
+
database[scope].push(value) unless database[scope].include?(value)
|
58
32
|
end
|
59
33
|
end
|
60
34
|
end
|
@@ -63,15 +37,8 @@ module Kimurai
|
|
63
37
|
|
64
38
|
def unique?(scope, value)
|
65
39
|
@mutex.synchronize do
|
66
|
-
|
67
|
-
|
68
|
-
database[scope] ||= []
|
69
|
-
database[scope].include?(value) ? false : database[scope].push(value) and true
|
70
|
-
end
|
71
|
-
else
|
72
|
-
database[scope] ||= []
|
73
|
-
database[scope].include?(value) ? false : database[scope].push(value) and true
|
74
|
-
end
|
40
|
+
database[scope] ||= []
|
41
|
+
database[scope].include?(value) ? false : database[scope].push(value) and true
|
75
42
|
end
|
76
43
|
end
|
77
44
|
|
@@ -79,21 +46,7 @@ module Kimurai
|
|
79
46
|
|
80
47
|
def clear!
|
81
48
|
@mutex.synchronize do
|
82
|
-
|
83
|
-
database.transaction do
|
84
|
-
database.roots.each { |key| database.delete key }
|
85
|
-
end
|
86
|
-
else
|
87
|
-
database = {}
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
def delete!
|
93
|
-
@mutex.synchronize do
|
94
|
-
if path
|
95
|
-
File.delete path if File.exists? path
|
96
|
-
end
|
49
|
+
@database = {}
|
97
50
|
end
|
98
51
|
end
|
99
52
|
end
|
@@ -38,11 +38,11 @@ module Kimurai
|
|
38
38
|
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
39
39
|
ip, port, type = proxy_string.split(":")
|
40
40
|
|
41
|
-
if type == "
|
42
|
-
logger.error "BrowserBuilder (mechanize): can't set socks5 proxy (not supported), skipped"
|
43
|
-
else
|
41
|
+
if type == "http"
|
44
42
|
@browser.driver.set_proxy(*proxy_string.split(":"))
|
45
|
-
logger.debug "BrowserBuilder (mechanize): enabled
|
43
|
+
logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
|
44
|
+
else
|
45
|
+
logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
@@ -64,8 +64,12 @@ module Kimurai
|
|
64
64
|
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
65
65
|
ip, port, type = proxy_string.split(":")
|
66
66
|
|
67
|
-
|
68
|
-
|
67
|
+
if %w(http socks5).include?(type)
|
68
|
+
@browser.driver.set_proxy(*proxy_string.split(":"))
|
69
|
+
logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
70
|
+
else
|
71
|
+
logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
|
72
|
+
end
|
69
73
|
end
|
70
74
|
|
71
75
|
# Headers
|
@@ -44,11 +44,15 @@ module Kimurai
|
|
44
44
|
proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
|
45
45
|
ip, port, type, user, password = proxy_string.split(":")
|
46
46
|
|
47
|
-
if
|
48
|
-
|
49
|
-
|
47
|
+
if %w(http socks5).include?(type)
|
48
|
+
if user.nil? && password.nil?
|
49
|
+
driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
|
50
|
+
logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
51
|
+
else
|
52
|
+
logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped"
|
53
|
+
end
|
50
54
|
else
|
51
|
-
logger.error "BrowserBuilder (selenium_chrome):
|
55
|
+
logger.error "BrowserBuilder (selenium_chrome): wrong type of proxy: #{type}, skipped"
|
52
56
|
end
|
53
57
|
end
|
54
58
|
|
@@ -40,13 +40,18 @@ module Kimurai
|
|
40
40
|
driver_options.profile["network.proxy.http_port"] = port.to_i
|
41
41
|
driver_options.profile["network.proxy.ssl"] = ip
|
42
42
|
driver_options.profile["network.proxy.ssl_port"] = port.to_i
|
43
|
+
|
44
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}"
|
43
45
|
elsif type == "socks5"
|
44
46
|
driver_options.profile["network.proxy.socks"] = ip
|
45
47
|
driver_options.profile["network.proxy.socks_port"] = port.to_i
|
46
48
|
driver_options.profile["network.proxy.socks_version"] = 5
|
47
49
|
driver_options.profile["network.proxy.socks_remote_dns"] = true
|
50
|
+
|
51
|
+
logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}"
|
52
|
+
else
|
53
|
+
logger.error "BrowserBuilder (selenium_firefox): wrong type of proxy: #{type}, skipped"
|
48
54
|
end
|
49
|
-
logger.debug "BrowserBuilder (selenium_firefox): enabled #{type} proxy, ip: #{ip}, port: #{port}"
|
50
55
|
else
|
51
56
|
logger.error "BrowserBuilder (selenium_firefox): proxy with authentication doesn't supported by selenium, skipped"
|
52
57
|
end
|
@@ -10,6 +10,8 @@ class Capybara::Mechanize::Driver
|
|
10
10
|
browser.agent.set_proxy(ip, port, user, password)
|
11
11
|
end
|
12
12
|
|
13
|
+
###
|
14
|
+
|
13
15
|
def headers
|
14
16
|
browser.agent.request_headers
|
15
17
|
end
|
@@ -22,6 +24,12 @@ class Capybara::Mechanize::Driver
|
|
22
24
|
browser.agent.request_headers[name] = value
|
23
25
|
end
|
24
26
|
|
27
|
+
###
|
28
|
+
|
29
|
+
def get_cookies
|
30
|
+
browser.agent.cookies
|
31
|
+
end
|
32
|
+
|
25
33
|
def set_cookie(name, value, options = {})
|
26
34
|
options[:name] ||= name
|
27
35
|
options[:value] ||= value
|
@@ -30,10 +38,18 @@ class Capybara::Mechanize::Driver
|
|
30
38
|
browser.agent.cookie_jar << cookie
|
31
39
|
end
|
32
40
|
|
41
|
+
def set_cookies(cookies)
|
42
|
+
cookies.each do |cookie|
|
43
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
33
47
|
def clear_cookies
|
34
48
|
browser.agent.cookie_jar.clear!
|
35
49
|
end
|
36
50
|
|
51
|
+
###
|
52
|
+
|
37
53
|
def quit
|
38
54
|
browser.agent.shutdown
|
39
55
|
end
|
@@ -1,6 +1,10 @@
|
|
1
1
|
require_relative '../driver/base'
|
2
2
|
|
3
3
|
class Capybara::Selenium::Driver
|
4
|
+
def get_cookies
|
5
|
+
browser.manage.all_cookies
|
6
|
+
end
|
7
|
+
|
4
8
|
def set_cookie(name, value, options = {})
|
5
9
|
options[:name] ||= name
|
6
10
|
options[:value] ||= value
|
@@ -8,6 +12,12 @@ class Capybara::Selenium::Driver
|
|
8
12
|
browser.manage.add_cookie(options)
|
9
13
|
end
|
10
14
|
|
15
|
+
def set_cookies(cookies)
|
16
|
+
cookies.each do |cookie|
|
17
|
+
set_cookie(cookie[:name], cookie[:value], cookie)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
11
21
|
def clear_cookies
|
12
22
|
browser.manage.delete_all_cookies
|
13
23
|
end
|
@@ -33,7 +33,7 @@ module Capybara
|
|
33
33
|
sleep sleep_interval and retry
|
34
34
|
else
|
35
35
|
logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
|
36
|
-
raise e
|
36
|
+
raise e unless skip_error_on_failure?(e)
|
37
37
|
end
|
38
38
|
else
|
39
39
|
raise e
|
@@ -127,26 +127,40 @@ module Capybara
|
|
127
127
|
|
128
128
|
###
|
129
129
|
|
130
|
+
def scroll_to_bottom
|
131
|
+
execute_script("window.scrollBy(0,10000)")
|
132
|
+
end
|
133
|
+
|
130
134
|
private
|
131
135
|
|
136
|
+
def skip_error_on_failure?(e)
|
137
|
+
config.retry_request_errors.any? do |error|
|
138
|
+
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
132
142
|
def match_error?(e, type:)
|
133
|
-
errors =
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
143
|
+
errors =
|
144
|
+
case type
|
145
|
+
when :to_retry then config.retry_request_errors
|
146
|
+
when :to_skip then config.skip_request_errors
|
147
|
+
end
|
148
|
+
|
149
|
+
errors.any? do |error|
|
150
|
+
if error.kind_of?(Hash)
|
151
|
+
match_class = e.class.ancestors.include?(error[:error])
|
152
|
+
if error[:message].present?
|
153
|
+
if error[:message].kind_of?(Regexp)
|
138
154
|
e.message&.match?(error[:message])
|
139
155
|
else
|
140
156
|
e.message&.include?(error[:message])
|
141
|
-
end
|
142
|
-
|
143
|
-
e.class == error[:error] && match
|
157
|
+
end && match_class
|
144
158
|
else
|
145
|
-
|
159
|
+
match_class
|
146
160
|
end
|
161
|
+
else
|
162
|
+
e.class.ancestors.include?(error)
|
147
163
|
end
|
148
|
-
else
|
149
|
-
false
|
150
164
|
end
|
151
165
|
end
|
152
166
|
|
data/lib/kimurai/cli.rb
CHANGED
@@ -66,7 +66,6 @@ module Kimurai
|
|
66
66
|
###
|
67
67
|
|
68
68
|
desc "crawl", "Run a particular spider by it's name"
|
69
|
-
option :continue, aliases: :c, type: :boolean, default: false, banner: "Continue previous crawling"
|
70
69
|
def crawl(spider_name)
|
71
70
|
raise "Can't find Kimurai project" unless inside_project?
|
72
71
|
require './config/boot'
|
@@ -81,7 +80,7 @@ module Kimurai
|
|
81
80
|
Kimurai.time_zone = time_zone
|
82
81
|
end
|
83
82
|
|
84
|
-
klass.crawl!
|
83
|
+
klass.crawl!
|
85
84
|
end
|
86
85
|
|
87
86
|
desc "parse", "Parse url in the particular spider method"
|
@@ -129,6 +128,8 @@ module Kimurai
|
|
129
128
|
end
|
130
129
|
|
131
130
|
desc "runner", "Run all spiders in the project in queue"
|
131
|
+
option :include, type: :array, default: [], banner: "List of spiders to run"
|
132
|
+
option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
|
132
133
|
option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
|
133
134
|
def runner
|
134
135
|
raise "Can't find Kimurai project" unless inside_project?
|
@@ -138,7 +139,11 @@ module Kimurai
|
|
138
139
|
|
139
140
|
require './config/boot'
|
140
141
|
require 'kimurai/runner'
|
141
|
-
|
142
|
+
|
143
|
+
spiders = options["include"].presence || Kimurai.list.keys
|
144
|
+
spiders -= options["exclude"]
|
145
|
+
|
146
|
+
Runner.new(spiders, jobs).run!
|
142
147
|
end
|
143
148
|
|
144
149
|
desc "--version, -v", "Print the version"
|
data/lib/kimurai/runner.rb
CHANGED
@@ -4,9 +4,9 @@ module Kimurai
|
|
4
4
|
class Runner
|
5
5
|
attr_reader :jobs, :spiders, :session_info
|
6
6
|
|
7
|
-
def initialize(parallel_jobs
|
7
|
+
def initialize(spiders, parallel_jobs)
|
8
8
|
@jobs = parallel_jobs
|
9
|
-
@spiders =
|
9
|
+
@spiders = spiders
|
10
10
|
@start_time = Time.now
|
11
11
|
|
12
12
|
@session_info = {
|
@@ -16,7 +16,7 @@ module Kimurai
|
|
16
16
|
stop_time: nil,
|
17
17
|
environment: Kimurai.env,
|
18
18
|
concurrent_jobs: @jobs,
|
19
|
-
spiders: @spiders
|
19
|
+
spiders: @spiders
|
20
20
|
}
|
21
21
|
|
22
22
|
if time_zone = Kimurai.configuration.time_zone
|
@@ -28,8 +28,6 @@ module Kimurai
|
|
28
28
|
end
|
29
29
|
|
30
30
|
def run!(exception_on_fail: true)
|
31
|
-
running_pids = []
|
32
|
-
|
33
31
|
puts ">>> Runner: started: #{session_info}"
|
34
32
|
if at_start_callback = Kimurai.configuration.runner_at_start_callback
|
35
33
|
at_start_callback.call(session_info)
|
@@ -39,29 +37,20 @@ module Kimurai
|
|
39
37
|
spiders.peach_with_index(jobs) do |spider, i|
|
40
38
|
next unless running
|
41
39
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
pid = spawn("bundle", "exec", "kimurai", "crawl", spider_name, [:out, :err] => "log/#{spider_name}.log")
|
46
|
-
running_pids << pid
|
40
|
+
puts "> Runner: started spider: #{spider}, index: #{i}"
|
41
|
+
pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
|
47
42
|
Process.wait pid
|
48
43
|
|
49
|
-
|
50
|
-
puts "< Runner: stopped spider: #{spider_name}, index: #{i}"
|
44
|
+
puts "< Runner: stopped spider: #{spider}, index: #{i}"
|
51
45
|
end
|
52
46
|
rescue StandardError, SignalException, SystemExit => e
|
53
47
|
running = false
|
48
|
+
|
54
49
|
session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
|
55
50
|
exception_on_fail ? raise(e) : [session_info, e]
|
56
51
|
else
|
57
52
|
session_info.merge!(status: :completed, stop_time: Time.now)
|
58
53
|
ensure
|
59
|
-
running = false
|
60
|
-
Thread.list.each { |t| t.kill if t != Thread.main }
|
61
|
-
|
62
|
-
# Kill currently running spiders (if any, in case of fail)
|
63
|
-
running_pids.each { |pid| Process.kill("INT", pid) }
|
64
|
-
|
65
54
|
if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
|
66
55
|
at_stop_callback.call(session_info)
|
67
56
|
end
|
@@ -16,7 +16,7 @@ set :chronic_options, hours24: true
|
|
16
16
|
# crawl "google_spider.com", output: "log/google_spider.com.log"
|
17
17
|
# end
|
18
18
|
def local_to_utc(time_string, zone:)
|
19
|
-
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(
|
19
|
+
TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
|
20
20
|
end
|
21
21
|
|
22
22
|
# Note: by default Whenever exports cron commands with :environment == "production".
|
data/lib/kimurai/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kimurai
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Afanasev
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|