kimurai 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -1
- data/README.md +183 -69
- data/kimurai.gemspec +1 -1
- data/lib/kimurai/base.rb +96 -36
- data/lib/kimurai/base/{simple_saver.rb → saver.rb} +25 -17
- data/lib/kimurai/base/storage.rb +91 -0
- data/lib/kimurai/browser_builder.rb +6 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +22 -18
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +25 -20
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +21 -23
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +22 -18
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +1 -1
- data/lib/kimurai/capybara_ext/session.rb +47 -7
- data/lib/kimurai/cli.rb +2 -1
- data/lib/kimurai/pipeline.rb +6 -2
- data/lib/kimurai/template/Gemfile +8 -0
- data/lib/kimurai/template/spiders/application_spider.rb +50 -35
- data/lib/kimurai/version.rb +1 -1
- metadata +5 -5
- data/lib/kimurai/base/uniq_checker.rb +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 67ee49692e64813bc980eb7562b711d5b5d2c47b50a995acb4759709703da0f9
|
4
|
+
data.tar.gz: baba361bc5039d303ae4a6c9a1dd2109368f8e4c7a641d0a782cfc6a7776ade4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0173d3859b5f8776371fad454ff5575fdc453aa3c6038d8a8399651c46c5eaae789273772227ea014b6ce39b13586e6805bd7f69156eafeacf653804f954003c
|
7
|
+
data.tar.gz: b05889c0cb030aed06fe1df5cc5411154d24019667e1f00f9f4248d598fc93990f86a4aae78430af3140f3dc3989e856cfc3e2316f455984c898442fccad15db
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,60 @@
|
|
1
1
|
# CHANGELOG
|
2
|
-
##
|
2
|
+
## 1.1.0
|
3
|
+
### Breaking changes 1.1.0
|
4
|
+
`browser` config option depricated. Now all sub-options inside `browser` should be placed right into `@config` hash, without `browser` parent key. Example:
|
5
|
+
|
6
|
+
```ruby
|
7
|
+
# Was:
|
8
|
+
@config = {
|
9
|
+
browser: {
|
10
|
+
retry_request_errors: [Net::ReadTimeout],
|
11
|
+
restart_if: {
|
12
|
+
memory_limit: 350_000,
|
13
|
+
requests_limit: 100
|
14
|
+
},
|
15
|
+
before_request: {
|
16
|
+
change_proxy: true,
|
17
|
+
change_user_agent: true,
|
18
|
+
clear_cookies: true,
|
19
|
+
clear_and_set_cookies: true,
|
20
|
+
delay: 1..3
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
# Now:
|
26
|
+
@config = {
|
27
|
+
retry_request_errors: [Net::ReadTimeout],
|
28
|
+
restart_if: {
|
29
|
+
memory_limit: 350_000,
|
30
|
+
requests_limit: 100
|
31
|
+
},
|
32
|
+
before_request: {
|
33
|
+
change_proxy: true,
|
34
|
+
change_user_agent: true,
|
35
|
+
clear_cookies: true,
|
36
|
+
clear_and_set_cookies: true,
|
37
|
+
delay: 1..3
|
38
|
+
}
|
39
|
+
}
|
40
|
+
```
|
41
|
+
|
42
|
+
### New
|
43
|
+
* Add `storage` object with additional methods and persistence database feature
|
44
|
+
* Add events feature to `run_info`
|
45
|
+
* Add `skip_duplicate_requests` config option to automatically skip already visited urls when using requrst_to
|
46
|
+
* Add `extensions` config option to allow inject JS code into browser (supported only by poltergeist_phantomjs engine)
|
47
|
+
* Add Capybara::Session#within_new_window_by method
|
48
|
+
|
49
|
+
### Improvements
|
50
|
+
* Add the last backtrace line to pipeline output when item was dropped
|
51
|
+
* Do not destroy driver if it's not exists (for Base.parse! method)
|
52
|
+
* Handle possible Net::ReadTimeout error while trying to #quit driver
|
53
|
+
|
54
|
+
### Fixes
|
55
|
+
* Fix Mechanize::Driver#proxy (there was a bug while using proxy for mechanize engine without authorization)
|
56
|
+
* Fix requests retries logic
|
57
|
+
|
3
58
|
|
4
59
|
## 1.0.1
|
5
60
|
* Add missing `logger` method to pipeline
|
data/README.md
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
<div align="center">
|
2
|
-
<a href="https://github.com/
|
2
|
+
<a href="https://github.com/vifreefly/kimuraframework">
|
3
3
|
<img width="312" height="200" src="https://hsto.org/webt/_v/mt/tp/_vmttpbpzbt-y2aook642d9wpz0.png">
|
4
4
|
</a>
|
5
5
|
|
6
|
-
<h1>
|
6
|
+
<h1>Kimurai Scraping Framework</h1>
|
7
7
|
</div>
|
8
8
|
|
9
9
|
> **Note about v1.0.0 version:**
|
@@ -18,6 +18,8 @@
|
|
18
18
|
|
19
19
|
<br>
|
20
20
|
|
21
|
+
> Note: this readme is for `1.1.0` gem version. CHANGELOG [here](CHANGELOG.md).
|
22
|
+
|
21
23
|
Kimurai is a modern web scraping framework written in Ruby which **works out of box with Headless Chromium/Firefox, PhantomJS**, or simple HTTP requests and **allows to scrape and interact with JavaScript rendered websites.**
|
22
24
|
|
23
25
|
Kimurai based on well-known [Capybara](https://github.com/teamcapybara/capybara) and [Nokogiri](https://github.com/sparklemotion/nokogiri) gems, so you don't have to learn anything new. Lets see:
|
@@ -32,9 +34,7 @@ class GithubSpider < Kimurai::Base
|
|
32
34
|
@start_urls = ["https://github.com/search?q=Ruby%20Web%20Scraping"]
|
33
35
|
@config = {
|
34
36
|
user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
|
35
|
-
|
36
|
-
before_request: { delay: 4..7 }
|
37
|
-
}
|
37
|
+
before_request: { delay: 4..7 }
|
38
38
|
}
|
39
39
|
|
40
40
|
def parse(response, url:, data: {})
|
@@ -238,7 +238,10 @@ I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scrol
|
|
238
238
|
* [browser object](#browser-object)
|
239
239
|
* [request_to method](#request_to-method)
|
240
240
|
* [save_to helper](#save_to-helper)
|
241
|
-
* [Skip duplicates
|
241
|
+
* [Skip duplicates](#skip-duplicates)
|
242
|
+
* [Automatically skip all duplicated requests urls](#automatically-skip-all-duplicated-requests-urls)
|
243
|
+
* [Storage object](#storage-object)
|
244
|
+
* [Persistence database for the storage](#persistence-database-for-the-storage)
|
242
245
|
* [open_spider and close_spider callbacks](#open_spider-and-close_spider-callbacks)
|
243
246
|
* [KIMURAI_ENV](#kimurai_env)
|
244
247
|
* [Parallel crawling using in_parallel](#parallel-crawling-using-in_parallel)
|
@@ -451,19 +454,19 @@ brew install mongodb
|
|
451
454
|
Before you get to know all Kimurai features, there is `$ kimurai console` command which is an interactive console where you can try and debug your scraping code very quickly, without having to run any spider (yes, it's like [Scrapy shell](https://doc.scrapy.org/en/latest/topics/shell.html#topics-shell)).
|
452
455
|
|
453
456
|
```bash
|
454
|
-
$ kimurai console --engine selenium_chrome --url https://github.com/
|
457
|
+
$ kimurai console --engine selenium_chrome --url https://github.com/vifreefly/kimuraframework
|
455
458
|
```
|
456
459
|
|
457
460
|
<details/>
|
458
461
|
<summary>Show output</summary>
|
459
462
|
|
460
463
|
```
|
461
|
-
$ kimurai console --engine selenium_chrome --url https://github.com/
|
464
|
+
$ kimurai console --engine selenium_chrome --url https://github.com/vifreefly/kimuraframework
|
462
465
|
|
463
466
|
D, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] DEBUG -- : BrowserBuilder (selenium_chrome): created browser instance
|
464
467
|
D, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] DEBUG -- : BrowserBuilder (selenium_chrome): enabled native headless_mode
|
465
|
-
I, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] INFO -- : Browser: started get request to: https://github.com/
|
466
|
-
I, [2018-08-22 13:42:35 +0400#26079] [M: 47461994677760] INFO -- : Browser: finished get request to: https://github.com/
|
468
|
+
I, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] INFO -- : Browser: started get request to: https://github.com/vifreefly/kimuraframework
|
469
|
+
I, [2018-08-22 13:42:35 +0400#26079] [M: 47461994677760] INFO -- : Browser: finished get request to: https://github.com/vifreefly/kimuraframework
|
467
470
|
D, [2018-08-22 13:42:35 +0400#26079] [M: 47461994677760] DEBUG -- : Browser: driver.current_memory: 201701
|
468
471
|
|
469
472
|
From: /home/victor/code/kimurai/lib/kimurai/base.rb @ line 189 Kimurai::Base#console:
|
@@ -473,7 +476,7 @@ From: /home/victor/code/kimurai/lib/kimurai/base.rb @ line 189 Kimurai::Base#con
|
|
473
476
|
190: end
|
474
477
|
|
475
478
|
[1] pry(#<Kimurai::Base>)> response.xpath("//title").text
|
476
|
-
=> "GitHub -
|
479
|
+
=> "GitHub - vifreefly/kimuraframework: Modern web scraping framework written in Ruby which works out of box with Headless Chromium/Firefox, PhantomJS, or simple HTTP requests and allows to scrape and interact with JavaScript rendered websites"
|
477
480
|
|
478
481
|
[2] pry(#<Kimurai::Base>)> ls
|
479
482
|
Kimurai::Base#methods: browser console logger request_to save_to unique?
|
@@ -733,9 +736,11 @@ By default `save_to` add position key to an item hash. You can disable it with `
|
|
733
736
|
|
734
737
|
Until spider stops, each new item will be appended to a file. At the next run, helper will clear the content of a file first, and then start again appending items to it.
|
735
738
|
|
736
|
-
|
739
|
+
> If you don't want file to be cleared before each run, add option `append: true`: `save_to "scraped_products.json", item, format: :json, append: true`
|
740
|
+
|
741
|
+
### Skip duplicates
|
737
742
|
|
738
|
-
It's pretty common when websites have duplicated pages. For example when an e-commerce shop has the same products in different categories. To skip duplicates, there is `unique?` helper:
|
743
|
+
It's pretty common when websites have duplicated pages. For example when an e-commerce shop has the same products in different categories. To skip duplicates, there is simple `unique?` helper:
|
739
744
|
|
740
745
|
```ruby
|
741
746
|
class ProductsSpider < Kimurai::Base
|
@@ -796,6 +801,100 @@ unique?(:id, 324234232)
|
|
796
801
|
unique?(:custom, "Lorem Ipsum")
|
797
802
|
```
|
798
803
|
|
804
|
+
#### Automatically skip all duplicated requests urls
|
805
|
+
|
806
|
+
It is possible to automatically skip all already visited urls while calling `request_to` method, using [@config](#all-available-config-options) option `skip_duplicate_requests: true`. With this option, all already visited urls will be automatically skipped. Also check the [@config](#all-available-config-options) for an additional options of this setting.
|
807
|
+
|
808
|
+
#### `storage` object
|
809
|
+
|
810
|
+
`unique?` method it's just an alias for `storage#unique?`. Storage has several methods:
|
811
|
+
|
812
|
+
* `#all` - display storage hash where keys are existing scopes.
|
813
|
+
* `#include?(scope, value)` - return `true` if value in the scope exists, and `false` if not
|
814
|
+
* `#add(scope, value)` - add value to the scope
|
815
|
+
* `unique?(scope, value)` - method already described above, will return `false` if value in the scope exists, or return `true` + add value to the scope if value in the scope not exists.
|
816
|
+
* `clear!` - reset the whole storage by deleting all values from all scopes.
|
817
|
+
|
818
|
+
#### Persistence database for the storage
|
819
|
+
|
820
|
+
It's pretty common that spider can fail (IP blocking, etc.) while crawling a huge website with +5k listings. In this case, it's not convenient to start everything over again.
|
821
|
+
|
822
|
+
Kimurai can use persistence database for a `storage` using Ruby built-in [PStore](https://ruby-doc.org/stdlib-2.5.1/libdoc/pstore/rdoc/PStore.html) database. With this option, you can automatically skip already visited urls in the next run _if previous run was failed_, otherwise _(if run was successful)_ storage database will be removed before spider stops.
|
823
|
+
|
824
|
+
Also, with persistence storage enabled, [save_to](#save_to-helper) method will keep adding items to an existing file (it will not be cleared before each run).
|
825
|
+
|
826
|
+
To use persistence storage, provide `continue: true` option to the `.crawl!` method: `SomeSpider.crawl!(continue: true)`.
|
827
|
+
|
828
|
+
There are two approaches how to use persistence storage and skip already processed items pages. First, is to manually add required urls to the storage:
|
829
|
+
|
830
|
+
```ruby
|
831
|
+
class ProductsSpider < Kimurai::Base
|
832
|
+
@start_urls = ["https://example-shop.com/"]
|
833
|
+
|
834
|
+
def parse(response, url:, data: {})
|
835
|
+
response.xpath("//categories/path").each do |category|
|
836
|
+
request_to :parse_category, url: category[:href]
|
837
|
+
end
|
838
|
+
end
|
839
|
+
|
840
|
+
def parse_category(response, url:, data: {})
|
841
|
+
response.xpath("//products/path").each do |product|
|
842
|
+
# check if product url already contains in the scope `:product_urls`, if so, skip the request:
|
843
|
+
next if storage.contains?(:product_urls, product[:href])
|
844
|
+
# Otherwise process it:
|
845
|
+
request_to :parse_product, url: product[:href]
|
846
|
+
end
|
847
|
+
end
|
848
|
+
|
849
|
+
def parse_product(response, url:, data: {})
|
850
|
+
# Add visited item to the storage:
|
851
|
+
storage.add(:product_urls, url)
|
852
|
+
|
853
|
+
# ...
|
854
|
+
end
|
855
|
+
end
|
856
|
+
|
857
|
+
# Run the spider with persistence database option:
|
858
|
+
ProductsSpider.crawl!(continue: true)
|
859
|
+
```
|
860
|
+
|
861
|
+
Second approach is to automatically skip already processed items urls using `@config` `skip_duplicate_requests:` option:
|
862
|
+
|
863
|
+
```ruby
|
864
|
+
class ProductsSpider < Kimurai::Base
|
865
|
+
@start_urls = ["https://example-shop.com/"]
|
866
|
+
@config = {
|
867
|
+
# Configure skip_duplicate_requests option:
|
868
|
+
skip_duplicate_requests: { scope: :product_urls, check_only: true }
|
869
|
+
}
|
870
|
+
|
871
|
+
def parse(response, url:, data: {})
|
872
|
+
response.xpath("//categories/path").each do |category|
|
873
|
+
request_to :parse_category, url: category[:href]
|
874
|
+
end
|
875
|
+
end
|
876
|
+
|
877
|
+
def parse_category(response, url:, data: {})
|
878
|
+
response.xpath("//products/path").each do |product|
|
879
|
+
# Before visiting the url, `request_to` will check if it already contains
|
880
|
+
# in the storage scope `:product_urls`, if so, request will be skipped:
|
881
|
+
request_to :parse_product, url: product[:href]
|
882
|
+
end
|
883
|
+
end
|
884
|
+
|
885
|
+
def parse_product(response, url:, data: {})
|
886
|
+
# Add visited item url to the storage scope `:product_urls`:
|
887
|
+
storage.add(:product_urls, url)
|
888
|
+
|
889
|
+
# ...
|
890
|
+
end
|
891
|
+
end
|
892
|
+
|
893
|
+
# Run the spider with persistence database option:
|
894
|
+
ProductsSpider.crawl!(continue: true)
|
895
|
+
```
|
896
|
+
|
897
|
+
|
799
898
|
### `open_spider` and `close_spider` callbacks
|
800
899
|
|
801
900
|
You can define `.open_spider` and `.close_spider` callbacks (class methods) to perform some action before spider started or after spider has been stopped:
|
@@ -1316,6 +1415,8 @@ end # =>
|
|
1316
1415
|
# "reddit: the front page of the internetHotHot"
|
1317
1416
|
```
|
1318
1417
|
|
1418
|
+
Keep in mind, that [save_to](#save_to-helper) and [unique?](#skip-duplicates) helpers are not thread-safe while using `.parse!` method.
|
1419
|
+
|
1319
1420
|
#### `Kimurai.list` and `Kimurai.find_by_name()`
|
1320
1421
|
|
1321
1422
|
```ruby
|
@@ -1399,21 +1500,19 @@ class Spider < Kimurai::Base
|
|
1399
1500
|
proxy: -> { PROXIES.sample },
|
1400
1501
|
window_size: [1366, 768],
|
1401
1502
|
disable_images: true,
|
1402
|
-
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1407
|
-
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
delay: 1..3
|
1416
|
-
}
|
1503
|
+
restart_if: {
|
1504
|
+
# Restart browser if provided memory limit (in kilobytes) is exceeded:
|
1505
|
+
memory_limit: 350_000
|
1506
|
+
},
|
1507
|
+
before_request: {
|
1508
|
+
# Change user agent before each request:
|
1509
|
+
change_user_agent: true,
|
1510
|
+
# Change proxy before each request:
|
1511
|
+
change_proxy: true,
|
1512
|
+
# Clear all cookies and set default cookies (if provided) before each request:
|
1513
|
+
clear_and_set_cookies: true,
|
1514
|
+
# Process delay before each request:
|
1515
|
+
delay: 1..3
|
1417
1516
|
}
|
1418
1517
|
}
|
1419
1518
|
|
@@ -1475,41 +1574,56 @@ end
|
|
1475
1574
|
# Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
|
1476
1575
|
ssl_cert_path: "path/to/ssl_cert",
|
1477
1576
|
|
1478
|
-
#
|
1479
|
-
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
|
1492
|
-
|
1493
|
-
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1577
|
+
# Inject some JavaScript code to the browser.
|
1578
|
+
# Format: array of strings, where each string is a path to JS file.
|
1579
|
+
# Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
|
1580
|
+
extensions: ["lib/code_to_inject.js"],
|
1581
|
+
|
1582
|
+
# Automatically skip duplicated (already visited) urls when using `request_to` method.
|
1583
|
+
# Possible values: `true` or `hash` with options.
|
1584
|
+
# In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
|
1585
|
+
# and if url already contains in this scope, request will be skipped.
|
1586
|
+
# You can configure this setting by providing additional options as hash:
|
1587
|
+
# `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
|
1588
|
+
# `scope:` - use custom scope than `:requests_urls`
|
1589
|
+
# `check_only:` - if true, then scope will be only checked for url, url will not
|
1590
|
+
# be added to the scope if scope doesn't contains it.
|
1591
|
+
# works for all drivers
|
1592
|
+
skip_duplicate_requests: true,
|
1593
|
+
|
1594
|
+
# Array of possible errors to retry while processing a request:
|
1595
|
+
retry_request_errors: [Net::ReadTimeout],
|
1596
|
+
|
1597
|
+
# Restart browser if one of the options is true:
|
1598
|
+
restart_if: {
|
1599
|
+
# Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
|
1600
|
+
memory_limit: 350_000,
|
1601
|
+
|
1602
|
+
# Restart browser if provided requests limit is exceeded (works for all engines)
|
1603
|
+
requests_limit: 100
|
1604
|
+
},
|
1605
|
+
before_request: {
|
1606
|
+
# Change proxy before each request. The `proxy:` option above should be presented
|
1607
|
+
# and has lambda format. Works only for poltergeist and mechanize engines
|
1608
|
+
# (Selenium doesn't support proxy rotation).
|
1609
|
+
change_proxy: true,
|
1610
|
+
|
1611
|
+
# Change user agent before each request. The `user_agent:` option above should be presented
|
1612
|
+
# and has lambda format. Works only for poltergeist and mechanize engines
|
1613
|
+
# (selenium doesn't support to get/set headers).
|
1614
|
+
change_user_agent: true,
|
1615
|
+
|
1616
|
+
# Clear all cookies before each request, works for all engines
|
1617
|
+
clear_cookies: true,
|
1618
|
+
|
1619
|
+
# If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
|
1620
|
+
# use this option instead (works for all engines)
|
1621
|
+
clear_and_set_cookies: true,
|
1622
|
+
|
1623
|
+
# Global option to set delay between requests.
|
1624
|
+
# Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
|
1625
|
+
# delay number will be chosen randomly for each request: `rand (2..5) # => 3`
|
1626
|
+
delay: 1..3
|
1513
1627
|
}
|
1514
1628
|
}
|
1515
1629
|
```
|
@@ -1525,10 +1639,8 @@ class ApplicationSpider < Kimurai::Base
|
|
1525
1639
|
@config = {
|
1526
1640
|
user_agent: "Firefox",
|
1527
1641
|
disable_images: true,
|
1528
|
-
|
1529
|
-
|
1530
|
-
before_request: { delay: 1..2 }
|
1531
|
-
}
|
1642
|
+
restart_if: { memory_limit: 350_000 },
|
1643
|
+
before_request: { delay: 1..2 }
|
1532
1644
|
}
|
1533
1645
|
end
|
1534
1646
|
|
@@ -1536,7 +1648,7 @@ class CustomSpider < ApplicationSpider
|
|
1536
1648
|
@name = "custom_spider"
|
1537
1649
|
@start_urls = ["https://example.com/"]
|
1538
1650
|
@config = {
|
1539
|
-
|
1651
|
+
before_request: { delay: 4..6 }
|
1540
1652
|
}
|
1541
1653
|
|
1542
1654
|
def parse(response, url:, data: {})
|
@@ -1628,6 +1740,8 @@ end
|
|
1628
1740
|
### Crawl
|
1629
1741
|
To run a particular spider in the project, run: `$ bundle exec kimurai crawl example_spider`. Don't forget to add `bundle exec` before command to load required environment.
|
1630
1742
|
|
1743
|
+
You can provide an additional option `--continue` to use [persistence storage database](#persistence-database-for-the-storage) feature.
|
1744
|
+
|
1631
1745
|
### List
|
1632
1746
|
To list all project spiders, run: `$ bundle exec kimurai list`
|
1633
1747
|
|
@@ -1786,7 +1900,7 @@ class GithubSpider < ApplicationSpider
|
|
1786
1900
|
@start_urls = ["https://github.com/search?q=Ruby%20Web%20Scraping"]
|
1787
1901
|
@config = {
|
1788
1902
|
user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
|
1789
|
-
|
1903
|
+
before_request: { delay: 4..7 }
|
1790
1904
|
}
|
1791
1905
|
|
1792
1906
|
def parse(response, url:, data: {})
|
data/kimurai.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["vicfreefly@gmail.com"]
|
11
11
|
|
12
12
|
spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri"
|
13
|
-
spec.homepage = "https://github.com/vifreefly/
|
13
|
+
spec.homepage = "https://github.com/vifreefly/kimuraframework"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
# Specify which files should be added to the gem when it is released.
|
data/lib/kimurai/base.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require_relative 'base/
|
2
|
-
require_relative 'base/
|
1
|
+
require_relative 'base/saver'
|
2
|
+
require_relative 'base/storage'
|
3
3
|
|
4
4
|
module Kimurai
|
5
5
|
class Base
|
@@ -21,7 +21,7 @@ module Kimurai
|
|
21
21
|
###
|
22
22
|
|
23
23
|
class << self
|
24
|
-
attr_reader :run_info
|
24
|
+
attr_reader :run_info, :savers, :storage
|
25
25
|
end
|
26
26
|
|
27
27
|
def self.running?
|
@@ -46,10 +46,12 @@ module Kimurai
|
|
46
46
|
|
47
47
|
def self.update(type, subtype)
|
48
48
|
return unless @run_info
|
49
|
+
@update_mutex.synchronize { @run_info[type][subtype] += 1 }
|
50
|
+
end
|
49
51
|
|
50
|
-
|
51
|
-
|
52
|
-
|
52
|
+
def self.add_event(scope, event)
|
53
|
+
return unless @run_info
|
54
|
+
@update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
|
53
55
|
end
|
54
56
|
|
55
57
|
###
|
@@ -58,8 +60,6 @@ module Kimurai
|
|
58
60
|
@pipelines = []
|
59
61
|
@config = {}
|
60
62
|
|
61
|
-
###
|
62
|
-
|
63
63
|
def self.name
|
64
64
|
@name
|
65
65
|
end
|
@@ -90,34 +90,27 @@ module Kimurai
|
|
90
90
|
end
|
91
91
|
end
|
92
92
|
|
93
|
-
|
94
|
-
|
95
|
-
def self.checker
|
96
|
-
@checker ||= UniqChecker.new
|
97
|
-
end
|
98
|
-
|
99
|
-
def unique?(scope, value)
|
100
|
-
self.class.checker.unique?(scope, value)
|
101
|
-
end
|
102
|
-
|
103
|
-
def self.saver
|
104
|
-
@saver ||= SimpleSaver.new
|
105
|
-
end
|
93
|
+
def self.crawl!(continue: false)
|
94
|
+
logger.error "Spider: already running: #{name}" and return false if running?
|
106
95
|
|
107
|
-
|
108
|
-
|
109
|
-
|
96
|
+
storage_path =
|
97
|
+
if continue
|
98
|
+
Dir.exists?("tmp") ? "tmp/#{name}.pstore" : "#{name}.pstore"
|
99
|
+
end
|
110
100
|
|
111
|
-
|
101
|
+
@storage = Storage.new(storage_path)
|
102
|
+
@savers = {}
|
103
|
+
@update_mutex = Mutex.new
|
112
104
|
|
113
|
-
def self.crawl!
|
114
|
-
logger.error "Spider: already running: #{name}" and return false if running?
|
115
105
|
@run_info = {
|
116
|
-
spider_name: name, status: :running, environment: Kimurai.env,
|
106
|
+
spider_name: name, status: :running, error: nil, environment: Kimurai.env,
|
117
107
|
start_time: Time.new, stop_time: nil, running_time: nil,
|
118
|
-
visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
|
108
|
+
visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
|
109
|
+
events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
|
119
110
|
}
|
120
111
|
|
112
|
+
###
|
113
|
+
|
121
114
|
logger.info "Spider: started: #{name}"
|
122
115
|
open_spider if self.respond_to? :open_spider
|
123
116
|
|
@@ -130,12 +123,11 @@ module Kimurai
|
|
130
123
|
else
|
131
124
|
spider.parse
|
132
125
|
end
|
133
|
-
rescue StandardError, SignalException => e
|
126
|
+
rescue StandardError, SignalException, SystemExit => e
|
134
127
|
@run_info.merge!(status: :failed, error: e.inspect)
|
135
128
|
raise e
|
136
129
|
else
|
137
|
-
@run_info
|
138
|
-
@run_info
|
130
|
+
@run_info.merge!(status: :completed)
|
139
131
|
ensure
|
140
132
|
if spider
|
141
133
|
spider.browser.destroy_driver!
|
@@ -145,10 +137,20 @@ module Kimurai
|
|
145
137
|
@run_info.merge!(stop_time: stop_time, running_time: total_time)
|
146
138
|
|
147
139
|
close_spider if self.respond_to? :close_spider
|
140
|
+
|
141
|
+
if @storage.path
|
142
|
+
if completed?
|
143
|
+
@storage.delete!
|
144
|
+
logger.info "Spider: storage: persistence database #{@storage.path} was removed (successful run)"
|
145
|
+
else
|
146
|
+
logger.info "Spider: storage: persistence database #{@storage.path} wasn't removed (failed run)"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
148
150
|
message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
|
149
|
-
failed? ?
|
151
|
+
failed? ? logger.fatal(message) : logger.info(message)
|
150
152
|
|
151
|
-
@run_info, @
|
153
|
+
@run_info, @storage, @savers, @update_mutex = nil
|
152
154
|
end
|
153
155
|
end
|
154
156
|
|
@@ -156,7 +158,7 @@ module Kimurai
|
|
156
158
|
spider = engine ? self.new(engine) : self.new
|
157
159
|
url.present? ? spider.request_to(handler, url: url, data: data) : spider.public_send(handler)
|
158
160
|
ensure
|
159
|
-
spider.browser.destroy_driver!
|
161
|
+
spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
|
160
162
|
end
|
161
163
|
|
162
164
|
###
|
@@ -175,6 +177,7 @@ module Kimurai
|
|
175
177
|
end.to_h
|
176
178
|
|
177
179
|
@logger = self.class.logger
|
180
|
+
@savers = {}
|
178
181
|
end
|
179
182
|
|
180
183
|
def browser
|
@@ -182,6 +185,11 @@ module Kimurai
|
|
182
185
|
end
|
183
186
|
|
184
187
|
def request_to(handler, delay = nil, url:, data: {})
|
188
|
+
if @config[:skip_duplicate_requests] && !unique_request?(url)
|
189
|
+
add_event(:duplicate_requests) if self.with_info
|
190
|
+
logger.warn "Spider: request_to: url is not unique: #{url}, skipped" and return
|
191
|
+
end
|
192
|
+
|
185
193
|
request_data = { url: url, data: data }
|
186
194
|
delay ? browser.visit(url, delay: delay) : browser.visit(url)
|
187
195
|
public_send(handler, browser.current_response, request_data)
|
@@ -191,8 +199,59 @@ module Kimurai
|
|
191
199
|
binding.pry
|
192
200
|
end
|
193
201
|
|
202
|
+
###
|
203
|
+
|
204
|
+
def storage
|
205
|
+
# Note: for `.crawl!` uses shared thread safe Storage instance,
|
206
|
+
# otherwise, each spider instance will have it's own Storage
|
207
|
+
@storage ||= self.with_info ? self.class.storage : Storage.new
|
208
|
+
end
|
209
|
+
|
210
|
+
def unique?(scope, value)
|
211
|
+
storage.unique?(scope, value)
|
212
|
+
end
|
213
|
+
|
214
|
+
def save_to(path, item, format:, position: true, append: false)
|
215
|
+
@savers[path] ||= begin
|
216
|
+
options = { format: format, position: position, append: storage.path ? true : append }
|
217
|
+
if self.with_info
|
218
|
+
self.class.savers[path] ||= Saver.new(path, options)
|
219
|
+
else
|
220
|
+
Saver.new(path, options)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
@savers[path].save(item)
|
225
|
+
end
|
226
|
+
|
227
|
+
###
|
228
|
+
|
229
|
+
def add_event(scope = :custom, event)
|
230
|
+
unless self.with_info
|
231
|
+
raise "It's allowed to use `add_event` only while performing a full run (`.crawl!` method)"
|
232
|
+
end
|
233
|
+
|
234
|
+
self.class.add_event(scope, event)
|
235
|
+
end
|
236
|
+
|
237
|
+
###
|
238
|
+
|
194
239
|
private
|
195
240
|
|
241
|
+
def unique_request?(url)
|
242
|
+
options = @config[:skip_duplicate_requests]
|
243
|
+
if options.class == Hash
|
244
|
+
scope = options[:scope] || :requests_urls
|
245
|
+
if options[:check_only]
|
246
|
+
storage.include?(scope, url) ? false : true
|
247
|
+
else
|
248
|
+
storage.unique?(scope, url) ? true : false
|
249
|
+
end
|
250
|
+
else
|
251
|
+
storage.unique?(:requests_urls, url) ? true : false
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
196
255
|
def send_item(item, options = {})
|
197
256
|
logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
|
198
257
|
self.class.update(:items, :sent) if self.with_info
|
@@ -201,7 +260,8 @@ module Kimurai
|
|
201
260
|
item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
|
202
261
|
end
|
203
262
|
rescue => e
|
204
|
-
logger.error "Pipeline: dropped: #{e.inspect}, item: #{item}"
|
263
|
+
logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
|
264
|
+
add_event(:drop_items_errors, e.inspect) if self.with_info
|
205
265
|
false
|
206
266
|
else
|
207
267
|
self.class.update(:items, :processed) if self.with_info
|