snapcrawl 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -0
- data/lib/snapcrawl/config.rb +2 -0
- data/lib/snapcrawl/page.rb +5 -1
- data/lib/snapcrawl/screenshot.rb +20 -12
- data/lib/snapcrawl/templates/config.yml +8 -0
- data/lib/snapcrawl/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1238ab663146a888fc002379efaae0abaa72f02fc9ef7954bffb79e7dad4b07d
|
4
|
+
data.tar.gz: c340cf0b5d1675158077257007f0d58883e9d3e0418f4c0551039fa2e09c6df8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e1c2ca21bbfa5471d58fce0e20d61237a87e14157b156fec0e0ee0b07d8c2a169dd38bffda1c99292bc737fc5cf25c64811e2650de958756c29fd630ca87721
|
7
|
+
data.tar.gz: fd7d8e18393e00274f3c145c103e03c0572f7113fab98b7cf3572e99f77d16b88628be0b3050bf98fba311fb05db974453377cb96e8bbd567794426b2a83c099
|
data/README.md
CHANGED
@@ -112,6 +112,14 @@ url_blacklist:
|
|
112
112
|
|
113
113
|
# take a screenshot of this CSS selector only
|
114
114
|
css_selector:
|
115
|
+
|
116
|
+
# when true, ignore SSL related errors
|
117
|
+
skip_ssl_verification: false
|
118
|
+
|
119
|
+
# set to any number of seconds to wait for the page to load before taking
|
120
|
+
# a screenshot, leave empty to not wait at all (only needed for pages with
|
121
|
+
# animations or other post-load events).
|
122
|
+
screenshot_delay:
|
115
123
|
```
|
116
124
|
|
117
125
|
## Contributing / Support
|
data/lib/snapcrawl/config.rb
CHANGED
data/lib/snapcrawl/page.rb
CHANGED
@@ -52,7 +52,7 @@ module Snapcrawl
|
|
52
52
|
end
|
53
53
|
|
54
54
|
def http_response!
|
55
|
-
response = cache.get(url) { HTTParty.get url }
|
55
|
+
response = cache.get(url) { HTTParty.get url, httparty_options }
|
56
56
|
|
57
57
|
if !response.success?
|
58
58
|
$logger.warn "http error on !undpur!#{url}!txtrst!, code: !txtylw!#{response.code}!txtrst!, message: #{response.message.strip}"
|
@@ -66,6 +66,10 @@ module Snapcrawl
|
|
66
66
|
|
67
67
|
end
|
68
68
|
|
69
|
+
def httparty_options
|
70
|
+
Config.skip_ssl_verification ? { verify: false } : {}
|
71
|
+
end
|
72
|
+
|
69
73
|
def normalize_links(links)
|
70
74
|
result = []
|
71
75
|
|
data/lib/snapcrawl/screenshot.rb
CHANGED
@@ -12,27 +12,20 @@ module Snapcrawl
|
|
12
12
|
|
13
13
|
def save(outfile = nil)
|
14
14
|
outfile ||= "#{url.to_slug}.png"
|
15
|
-
|
16
|
-
fetch_opts = { allowed_status_codes: [404, 401, 403] }
|
17
|
-
if Config.selector
|
18
|
-
fetch_opts[:selector] = Config.selector
|
19
|
-
fetch_opts[:full] = false
|
20
|
-
end
|
21
|
-
|
22
|
-
webshot_capture url, outfile, fetch_opts
|
15
|
+
webshot_capture url, outfile
|
23
16
|
end
|
24
17
|
|
25
18
|
private
|
26
19
|
|
27
|
-
def webshot_capture(url, image_path
|
28
|
-
webshot_capture! url, image_path
|
20
|
+
def webshot_capture(url, image_path)
|
21
|
+
webshot_capture! url, image_path
|
29
22
|
rescue => e
|
30
23
|
raise ScreenshotError, "#{e.class} #{e.message}"
|
31
24
|
end
|
32
25
|
|
33
|
-
def webshot_capture!(url, image_path
|
26
|
+
def webshot_capture!(url, image_path)
|
34
27
|
hide_output do
|
35
|
-
webshot.capture url, image_path,
|
28
|
+
webshot.capture url, image_path, webshot_options do |magick|
|
36
29
|
magick.combine_options do |c|
|
37
30
|
c.background "white"
|
38
31
|
c.gravity 'north'
|
@@ -43,6 +36,21 @@ module Snapcrawl
|
|
43
36
|
end
|
44
37
|
end
|
45
38
|
|
39
|
+
def webshot_options
|
40
|
+
result = { allowed_status_codes: [404, 401, 403] }
|
41
|
+
|
42
|
+
if Config.selector
|
43
|
+
result[:selector] = Config.selector
|
44
|
+
result[:full] = false
|
45
|
+
end
|
46
|
+
|
47
|
+
if Config.screenshot_delay
|
48
|
+
result[:timeout] = Config.screenshot_delay
|
49
|
+
end
|
50
|
+
|
51
|
+
result
|
52
|
+
end
|
53
|
+
|
46
54
|
def webshot
|
47
55
|
@webshot ||= Webshot::Screenshot.instance
|
48
56
|
end
|
@@ -39,3 +39,11 @@ url_blacklist:
|
|
39
39
|
|
40
40
|
# take a screenshot of this CSS selector only
|
41
41
|
css_selector:
|
42
|
+
|
43
|
+
# when true, ignore SSL related errors
|
44
|
+
skip_ssl_verification: false
|
45
|
+
|
46
|
+
# set to any number of seconds to wait for the page to load before taking
|
47
|
+
# a screenshot, leave empty to not wait at all (only needed for pages with
|
48
|
+
# animations or other post-load events).
|
49
|
+
screenshot_delay:
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-03-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|