snapcrawl 0.5.2 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -0
- data/lib/snapcrawl/config.rb +2 -0
- data/lib/snapcrawl/page.rb +5 -1
- data/lib/snapcrawl/screenshot.rb +20 -12
- data/lib/snapcrawl/templates/config.yml +8 -0
- data/lib/snapcrawl/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1238ab663146a888fc002379efaae0abaa72f02fc9ef7954bffb79e7dad4b07d
|
4
|
+
data.tar.gz: c340cf0b5d1675158077257007f0d58883e9d3e0418f4c0551039fa2e09c6df8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e1c2ca21bbfa5471d58fce0e20d61237a87e14157b156fec0e0ee0b07d8c2a169dd38bffda1c99292bc737fc5cf25c64811e2650de958756c29fd630ca87721
|
7
|
+
data.tar.gz: fd7d8e18393e00274f3c145c103e03c0572f7113fab98b7cf3572e99f77d16b88628be0b3050bf98fba311fb05db974453377cb96e8bbd567794426b2a83c099
|
data/README.md
CHANGED
@@ -112,6 +112,14 @@ url_blacklist:
|
|
112
112
|
|
113
113
|
# take a screenshot of this CSS selector only
|
114
114
|
css_selector:
|
115
|
+
|
116
|
+
# when true, ignore SSL related errors
|
117
|
+
skip_ssl_verification: false
|
118
|
+
|
119
|
+
# set to any number of seconds to wait for the page to load before taking
|
120
|
+
# a screenshot, leave empty to not wait at all (only needed for pages with
|
121
|
+
# animations or other post-load events).
|
122
|
+
screenshot_delay:
|
115
123
|
```
|
116
124
|
|
117
125
|
## Contributing / Support
|
data/lib/snapcrawl/config.rb
CHANGED
data/lib/snapcrawl/page.rb
CHANGED
@@ -52,7 +52,7 @@ module Snapcrawl
|
|
52
52
|
end
|
53
53
|
|
54
54
|
def http_response!
|
55
|
-
response = cache.get(url) { HTTParty.get url }
|
55
|
+
response = cache.get(url) { HTTParty.get url, httparty_options }
|
56
56
|
|
57
57
|
if !response.success?
|
58
58
|
$logger.warn "http error on !undpur!#{url}!txtrst!, code: !txtylw!#{response.code}!txtrst!, message: #{response.message.strip}"
|
@@ -66,6 +66,10 @@ module Snapcrawl
|
|
66
66
|
|
67
67
|
end
|
68
68
|
|
69
|
+
def httparty_options
|
70
|
+
Config.skip_ssl_verification ? { verify: false } : {}
|
71
|
+
end
|
72
|
+
|
69
73
|
def normalize_links(links)
|
70
74
|
result = []
|
71
75
|
|
data/lib/snapcrawl/screenshot.rb
CHANGED
@@ -12,27 +12,20 @@ module Snapcrawl
|
|
12
12
|
|
13
13
|
def save(outfile = nil)
|
14
14
|
outfile ||= "#{url.to_slug}.png"
|
15
|
-
|
16
|
-
fetch_opts = { allowed_status_codes: [404, 401, 403] }
|
17
|
-
if Config.selector
|
18
|
-
fetch_opts[:selector] = Config.selector
|
19
|
-
fetch_opts[:full] = false
|
20
|
-
end
|
21
|
-
|
22
|
-
webshot_capture url, outfile, fetch_opts
|
15
|
+
webshot_capture url, outfile
|
23
16
|
end
|
24
17
|
|
25
18
|
private
|
26
19
|
|
27
|
-
def webshot_capture(url, image_path
|
28
|
-
webshot_capture! url, image_path
|
20
|
+
def webshot_capture(url, image_path)
|
21
|
+
webshot_capture! url, image_path
|
29
22
|
rescue => e
|
30
23
|
raise ScreenshotError, "#{e.class} #{e.message}"
|
31
24
|
end
|
32
25
|
|
33
|
-
def webshot_capture!(url, image_path
|
26
|
+
def webshot_capture!(url, image_path)
|
34
27
|
hide_output do
|
35
|
-
webshot.capture url, image_path,
|
28
|
+
webshot.capture url, image_path, webshot_options do |magick|
|
36
29
|
magick.combine_options do |c|
|
37
30
|
c.background "white"
|
38
31
|
c.gravity 'north'
|
@@ -43,6 +36,21 @@ module Snapcrawl
|
|
43
36
|
end
|
44
37
|
end
|
45
38
|
|
39
|
+
def webshot_options
|
40
|
+
result = { allowed_status_codes: [404, 401, 403] }
|
41
|
+
|
42
|
+
if Config.selector
|
43
|
+
result[:selector] = Config.selector
|
44
|
+
result[:full] = false
|
45
|
+
end
|
46
|
+
|
47
|
+
if Config.screenshot_delay
|
48
|
+
result[:timeout] = Config.screenshot_delay
|
49
|
+
end
|
50
|
+
|
51
|
+
result
|
52
|
+
end
|
53
|
+
|
46
54
|
def webshot
|
47
55
|
@webshot ||= Webshot::Screenshot.instance
|
48
56
|
end
|
@@ -39,3 +39,11 @@ url_blacklist:
|
|
39
39
|
|
40
40
|
# take a screenshot of this CSS selector only
|
41
41
|
css_selector:
|
42
|
+
|
43
|
+
# when true, ignore SSL related errors
|
44
|
+
skip_ssl_verification: false
|
45
|
+
|
46
|
+
# set to any number of seconds to wait for the page to load before taking
|
47
|
+
# a screenshot, leave empty to not wait at all (only needed for pages with
|
48
|
+
# animations or other post-load events).
|
49
|
+
screenshot_delay:
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-03-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|