snapcrawl 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/snapcrawl/config.rb +2 -2
- data/lib/snapcrawl/crawler.rb +7 -7
- data/lib/snapcrawl/dependencies.rb +2 -2
- data/lib/snapcrawl/log_helpers.rb +17 -39
- data/lib/snapcrawl/page.rb +3 -3
- data/lib/snapcrawl/pretty_logger.rb +1 -1
- data/lib/snapcrawl/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62a293da259afce5690315f27f2bbcd881e495a3d1b5344eb9ed9e2c46bd4a4d
|
4
|
+
data.tar.gz: d600fdbcd2344e5a19f853cbea67a0d8ad0c365a38d00aa4de8d02dd6e52e5b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ebdb2355480bacd7f7a6faba264a31086e68c1864c692607fdb6fbc11df210eee17af936ab63305484ee46ac473d50b4033be11e995b51b9050b359c81dd906
|
7
|
+
data.tar.gz: 42a0a9f048fe9b5b1b04426d444710a256ccc8e9a914e3277f062c4ebf760d50a018c1f189e7b0cebced1c236f5d13ca56ab4abbf808a5ec4812bf9a754a9343
|
data/README.md
CHANGED
@@ -27,7 +27,7 @@ You can run Snapcrawl by using this docker image (which contains all the
|
|
27
27
|
necessary prerequisites):
|
28
28
|
|
29
29
|
```shell
|
30
|
-
$ alias snapcrawl='docker run --rm -it --volume $PWD:/app dannyben/snapcrawl'
|
30
|
+
$ alias snapcrawl='docker run --rm -it --network host --volume "$PWD:/app" dannyben/snapcrawl'
|
31
31
|
```
|
32
32
|
|
33
33
|
For more information on the Docker image, refer to the [docker-snapcrawl][3] repository.
|
data/lib/snapcrawl/config.rb
CHANGED
@@ -16,10 +16,10 @@ module Snapcrawl
|
|
16
16
|
# Config. The $logger is available, but it was not yet fully
|
17
17
|
# configured with log_level etc.
|
18
18
|
if File.exist? file
|
19
|
-
# $logger.debug "loading config file
|
19
|
+
# $logger.debug "loading config file !txtgrn!#{file}"
|
20
20
|
push file
|
21
21
|
else
|
22
|
-
# $logger.debug "creating config file
|
22
|
+
# $logger.debug "creating config file !txtgrn!#{file}"
|
23
23
|
create_config file
|
24
24
|
end
|
25
25
|
end
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -7,7 +7,7 @@ module Snapcrawl
|
|
7
7
|
attr_reader :url
|
8
8
|
|
9
9
|
def initialize(url)
|
10
|
-
$logger.debug "initializing crawler with
|
10
|
+
$logger.debug "initializing crawler with !txtgrn!#{url}"
|
11
11
|
|
12
12
|
config_for_display = Config.settings.dup
|
13
13
|
config_for_display['name_template'] = '%%{url}'
|
@@ -25,7 +25,7 @@ module Snapcrawl
|
|
25
25
|
private
|
26
26
|
|
27
27
|
def process_todo
|
28
|
-
$logger.debug "processing queue:
|
28
|
+
$logger.debug "processing queue: !txtgrn!#{todo.count} remaining"
|
29
29
|
|
30
30
|
url, page = todo.shift
|
31
31
|
done.push url
|
@@ -40,12 +40,12 @@ module Snapcrawl
|
|
40
40
|
next if todo.has_key?(sub_page) or done.include?(sub_page)
|
41
41
|
|
42
42
|
if Config.url_whitelist and sub_page.path !~ /#{Config.url_whitelist}/
|
43
|
-
$logger.debug "ignoring
|
43
|
+
$logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: whitelist"
|
44
44
|
next
|
45
45
|
end
|
46
46
|
|
47
47
|
if Config.url_blacklist and sub_page.path =~ /#{Config.url_blacklist}/
|
48
|
-
$logger.debug "ignoring
|
48
|
+
$logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: blacklist"
|
49
49
|
next
|
50
50
|
end
|
51
51
|
|
@@ -56,7 +56,7 @@ module Snapcrawl
|
|
56
56
|
def process_page(page)
|
57
57
|
outfile = "#{Config.snaps_dir}/#{Config.name_template}.png" % { url: page.url.to_slug }
|
58
58
|
|
59
|
-
$logger.info "processing
|
59
|
+
$logger.info "processing !undpur!#{page.url}!txtrst!, depth: #{page.depth}"
|
60
60
|
|
61
61
|
if !page.valid?
|
62
62
|
$logger.debug "page #{page.path} is invalid, aborting process"
|
@@ -66,7 +66,7 @@ module Snapcrawl
|
|
66
66
|
if file_fresh? outfile
|
67
67
|
$logger.info "screenshot for #{page.path} already exists"
|
68
68
|
else
|
69
|
-
$logger.info "
|
69
|
+
$logger.info "!bldgrn!capturing screenshot for #{page.path}"
|
70
70
|
save_screenshot page, outfile
|
71
71
|
end
|
72
72
|
|
@@ -76,7 +76,7 @@ module Snapcrawl
|
|
76
76
|
def save_screenshot(page, outfile)
|
77
77
|
page.save_screenshot outfile
|
78
78
|
rescue => e
|
79
|
-
$logger.error "screenshot error on
|
79
|
+
$logger.error "screenshot error on !undpur!#{page.path}!txtrst! - !txtred!#{e.class}!txtrst!: #{e.message}"
|
80
80
|
end
|
81
81
|
|
82
82
|
def file_fresh?(file)
|
@@ -8,10 +8,10 @@ module Snapcrawl
|
|
8
8
|
def verify
|
9
9
|
return if @verified
|
10
10
|
|
11
|
-
$logger.debug 'verifying
|
11
|
+
$logger.debug 'verifying !txtgrn!phantomjs!txtrst! is present'
|
12
12
|
raise MissingPhantomJS unless command_exist? "phantomjs"
|
13
13
|
|
14
|
-
$logger.debug 'verifying
|
14
|
+
$logger.debug 'verifying !txtgrn!imagemagick!txtrst! is present'
|
15
15
|
raise MissingImageMagick unless command_exist? "convert"
|
16
16
|
|
17
17
|
@verified = true
|
@@ -1,57 +1,35 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
|
1
3
|
module Snapcrawl
|
2
4
|
module LogHelpers
|
5
|
+
include Colsole
|
6
|
+
|
3
7
|
SEVERITY_COLORS = {
|
4
|
-
'INFO' => :
|
5
|
-
'WARN' => :
|
6
|
-
'ERROR' => :
|
7
|
-
'FATAL' => :
|
8
|
-
'DEBUG' => :
|
8
|
+
'INFO' => :txtblu,
|
9
|
+
'WARN' => :txtylw,
|
10
|
+
'ERROR' => :txtred,
|
11
|
+
'FATAL' => :txtred,
|
12
|
+
'DEBUG' => :txtcyn
|
9
13
|
}
|
10
14
|
|
11
15
|
def log_formatter
|
12
16
|
proc do |severity, _time, _prog, message|
|
13
17
|
severity_color = SEVERITY_COLORS[severity]
|
14
|
-
|
15
|
-
|
18
|
+
line = "!#{severity_color}!#{severity.rjust 5}!txtrst! : #{message}\n"
|
19
|
+
use_colors? ? colorize(line) : strip_color_markers(line)
|
16
20
|
end
|
17
21
|
end
|
18
22
|
|
19
|
-
def
|
20
|
-
@
|
21
|
-
end
|
22
|
-
|
23
|
-
def log_colors!
|
24
|
-
colors? ? actual_colors : empty_colors
|
25
|
-
end
|
26
|
-
|
27
|
-
def actual_colors
|
28
|
-
{
|
29
|
-
red: "\e[31m", green: "\e[32m", yellow: "\e[33m",
|
30
|
-
blue: "\e[34m", purple: "\e[35m", cyan: "\e[36m",
|
31
|
-
underlined: "\e[4m", bold: "\e[1m",
|
32
|
-
none: "", reset: "\e[0m"
|
33
|
-
}
|
34
|
-
end
|
35
|
-
|
36
|
-
def empty_colors
|
37
|
-
{
|
38
|
-
red: "", green: "", yellow: "",
|
39
|
-
blue: "", purple: "", cyan: "",
|
40
|
-
underlined: "", bold: "",
|
41
|
-
none: "", reset: ""
|
42
|
-
}
|
43
|
-
end
|
44
|
-
|
45
|
-
def colors?
|
46
|
-
if Config.log_color == 'auto'
|
47
|
-
tty?
|
48
|
-
else
|
49
|
-
Config.log_color
|
50
|
-
end
|
23
|
+
def use_colors?
|
24
|
+
@use_colors ||= (Config.log_color == 'auto' ? tty? : Config.log_color)
|
51
25
|
end
|
52
26
|
|
53
27
|
def tty?
|
54
28
|
ENV['TTY'] == 'on' ? true : ENV['TTY'] == 'off' ? false : $stdout.tty?
|
55
29
|
end
|
30
|
+
|
31
|
+
def strip_color_markers(text)
|
32
|
+
text.gsub(/\!([a-z]{6})\!/, '')
|
33
|
+
end
|
56
34
|
end
|
57
35
|
end
|
data/lib/snapcrawl/page.rb
CHANGED
@@ -55,13 +55,13 @@ module Snapcrawl
|
|
55
55
|
response = cache.get(url) { HTTParty.get url }
|
56
56
|
|
57
57
|
if !response.success?
|
58
|
-
$logger.warn "http error on
|
58
|
+
$logger.warn "http error on !undpur!#{url}!txtrst!, code: !txtylw!#{response.code}!txtrst!, message: #{response.message.strip}"
|
59
59
|
end
|
60
60
|
|
61
61
|
response
|
62
62
|
|
63
63
|
rescue => e
|
64
|
-
$logger.error "http error on
|
64
|
+
$logger.error "http error on !undpur!#{url}!txtrst! - !txtred!#{e.class}!txtrst!: #{e.message}"
|
65
65
|
nil
|
66
66
|
|
67
67
|
end
|
@@ -95,7 +95,7 @@ module Snapcrawl
|
|
95
95
|
begin
|
96
96
|
link = Addressable::URI.join(url, link).to_s.dup
|
97
97
|
rescue => e
|
98
|
-
$logger.warn "
|
98
|
+
$logger.warn "!txtred!#{e.class}!txtrst!: #{e.message} on #{path} (link: #{link})"
|
99
99
|
return nil
|
100
100
|
end
|
101
101
|
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|
@@ -166,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
166
|
- !ruby/object:Gem::Version
|
167
167
|
version: '0'
|
168
168
|
requirements: []
|
169
|
-
rubygems_version: 3.
|
169
|
+
rubygems_version: 3.2.3
|
170
170
|
signing_key:
|
171
171
|
specification_version: 4
|
172
172
|
summary: Crawl a website and take screenshots (CLI + Library)
|