snapcrawl 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/snapcrawl/config.rb +2 -2
- data/lib/snapcrawl/crawler.rb +7 -7
- data/lib/snapcrawl/dependencies.rb +2 -2
- data/lib/snapcrawl/log_helpers.rb +17 -39
- data/lib/snapcrawl/page.rb +3 -3
- data/lib/snapcrawl/pretty_logger.rb +1 -1
- data/lib/snapcrawl/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62a293da259afce5690315f27f2bbcd881e495a3d1b5344eb9ed9e2c46bd4a4d
|
4
|
+
data.tar.gz: d600fdbcd2344e5a19f853cbea67a0d8ad0c365a38d00aa4de8d02dd6e52e5b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ebdb2355480bacd7f7a6faba264a31086e68c1864c692607fdb6fbc11df210eee17af936ab63305484ee46ac473d50b4033be11e995b51b9050b359c81dd906
|
7
|
+
data.tar.gz: 42a0a9f048fe9b5b1b04426d444710a256ccc8e9a914e3277f062c4ebf760d50a018c1f189e7b0cebced1c236f5d13ca56ab4abbf808a5ec4812bf9a754a9343
|
data/README.md
CHANGED
@@ -27,7 +27,7 @@ You can run Snapcrawl by using this docker image (which contains all the
|
|
27
27
|
necessary prerequisites):
|
28
28
|
|
29
29
|
```shell
|
30
|
-
$ alias snapcrawl='docker run --rm -it --volume $PWD:/app dannyben/snapcrawl'
|
30
|
+
$ alias snapcrawl='docker run --rm -it --network host --volume "$PWD:/app" dannyben/snapcrawl'
|
31
31
|
```
|
32
32
|
|
33
33
|
For more information on the Docker image, refer to the [docker-snapcrawl][3] repository.
|
data/lib/snapcrawl/config.rb
CHANGED
@@ -16,10 +16,10 @@ module Snapcrawl
|
|
16
16
|
# Config. The $logger is available, but it was not yet fully
|
17
17
|
# configured with log_level etc.
|
18
18
|
if File.exist? file
|
19
|
-
# $logger.debug "loading config file
|
19
|
+
# $logger.debug "loading config file !txtgrn!#{file}"
|
20
20
|
push file
|
21
21
|
else
|
22
|
-
# $logger.debug "creating config file
|
22
|
+
# $logger.debug "creating config file !txtgrn!#{file}"
|
23
23
|
create_config file
|
24
24
|
end
|
25
25
|
end
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -7,7 +7,7 @@ module Snapcrawl
|
|
7
7
|
attr_reader :url
|
8
8
|
|
9
9
|
def initialize(url)
|
10
|
-
$logger.debug "initializing crawler with
|
10
|
+
$logger.debug "initializing crawler with !txtgrn!#{url}"
|
11
11
|
|
12
12
|
config_for_display = Config.settings.dup
|
13
13
|
config_for_display['name_template'] = '%%{url}'
|
@@ -25,7 +25,7 @@ module Snapcrawl
|
|
25
25
|
private
|
26
26
|
|
27
27
|
def process_todo
|
28
|
-
$logger.debug "processing queue:
|
28
|
+
$logger.debug "processing queue: !txtgrn!#{todo.count} remaining"
|
29
29
|
|
30
30
|
url, page = todo.shift
|
31
31
|
done.push url
|
@@ -40,12 +40,12 @@ module Snapcrawl
|
|
40
40
|
next if todo.has_key?(sub_page) or done.include?(sub_page)
|
41
41
|
|
42
42
|
if Config.url_whitelist and sub_page.path !~ /#{Config.url_whitelist}/
|
43
|
-
$logger.debug "ignoring
|
43
|
+
$logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: whitelist"
|
44
44
|
next
|
45
45
|
end
|
46
46
|
|
47
47
|
if Config.url_blacklist and sub_page.path =~ /#{Config.url_blacklist}/
|
48
|
-
$logger.debug "ignoring
|
48
|
+
$logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: blacklist"
|
49
49
|
next
|
50
50
|
end
|
51
51
|
|
@@ -56,7 +56,7 @@ module Snapcrawl
|
|
56
56
|
def process_page(page)
|
57
57
|
outfile = "#{Config.snaps_dir}/#{Config.name_template}.png" % { url: page.url.to_slug }
|
58
58
|
|
59
|
-
$logger.info "processing
|
59
|
+
$logger.info "processing !undpur!#{page.url}!txtrst!, depth: #{page.depth}"
|
60
60
|
|
61
61
|
if !page.valid?
|
62
62
|
$logger.debug "page #{page.path} is invalid, aborting process"
|
@@ -66,7 +66,7 @@ module Snapcrawl
|
|
66
66
|
if file_fresh? outfile
|
67
67
|
$logger.info "screenshot for #{page.path} already exists"
|
68
68
|
else
|
69
|
-
$logger.info "
|
69
|
+
$logger.info "!bldgrn!capturing screenshot for #{page.path}"
|
70
70
|
save_screenshot page, outfile
|
71
71
|
end
|
72
72
|
|
@@ -76,7 +76,7 @@ module Snapcrawl
|
|
76
76
|
def save_screenshot(page, outfile)
|
77
77
|
page.save_screenshot outfile
|
78
78
|
rescue => e
|
79
|
-
$logger.error "screenshot error on
|
79
|
+
$logger.error "screenshot error on !undpur!#{page.path}!txtrst! - !txtred!#{e.class}!txtrst!: #{e.message}"
|
80
80
|
end
|
81
81
|
|
82
82
|
def file_fresh?(file)
|
@@ -8,10 +8,10 @@ module Snapcrawl
|
|
8
8
|
def verify
|
9
9
|
return if @verified
|
10
10
|
|
11
|
-
$logger.debug 'verifying
|
11
|
+
$logger.debug 'verifying !txtgrn!phantomjs!txtrst! is present'
|
12
12
|
raise MissingPhantomJS unless command_exist? "phantomjs"
|
13
13
|
|
14
|
-
$logger.debug 'verifying
|
14
|
+
$logger.debug 'verifying !txtgrn!imagemagick!txtrst! is present'
|
15
15
|
raise MissingImageMagick unless command_exist? "convert"
|
16
16
|
|
17
17
|
@verified = true
|
@@ -1,57 +1,35 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
|
1
3
|
module Snapcrawl
|
2
4
|
module LogHelpers
|
5
|
+
include Colsole
|
6
|
+
|
3
7
|
SEVERITY_COLORS = {
|
4
|
-
'INFO' => :
|
5
|
-
'WARN' => :
|
6
|
-
'ERROR' => :
|
7
|
-
'FATAL' => :
|
8
|
-
'DEBUG' => :
|
8
|
+
'INFO' => :txtblu,
|
9
|
+
'WARN' => :txtylw,
|
10
|
+
'ERROR' => :txtred,
|
11
|
+
'FATAL' => :txtred,
|
12
|
+
'DEBUG' => :txtcyn
|
9
13
|
}
|
10
14
|
|
11
15
|
def log_formatter
|
12
16
|
proc do |severity, _time, _prog, message|
|
13
17
|
severity_color = SEVERITY_COLORS[severity]
|
14
|
-
|
15
|
-
|
18
|
+
line = "!#{severity_color}!#{severity.rjust 5}!txtrst! : #{message}\n"
|
19
|
+
use_colors? ? colorize(line) : strip_color_markers(line)
|
16
20
|
end
|
17
21
|
end
|
18
22
|
|
19
|
-
def
|
20
|
-
@
|
21
|
-
end
|
22
|
-
|
23
|
-
def log_colors!
|
24
|
-
colors? ? actual_colors : empty_colors
|
25
|
-
end
|
26
|
-
|
27
|
-
def actual_colors
|
28
|
-
{
|
29
|
-
red: "\e[31m", green: "\e[32m", yellow: "\e[33m",
|
30
|
-
blue: "\e[34m", purple: "\e[35m", cyan: "\e[36m",
|
31
|
-
underlined: "\e[4m", bold: "\e[1m",
|
32
|
-
none: "", reset: "\e[0m"
|
33
|
-
}
|
34
|
-
end
|
35
|
-
|
36
|
-
def empty_colors
|
37
|
-
{
|
38
|
-
red: "", green: "", yellow: "",
|
39
|
-
blue: "", purple: "", cyan: "",
|
40
|
-
underlined: "", bold: "",
|
41
|
-
none: "", reset: ""
|
42
|
-
}
|
43
|
-
end
|
44
|
-
|
45
|
-
def colors?
|
46
|
-
if Config.log_color == 'auto'
|
47
|
-
tty?
|
48
|
-
else
|
49
|
-
Config.log_color
|
50
|
-
end
|
23
|
+
def use_colors?
|
24
|
+
@use_colors ||= (Config.log_color == 'auto' ? tty? : Config.log_color)
|
51
25
|
end
|
52
26
|
|
53
27
|
def tty?
|
54
28
|
ENV['TTY'] == 'on' ? true : ENV['TTY'] == 'off' ? false : $stdout.tty?
|
55
29
|
end
|
30
|
+
|
31
|
+
def strip_color_markers(text)
|
32
|
+
text.gsub(/\!([a-z]{6})\!/, '')
|
33
|
+
end
|
56
34
|
end
|
57
35
|
end
|
data/lib/snapcrawl/page.rb
CHANGED
@@ -55,13 +55,13 @@ module Snapcrawl
|
|
55
55
|
response = cache.get(url) { HTTParty.get url }
|
56
56
|
|
57
57
|
if !response.success?
|
58
|
-
$logger.warn "http error on
|
58
|
+
$logger.warn "http error on !undpur!#{url}!txtrst!, code: !txtylw!#{response.code}!txtrst!, message: #{response.message.strip}"
|
59
59
|
end
|
60
60
|
|
61
61
|
response
|
62
62
|
|
63
63
|
rescue => e
|
64
|
-
$logger.error "http error on
|
64
|
+
$logger.error "http error on !undpur!#{url}!txtrst! - !txtred!#{e.class}!txtrst!: #{e.message}"
|
65
65
|
nil
|
66
66
|
|
67
67
|
end
|
@@ -95,7 +95,7 @@ module Snapcrawl
|
|
95
95
|
begin
|
96
96
|
link = Addressable::URI.join(url, link).to_s.dup
|
97
97
|
rescue => e
|
98
|
-
$logger.warn "
|
98
|
+
$logger.warn "!txtred!#{e.class}!txtrst!: #{e.message} on #{path} (link: #{link})"
|
99
99
|
return nil
|
100
100
|
end
|
101
101
|
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|
@@ -166,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
166
|
- !ruby/object:Gem::Version
|
167
167
|
version: '0'
|
168
168
|
requirements: []
|
169
|
-
rubygems_version: 3.
|
169
|
+
rubygems_version: 3.2.3
|
170
170
|
signing_key:
|
171
171
|
specification_version: 4
|
172
172
|
summary: Crawl a website and take screenshots (CLI + Library)
|