snapcrawl 0.4.3 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +69 -62
- data/bin/snapcrawl +10 -2
- data/lib/snapcrawl.rb +15 -1
- data/lib/snapcrawl/cli.rb +55 -0
- data/lib/snapcrawl/config.rb +59 -0
- data/lib/snapcrawl/crawler.rb +54 -212
- data/lib/snapcrawl/dependencies.rb +21 -0
- data/lib/snapcrawl/exceptions.rb +1 -0
- data/lib/snapcrawl/log_helpers.rb +35 -0
- data/lib/snapcrawl/page.rb +111 -0
- data/lib/snapcrawl/pretty_logger.rb +11 -0
- data/lib/snapcrawl/refinements/pair_split.rb +23 -0
- data/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- data/lib/snapcrawl/screenshot.rb +62 -0
- data/lib/snapcrawl/templates/config.yml +41 -0
- data/lib/snapcrawl/templates/docopt.txt +26 -0
- data/lib/snapcrawl/version.rb +1 -1
- metadata +50 -18
- data/lib/snapcrawl/docopt.txt +0 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62a293da259afce5690315f27f2bbcd881e495a3d1b5344eb9ed9e2c46bd4a4d
|
4
|
+
data.tar.gz: d600fdbcd2344e5a19f853cbea67a0d8ad0c365a38d00aa4de8d02dd6e52e5b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ebdb2355480bacd7f7a6faba264a31086e68c1864c692607fdb6fbc11df210eee17af936ab63305484ee46ac473d50b4033be11e995b51b9050b359c81dd906
|
7
|
+
data.tar.gz: 42a0a9f048fe9b5b1b04426d444710a256ccc8e9a914e3277f062c4ebf760d50a018c1f189e7b0cebced1c236f5d13ca56ab4abbf808a5ec4812bf9a754a9343
|
data/README.md
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
-
Snapcrawl - crawl a website and take screenshots
|
2
|
-
==================================================
|
1
|
+
# Snapcrawl - crawl a website and take screenshots
|
3
2
|
|
4
|
-
[](https://travis-ci.com/DannyBen/snapcrawl)
|
5
3
|
[](http://badge.fury.io/rb/snapcrawl)
|
4
|
+
[](https://github.com/DannyBen/snapcrawl/actions?query=workflow%3ATest)
|
6
5
|
[](https://codeclimate.com/github/DannyBen/snapcrawl)
|
7
6
|
|
8
7
|
---
|
@@ -11,8 +10,7 @@ Snapcrawl is a command line utility for crawling a website and saving
|
|
11
10
|
screenshots.
|
12
11
|
|
13
12
|
|
14
|
-
Features
|
15
|
-
--------------------------------------------------
|
13
|
+
## Features
|
16
14
|
|
17
15
|
- Crawls a website to any given depth and saves screenshots
|
18
16
|
- Can capture the full length of the page
|
@@ -21,100 +19,109 @@ Features
|
|
21
19
|
- Uses local caching to avoid expensive crawl operations if not needed
|
22
20
|
- Reports broken links
|
23
21
|
|
22
|
+
## Install
|
24
23
|
|
25
|
-
|
26
|
-
--------------------------------------------------
|
27
|
-
|
28
|
-
Snapcrawl requires [PhantomJS][1] and [ImageMagick][2].
|
29
|
-
|
30
|
-
|
31
|
-
Docker Image
|
32
|
-
--------------------------------------------------
|
24
|
+
**Using Docker**
|
33
25
|
|
34
26
|
You can run Snapcrawl by using this docker image (which contains all the
|
35
27
|
necessary prerequisites):
|
36
28
|
|
37
|
-
```
|
38
|
-
$ docker
|
29
|
+
```shell
|
30
|
+
$ alias snapcrawl='docker run --rm -it --network host --volume "$PWD:/app" dannyben/snapcrawl'
|
39
31
|
```
|
40
32
|
|
41
|
-
|
33
|
+
For more information on the Docker image, refer to the [docker-snapcrawl][3] repository.
|
42
34
|
|
43
|
-
|
44
|
-
|
35
|
+
**Using Ruby**
|
36
|
+
|
37
|
+
```shell
|
38
|
+
$ gem install snapcrawl
|
45
39
|
```
|
46
40
|
|
47
|
-
|
41
|
+
Note that Snapcrawl requires [PhantomJS][1] and [ImageMagick][2].
|
48
42
|
|
43
|
+
## Usage
|
49
44
|
|
50
|
-
|
51
|
-
--------------------------------------------------
|
45
|
+
Snapcrawl can be configured either through a configuration file (YAML), or by specifying options in the command line.
|
52
46
|
|
47
|
+
```shell
|
48
|
+
$ snapcrawl
|
49
|
+
Usage:
|
50
|
+
snapcrawl URL [--config FILE] [SETTINGS...]
|
51
|
+
snapcrawl -h | --help
|
52
|
+
snapcrawl -v | --version
|
53
53
|
```
|
54
|
-
|
54
|
+
|
55
|
+
The default configuration filename is `snapcrawl.yml`.
|
56
|
+
|
57
|
+
Using the `--config` flag will create a template configuration file if it is not present:
|
58
|
+
|
59
|
+
```shell
|
60
|
+
$ snapcrawl example.com --config snapcrawl
|
55
61
|
```
|
56
62
|
|
63
|
+
### Specifying options in the command line
|
57
64
|
|
58
|
-
|
59
|
-
--------------------------------------------------
|
65
|
+
All configuration options can be specified in the command line as `key=value` pairs:
|
60
66
|
|
67
|
+
```shell
|
68
|
+
$ snapcrawl example.com log_level=0 depth=2 width=1024
|
61
69
|
```
|
62
|
-
$ snapcrawl --help
|
63
70
|
|
64
|
-
|
71
|
+
### Sample configuration file
|
65
72
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
73
|
+
```yaml
|
74
|
+
# All values below are the default values
|
75
|
+
|
76
|
+
# log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
|
77
|
+
log_level: 1
|
70
78
|
|
71
|
-
|
72
|
-
|
73
|
-
|
79
|
+
# log_color (yes, no, auto)
|
80
|
+
# yes = always show log color
|
81
|
+
# no = never use colors
|
82
|
+
# auto = only use colors when running in an interactive terminal
|
83
|
+
log_color: auto
|
74
84
|
|
75
|
-
|
76
|
-
|
77
|
-
use the captured URL in the filename [default: %{url}]
|
85
|
+
# number of levels to crawl, 0 means capture only the root URL
|
86
|
+
depth: 1
|
78
87
|
|
79
|
-
|
80
|
-
|
88
|
+
# screenshot width in pixels
|
89
|
+
width: 1280
|
81
90
|
|
82
|
-
|
83
|
-
|
91
|
+
# screenshot height in pixels, 0 means the entire height
|
92
|
+
height: 0
|
84
93
|
|
85
|
-
|
86
|
-
|
94
|
+
# number of seconds to consider the page cache and its screenshot fresh
|
95
|
+
cache_life: 86400
|
87
96
|
|
88
|
-
|
89
|
-
|
97
|
+
# where to store the HTML page cache
|
98
|
+
cache_dir: cache
|
90
99
|
|
91
|
-
|
92
|
-
|
100
|
+
# where to store screenshots
|
101
|
+
snaps_dir: snaps
|
93
102
|
|
94
|
-
|
95
|
-
|
103
|
+
# screenshot filename template, where '%{url}' will be replaced with a
|
104
|
+
# slug version of the URL (no need to include the .png extension)
|
105
|
+
name_template: '%{url}'
|
96
106
|
|
97
|
-
|
98
|
-
|
107
|
+
# urls not matching this regular expression will be ignored
|
108
|
+
url_whitelist:
|
99
109
|
|
100
|
-
|
101
|
-
|
110
|
+
# urls matching this regular expression will be ignored
|
111
|
+
url_blacklist:
|
102
112
|
|
103
|
-
|
104
|
-
|
105
|
-
snapcrawl example.com -d2 -fscreens
|
106
|
-
snapcrawl example.com -d2 > out.txt 2> err.txt &
|
107
|
-
snapcrawl example.com -W360 -H480
|
108
|
-
snapcrawl example.com --selector "#main-content"
|
109
|
-
snapcrawl example.com --only "products|collections"
|
110
|
-
snapcrawl example.com --name "screenshot-%{url}"
|
111
|
-
snapcrawl example.com --name "`date +%Y%m%d`_%{url}"
|
113
|
+
# take a screenshot of this CSS selector only
|
114
|
+
css_selector:
|
112
115
|
```
|
113
116
|
|
117
|
+
## Contributing / Support
|
118
|
+
If you experience any issue, have a question or a suggestion, or if you wish
|
119
|
+
to contribute, feel free to [open an issue][issues].
|
120
|
+
|
114
121
|
---
|
115
122
|
|
116
123
|
[1]: http://phantomjs.org/download.html
|
117
124
|
[2]: https://imagemagick.org/script/download.php
|
118
125
|
[3]: https://github.com/DannyBen/docker-snapcrawl
|
119
|
-
|
126
|
+
[issues]: https://github.com/DannyBen/snapcrawl/issues
|
120
127
|
|
data/bin/snapcrawl
CHANGED
@@ -1,22 +1,30 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'snapcrawl'
|
4
|
+
require 'colsole'
|
5
|
+
|
4
6
|
trap(:INT) { abort "\r\nGoodbye" }
|
7
|
+
|
5
8
|
include Snapcrawl
|
9
|
+
include Colsole
|
6
10
|
|
7
11
|
begin
|
8
|
-
|
12
|
+
CLI.new.call ARGV
|
13
|
+
|
9
14
|
rescue MissingPhantomJS => e
|
10
15
|
message = "Cannot find phantomjs executable in the path, please install it first."
|
11
16
|
say! "\n\n!undred!#{e.class}!txtrst!\n#{message}"
|
12
17
|
exit 2
|
18
|
+
|
13
19
|
rescue MissingImageMagick=> e
|
14
20
|
message = "Cannot find convert (ImageMagick) executable in the path, please install it first."
|
15
21
|
say! "\n\n!undred!#{e.class}!txtrst!\n#{message}"
|
16
22
|
exit 3
|
23
|
+
|
17
24
|
rescue => e
|
18
25
|
puts e.backtrace.reverse if ENV['DEBUG']
|
19
|
-
say! "\n
|
26
|
+
say! "\n!undred!#{e.class}!txtrst!\n#{e.message}"
|
20
27
|
exit 1
|
28
|
+
|
21
29
|
end
|
22
30
|
|
data/lib/snapcrawl.rb
CHANGED
@@ -1,6 +1,20 @@
|
|
1
1
|
require 'snapcrawl/version'
|
2
2
|
require 'snapcrawl/exceptions'
|
3
|
+
require 'snapcrawl/refinements/pair_split'
|
4
|
+
require 'snapcrawl/refinements/string_refinements'
|
5
|
+
require 'snapcrawl/log_helpers'
|
6
|
+
require 'snapcrawl/pretty_logger'
|
7
|
+
require 'snapcrawl/dependencies'
|
8
|
+
require 'snapcrawl/config'
|
9
|
+
require 'snapcrawl/screenshot'
|
10
|
+
require 'snapcrawl/page'
|
3
11
|
require 'snapcrawl/crawler'
|
12
|
+
require 'snapcrawl/cli'
|
4
13
|
|
5
|
-
|
14
|
+
if ENV['BYEBUG']
|
15
|
+
require 'byebug'
|
16
|
+
require 'lp'
|
17
|
+
end
|
6
18
|
|
19
|
+
Snapcrawl::Config.load
|
20
|
+
$logger = Snapcrawl::PrettyLogger.new
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
require 'docopt'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
module Snapcrawl
|
6
|
+
class CLI
|
7
|
+
include Colsole
|
8
|
+
using StringRefinements
|
9
|
+
using PairSplit
|
10
|
+
|
11
|
+
def call(args = [])
|
12
|
+
begin
|
13
|
+
execute Docopt::docopt(docopt, version: VERSION, argv: args)
|
14
|
+
rescue Docopt::Exit => e
|
15
|
+
puts e.message
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def execute(args)
|
22
|
+
config_file = args['--config']
|
23
|
+
Config.load config_file if config_file
|
24
|
+
|
25
|
+
tweaks = args['SETTINGS'].pair_split
|
26
|
+
apply_tweaks tweaks if tweaks
|
27
|
+
|
28
|
+
Dependencies.verify
|
29
|
+
|
30
|
+
$logger.debug 'initializing cli'
|
31
|
+
FileUtils.mkdir_p Config.snaps_dir
|
32
|
+
|
33
|
+
url = args['URL'].protocolize
|
34
|
+
crawler = Crawler.new url
|
35
|
+
|
36
|
+
crawler.crawl
|
37
|
+
end
|
38
|
+
|
39
|
+
def docopt
|
40
|
+
@doc ||= File.read docopt_path
|
41
|
+
end
|
42
|
+
|
43
|
+
def docopt_path
|
44
|
+
File.expand_path "templates/docopt.txt", __dir__
|
45
|
+
end
|
46
|
+
|
47
|
+
def apply_tweaks(tweaks)
|
48
|
+
tweaks.each do |key, value|
|
49
|
+
Config.settings[key] = value
|
50
|
+
$logger.level = value if key == 'log_level'
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'sting'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Snapcrawl
|
5
|
+
class Config < Sting
|
6
|
+
class << self
|
7
|
+
def load(file = nil)
|
8
|
+
reset!
|
9
|
+
push defaults
|
10
|
+
|
11
|
+
return unless file
|
12
|
+
|
13
|
+
file = "#{file}.yml" unless file =~ /\.ya?ml$/
|
14
|
+
|
15
|
+
# FIXME: Cannot use logger here due to the "chicken and egg" with
|
16
|
+
# Config. The $logger is available, but it was not yet fully
|
17
|
+
# configured with log_level etc.
|
18
|
+
if File.exist? file
|
19
|
+
# $logger.debug "loading config file !txtgrn!#{file}"
|
20
|
+
push file
|
21
|
+
else
|
22
|
+
# $logger.debug "creating config file !txtgrn!#{file}"
|
23
|
+
create_config file
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def defaults
|
30
|
+
{
|
31
|
+
depth: 1,
|
32
|
+
width: 1280,
|
33
|
+
height: 0,
|
34
|
+
cache_life: 86400,
|
35
|
+
cache_dir: 'cache',
|
36
|
+
snaps_dir: 'snaps',
|
37
|
+
name_template: '%{url}',
|
38
|
+
url_whitelist: nil,
|
39
|
+
url_blacklist: nil,
|
40
|
+
css_selector: nil,
|
41
|
+
log_level: 1,
|
42
|
+
log_color: 'auto',
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def create_config(file)
|
47
|
+
content = File.read config_template
|
48
|
+
dir = File.dirname file
|
49
|
+
FileUtils.mkdir_p dir
|
50
|
+
File.write file, content
|
51
|
+
end
|
52
|
+
|
53
|
+
def config_template
|
54
|
+
File.expand_path 'templates/config.yml', __dir__
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -1,257 +1,99 @@
|
|
1
|
-
require 'colsole'
|
2
|
-
require 'docopt'
|
3
1
|
require 'fileutils'
|
4
|
-
require 'httparty'
|
5
|
-
require 'nokogiri'
|
6
|
-
require 'ostruct'
|
7
|
-
require 'pstore'
|
8
|
-
require 'addressable/uri'
|
9
|
-
require 'webshot'
|
10
2
|
|
11
3
|
module Snapcrawl
|
12
|
-
include Colsole
|
13
|
-
|
14
4
|
class Crawler
|
15
|
-
|
16
|
-
|
17
|
-
def initialize
|
18
|
-
@storefile = "snapcrawl.pstore"
|
19
|
-
@store = PStore.new(@storefile)
|
20
|
-
end
|
5
|
+
using StringRefinements
|
21
6
|
|
22
|
-
|
23
|
-
@done = []
|
24
|
-
begin
|
25
|
-
execute Docopt::docopt(doc, version: VERSION, argv: args)
|
26
|
-
rescue Docopt::Exit => e
|
27
|
-
puts e.message
|
28
|
-
end
|
29
|
-
end
|
7
|
+
attr_reader :url
|
30
8
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
|
9
|
+
def initialize(url)
|
10
|
+
$logger.debug "initializing crawler with !txtgrn!#{url}"
|
11
|
+
|
12
|
+
config_for_display = Config.settings.dup
|
13
|
+
config_for_display['name_template'] = '%%{url}'
|
14
|
+
|
15
|
+
$logger.debug "config #{config_for_display}"
|
16
|
+
@url = url
|
35
17
|
end
|
36
18
|
|
37
|
-
def
|
38
|
-
|
19
|
+
def crawl
|
20
|
+
Dependencies.verify
|
21
|
+
todo[url] = Page.new url
|
22
|
+
process_todo while todo.any?
|
39
23
|
end
|
40
24
|
|
41
25
|
private
|
42
26
|
|
43
|
-
def
|
44
|
-
|
45
|
-
defaults = {
|
46
|
-
width: 1280,
|
47
|
-
height: 0,
|
48
|
-
depth: 1,
|
49
|
-
age: 86400,
|
50
|
-
folder: 'snaps',
|
51
|
-
name: '%{url}',
|
52
|
-
base: url,
|
53
|
-
}
|
54
|
-
urls = [url]
|
55
|
-
|
56
|
-
@opts = OpenStruct.new defaults.merge(opts)
|
27
|
+
def process_todo
|
28
|
+
$logger.debug "processing queue: !txtgrn!#{todo.count} remaining"
|
57
29
|
|
58
|
-
|
30
|
+
url, page = todo.shift
|
31
|
+
done.push url
|
59
32
|
|
60
|
-
|
61
|
-
|
33
|
+
if process_page page
|
34
|
+
register_sub_pages page.pages if page.depth < Config.depth
|
62
35
|
end
|
63
36
|
end
|
64
37
|
|
65
|
-
def
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
say " Snap: Skipping. Does not match regex"
|
73
|
-
else
|
74
|
-
snap url
|
38
|
+
def register_sub_pages(pages)
|
39
|
+
pages.each do |sub_page|
|
40
|
+
next if todo.has_key?(sub_page) or done.include?(sub_page)
|
41
|
+
|
42
|
+
if Config.url_whitelist and sub_page.path !~ /#{Config.url_whitelist}/
|
43
|
+
$logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: whitelist"
|
44
|
+
next
|
75
45
|
end
|
76
|
-
new_urls += extract_urls_from url
|
77
|
-
end
|
78
|
-
new_urls
|
79
|
-
end
|
80
|
-
|
81
|
-
# Take a screenshot of a URL, unless we already did so recently
|
82
|
-
def snap(url)
|
83
|
-
file = image_path_for(url)
|
84
|
-
if file_fresh? file
|
85
|
-
say " Snap: Skipping. File exists and seems fresh"
|
86
|
-
else
|
87
|
-
snap!(url)
|
88
|
-
end
|
89
|
-
end
|
90
46
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
image_path = image_path_for url
|
95
|
-
|
96
|
-
fetch_opts = { allowed_status_codes: [404, 401, 403] }
|
97
|
-
if @opts.selector
|
98
|
-
fetch_opts[:selector] = @opts.selector
|
99
|
-
fetch_opts[:full] = false
|
100
|
-
end
|
101
|
-
|
102
|
-
hide_output do
|
103
|
-
webshot.capture url, image_path, fetch_opts do |magick|
|
104
|
-
magick.combine_options do |c|
|
105
|
-
c.background "white"
|
106
|
-
c.gravity 'north'
|
107
|
-
c.quality 100
|
108
|
-
c.extent @opts.height > 0 ? "#{@opts.width}x#{@opts.height}" : "#{@opts.width}x"
|
109
|
-
end
|
47
|
+
if Config.url_blacklist and sub_page.path =~ /#{Config.url_blacklist}/
|
48
|
+
$logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: blacklist"
|
49
|
+
next
|
110
50
|
end
|
111
|
-
end
|
112
51
|
|
113
|
-
|
114
|
-
end
|
115
|
-
|
116
|
-
def extract_urls_from(url)
|
117
|
-
cached = nil
|
118
|
-
@store.transaction { cached = @store[url] }
|
119
|
-
if cached
|
120
|
-
say " Crawl: Page was cached. Reading subsequent URLs from cache"
|
121
|
-
return cached
|
122
|
-
else
|
123
|
-
return extract_urls_from! url
|
52
|
+
todo[sub_page.url] = sub_page
|
124
53
|
end
|
125
54
|
end
|
126
55
|
|
127
|
-
def
|
128
|
-
|
56
|
+
def process_page(page)
|
57
|
+
outfile = "#{Config.snaps_dir}/#{Config.name_template}.png" % { url: page.url.to_slug }
|
129
58
|
|
130
|
-
|
131
|
-
response = HTTParty.get url
|
132
|
-
if response.success?
|
133
|
-
doc = Nokogiri::HTML response.body
|
134
|
-
links = doc.css('a')
|
135
|
-
links, warnings = normalize_links links
|
136
|
-
@store.transaction { @store[url] = links }
|
137
|
-
say "done"
|
138
|
-
warnings.each do |warning|
|
139
|
-
say "!txtylw! Warn: #{warning[:link]}"
|
140
|
-
say word_wrap " #{warning[:message]}"
|
141
|
-
end
|
142
|
-
else
|
143
|
-
links = []
|
144
|
-
say "!txtred!FAILED"
|
145
|
-
say "!txtred! ! HTTP Error: #{response.code} #{response.message.strip} at #{url}"
|
146
|
-
end
|
147
|
-
end
|
148
|
-
links
|
149
|
-
end
|
59
|
+
$logger.info "processing !undpur!#{page.url}!txtrst!, depth: #{page.depth}"
|
150
60
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
61
|
+
if !page.valid?
|
62
|
+
$logger.debug "page #{page.path} is invalid, aborting process"
|
63
|
+
return false
|
64
|
+
end
|
155
65
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
66
|
+
if file_fresh? outfile
|
67
|
+
$logger.info "screenshot for #{page.path} already exists"
|
68
|
+
else
|
69
|
+
$logger.info "!bldgrn!capturing screenshot for #{page.path}"
|
70
|
+
save_screenshot page, outfile
|
71
|
+
end
|
160
72
|
|
161
|
-
|
162
|
-
def image_path_for(url)
|
163
|
-
"#{@opts.folder}/#{@opts.name}.png" % { url: handelize(url) }
|
73
|
+
true
|
164
74
|
end
|
165
75
|
|
166
|
-
|
167
|
-
|
168
|
-
|
76
|
+
def save_screenshot(page, outfile)
|
77
|
+
page.save_screenshot outfile
|
78
|
+
rescue => e
|
79
|
+
$logger.error "screenshot error on !undpur!#{page.path}!txtrst! - !txtred!#{e.class}!txtrst!: #{e.message}"
|
169
80
|
end
|
170
81
|
|
171
|
-
# Return true if the file exists and is not too old
|
172
82
|
def file_fresh?(file)
|
173
|
-
|
83
|
+
Config.cache_life > 0 and File.exist?(file) and file_age(file) < Config.cache_life
|
174
84
|
end
|
175
85
|
|
176
|
-
# Return file age in seconds
|
177
86
|
def file_age(file)
|
178
87
|
(Time.now - File.stat(file).mtime).to_i
|
179
88
|
end
|
180
89
|
|
181
|
-
|
182
|
-
|
183
|
-
extensions = "png|gif|jpg|pdf|zip"
|
184
|
-
beginnings = "mailto|tel"
|
185
|
-
|
186
|
-
links_array = []
|
187
|
-
warnings = []
|
188
|
-
|
189
|
-
links.each do |link|
|
190
|
-
link = link.attribute('href').to_s.dup
|
191
|
-
|
192
|
-
# Remove #hash
|
193
|
-
link.gsub!(/#.+$/, '')
|
194
|
-
next if link.empty?
|
195
|
-
|
196
|
-
# Remove links to specific extensions and protocols
|
197
|
-
next if link =~ /\.(#{extensions})(\?.*)?$/
|
198
|
-
next if link =~ /^(#{beginnings})/
|
199
|
-
|
200
|
-
# Strip spaces
|
201
|
-
link.strip!
|
202
|
-
|
203
|
-
# Convert relative links to absolute
|
204
|
-
begin
|
205
|
-
link = Addressable::URI.join( @opts.base, link ).to_s.dup
|
206
|
-
rescue => e
|
207
|
-
warnings << { link: link, message: "#{e.class} #{e.message}" }
|
208
|
-
next
|
209
|
-
end
|
210
|
-
|
211
|
-
# Keep only links in our base domain
|
212
|
-
next unless link.include? @opts.base
|
213
|
-
|
214
|
-
links_array << link
|
215
|
-
end
|
216
|
-
|
217
|
-
[links_array.uniq, warnings]
|
218
|
-
end
|
219
|
-
|
220
|
-
def doc
|
221
|
-
@doc ||= File.read docopt
|
222
|
-
end
|
223
|
-
|
224
|
-
def docopt
|
225
|
-
File.expand_path "docopt.txt", __dir__
|
226
|
-
end
|
227
|
-
|
228
|
-
def opts_from_args(args)
|
229
|
-
opts = {}
|
230
|
-
%w[folder name selector only].each do |opt|
|
231
|
-
opts[opt.to_sym] = args["--#{opt}"] if args["--#{opt}"]
|
232
|
-
end
|
233
|
-
|
234
|
-
%w[age depth width height].each do |opt|
|
235
|
-
opts[opt.to_sym] = args["--#{opt}"].to_i if args["--#{opt}"]
|
236
|
-
end
|
237
|
-
|
238
|
-
opts
|
90
|
+
def todo
|
91
|
+
@todo ||= {}
|
239
92
|
end
|
240
93
|
|
241
|
-
def
|
242
|
-
@
|
94
|
+
def done
|
95
|
+
@done ||= []
|
243
96
|
end
|
244
97
|
|
245
|
-
# The webshot gem messes with stdout/stderr streams so we keep it in
|
246
|
-
# check by using this method. Also, in some sites (e.g. uown.co) it
|
247
|
-
# prints some output to stdout, this is why we override $stdout for
|
248
|
-
# the duration of the run.
|
249
|
-
def hide_output
|
250
|
-
keep_stdout, keep_stderr = $stdout, $stderr
|
251
|
-
$stdout, $stderr = StringIO.new, StringIO.new
|
252
|
-
yield
|
253
|
-
ensure
|
254
|
-
$stdout, $stderr = keep_stdout, keep_stderr
|
255
|
-
end
|
256
98
|
end
|
257
99
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
|
3
|
+
module Snapcrawl
|
4
|
+
class Dependencies
|
5
|
+
class << self
|
6
|
+
include Colsole
|
7
|
+
|
8
|
+
def verify
|
9
|
+
return if @verified
|
10
|
+
|
11
|
+
$logger.debug 'verifying !txtgrn!phantomjs!txtrst! is present'
|
12
|
+
raise MissingPhantomJS unless command_exist? "phantomjs"
|
13
|
+
|
14
|
+
$logger.debug 'verifying !txtgrn!imagemagick!txtrst! is present'
|
15
|
+
raise MissingImageMagick unless command_exist? "convert"
|
16
|
+
|
17
|
+
@verified = true
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/snapcrawl/exceptions.rb
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
|
3
|
+
module Snapcrawl
|
4
|
+
module LogHelpers
|
5
|
+
include Colsole
|
6
|
+
|
7
|
+
SEVERITY_COLORS = {
|
8
|
+
'INFO' => :txtblu,
|
9
|
+
'WARN' => :txtylw,
|
10
|
+
'ERROR' => :txtred,
|
11
|
+
'FATAL' => :txtred,
|
12
|
+
'DEBUG' => :txtcyn
|
13
|
+
}
|
14
|
+
|
15
|
+
def log_formatter
|
16
|
+
proc do |severity, _time, _prog, message|
|
17
|
+
severity_color = SEVERITY_COLORS[severity]
|
18
|
+
line = "!#{severity_color}!#{severity.rjust 5}!txtrst! : #{message}\n"
|
19
|
+
use_colors? ? colorize(line) : strip_color_markers(line)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def use_colors?
|
24
|
+
@use_colors ||= (Config.log_color == 'auto' ? tty? : Config.log_color)
|
25
|
+
end
|
26
|
+
|
27
|
+
def tty?
|
28
|
+
ENV['TTY'] == 'on' ? true : ENV['TTY'] == 'off' ? false : $stdout.tty?
|
29
|
+
end
|
30
|
+
|
31
|
+
def strip_color_markers(text)
|
32
|
+
text.gsub(/\!([a-z]{6})\!/, '')
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'httparty'
|
4
|
+
require 'lightly'
|
5
|
+
require 'nokogiri'
|
6
|
+
|
7
|
+
module Snapcrawl
|
8
|
+
class Page
|
9
|
+
using StringRefinements
|
10
|
+
|
11
|
+
attr_reader :url, :depth
|
12
|
+
|
13
|
+
EXTENSION_BLACKLIST = "png|gif|jpg|pdf|zip"
|
14
|
+
PROTOCOL_BLACKLIST = "mailto|tel"
|
15
|
+
|
16
|
+
def initialize(url, depth: 0)
|
17
|
+
@url, @depth = url.protocolize, depth
|
18
|
+
end
|
19
|
+
|
20
|
+
def valid?
|
21
|
+
http_response&.success?
|
22
|
+
end
|
23
|
+
|
24
|
+
def site
|
25
|
+
@site ||= Addressable::URI.parse(url).site
|
26
|
+
end
|
27
|
+
|
28
|
+
def path
|
29
|
+
@path ||= Addressable::URI.parse(url).request_uri
|
30
|
+
end
|
31
|
+
|
32
|
+
def links
|
33
|
+
return nil unless valid?
|
34
|
+
doc = Nokogiri::HTML http_response.body
|
35
|
+
normalize_links doc.css('a')
|
36
|
+
end
|
37
|
+
|
38
|
+
def pages
|
39
|
+
return nil unless valid?
|
40
|
+
links.map { |link| Page.new link, depth: depth+1 }
|
41
|
+
end
|
42
|
+
|
43
|
+
def save_screenshot(outfile)
|
44
|
+
return false unless valid?
|
45
|
+
Screenshot.new(url).save "#{outfile}"
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def http_response
|
51
|
+
@http_response ||= http_response!
|
52
|
+
end
|
53
|
+
|
54
|
+
def http_response!
|
55
|
+
response = cache.get(url) { HTTParty.get url }
|
56
|
+
|
57
|
+
if !response.success?
|
58
|
+
$logger.warn "http error on !undpur!#{url}!txtrst!, code: !txtylw!#{response.code}!txtrst!, message: #{response.message.strip}"
|
59
|
+
end
|
60
|
+
|
61
|
+
response
|
62
|
+
|
63
|
+
rescue => e
|
64
|
+
$logger.error "http error on !undpur!#{url}!txtrst! - !txtred!#{e.class}!txtrst!: #{e.message}"
|
65
|
+
nil
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
def normalize_links(links)
|
70
|
+
result = []
|
71
|
+
|
72
|
+
links.each do |link|
|
73
|
+
valid_link = normalize_link link
|
74
|
+
result << valid_link if valid_link
|
75
|
+
end
|
76
|
+
|
77
|
+
result.uniq
|
78
|
+
end
|
79
|
+
|
80
|
+
def normalize_link(link)
|
81
|
+
link = link.attribute('href').to_s.dup
|
82
|
+
|
83
|
+
# Remove #hash
|
84
|
+
link.gsub!(/#.+$/, '')
|
85
|
+
return nil if link.empty?
|
86
|
+
|
87
|
+
# Remove links to specific extensions and protocols
|
88
|
+
return nil if link =~ /\.(#{EXTENSION_BLACKLIST})(\?.*)?$/
|
89
|
+
return nil if link =~ /^(#{PROTOCOL_BLACKLIST}):/
|
90
|
+
|
91
|
+
# Strip spaces
|
92
|
+
link.strip!
|
93
|
+
|
94
|
+
# Convert relative links to absolute
|
95
|
+
begin
|
96
|
+
link = Addressable::URI.join(url, link).to_s.dup
|
97
|
+
rescue => e
|
98
|
+
$logger.warn "!txtred!#{e.class}!txtrst!: #{e.message} on #{path} (link: #{link})"
|
99
|
+
return nil
|
100
|
+
end
|
101
|
+
|
102
|
+
# Keep only links in our base domain
|
103
|
+
return nil unless link.include? site
|
104
|
+
link
|
105
|
+
end
|
106
|
+
|
107
|
+
def cache
|
108
|
+
Lightly.new life: Config.cache_life
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Snapcrawl
|
2
|
+
module PairSplit
|
3
|
+
refine Array do
|
4
|
+
def pair_split
|
5
|
+
map do |pair|
|
6
|
+
key, value = pair.split '='
|
7
|
+
|
8
|
+
value = if value =~ /^\d+$/
|
9
|
+
value.to_i
|
10
|
+
elsif ['no', 'false'].include? value
|
11
|
+
false
|
12
|
+
elsif ['yes', 'true'].include? value
|
13
|
+
true
|
14
|
+
else
|
15
|
+
value
|
16
|
+
end
|
17
|
+
|
18
|
+
[key, value]
|
19
|
+
end.to_h
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'webshot'
|
2
|
+
|
3
|
+
module Snapcrawl
|
4
|
+
class Screenshot
|
5
|
+
using StringRefinements
|
6
|
+
|
7
|
+
attr_reader :url
|
8
|
+
|
9
|
+
def initialize(url)
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
|
13
|
+
def save(outfile = nil)
|
14
|
+
outfile ||= "#{url.to_slug}.png"
|
15
|
+
|
16
|
+
fetch_opts = { allowed_status_codes: [404, 401, 403] }
|
17
|
+
if Config.selector
|
18
|
+
fetch_opts[:selector] = Config.selector
|
19
|
+
fetch_opts[:full] = false
|
20
|
+
end
|
21
|
+
|
22
|
+
webshot_capture url, outfile, fetch_opts
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def webshot_capture(url, image_path, fetch_opts)
|
28
|
+
webshot_capture! url, image_path, fetch_opts
|
29
|
+
rescue => e
|
30
|
+
raise ScreenshotError, "#{e.class} #{e.message}"
|
31
|
+
end
|
32
|
+
|
33
|
+
def webshot_capture!(url, image_path, fetch_opts)
|
34
|
+
hide_output do
|
35
|
+
webshot.capture url, image_path, fetch_opts do |magick|
|
36
|
+
magick.combine_options do |c|
|
37
|
+
c.background "white"
|
38
|
+
c.gravity 'north'
|
39
|
+
c.quality 100
|
40
|
+
c.extent Config.height > 0 ? "#{Config.width}x#{Config.height}" : "#{Config.width}x"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def webshot
|
47
|
+
@webshot ||= Webshot::Screenshot.instance
|
48
|
+
end
|
49
|
+
|
50
|
+
# The webshot gem messes with stdout/stderr streams so we keep it in
|
51
|
+
# check by using this method. Also, in some sites (e.g. uown.co) it
|
52
|
+
# prints some output to stdout, this is why we override $stdout for
|
53
|
+
# the duration of the run.
|
54
|
+
def hide_output
|
55
|
+
keep_stdout, keep_stderr = $stdout, $stderr
|
56
|
+
$stdout, $stderr = StringIO.new, StringIO.new
|
57
|
+
yield
|
58
|
+
ensure
|
59
|
+
$stdout, $stderr = keep_stdout, keep_stderr
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# All values below are the default values
|
2
|
+
|
3
|
+
# log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
|
4
|
+
log_level: 1
|
5
|
+
|
6
|
+
# log_color (yes, no, auto)
|
7
|
+
# yes = always show log color
|
8
|
+
# no = never use colors
|
9
|
+
# auto = only use colors when running in an interactive terminal
|
10
|
+
log_color: auto
|
11
|
+
|
12
|
+
# number of levels to crawl, 0 means capture only the root URL
|
13
|
+
depth: 1
|
14
|
+
|
15
|
+
# screenshot width in pixels
|
16
|
+
width: 1280
|
17
|
+
|
18
|
+
# screenshot height in pixels, 0 means the entire height
|
19
|
+
height: 0
|
20
|
+
|
21
|
+
# number of seconds to consider the page cache and its screenshot fresh
|
22
|
+
cache_life: 86400
|
23
|
+
|
24
|
+
# where to store the HTML page cache
|
25
|
+
cache_dir: cache
|
26
|
+
|
27
|
+
# where to store screenshots
|
28
|
+
snaps_dir: snaps
|
29
|
+
|
30
|
+
# screenshot filename template, where '%{url}' will be replaced with a
|
31
|
+
# slug version of the URL (no need to include the .png extension)
|
32
|
+
name_template: '%{url}'
|
33
|
+
|
34
|
+
# urls not matching this regular expression will be ignored
|
35
|
+
url_whitelist:
|
36
|
+
|
37
|
+
# urls matching this regular expression will be ignored
|
38
|
+
url_blacklist:
|
39
|
+
|
40
|
+
# take a screenshot of this CSS selector only
|
41
|
+
css_selector:
|
@@ -0,0 +1,26 @@
|
|
1
|
+
Snapcrawl
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
snapcrawl URL [--config FILE] [SETTINGS...]
|
5
|
+
snapcrawl -h | --help
|
6
|
+
snapcrawl -v | --version
|
7
|
+
|
8
|
+
Options:
|
9
|
+
-c, --config FILE
|
10
|
+
Path to config file, with or without the .yml extension.
|
11
|
+
A sample file will be created if not found.
|
12
|
+
The default filename is 'snapcrawl.yml'.
|
13
|
+
|
14
|
+
-h, --help
|
15
|
+
Show this screen
|
16
|
+
|
17
|
+
-v, --version
|
18
|
+
Show version number
|
19
|
+
|
20
|
+
Settings:
|
21
|
+
Provide any of the options available in the config as 'key=value'.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
snapcrawl example.com
|
25
|
+
snapcrawl example.com --config simple
|
26
|
+
snapcrawl example.com depth=1 log_level=2 width=768
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|
@@ -16,48 +16,42 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0.
|
20
|
-
- - ">="
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
version: 0.5.4
|
19
|
+
version: '0.7'
|
23
20
|
type: :runtime
|
24
21
|
prerelease: false
|
25
22
|
version_requirements: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
24
|
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version: '0.
|
30
|
-
- - ">="
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: 0.5.4
|
26
|
+
version: '0.7'
|
33
27
|
- !ruby/object:Gem::Dependency
|
34
28
|
name: docopt
|
35
29
|
requirement: !ruby/object:Gem::Requirement
|
36
30
|
requirements:
|
37
31
|
- - "~>"
|
38
32
|
- !ruby/object:Gem::Version
|
39
|
-
version: '0.
|
33
|
+
version: '0.6'
|
40
34
|
type: :runtime
|
41
35
|
prerelease: false
|
42
36
|
version_requirements: !ruby/object:Gem::Requirement
|
43
37
|
requirements:
|
44
38
|
- - "~>"
|
45
39
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
40
|
+
version: '0.6'
|
47
41
|
- !ruby/object:Gem::Dependency
|
48
42
|
name: nokogiri
|
49
43
|
requirement: !ruby/object:Gem::Requirement
|
50
44
|
requirements:
|
51
45
|
- - "~>"
|
52
46
|
- !ruby/object:Gem::Version
|
53
|
-
version: '1.
|
47
|
+
version: '1.10'
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
51
|
requirements:
|
58
52
|
- - "~>"
|
59
53
|
- !ruby/object:Gem::Version
|
60
|
-
version: '1.
|
54
|
+
version: '1.10'
|
61
55
|
- !ruby/object:Gem::Dependency
|
62
56
|
name: webshot
|
63
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,14 +72,14 @@ dependencies:
|
|
78
72
|
requirements:
|
79
73
|
- - "~>"
|
80
74
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
75
|
+
version: '0.18'
|
82
76
|
type: :runtime
|
83
77
|
prerelease: false
|
84
78
|
version_requirements: !ruby/object:Gem::Requirement
|
85
79
|
requirements:
|
86
80
|
- - "~>"
|
87
81
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
82
|
+
version: '0.18'
|
89
83
|
- !ruby/object:Gem::Dependency
|
90
84
|
name: addressable
|
91
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,6 +94,34 @@ dependencies:
|
|
100
94
|
- - "~>"
|
101
95
|
- !ruby/object:Gem::Version
|
102
96
|
version: '2.7'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: lightly
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.3'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.3'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: sting
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.4'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.4'
|
103
125
|
description: Snapcrawl is a command line utility for crawling a website and saving
|
104
126
|
screenshots.
|
105
127
|
email: db@dannyben.com
|
@@ -111,9 +133,19 @@ files:
|
|
111
133
|
- README.md
|
112
134
|
- bin/snapcrawl
|
113
135
|
- lib/snapcrawl.rb
|
136
|
+
- lib/snapcrawl/cli.rb
|
137
|
+
- lib/snapcrawl/config.rb
|
114
138
|
- lib/snapcrawl/crawler.rb
|
115
|
-
- lib/snapcrawl/
|
139
|
+
- lib/snapcrawl/dependencies.rb
|
116
140
|
- lib/snapcrawl/exceptions.rb
|
141
|
+
- lib/snapcrawl/log_helpers.rb
|
142
|
+
- lib/snapcrawl/page.rb
|
143
|
+
- lib/snapcrawl/pretty_logger.rb
|
144
|
+
- lib/snapcrawl/refinements/pair_split.rb
|
145
|
+
- lib/snapcrawl/refinements/string_refinements.rb
|
146
|
+
- lib/snapcrawl/screenshot.rb
|
147
|
+
- lib/snapcrawl/templates/config.yml
|
148
|
+
- lib/snapcrawl/templates/docopt.txt
|
117
149
|
- lib/snapcrawl/version.rb
|
118
150
|
homepage: https://github.com/DannyBen/snapcrawl
|
119
151
|
licenses:
|
@@ -134,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
166
|
- !ruby/object:Gem::Version
|
135
167
|
version: '0'
|
136
168
|
requirements: []
|
137
|
-
rubygems_version: 3.
|
169
|
+
rubygems_version: 3.2.3
|
138
170
|
signing_key:
|
139
171
|
specification_version: 4
|
140
172
|
summary: Crawl a website and take screenshots (CLI + Library)
|
data/lib/snapcrawl/docopt.txt
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
Snapcrawl
|
2
|
-
|
3
|
-
Usage:
|
4
|
-
snapcrawl URL [options]
|
5
|
-
snapcrawl -h | --help
|
6
|
-
snapcrawl -v | --version
|
7
|
-
|
8
|
-
Options:
|
9
|
-
-f, --folder PATH
|
10
|
-
Where to save screenshots [default: snaps]
|
11
|
-
|
12
|
-
-n, --name TEMPLATE
|
13
|
-
Filename template. Include the string '%{url}' anywhere in the name to
|
14
|
-
use the captured URL in the filename [default: %{url}]
|
15
|
-
|
16
|
-
-a, --age SECONDS
|
17
|
-
Number of seconds to consider screenshots fresh [default: 86400]
|
18
|
-
|
19
|
-
-d, --depth LEVELS
|
20
|
-
Number of levels to crawl [default: 1]
|
21
|
-
|
22
|
-
-W, --width PIXELS
|
23
|
-
Screen width in pixels [default: 1280]
|
24
|
-
|
25
|
-
-H, --height PIXELS
|
26
|
-
Screen height in pixels. Use 0 to capture the full page [default: 0]
|
27
|
-
|
28
|
-
-s, --selector SELECTOR
|
29
|
-
CSS selector to capture
|
30
|
-
|
31
|
-
-o, --only REGEX
|
32
|
-
Include only URLs that match REGEX
|
33
|
-
|
34
|
-
-h, --help
|
35
|
-
Show this screen
|
36
|
-
|
37
|
-
-v, --version
|
38
|
-
Show version number
|
39
|
-
|
40
|
-
Examples:
|
41
|
-
snapcrawl example.com
|
42
|
-
snapcrawl example.com -d2 -fscreens
|
43
|
-
snapcrawl example.com -d2 > out.txt 2> err.txt &
|
44
|
-
snapcrawl example.com -W360 -H480
|
45
|
-
snapcrawl example.com --selector "#main-content"
|
46
|
-
snapcrawl example.com --only "products|collections"
|
47
|
-
snapcrawl example.com --name "screenshot-%{url}"
|
48
|
-
snapcrawl example.com --name "`date +%Y%m%d`_%{url}"
|