snapcrawl 0.4.4 → 0.5.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +68 -61
- data/bin/snapcrawl +10 -2
- data/lib/snapcrawl.rb +15 -1
- data/lib/snapcrawl/cli.rb +55 -0
- data/lib/snapcrawl/config.rb +54 -0
- data/lib/snapcrawl/crawler.rb +49 -223
- data/lib/snapcrawl/dependencies.rb +21 -0
- data/lib/snapcrawl/exceptions.rb +1 -0
- data/lib/snapcrawl/log_helpers.rb +57 -0
- data/lib/snapcrawl/page.rb +111 -0
- data/lib/snapcrawl/pretty_logger.rb +11 -0
- data/lib/snapcrawl/refinements/pair_split.rb +23 -0
- data/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- data/lib/snapcrawl/screenshot.rb +62 -0
- data/lib/snapcrawl/templates/config.yml +41 -0
- data/lib/snapcrawl/templates/docopt.txt +26 -0
- data/lib/snapcrawl/version.rb +1 -1
- metadata +51 -19
- data/lib/snapcrawl/docopt.txt +0 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ced7afea220ea7c23c7207037cb32d02625fc3278e8e2347c0c9327fc0f0e509
|
4
|
+
data.tar.gz: 12e7a758a10cba960027ce2152187aed99cfec0c0ea2a434431a34a11a1e2f04
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 117c0157a09a7e040c3c487c6f0d51fa20ad9c9a6be965cb8083eb32c6201effa406d0cbbd428190e1ffc41b1097347113e3ce03a88eae273a5d4d6fd2a8c85d
|
7
|
+
data.tar.gz: 5261d94ef0a0a2223963b70fd0bd8cc6c822e31a693d5bbcc8f452e51f92ef519df20decea90351c3e85f3bfaf30e725be1d6a4d76b4d2748663de44a7772e88
|
data/README.md
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
Snapcrawl - crawl a website and take screenshots
|
2
|
-
==================================================
|
1
|
+
# Snapcrawl - crawl a website and take screenshots
|
3
2
|
|
4
3
|
[](http://badge.fury.io/rb/snapcrawl)
|
5
4
|
[](https://github.com/DannyBen/snapcrawl/actions?query=workflow%3ATest)
|
@@ -11,8 +10,7 @@ Snapcrawl is a command line utility for crawling a website and saving
|
|
11
10
|
screenshots.
|
12
11
|
|
13
12
|
|
14
|
-
Features
|
15
|
-
--------------------------------------------------
|
13
|
+
## Features
|
16
14
|
|
17
15
|
- Crawls a website to any given depth and saves screenshots
|
18
16
|
- Can capture the full length of the page
|
@@ -21,100 +19,109 @@ Features
|
|
21
19
|
- Uses local caching to avoid expensive crawl operations if not needed
|
22
20
|
- Reports broken links
|
23
21
|
|
22
|
+
## Install
|
24
23
|
|
25
|
-
|
26
|
-
--------------------------------------------------
|
27
|
-
|
28
|
-
Snapcrawl requires [PhantomJS][1] and [ImageMagick][2].
|
29
|
-
|
30
|
-
|
31
|
-
Docker Image
|
32
|
-
--------------------------------------------------
|
24
|
+
**Using Docker**
|
33
25
|
|
34
26
|
You can run Snapcrawl by using this docker image (which contains all the
|
35
27
|
necessary prerequisites):
|
36
28
|
|
37
|
-
```
|
38
|
-
$ docker
|
29
|
+
```shell
|
30
|
+
$ alias snapcrawl="docker run --rm -it --volume $PWD:/app dannyben/snapcrawl"
|
39
31
|
```
|
40
32
|
|
41
|
-
|
33
|
+
For more information on the Docker image, refer to the [docker-snapcrawl][3] repository.
|
42
34
|
|
43
|
-
|
44
|
-
|
35
|
+
**Using Ruby**
|
36
|
+
|
37
|
+
```shell
|
38
|
+
$ gem install snapcrawl
|
45
39
|
```
|
46
40
|
|
47
|
-
|
41
|
+
Note that Snapcrawl requires [PhantomJS][1] and [ImageMagick][2].
|
48
42
|
|
43
|
+
## Usage
|
49
44
|
|
50
|
-
|
51
|
-
--------------------------------------------------
|
45
|
+
Snapcrawl can be configured either through a configuration file (YAML), or by specifying options in the command line.
|
52
46
|
|
47
|
+
```shell
|
48
|
+
$ snapcrawl
|
49
|
+
Usage:
|
50
|
+
snapcrawl URL [--config FILE] [SETTINGS...]
|
51
|
+
snapcrawl -h | --help
|
52
|
+
snapcrawl -v | --version
|
53
53
|
```
|
54
|
-
|
54
|
+
|
55
|
+
The default configuration filename is `snapcrawl.yml`.
|
56
|
+
|
57
|
+
Using the `--config` flag will create a template configuration file if it is not present:
|
58
|
+
|
59
|
+
```shell
|
60
|
+
$ snapcrawl example.com --config snapcrawl
|
55
61
|
```
|
56
62
|
|
63
|
+
### Specifying options in the command line
|
57
64
|
|
58
|
-
|
59
|
-
--------------------------------------------------
|
65
|
+
All configuration options can be specified in the command line as `key=value` pairs:
|
60
66
|
|
67
|
+
```shell
|
68
|
+
$ snapcrawl example.com log_level=0 depth=2 width=1024
|
61
69
|
```
|
62
|
-
$ snapcrawl --help
|
63
70
|
|
64
|
-
|
71
|
+
### Sample configuration file
|
65
72
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
73
|
+
```yaml
|
74
|
+
# All values below are the default values
|
75
|
+
|
76
|
+
# log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
|
77
|
+
log_level: 1
|
70
78
|
|
71
|
-
|
72
|
-
|
73
|
-
|
79
|
+
# log_color (yes, no, auto)
|
80
|
+
# yes = always show log color
|
81
|
+
# no = never use colors
|
82
|
+
# auto = only use colors when running in an interactive terminal
|
83
|
+
log_color: auto
|
74
84
|
|
75
|
-
|
76
|
-
|
77
|
-
use the captured URL in the filename [default: %{url}]
|
85
|
+
# number of levels to crawl, 0 means capture only the root URL
|
86
|
+
depth: 1
|
78
87
|
|
79
|
-
|
80
|
-
|
88
|
+
# screenshot width in pixels
|
89
|
+
width: 1280
|
81
90
|
|
82
|
-
|
83
|
-
|
91
|
+
# screenshot height in pixels, 0 means the entire height
|
92
|
+
height: 0
|
84
93
|
|
85
|
-
|
86
|
-
|
94
|
+
# number of seconds to consider the page cache and its screenshot fresh
|
95
|
+
cache_life: 86400
|
87
96
|
|
88
|
-
|
89
|
-
|
97
|
+
# where to store the HTML page cache
|
98
|
+
cache_dir: cache
|
90
99
|
|
91
|
-
|
92
|
-
|
100
|
+
# where to store screenshots
|
101
|
+
snaps_dir: snaps
|
93
102
|
|
94
|
-
|
95
|
-
|
103
|
+
# screenshot filename template, where '%{url}' will be replaced with a
|
104
|
+
# slug version of the URL (no need to include the .png extension)
|
105
|
+
name_template: '%{url}'
|
96
106
|
|
97
|
-
|
98
|
-
|
107
|
+
# urls not matching this regular expression will be ignored
|
108
|
+
url_whitelist:
|
99
109
|
|
100
|
-
|
101
|
-
|
110
|
+
# urls matching this regular expression will be ignored
|
111
|
+
url_blacklist:
|
102
112
|
|
103
|
-
|
104
|
-
|
105
|
-
snapcrawl example.com -d2 -fscreens
|
106
|
-
snapcrawl example.com -d2 > out.txt 2> err.txt &
|
107
|
-
snapcrawl example.com -W360 -H480
|
108
|
-
snapcrawl example.com --selector "#main-content"
|
109
|
-
snapcrawl example.com --only "products|collections"
|
110
|
-
snapcrawl example.com --name "screenshot-%{url}"
|
111
|
-
snapcrawl example.com --name "`date +%Y%m%d`_%{url}"
|
113
|
+
# take a screenshot of this CSS selector only
|
114
|
+
css_selector:
|
112
115
|
```
|
113
116
|
|
117
|
+
## Contributing / Support
|
118
|
+
If you experience any issue, have a question or a suggestion, or if you wish
|
119
|
+
to contribute, feel free to [open an issue][issues].
|
120
|
+
|
114
121
|
---
|
115
122
|
|
116
123
|
[1]: http://phantomjs.org/download.html
|
117
124
|
[2]: https://imagemagick.org/script/download.php
|
118
125
|
[3]: https://github.com/DannyBen/docker-snapcrawl
|
119
|
-
|
126
|
+
[issues]: https://github.com/DannyBen/snapcrawl/issues
|
120
127
|
|
data/bin/snapcrawl
CHANGED
@@ -1,22 +1,30 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'snapcrawl'
|
4
|
+
require 'colsole'
|
5
|
+
|
4
6
|
trap(:INT) { abort "\r\nGoodbye" }
|
7
|
+
|
5
8
|
include Snapcrawl
|
9
|
+
include Colsole
|
6
10
|
|
7
11
|
begin
|
8
|
-
|
12
|
+
CLI.new.call ARGV
|
13
|
+
|
9
14
|
rescue MissingPhantomJS => e
|
10
15
|
message = "Cannot find phantomjs executable in the path, please install it first."
|
11
16
|
say! "\n\n!undred!#{e.class}!txtrst!\n#{message}"
|
12
17
|
exit 2
|
18
|
+
|
13
19
|
rescue MissingImageMagick=> e
|
14
20
|
message = "Cannot find convert (ImageMagick) executable in the path, please install it first."
|
15
21
|
say! "\n\n!undred!#{e.class}!txtrst!\n#{message}"
|
16
22
|
exit 3
|
23
|
+
|
17
24
|
rescue => e
|
18
25
|
puts e.backtrace.reverse if ENV['DEBUG']
|
19
|
-
say! "\n
|
26
|
+
say! "\n!undred!#{e.class}!txtrst!\n#{e.message}"
|
20
27
|
exit 1
|
28
|
+
|
21
29
|
end
|
22
30
|
|
data/lib/snapcrawl.rb
CHANGED
@@ -1,6 +1,20 @@
|
|
1
1
|
require 'snapcrawl/version'
|
2
2
|
require 'snapcrawl/exceptions'
|
3
|
+
require 'snapcrawl/refinements/pair_split'
|
4
|
+
require 'snapcrawl/refinements/string_refinements'
|
5
|
+
require 'snapcrawl/log_helpers'
|
6
|
+
require 'snapcrawl/pretty_logger'
|
7
|
+
require 'snapcrawl/dependencies'
|
8
|
+
require 'snapcrawl/config'
|
9
|
+
require 'snapcrawl/screenshot'
|
10
|
+
require 'snapcrawl/page'
|
3
11
|
require 'snapcrawl/crawler'
|
12
|
+
require 'snapcrawl/cli'
|
4
13
|
|
5
|
-
|
14
|
+
if ENV['BYEBUG']
|
15
|
+
require 'byebug'
|
16
|
+
require 'lp'
|
17
|
+
end
|
6
18
|
|
19
|
+
Snapcrawl::Config.load
|
20
|
+
$logger = Snapcrawl::PrettyLogger.new
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
require 'docopt'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
module Snapcrawl
|
6
|
+
class CLI
|
7
|
+
include Colsole
|
8
|
+
using StringRefinements
|
9
|
+
using PairSplit
|
10
|
+
|
11
|
+
def call(args = [])
|
12
|
+
begin
|
13
|
+
execute Docopt::docopt(docopt, version: VERSION, argv: args)
|
14
|
+
rescue Docopt::Exit => e
|
15
|
+
puts e.message
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def execute(args)
|
22
|
+
status = Config.load args['--config']
|
23
|
+
$logger.debug 'config file created' if status == :created
|
24
|
+
|
25
|
+
tweaks = args['SETTINGS'].pair_split
|
26
|
+
apply_tweaks tweaks if tweaks
|
27
|
+
|
28
|
+
Dependencies.verify
|
29
|
+
|
30
|
+
$logger.debug 'initializing cli'
|
31
|
+
FileUtils.mkdir_p Config.snaps_dir
|
32
|
+
|
33
|
+
url = args['URL'].protocolize
|
34
|
+
crawler = Crawler.new url
|
35
|
+
|
36
|
+
crawler.crawl
|
37
|
+
end
|
38
|
+
|
39
|
+
def docopt
|
40
|
+
@doc ||= File.read docopt_path
|
41
|
+
end
|
42
|
+
|
43
|
+
def docopt_path
|
44
|
+
File.expand_path "templates/docopt.txt", __dir__
|
45
|
+
end
|
46
|
+
|
47
|
+
def apply_tweaks(tweaks)
|
48
|
+
tweaks.each do |key, value|
|
49
|
+
Config.settings[key] = value
|
50
|
+
$logger.level = value if key == 'log_level'
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'sting'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Snapcrawl
|
5
|
+
class Config < Sting
|
6
|
+
class << self
|
7
|
+
def load(file = nil)
|
8
|
+
reset!
|
9
|
+
push defaults
|
10
|
+
|
11
|
+
return unless file
|
12
|
+
|
13
|
+
file = "#{file}.yml" unless file =~ /\.ya?ml$/
|
14
|
+
|
15
|
+
if File.exist? file
|
16
|
+
push file
|
17
|
+
else
|
18
|
+
create_config file
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def defaults
|
25
|
+
{
|
26
|
+
depth: 1,
|
27
|
+
width: 1280,
|
28
|
+
height: 0,
|
29
|
+
cache_life: 86400,
|
30
|
+
cache_dir: 'cache',
|
31
|
+
snaps_dir: 'snaps',
|
32
|
+
name_template: '%{url}',
|
33
|
+
url_whitelist: nil,
|
34
|
+
css_selector: nil,
|
35
|
+
log_level: 1,
|
36
|
+
log_color: 'auto',
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def create_config(file)
|
41
|
+
$logger.debug "creating config file %{green}#{file}%{reset}"
|
42
|
+
content = File.read config_template
|
43
|
+
dir = File.dirname file
|
44
|
+
FileUtils.mkdir_p dir
|
45
|
+
File.write file, content
|
46
|
+
end
|
47
|
+
|
48
|
+
def config_template
|
49
|
+
File.expand_path 'templates/config.yml', __dir__
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -1,267 +1,93 @@
|
|
1
|
-
require 'colsole'
|
2
|
-
require 'docopt'
|
3
1
|
require 'fileutils'
|
4
|
-
require 'httparty'
|
5
|
-
require 'nokogiri'
|
6
|
-
require 'ostruct'
|
7
|
-
require 'pstore'
|
8
|
-
require 'addressable/uri'
|
9
|
-
require 'webshot'
|
10
2
|
|
11
3
|
module Snapcrawl
|
12
|
-
include Colsole
|
13
|
-
|
14
4
|
class Crawler
|
15
|
-
|
16
|
-
|
17
|
-
def initialize
|
18
|
-
@storefile = "snapcrawl.pstore"
|
19
|
-
@store = PStore.new(@storefile)
|
20
|
-
end
|
5
|
+
using StringRefinements
|
21
6
|
|
22
|
-
|
23
|
-
@done = []
|
24
|
-
begin
|
25
|
-
execute Docopt::docopt(doc, version: VERSION, argv: args)
|
26
|
-
rescue Docopt::Exit => e
|
27
|
-
puts e.message
|
28
|
-
end
|
29
|
-
end
|
7
|
+
attr_reader :url
|
30
8
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
|
9
|
+
def initialize(url)
|
10
|
+
$logger.debug "initializing crawler with %{green}#{url}%{reset}"
|
11
|
+
|
12
|
+
config_for_display = Config.settings.dup
|
13
|
+
config_for_display['name_template'] = '%%{url}'
|
14
|
+
|
15
|
+
$logger.debug "config #{config_for_display}"
|
16
|
+
@url = url
|
35
17
|
end
|
36
18
|
|
37
|
-
def
|
38
|
-
|
19
|
+
def crawl
|
20
|
+
Dependencies.verify
|
21
|
+
todo[url] = Page.new url
|
22
|
+
process_todo while todo.any?
|
39
23
|
end
|
40
24
|
|
41
25
|
private
|
42
26
|
|
43
|
-
def
|
44
|
-
|
45
|
-
defaults = {
|
46
|
-
width: 1280,
|
47
|
-
height: 0,
|
48
|
-
depth: 1,
|
49
|
-
age: 86400,
|
50
|
-
folder: 'snaps',
|
51
|
-
name: '%{url}',
|
52
|
-
base: url,
|
53
|
-
}
|
54
|
-
urls = [url]
|
55
|
-
|
56
|
-
@opts = OpenStruct.new defaults.merge(opts)
|
27
|
+
def process_todo
|
28
|
+
$logger.debug "processing queue: %{green}#{todo.count} remaining%{reset}"
|
57
29
|
|
58
|
-
|
30
|
+
url, page = todo.shift
|
31
|
+
done.push url
|
59
32
|
|
60
|
-
|
61
|
-
|
33
|
+
if process_page page
|
34
|
+
register_sub_pages page.pages if page.depth < Config.depth
|
62
35
|
end
|
63
36
|
end
|
64
37
|
|
65
|
-
def
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
say " Snap: Skipping. Does not match regex"
|
73
|
-
else
|
74
|
-
snap url
|
38
|
+
def register_sub_pages(pages)
|
39
|
+
pages.each do |sub_page|
|
40
|
+
next if todo.has_key?(sub_page) or done.include?(sub_page)
|
41
|
+
|
42
|
+
if Config.url_whitelist and sub_page.path !~ /#{Config.url_whitelist}/
|
43
|
+
$logger.debug "ignoring %{purple}%{underlined}#{sub_page.url}%{reset}, reason: whitelist"
|
44
|
+
next
|
75
45
|
end
|
76
|
-
new_urls += extract_urls_from url
|
77
|
-
end
|
78
|
-
new_urls
|
79
|
-
end
|
80
46
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
say " Snap: Skipping. File exists and seems fresh"
|
86
|
-
else
|
87
|
-
snap!(url)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
# Take a screenshot of the URL, even if file exists
|
92
|
-
def snap!(url)
|
93
|
-
say " !txtblu!Snap!!txtrst! Snapping picture... "
|
94
|
-
image_path = image_path_for url
|
47
|
+
if Config.url_blacklist and sub_page.path =~ /#{Config.url_blacklist}/
|
48
|
+
$logger.debug "ignoring %{purple}%{underlined}#{sub_page.url}%{reset}, reason: blacklist"
|
49
|
+
next
|
50
|
+
end
|
95
51
|
|
96
|
-
|
97
|
-
if @opts.selector
|
98
|
-
fetch_opts[:selector] = @opts.selector
|
99
|
-
fetch_opts[:full] = false
|
52
|
+
todo[sub_page.url] = sub_page
|
100
53
|
end
|
101
|
-
|
102
|
-
webshot_capture url, image_path, fetch_opts
|
103
|
-
say "done"
|
104
54
|
end
|
105
55
|
|
106
|
-
def
|
107
|
-
|
108
|
-
rescue => e
|
109
|
-
say "!txtred!FAILED"
|
110
|
-
say "!txtred! ! #{e.class}: #{e.message.strip}"
|
111
|
-
end
|
56
|
+
def process_page(page)
|
57
|
+
outfile = "#{Config.snaps_dir}/#{Config.name_template}.png" % { url: page.url.to_slug }
|
112
58
|
|
113
|
-
|
114
|
-
hide_output do
|
115
|
-
webshot.capture url, image_path, fetch_opts do |magick|
|
116
|
-
magick.combine_options do |c|
|
117
|
-
c.background "white"
|
118
|
-
c.gravity 'north'
|
119
|
-
c.quality 100
|
120
|
-
c.extent @opts.height > 0 ? "#{@opts.width}x#{@opts.height}" : "#{@opts.width}x"
|
121
|
-
end
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
59
|
+
$logger.info "processing %{purple}%{underlined}#{page.url}%{reset}, depth: #{page.depth}"
|
125
60
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
if cached
|
130
|
-
say " Crawl: Page was cached. Reading subsequent URLs from cache"
|
131
|
-
return cached
|
132
|
-
else
|
133
|
-
return extract_urls_from! url
|
61
|
+
if !page.valid?
|
62
|
+
$logger.debug "page #{page.path} is invalid, aborting process"
|
63
|
+
return false
|
134
64
|
end
|
135
|
-
end
|
136
|
-
|
137
|
-
def extract_urls_from!(url)
|
138
|
-
say " !txtblu!Crawl!!txtrst! Extracting links... "
|
139
65
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
links, warnings = normalize_links links
|
146
|
-
@store.transaction { @store[url] = links }
|
147
|
-
say "done"
|
148
|
-
warnings.each do |warning|
|
149
|
-
say "!txtylw! Warn: #{warning[:link]}"
|
150
|
-
say word_wrap " #{warning[:message]}"
|
151
|
-
end
|
152
|
-
else
|
153
|
-
links = []
|
154
|
-
say "!txtred!FAILED"
|
155
|
-
say "!txtred! ! HTTP Error: #{response.code} #{response.message.strip} at #{url}"
|
156
|
-
end
|
66
|
+
if file_fresh? outfile
|
67
|
+
$logger.info "screenshot for #{page.path} already exists"
|
68
|
+
else
|
69
|
+
$logger.info "%{bold}capturing screenshot for #{page.path}%{reset}"
|
70
|
+
page.save_screenshot outfile
|
157
71
|
end
|
158
|
-
links
|
159
|
-
end
|
160
|
-
|
161
|
-
# mkdir the screenshots folder, if needed
|
162
|
-
def make_screenshot_dir(dir)
|
163
|
-
Dir.exist? dir or FileUtils.mkdir_p dir
|
164
|
-
end
|
165
72
|
|
166
|
-
|
167
|
-
def handelize(str)
|
168
|
-
str.downcase.gsub(/[^a-z0-9]+/, '-')
|
73
|
+
true
|
169
74
|
end
|
170
75
|
|
171
|
-
# Return proper image path for a UR
|
172
|
-
def image_path_for(url)
|
173
|
-
"#{@opts.folder}/#{@opts.name}.png" % { url: handelize(url) }
|
174
|
-
end
|
175
|
-
|
176
|
-
# Add protocol to a URL if neeed
|
177
|
-
def protocolize(url)
|
178
|
-
url =~ /^http/ ? url : "http://#{url}"
|
179
|
-
end
|
180
|
-
|
181
|
-
# Return true if the file exists and is not too old
|
182
76
|
def file_fresh?(file)
|
183
|
-
|
77
|
+
Config.cache_life > 0 and File.exist?(file) and file_age(file) < Config.cache_life
|
184
78
|
end
|
185
79
|
|
186
|
-
# Return file age in seconds
|
187
80
|
def file_age(file)
|
188
81
|
(Time.now - File.stat(file).mtime).to_i
|
189
82
|
end
|
190
83
|
|
191
|
-
|
192
|
-
|
193
|
-
extensions = "png|gif|jpg|pdf|zip"
|
194
|
-
beginnings = "mailto|tel"
|
195
|
-
|
196
|
-
links_array = []
|
197
|
-
warnings = []
|
198
|
-
|
199
|
-
links.each do |link|
|
200
|
-
link = link.attribute('href').to_s.dup
|
201
|
-
|
202
|
-
# Remove #hash
|
203
|
-
link.gsub!(/#.+$/, '')
|
204
|
-
next if link.empty?
|
205
|
-
|
206
|
-
# Remove links to specific extensions and protocols
|
207
|
-
next if link =~ /\.(#{extensions})(\?.*)?$/
|
208
|
-
next if link =~ /^(#{beginnings})/
|
209
|
-
|
210
|
-
# Strip spaces
|
211
|
-
link.strip!
|
212
|
-
|
213
|
-
# Convert relative links to absolute
|
214
|
-
begin
|
215
|
-
link = Addressable::URI.join( @opts.base, link ).to_s.dup
|
216
|
-
rescue => e
|
217
|
-
warnings << { link: link, message: "#{e.class} #{e.message}" }
|
218
|
-
next
|
219
|
-
end
|
220
|
-
|
221
|
-
# Keep only links in our base domain
|
222
|
-
next unless link.include? @opts.base
|
223
|
-
|
224
|
-
links_array << link
|
225
|
-
end
|
226
|
-
|
227
|
-
[links_array.uniq, warnings]
|
228
|
-
end
|
229
|
-
|
230
|
-
def doc
|
231
|
-
@doc ||= File.read docopt
|
84
|
+
def todo
|
85
|
+
@todo ||= {}
|
232
86
|
end
|
233
87
|
|
234
|
-
def
|
235
|
-
|
88
|
+
def done
|
89
|
+
@done ||= []
|
236
90
|
end
|
237
91
|
|
238
|
-
def opts_from_args(args)
|
239
|
-
opts = {}
|
240
|
-
%w[folder name selector only].each do |opt|
|
241
|
-
opts[opt.to_sym] = args["--#{opt}"] if args["--#{opt}"]
|
242
|
-
end
|
243
|
-
|
244
|
-
%w[age depth width height].each do |opt|
|
245
|
-
opts[opt.to_sym] = args["--#{opt}"].to_i if args["--#{opt}"]
|
246
|
-
end
|
247
|
-
|
248
|
-
opts
|
249
|
-
end
|
250
|
-
|
251
|
-
def webshot
|
252
|
-
@webshot ||= Webshot::Screenshot.instance
|
253
|
-
end
|
254
|
-
|
255
|
-
# The webshot gem messes with stdout/stderr streams so we keep it in
|
256
|
-
# check by using this method. Also, in some sites (e.g. uown.co) it
|
257
|
-
# prints some output to stdout, this is why we override $stdout for
|
258
|
-
# the duration of the run.
|
259
|
-
def hide_output
|
260
|
-
keep_stdout, keep_stderr = $stdout, $stderr
|
261
|
-
$stdout, $stderr = StringIO.new, StringIO.new
|
262
|
-
yield
|
263
|
-
ensure
|
264
|
-
$stdout, $stderr = keep_stdout, keep_stderr
|
265
|
-
end
|
266
92
|
end
|
267
93
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
|
3
|
+
module Snapcrawl
|
4
|
+
class Dependencies
|
5
|
+
class << self
|
6
|
+
include Colsole
|
7
|
+
|
8
|
+
def verify
|
9
|
+
return if @verified
|
10
|
+
|
11
|
+
$logger.debug 'verifying %{green}phantomjs%{reset} is present'
|
12
|
+
raise MissingPhantomJS unless command_exist? "phantomjs"
|
13
|
+
|
14
|
+
$logger.debug 'verifying %{green}imagemagick%{reset} is present'
|
15
|
+
raise MissingImageMagick unless command_exist? "convert"
|
16
|
+
|
17
|
+
@verified = true
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/snapcrawl/exceptions.rb
CHANGED
@@ -0,0 +1,57 @@
|
|
1
|
+
module Snapcrawl
|
2
|
+
module LogHelpers
|
3
|
+
SEVERITY_COLORS = {
|
4
|
+
'INFO' => :blue,
|
5
|
+
'WARN' => :yellow,
|
6
|
+
'ERROR' => :red,
|
7
|
+
'FATAL' => :red,
|
8
|
+
'DEBUG' => :cyan
|
9
|
+
}
|
10
|
+
|
11
|
+
def log_formatter
|
12
|
+
proc do |severity, _time, _prog, message|
|
13
|
+
severity_color = SEVERITY_COLORS[severity]
|
14
|
+
|
15
|
+
"%{#{severity_color}}#{severity.rjust 5}%{reset} : #{message}\n" % log_colors
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def log_colors
|
20
|
+
@log_colors ||= log_colors!
|
21
|
+
end
|
22
|
+
|
23
|
+
def log_colors!
|
24
|
+
colors? ? actual_colors : empty_colors
|
25
|
+
end
|
26
|
+
|
27
|
+
def actual_colors
|
28
|
+
{
|
29
|
+
red: "\e[31m", green: "\e[32m", yellow: "\e[33m",
|
30
|
+
blue: "\e[34m", purple: "\e[35m", cyan: "\e[36m",
|
31
|
+
underlined: "\e[4m", bold: "\e[1m",
|
32
|
+
none: "", reset: "\e[0m"
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
def empty_colors
|
37
|
+
{
|
38
|
+
red: "", green: "", yellow: "",
|
39
|
+
blue: "", purple: "", cyan: "",
|
40
|
+
underlined: "", bold: "",
|
41
|
+
none: "", reset: ""
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
def colors?
|
46
|
+
if Config.log_color == 'auto'
|
47
|
+
tty?
|
48
|
+
else
|
49
|
+
Config.log_color
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def tty?
|
54
|
+
ENV['TTY'] == 'on' ? true : ENV['TTY'] == 'off' ? false : $stdout.tty?
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'httparty'
|
4
|
+
require 'lightly'
|
5
|
+
require 'nokogiri'
|
6
|
+
|
7
|
+
module Snapcrawl
|
8
|
+
class Page
|
9
|
+
using StringRefinements
|
10
|
+
|
11
|
+
attr_reader :url, :depth
|
12
|
+
|
13
|
+
EXTENSION_BLACKLIST = "png|gif|jpg|pdf|zip"
|
14
|
+
PROTOCOL_BLACKLIST = "mailto|tel"
|
15
|
+
|
16
|
+
def initialize(url, depth: 0)
|
17
|
+
@url, @depth = url.protocolize, depth
|
18
|
+
end
|
19
|
+
|
20
|
+
def valid?
|
21
|
+
http_response&.success?
|
22
|
+
end
|
23
|
+
|
24
|
+
def site
|
25
|
+
@site ||= Addressable::URI.parse(url).site
|
26
|
+
end
|
27
|
+
|
28
|
+
def path
|
29
|
+
@path ||= Addressable::URI.parse(url).request_uri
|
30
|
+
end
|
31
|
+
|
32
|
+
def links
|
33
|
+
return nil unless valid?
|
34
|
+
doc = Nokogiri::HTML http_response.body
|
35
|
+
normalize_links doc.css('a')
|
36
|
+
end
|
37
|
+
|
38
|
+
def pages
|
39
|
+
return nil unless valid?
|
40
|
+
links.map { |link| Page.new link, depth: depth+1 }
|
41
|
+
end
|
42
|
+
|
43
|
+
def save_screenshot(outfile)
|
44
|
+
return false unless valid?
|
45
|
+
Screenshot.new(url).save "#{outfile}"
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def http_response
|
51
|
+
@http_response ||= http_response!
|
52
|
+
end
|
53
|
+
|
54
|
+
def http_response!
|
55
|
+
response = cache.get(url) { HTTParty.get url }
|
56
|
+
|
57
|
+
if !response.success?
|
58
|
+
$logger.warn "http error on %{purple}%{underlined}#{url}%{reset}, code: %{yellow}#{response.code}%{reset}, message: #{response.message.strip}"
|
59
|
+
end
|
60
|
+
|
61
|
+
response
|
62
|
+
|
63
|
+
rescue => e
|
64
|
+
$logger.error "http error on %{purple}%{underlined}#{url}%{reset} - %{red}#{e.class}%{reset}: #{e.message}"
|
65
|
+
nil
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
def normalize_links(links)
|
70
|
+
result = []
|
71
|
+
|
72
|
+
links.each do |link|
|
73
|
+
valid_link = normalize_link link
|
74
|
+
result << valid_link if valid_link
|
75
|
+
end
|
76
|
+
|
77
|
+
result.uniq
|
78
|
+
end
|
79
|
+
|
80
|
+
def normalize_link(link)
|
81
|
+
link = link.attribute('href').to_s.dup
|
82
|
+
|
83
|
+
# Remove #hash
|
84
|
+
link.gsub!(/#.+$/, '')
|
85
|
+
return nil if link.empty?
|
86
|
+
|
87
|
+
# Remove links to specific extensions and protocols
|
88
|
+
return nil if link =~ /\.(#{EXTENSION_BLACKLIST})(\?.*)?$/
|
89
|
+
return nil if link =~ /^(#{PROTOCOL_BLACKLIST}):/
|
90
|
+
|
91
|
+
# Strip spaces
|
92
|
+
link.strip!
|
93
|
+
|
94
|
+
# Convert relative links to absolute
|
95
|
+
begin
|
96
|
+
link = Addressable::URI.join(url, link).to_s.dup
|
97
|
+
rescue => e
|
98
|
+
$logger.warn "%{red}#{e.class}%{reset}: #{e.message} on #{path} (link: #{link})"
|
99
|
+
return nil
|
100
|
+
end
|
101
|
+
|
102
|
+
# Keep only links in our base domain
|
103
|
+
return nil unless link.include? site
|
104
|
+
link
|
105
|
+
end
|
106
|
+
|
107
|
+
def cache
|
108
|
+
Lightly.new life: Config.cache_life
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Snapcrawl
|
2
|
+
module PairSplit
|
3
|
+
refine Array do
|
4
|
+
def pair_split
|
5
|
+
map do |pair|
|
6
|
+
key, value = pair.split '='
|
7
|
+
|
8
|
+
value = if value =~ /^\d+$/
|
9
|
+
value.to_i
|
10
|
+
elsif ['no', 'false'].include? value
|
11
|
+
false
|
12
|
+
elsif ['yes', 'true'].include? value
|
13
|
+
true
|
14
|
+
else
|
15
|
+
value
|
16
|
+
end
|
17
|
+
|
18
|
+
[key, value]
|
19
|
+
end.to_h
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'webshot'
|
2
|
+
|
3
|
+
module Snapcrawl
|
4
|
+
class Screenshot
|
5
|
+
using StringRefinements
|
6
|
+
|
7
|
+
attr_reader :url
|
8
|
+
|
9
|
+
def initialize(url)
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
|
13
|
+
def save(outfile = nil)
|
14
|
+
outfile ||= "#{url.to_slug}.png"
|
15
|
+
|
16
|
+
fetch_opts = { allowed_status_codes: [404, 401, 403] }
|
17
|
+
if Config.selector
|
18
|
+
fetch_opts[:selector] = Config.selector
|
19
|
+
fetch_opts[:full] = false
|
20
|
+
end
|
21
|
+
|
22
|
+
webshot_capture url, outfile, fetch_opts
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def webshot_capture(url, image_path, fetch_opts)
|
28
|
+
webshot_capture! url, image_path, fetch_opts
|
29
|
+
rescue => e
|
30
|
+
raise ScreenshotError, "#{e.class} #{e.message}"
|
31
|
+
end
|
32
|
+
|
33
|
+
def webshot_capture!(url, image_path, fetch_opts)
|
34
|
+
hide_output do
|
35
|
+
webshot.capture url, image_path, fetch_opts do |magick|
|
36
|
+
magick.combine_options do |c|
|
37
|
+
c.background "white"
|
38
|
+
c.gravity 'north'
|
39
|
+
c.quality 100
|
40
|
+
c.extent Config.height > 0 ? "#{Config.width}x#{Config.height}" : "#{Config.width}x"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def webshot
|
47
|
+
@webshot ||= Webshot::Screenshot.instance
|
48
|
+
end
|
49
|
+
|
50
|
+
# The webshot gem messes with stdout/stderr streams so we keep it in
|
51
|
+
# check by using this method. Also, in some sites (e.g. uown.co) it
|
52
|
+
# prints some output to stdout, this is why we override $stdout for
|
53
|
+
# the duration of the run.
|
54
|
+
def hide_output
|
55
|
+
keep_stdout, keep_stderr = $stdout, $stderr
|
56
|
+
$stdout, $stderr = StringIO.new, StringIO.new
|
57
|
+
yield
|
58
|
+
ensure
|
59
|
+
$stdout, $stderr = keep_stdout, keep_stderr
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# All values below are the default values
|
2
|
+
|
3
|
+
# log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
|
4
|
+
log_level: 1
|
5
|
+
|
6
|
+
# log_color (yes, no, auto)
|
7
|
+
# yes = always show log color
|
8
|
+
# no = never use colors
|
9
|
+
# auto = only use colors when running in an interactive terminal
|
10
|
+
log_color: auto
|
11
|
+
|
12
|
+
# number of levels to crawl, 0 means capture only the root URL
|
13
|
+
depth: 1
|
14
|
+
|
15
|
+
# screenshot width in pixels
|
16
|
+
width: 1280
|
17
|
+
|
18
|
+
# screenshot height in pixels, 0 means the entire height
|
19
|
+
height: 0
|
20
|
+
|
21
|
+
# number of seconds to consider the page cache and its screenshot fresh
|
22
|
+
cache_life: 86400
|
23
|
+
|
24
|
+
# where to store the HTML page cache
|
25
|
+
cache_dir: cache
|
26
|
+
|
27
|
+
# where to store screenshots
|
28
|
+
snaps_dir: snaps
|
29
|
+
|
30
|
+
# screenshot filename template, where '%{url}' will be replaced with a
|
31
|
+
# slug version of the URL (no need to include the .png extension)
|
32
|
+
name_template: '%{url}'
|
33
|
+
|
34
|
+
# urls not matching this regular expression will be ignored
|
35
|
+
url_whitelist:
|
36
|
+
|
37
|
+
# urls matching this regular expression will be ignored
|
38
|
+
url_blacklist:
|
39
|
+
|
40
|
+
# take a screenshot of this CSS selector only
|
41
|
+
css_selector:
|
@@ -0,0 +1,26 @@
|
|
1
|
+
Snapcrawl
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
snapcrawl URL [--config FILE] [SETTINGS...]
|
5
|
+
snapcrawl -h | --help
|
6
|
+
snapcrawl -v | --version
|
7
|
+
|
8
|
+
Options:
|
9
|
+
-c, --config FILE
|
10
|
+
Path to config file, with or without the .yml extension
|
11
|
+
A sample file will be created if not found
|
12
|
+
[default: snapcrawl.yml]
|
13
|
+
|
14
|
+
-h, --help
|
15
|
+
Show this screen
|
16
|
+
|
17
|
+
-v, --version
|
18
|
+
Show version number
|
19
|
+
|
20
|
+
Settings:
|
21
|
+
You may provide any of the options available in the config as 'key=value'.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
snapcrawl example.com
|
25
|
+
snapcrawl example.com --config simple
|
26
|
+
snapcrawl example.com depth=1 log_level=2
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-03-
|
11
|
+
date: 2020-03-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|
@@ -16,48 +16,42 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0.
|
20
|
-
- - ">="
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
version: 0.5.4
|
19
|
+
version: '0.7'
|
23
20
|
type: :runtime
|
24
21
|
prerelease: false
|
25
22
|
version_requirements: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
24
|
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version: '0.
|
30
|
-
- - ">="
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: 0.5.4
|
26
|
+
version: '0.7'
|
33
27
|
- !ruby/object:Gem::Dependency
|
34
28
|
name: docopt
|
35
29
|
requirement: !ruby/object:Gem::Requirement
|
36
30
|
requirements:
|
37
31
|
- - "~>"
|
38
32
|
- !ruby/object:Gem::Version
|
39
|
-
version: '0.
|
33
|
+
version: '0.6'
|
40
34
|
type: :runtime
|
41
35
|
prerelease: false
|
42
36
|
version_requirements: !ruby/object:Gem::Requirement
|
43
37
|
requirements:
|
44
38
|
- - "~>"
|
45
39
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
40
|
+
version: '0.6'
|
47
41
|
- !ruby/object:Gem::Dependency
|
48
42
|
name: nokogiri
|
49
43
|
requirement: !ruby/object:Gem::Requirement
|
50
44
|
requirements:
|
51
45
|
- - "~>"
|
52
46
|
- !ruby/object:Gem::Version
|
53
|
-
version: '1.
|
47
|
+
version: '1.10'
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
51
|
requirements:
|
58
52
|
- - "~>"
|
59
53
|
- !ruby/object:Gem::Version
|
60
|
-
version: '1.
|
54
|
+
version: '1.10'
|
61
55
|
- !ruby/object:Gem::Dependency
|
62
56
|
name: webshot
|
63
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,14 +72,14 @@ dependencies:
|
|
78
72
|
requirements:
|
79
73
|
- - "~>"
|
80
74
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
75
|
+
version: '0.18'
|
82
76
|
type: :runtime
|
83
77
|
prerelease: false
|
84
78
|
version_requirements: !ruby/object:Gem::Requirement
|
85
79
|
requirements:
|
86
80
|
- - "~>"
|
87
81
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
82
|
+
version: '0.18'
|
89
83
|
- !ruby/object:Gem::Dependency
|
90
84
|
name: addressable
|
91
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,6 +94,34 @@ dependencies:
|
|
100
94
|
- - "~>"
|
101
95
|
- !ruby/object:Gem::Version
|
102
96
|
version: '2.7'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: lightly
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.3'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.3'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: sting
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.4'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.4'
|
103
125
|
description: Snapcrawl is a command line utility for crawling a website and saving
|
104
126
|
screenshots.
|
105
127
|
email: db@dannyben.com
|
@@ -111,9 +133,19 @@ files:
|
|
111
133
|
- README.md
|
112
134
|
- bin/snapcrawl
|
113
135
|
- lib/snapcrawl.rb
|
136
|
+
- lib/snapcrawl/cli.rb
|
137
|
+
- lib/snapcrawl/config.rb
|
114
138
|
- lib/snapcrawl/crawler.rb
|
115
|
-
- lib/snapcrawl/
|
139
|
+
- lib/snapcrawl/dependencies.rb
|
116
140
|
- lib/snapcrawl/exceptions.rb
|
141
|
+
- lib/snapcrawl/log_helpers.rb
|
142
|
+
- lib/snapcrawl/page.rb
|
143
|
+
- lib/snapcrawl/pretty_logger.rb
|
144
|
+
- lib/snapcrawl/refinements/pair_split.rb
|
145
|
+
- lib/snapcrawl/refinements/string_refinements.rb
|
146
|
+
- lib/snapcrawl/screenshot.rb
|
147
|
+
- lib/snapcrawl/templates/config.yml
|
148
|
+
- lib/snapcrawl/templates/docopt.txt
|
117
149
|
- lib/snapcrawl/version.rb
|
118
150
|
homepage: https://github.com/DannyBen/snapcrawl
|
119
151
|
licenses:
|
@@ -130,9 +162,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
130
162
|
version: '2.3'
|
131
163
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
164
|
requirements:
|
133
|
-
- - "
|
165
|
+
- - ">"
|
134
166
|
- !ruby/object:Gem::Version
|
135
|
-
version:
|
167
|
+
version: 1.3.1
|
136
168
|
requirements: []
|
137
169
|
rubygems_version: 3.0.3
|
138
170
|
signing_key:
|
data/lib/snapcrawl/docopt.txt
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
Snapcrawl
|
2
|
-
|
3
|
-
Usage:
|
4
|
-
snapcrawl URL [options]
|
5
|
-
snapcrawl -h | --help
|
6
|
-
snapcrawl -v | --version
|
7
|
-
|
8
|
-
Options:
|
9
|
-
-f, --folder PATH
|
10
|
-
Where to save screenshots [default: snaps]
|
11
|
-
|
12
|
-
-n, --name TEMPLATE
|
13
|
-
Filename template. Include the string '%{url}' anywhere in the name to
|
14
|
-
use the captured URL in the filename [default: %{url}]
|
15
|
-
|
16
|
-
-a, --age SECONDS
|
17
|
-
Number of seconds to consider screenshots fresh [default: 86400]
|
18
|
-
|
19
|
-
-d, --depth LEVELS
|
20
|
-
Number of levels to crawl [default: 1]
|
21
|
-
|
22
|
-
-W, --width PIXELS
|
23
|
-
Screen width in pixels [default: 1280]
|
24
|
-
|
25
|
-
-H, --height PIXELS
|
26
|
-
Screen height in pixels. Use 0 to capture the full page [default: 0]
|
27
|
-
|
28
|
-
-s, --selector SELECTOR
|
29
|
-
CSS selector to capture
|
30
|
-
|
31
|
-
-o, --only REGEX
|
32
|
-
Include only URLs that match REGEX
|
33
|
-
|
34
|
-
-h, --help
|
35
|
-
Show this screen
|
36
|
-
|
37
|
-
-v, --version
|
38
|
-
Show version number
|
39
|
-
|
40
|
-
Examples:
|
41
|
-
snapcrawl example.com
|
42
|
-
snapcrawl example.com -d2 -fscreens
|
43
|
-
snapcrawl example.com -d2 > out.txt 2> err.txt &
|
44
|
-
snapcrawl example.com -W360 -H480
|
45
|
-
snapcrawl example.com --selector "#main-content"
|
46
|
-
snapcrawl example.com --only "products|collections"
|
47
|
-
snapcrawl example.com --name "screenshot-%{url}"
|
48
|
-
snapcrawl example.com --name "`date +%Y%m%d`_%{url}"
|