snapcrawl 0.4.4 → 0.5.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +68 -61
- data/bin/snapcrawl +10 -2
- data/lib/snapcrawl.rb +15 -1
- data/lib/snapcrawl/cli.rb +55 -0
- data/lib/snapcrawl/config.rb +54 -0
- data/lib/snapcrawl/crawler.rb +49 -223
- data/lib/snapcrawl/dependencies.rb +21 -0
- data/lib/snapcrawl/exceptions.rb +1 -0
- data/lib/snapcrawl/log_helpers.rb +57 -0
- data/lib/snapcrawl/page.rb +111 -0
- data/lib/snapcrawl/pretty_logger.rb +11 -0
- data/lib/snapcrawl/refinements/pair_split.rb +23 -0
- data/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- data/lib/snapcrawl/screenshot.rb +62 -0
- data/lib/snapcrawl/templates/config.yml +41 -0
- data/lib/snapcrawl/templates/docopt.txt +26 -0
- data/lib/snapcrawl/version.rb +1 -1
- metadata +51 -19
- data/lib/snapcrawl/docopt.txt +0 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ced7afea220ea7c23c7207037cb32d02625fc3278e8e2347c0c9327fc0f0e509
|
4
|
+
data.tar.gz: 12e7a758a10cba960027ce2152187aed99cfec0c0ea2a434431a34a11a1e2f04
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 117c0157a09a7e040c3c487c6f0d51fa20ad9c9a6be965cb8083eb32c6201effa406d0cbbd428190e1ffc41b1097347113e3ce03a88eae273a5d4d6fd2a8c85d
|
7
|
+
data.tar.gz: 5261d94ef0a0a2223963b70fd0bd8cc6c822e31a693d5bbcc8f452e51f92ef519df20decea90351c3e85f3bfaf30e725be1d6a4d76b4d2748663de44a7772e88
|
data/README.md
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
Snapcrawl - crawl a website and take screenshots
|
2
|
-
==================================================
|
1
|
+
# Snapcrawl - crawl a website and take screenshots
|
3
2
|
|
4
3
|
[![Gem Version](https://badge.fury.io/rb/snapcrawl.svg)](http://badge.fury.io/rb/snapcrawl)
|
5
4
|
[![Build Status](https://github.com/DannyBen/snapcrawl/workflows/Test/badge.svg)](https://github.com/DannyBen/snapcrawl/actions?query=workflow%3ATest)
|
@@ -11,8 +10,7 @@ Snapcrawl is a command line utility for crawling a website and saving
|
|
11
10
|
screenshots.
|
12
11
|
|
13
12
|
|
14
|
-
Features
|
15
|
-
--------------------------------------------------
|
13
|
+
## Features
|
16
14
|
|
17
15
|
- Crawls a website to any given depth and saves screenshots
|
18
16
|
- Can capture the full length of the page
|
@@ -21,100 +19,109 @@ Features
|
|
21
19
|
- Uses local caching to avoid expensive crawl operations if not needed
|
22
20
|
- Reports broken links
|
23
21
|
|
22
|
+
## Install
|
24
23
|
|
25
|
-
|
26
|
-
--------------------------------------------------
|
27
|
-
|
28
|
-
Snapcrawl requires [PhantomJS][1] and [ImageMagick][2].
|
29
|
-
|
30
|
-
|
31
|
-
Docker Image
|
32
|
-
--------------------------------------------------
|
24
|
+
**Using Docker**
|
33
25
|
|
34
26
|
You can run Snapcrawl by using this docker image (which contains all the
|
35
27
|
necessary prerequisites):
|
36
28
|
|
37
|
-
```
|
38
|
-
$ docker
|
29
|
+
```shell
|
30
|
+
$ alias snapcrawl="docker run --rm -it --volume $PWD:/app dannyben/snapcrawl"
|
39
31
|
```
|
40
32
|
|
41
|
-
|
33
|
+
For more information on the Docker image, refer to the [docker-snapcrawl][3] repository.
|
42
34
|
|
43
|
-
|
44
|
-
|
35
|
+
**Using Ruby**
|
36
|
+
|
37
|
+
```shell
|
38
|
+
$ gem install snapcrawl
|
45
39
|
```
|
46
40
|
|
47
|
-
|
41
|
+
Note that Snapcrawl requires [PhantomJS][1] and [ImageMagick][2].
|
48
42
|
|
43
|
+
## Usage
|
49
44
|
|
50
|
-
|
51
|
-
--------------------------------------------------
|
45
|
+
Snapcrawl can be configured either through a configuration file (YAML), or by specifying options in the command line.
|
52
46
|
|
47
|
+
```shell
|
48
|
+
$ snapcrawl
|
49
|
+
Usage:
|
50
|
+
snapcrawl URL [--config FILE] [SETTINGS...]
|
51
|
+
snapcrawl -h | --help
|
52
|
+
snapcrawl -v | --version
|
53
53
|
```
|
54
|
-
|
54
|
+
|
55
|
+
The default configuration filename is `snapcrawl.yml`.
|
56
|
+
|
57
|
+
Using the `--config` flag will create a template configuration file if it is not present:
|
58
|
+
|
59
|
+
```shell
|
60
|
+
$ snapcrawl example.com --config snapcrawl
|
55
61
|
```
|
56
62
|
|
63
|
+
### Specifying options in the command line
|
57
64
|
|
58
|
-
|
59
|
-
--------------------------------------------------
|
65
|
+
All configuration options can be specified in the command line as `key=value` pairs:
|
60
66
|
|
67
|
+
```shell
|
68
|
+
$ snapcrawl example.com log_level=0 depth=2 width=1024
|
61
69
|
```
|
62
|
-
$ snapcrawl --help
|
63
70
|
|
64
|
-
|
71
|
+
### Sample configuration file
|
65
72
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
73
|
+
```yaml
|
74
|
+
# All values below are the default values
|
75
|
+
|
76
|
+
# log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
|
77
|
+
log_level: 1
|
70
78
|
|
71
|
-
|
72
|
-
|
73
|
-
|
79
|
+
# log_color (yes, no, auto)
|
80
|
+
# yes = always show log color
|
81
|
+
# no = never use colors
|
82
|
+
# auto = only use colors when running in an interactive terminal
|
83
|
+
log_color: auto
|
74
84
|
|
75
|
-
|
76
|
-
|
77
|
-
use the captured URL in the filename [default: %{url}]
|
85
|
+
# number of levels to crawl, 0 means capture only the root URL
|
86
|
+
depth: 1
|
78
87
|
|
79
|
-
|
80
|
-
|
88
|
+
# screenshot width in pixels
|
89
|
+
width: 1280
|
81
90
|
|
82
|
-
|
83
|
-
|
91
|
+
# screenshot height in pixels, 0 means the entire height
|
92
|
+
height: 0
|
84
93
|
|
85
|
-
|
86
|
-
|
94
|
+
# number of seconds to consider the page cache and its screenshot fresh
|
95
|
+
cache_life: 86400
|
87
96
|
|
88
|
-
|
89
|
-
|
97
|
+
# where to store the HTML page cache
|
98
|
+
cache_dir: cache
|
90
99
|
|
91
|
-
|
92
|
-
|
100
|
+
# where to store screenshots
|
101
|
+
snaps_dir: snaps
|
93
102
|
|
94
|
-
|
95
|
-
|
103
|
+
# screenshot filename template, where '%{url}' will be replaced with a
|
104
|
+
# slug version of the URL (no need to include the .png extension)
|
105
|
+
name_template: '%{url}'
|
96
106
|
|
97
|
-
|
98
|
-
|
107
|
+
# urls not matching this regular expression will be ignored
|
108
|
+
url_whitelist:
|
99
109
|
|
100
|
-
|
101
|
-
|
110
|
+
# urls matching this regular expression will be ignored
|
111
|
+
url_blacklist:
|
102
112
|
|
103
|
-
|
104
|
-
|
105
|
-
snapcrawl example.com -d2 -fscreens
|
106
|
-
snapcrawl example.com -d2 > out.txt 2> err.txt &
|
107
|
-
snapcrawl example.com -W360 -H480
|
108
|
-
snapcrawl example.com --selector "#main-content"
|
109
|
-
snapcrawl example.com --only "products|collections"
|
110
|
-
snapcrawl example.com --name "screenshot-%{url}"
|
111
|
-
snapcrawl example.com --name "`date +%Y%m%d`_%{url}"
|
113
|
+
# take a screenshot of this CSS selector only
|
114
|
+
css_selector:
|
112
115
|
```
|
113
116
|
|
117
|
+
## Contributing / Support
|
118
|
+
If you experience any issue, have a question or a suggestion, or if you wish
|
119
|
+
to contribute, feel free to [open an issue][issues].
|
120
|
+
|
114
121
|
---
|
115
122
|
|
116
123
|
[1]: http://phantomjs.org/download.html
|
117
124
|
[2]: https://imagemagick.org/script/download.php
|
118
125
|
[3]: https://github.com/DannyBen/docker-snapcrawl
|
119
|
-
|
126
|
+
[issues]: https://github.com/DannyBen/snapcrawl/issues
|
120
127
|
|
data/bin/snapcrawl
CHANGED
@@ -1,22 +1,30 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'snapcrawl'
|
4
|
+
require 'colsole'
|
5
|
+
|
4
6
|
trap(:INT) { abort "\r\nGoodbye" }
|
7
|
+
|
5
8
|
include Snapcrawl
|
9
|
+
include Colsole
|
6
10
|
|
7
11
|
begin
|
8
|
-
|
12
|
+
CLI.new.call ARGV
|
13
|
+
|
9
14
|
rescue MissingPhantomJS => e
|
10
15
|
message = "Cannot find phantomjs executable in the path, please install it first."
|
11
16
|
say! "\n\n!undred!#{e.class}!txtrst!\n#{message}"
|
12
17
|
exit 2
|
18
|
+
|
13
19
|
rescue MissingImageMagick=> e
|
14
20
|
message = "Cannot find convert (ImageMagick) executable in the path, please install it first."
|
15
21
|
say! "\n\n!undred!#{e.class}!txtrst!\n#{message}"
|
16
22
|
exit 3
|
23
|
+
|
17
24
|
rescue => e
|
18
25
|
puts e.backtrace.reverse if ENV['DEBUG']
|
19
|
-
say! "\n
|
26
|
+
say! "\n!undred!#{e.class}!txtrst!\n#{e.message}"
|
20
27
|
exit 1
|
28
|
+
|
21
29
|
end
|
22
30
|
|
data/lib/snapcrawl.rb
CHANGED
@@ -1,6 +1,20 @@
|
|
1
1
|
require 'snapcrawl/version'
|
2
2
|
require 'snapcrawl/exceptions'
|
3
|
+
require 'snapcrawl/refinements/pair_split'
|
4
|
+
require 'snapcrawl/refinements/string_refinements'
|
5
|
+
require 'snapcrawl/log_helpers'
|
6
|
+
require 'snapcrawl/pretty_logger'
|
7
|
+
require 'snapcrawl/dependencies'
|
8
|
+
require 'snapcrawl/config'
|
9
|
+
require 'snapcrawl/screenshot'
|
10
|
+
require 'snapcrawl/page'
|
3
11
|
require 'snapcrawl/crawler'
|
12
|
+
require 'snapcrawl/cli'
|
4
13
|
|
5
|
-
|
14
|
+
if ENV['BYEBUG']
|
15
|
+
require 'byebug'
|
16
|
+
require 'lp'
|
17
|
+
end
|
6
18
|
|
19
|
+
Snapcrawl::Config.load
|
20
|
+
$logger = Snapcrawl::PrettyLogger.new
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
require 'docopt'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
module Snapcrawl
|
6
|
+
class CLI
|
7
|
+
include Colsole
|
8
|
+
using StringRefinements
|
9
|
+
using PairSplit
|
10
|
+
|
11
|
+
def call(args = [])
|
12
|
+
begin
|
13
|
+
execute Docopt::docopt(docopt, version: VERSION, argv: args)
|
14
|
+
rescue Docopt::Exit => e
|
15
|
+
puts e.message
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def execute(args)
|
22
|
+
status = Config.load args['--config']
|
23
|
+
$logger.debug 'config file created' if status == :created
|
24
|
+
|
25
|
+
tweaks = args['SETTINGS'].pair_split
|
26
|
+
apply_tweaks tweaks if tweaks
|
27
|
+
|
28
|
+
Dependencies.verify
|
29
|
+
|
30
|
+
$logger.debug 'initializing cli'
|
31
|
+
FileUtils.mkdir_p Config.snaps_dir
|
32
|
+
|
33
|
+
url = args['URL'].protocolize
|
34
|
+
crawler = Crawler.new url
|
35
|
+
|
36
|
+
crawler.crawl
|
37
|
+
end
|
38
|
+
|
39
|
+
def docopt
|
40
|
+
@doc ||= File.read docopt_path
|
41
|
+
end
|
42
|
+
|
43
|
+
def docopt_path
|
44
|
+
File.expand_path "templates/docopt.txt", __dir__
|
45
|
+
end
|
46
|
+
|
47
|
+
def apply_tweaks(tweaks)
|
48
|
+
tweaks.each do |key, value|
|
49
|
+
Config.settings[key] = value
|
50
|
+
$logger.level = value if key == 'log_level'
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'sting'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Snapcrawl
|
5
|
+
class Config < Sting
|
6
|
+
class << self
|
7
|
+
def load(file = nil)
|
8
|
+
reset!
|
9
|
+
push defaults
|
10
|
+
|
11
|
+
return unless file
|
12
|
+
|
13
|
+
file = "#{file}.yml" unless file =~ /\.ya?ml$/
|
14
|
+
|
15
|
+
if File.exist? file
|
16
|
+
push file
|
17
|
+
else
|
18
|
+
create_config file
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def defaults
|
25
|
+
{
|
26
|
+
depth: 1,
|
27
|
+
width: 1280,
|
28
|
+
height: 0,
|
29
|
+
cache_life: 86400,
|
30
|
+
cache_dir: 'cache',
|
31
|
+
snaps_dir: 'snaps',
|
32
|
+
name_template: '%{url}',
|
33
|
+
url_whitelist: nil,
|
34
|
+
css_selector: nil,
|
35
|
+
log_level: 1,
|
36
|
+
log_color: 'auto',
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
def create_config(file)
|
41
|
+
$logger.debug "creating config file %{green}#{file}%{reset}"
|
42
|
+
content = File.read config_template
|
43
|
+
dir = File.dirname file
|
44
|
+
FileUtils.mkdir_p dir
|
45
|
+
File.write file, content
|
46
|
+
end
|
47
|
+
|
48
|
+
def config_template
|
49
|
+
File.expand_path 'templates/config.yml', __dir__
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -1,267 +1,93 @@
|
|
1
|
-
require 'colsole'
|
2
|
-
require 'docopt'
|
3
1
|
require 'fileutils'
|
4
|
-
require 'httparty'
|
5
|
-
require 'nokogiri'
|
6
|
-
require 'ostruct'
|
7
|
-
require 'pstore'
|
8
|
-
require 'addressable/uri'
|
9
|
-
require 'webshot'
|
10
2
|
|
11
3
|
module Snapcrawl
|
12
|
-
include Colsole
|
13
|
-
|
14
4
|
class Crawler
|
15
|
-
|
16
|
-
|
17
|
-
def initialize
|
18
|
-
@storefile = "snapcrawl.pstore"
|
19
|
-
@store = PStore.new(@storefile)
|
20
|
-
end
|
5
|
+
using StringRefinements
|
21
6
|
|
22
|
-
|
23
|
-
@done = []
|
24
|
-
begin
|
25
|
-
execute Docopt::docopt(doc, version: VERSION, argv: args)
|
26
|
-
rescue Docopt::Exit => e
|
27
|
-
puts e.message
|
28
|
-
end
|
29
|
-
end
|
7
|
+
attr_reader :url
|
30
8
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
|
9
|
+
def initialize(url)
|
10
|
+
$logger.debug "initializing crawler with %{green}#{url}%{reset}"
|
11
|
+
|
12
|
+
config_for_display = Config.settings.dup
|
13
|
+
config_for_display['name_template'] = '%%{url}'
|
14
|
+
|
15
|
+
$logger.debug "config #{config_for_display}"
|
16
|
+
@url = url
|
35
17
|
end
|
36
18
|
|
37
|
-
def
|
38
|
-
|
19
|
+
def crawl
|
20
|
+
Dependencies.verify
|
21
|
+
todo[url] = Page.new url
|
22
|
+
process_todo while todo.any?
|
39
23
|
end
|
40
24
|
|
41
25
|
private
|
42
26
|
|
43
|
-
def
|
44
|
-
|
45
|
-
defaults = {
|
46
|
-
width: 1280,
|
47
|
-
height: 0,
|
48
|
-
depth: 1,
|
49
|
-
age: 86400,
|
50
|
-
folder: 'snaps',
|
51
|
-
name: '%{url}',
|
52
|
-
base: url,
|
53
|
-
}
|
54
|
-
urls = [url]
|
55
|
-
|
56
|
-
@opts = OpenStruct.new defaults.merge(opts)
|
27
|
+
def process_todo
|
28
|
+
$logger.debug "processing queue: %{green}#{todo.count} remaining%{reset}"
|
57
29
|
|
58
|
-
|
30
|
+
url, page = todo.shift
|
31
|
+
done.push url
|
59
32
|
|
60
|
-
|
61
|
-
|
33
|
+
if process_page page
|
34
|
+
register_sub_pages page.pages if page.depth < Config.depth
|
62
35
|
end
|
63
36
|
end
|
64
37
|
|
65
|
-
def
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
say " Snap: Skipping. Does not match regex"
|
73
|
-
else
|
74
|
-
snap url
|
38
|
+
def register_sub_pages(pages)
|
39
|
+
pages.each do |sub_page|
|
40
|
+
next if todo.has_key?(sub_page) or done.include?(sub_page)
|
41
|
+
|
42
|
+
if Config.url_whitelist and sub_page.path !~ /#{Config.url_whitelist}/
|
43
|
+
$logger.debug "ignoring %{purple}%{underlined}#{sub_page.url}%{reset}, reason: whitelist"
|
44
|
+
next
|
75
45
|
end
|
76
|
-
new_urls += extract_urls_from url
|
77
|
-
end
|
78
|
-
new_urls
|
79
|
-
end
|
80
46
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
say " Snap: Skipping. File exists and seems fresh"
|
86
|
-
else
|
87
|
-
snap!(url)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
# Take a screenshot of the URL, even if file exists
|
92
|
-
def snap!(url)
|
93
|
-
say " !txtblu!Snap!!txtrst! Snapping picture... "
|
94
|
-
image_path = image_path_for url
|
47
|
+
if Config.url_blacklist and sub_page.path =~ /#{Config.url_blacklist}/
|
48
|
+
$logger.debug "ignoring %{purple}%{underlined}#{sub_page.url}%{reset}, reason: blacklist"
|
49
|
+
next
|
50
|
+
end
|
95
51
|
|
96
|
-
|
97
|
-
if @opts.selector
|
98
|
-
fetch_opts[:selector] = @opts.selector
|
99
|
-
fetch_opts[:full] = false
|
52
|
+
todo[sub_page.url] = sub_page
|
100
53
|
end
|
101
|
-
|
102
|
-
webshot_capture url, image_path, fetch_opts
|
103
|
-
say "done"
|
104
54
|
end
|
105
55
|
|
106
|
-
def
|
107
|
-
|
108
|
-
rescue => e
|
109
|
-
say "!txtred!FAILED"
|
110
|
-
say "!txtred! ! #{e.class}: #{e.message.strip}"
|
111
|
-
end
|
56
|
+
def process_page(page)
|
57
|
+
outfile = "#{Config.snaps_dir}/#{Config.name_template}.png" % { url: page.url.to_slug }
|
112
58
|
|
113
|
-
|
114
|
-
hide_output do
|
115
|
-
webshot.capture url, image_path, fetch_opts do |magick|
|
116
|
-
magick.combine_options do |c|
|
117
|
-
c.background "white"
|
118
|
-
c.gravity 'north'
|
119
|
-
c.quality 100
|
120
|
-
c.extent @opts.height > 0 ? "#{@opts.width}x#{@opts.height}" : "#{@opts.width}x"
|
121
|
-
end
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
59
|
+
$logger.info "processing %{purple}%{underlined}#{page.url}%{reset}, depth: #{page.depth}"
|
125
60
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
if cached
|
130
|
-
say " Crawl: Page was cached. Reading subsequent URLs from cache"
|
131
|
-
return cached
|
132
|
-
else
|
133
|
-
return extract_urls_from! url
|
61
|
+
if !page.valid?
|
62
|
+
$logger.debug "page #{page.path} is invalid, aborting process"
|
63
|
+
return false
|
134
64
|
end
|
135
|
-
end
|
136
|
-
|
137
|
-
def extract_urls_from!(url)
|
138
|
-
say " !txtblu!Crawl!!txtrst! Extracting links... "
|
139
65
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
links, warnings = normalize_links links
|
146
|
-
@store.transaction { @store[url] = links }
|
147
|
-
say "done"
|
148
|
-
warnings.each do |warning|
|
149
|
-
say "!txtylw! Warn: #{warning[:link]}"
|
150
|
-
say word_wrap " #{warning[:message]}"
|
151
|
-
end
|
152
|
-
else
|
153
|
-
links = []
|
154
|
-
say "!txtred!FAILED"
|
155
|
-
say "!txtred! ! HTTP Error: #{response.code} #{response.message.strip} at #{url}"
|
156
|
-
end
|
66
|
+
if file_fresh? outfile
|
67
|
+
$logger.info "screenshot for #{page.path} already exists"
|
68
|
+
else
|
69
|
+
$logger.info "%{bold}capturing screenshot for #{page.path}%{reset}"
|
70
|
+
page.save_screenshot outfile
|
157
71
|
end
|
158
|
-
links
|
159
|
-
end
|
160
|
-
|
161
|
-
# mkdir the screenshots folder, if needed
|
162
|
-
def make_screenshot_dir(dir)
|
163
|
-
Dir.exist? dir or FileUtils.mkdir_p dir
|
164
|
-
end
|
165
72
|
|
166
|
-
|
167
|
-
def handelize(str)
|
168
|
-
str.downcase.gsub(/[^a-z0-9]+/, '-')
|
73
|
+
true
|
169
74
|
end
|
170
75
|
|
171
|
-
# Return proper image path for a UR
|
172
|
-
def image_path_for(url)
|
173
|
-
"#{@opts.folder}/#{@opts.name}.png" % { url: handelize(url) }
|
174
|
-
end
|
175
|
-
|
176
|
-
# Add protocol to a URL if neeed
|
177
|
-
def protocolize(url)
|
178
|
-
url =~ /^http/ ? url : "http://#{url}"
|
179
|
-
end
|
180
|
-
|
181
|
-
# Return true if the file exists and is not too old
|
182
76
|
def file_fresh?(file)
|
183
|
-
|
77
|
+
Config.cache_life > 0 and File.exist?(file) and file_age(file) < Config.cache_life
|
184
78
|
end
|
185
79
|
|
186
|
-
# Return file age in seconds
|
187
80
|
def file_age(file)
|
188
81
|
(Time.now - File.stat(file).mtime).to_i
|
189
82
|
end
|
190
83
|
|
191
|
-
|
192
|
-
|
193
|
-
extensions = "png|gif|jpg|pdf|zip"
|
194
|
-
beginnings = "mailto|tel"
|
195
|
-
|
196
|
-
links_array = []
|
197
|
-
warnings = []
|
198
|
-
|
199
|
-
links.each do |link|
|
200
|
-
link = link.attribute('href').to_s.dup
|
201
|
-
|
202
|
-
# Remove #hash
|
203
|
-
link.gsub!(/#.+$/, '')
|
204
|
-
next if link.empty?
|
205
|
-
|
206
|
-
# Remove links to specific extensions and protocols
|
207
|
-
next if link =~ /\.(#{extensions})(\?.*)?$/
|
208
|
-
next if link =~ /^(#{beginnings})/
|
209
|
-
|
210
|
-
# Strip spaces
|
211
|
-
link.strip!
|
212
|
-
|
213
|
-
# Convert relative links to absolute
|
214
|
-
begin
|
215
|
-
link = Addressable::URI.join( @opts.base, link ).to_s.dup
|
216
|
-
rescue => e
|
217
|
-
warnings << { link: link, message: "#{e.class} #{e.message}" }
|
218
|
-
next
|
219
|
-
end
|
220
|
-
|
221
|
-
# Keep only links in our base domain
|
222
|
-
next unless link.include? @opts.base
|
223
|
-
|
224
|
-
links_array << link
|
225
|
-
end
|
226
|
-
|
227
|
-
[links_array.uniq, warnings]
|
228
|
-
end
|
229
|
-
|
230
|
-
def doc
|
231
|
-
@doc ||= File.read docopt
|
84
|
+
def todo
|
85
|
+
@todo ||= {}
|
232
86
|
end
|
233
87
|
|
234
|
-
def
|
235
|
-
|
88
|
+
def done
|
89
|
+
@done ||= []
|
236
90
|
end
|
237
91
|
|
238
|
-
def opts_from_args(args)
|
239
|
-
opts = {}
|
240
|
-
%w[folder name selector only].each do |opt|
|
241
|
-
opts[opt.to_sym] = args["--#{opt}"] if args["--#{opt}"]
|
242
|
-
end
|
243
|
-
|
244
|
-
%w[age depth width height].each do |opt|
|
245
|
-
opts[opt.to_sym] = args["--#{opt}"].to_i if args["--#{opt}"]
|
246
|
-
end
|
247
|
-
|
248
|
-
opts
|
249
|
-
end
|
250
|
-
|
251
|
-
def webshot
|
252
|
-
@webshot ||= Webshot::Screenshot.instance
|
253
|
-
end
|
254
|
-
|
255
|
-
# The webshot gem messes with stdout/stderr streams so we keep it in
|
256
|
-
# check by using this method. Also, in some sites (e.g. uown.co) it
|
257
|
-
# prints some output to stdout, this is why we override $stdout for
|
258
|
-
# the duration of the run.
|
259
|
-
def hide_output
|
260
|
-
keep_stdout, keep_stderr = $stdout, $stderr
|
261
|
-
$stdout, $stderr = StringIO.new, StringIO.new
|
262
|
-
yield
|
263
|
-
ensure
|
264
|
-
$stdout, $stderr = keep_stdout, keep_stderr
|
265
|
-
end
|
266
92
|
end
|
267
93
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
|
3
|
+
module Snapcrawl
|
4
|
+
class Dependencies
|
5
|
+
class << self
|
6
|
+
include Colsole
|
7
|
+
|
8
|
+
def verify
|
9
|
+
return if @verified
|
10
|
+
|
11
|
+
$logger.debug 'verifying %{green}phantomjs%{reset} is present'
|
12
|
+
raise MissingPhantomJS unless command_exist? "phantomjs"
|
13
|
+
|
14
|
+
$logger.debug 'verifying %{green}imagemagick%{reset} is present'
|
15
|
+
raise MissingImageMagick unless command_exist? "convert"
|
16
|
+
|
17
|
+
@verified = true
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/snapcrawl/exceptions.rb
CHANGED
@@ -0,0 +1,57 @@
|
|
1
|
+
module Snapcrawl
|
2
|
+
module LogHelpers
|
3
|
+
SEVERITY_COLORS = {
|
4
|
+
'INFO' => :blue,
|
5
|
+
'WARN' => :yellow,
|
6
|
+
'ERROR' => :red,
|
7
|
+
'FATAL' => :red,
|
8
|
+
'DEBUG' => :cyan
|
9
|
+
}
|
10
|
+
|
11
|
+
def log_formatter
|
12
|
+
proc do |severity, _time, _prog, message|
|
13
|
+
severity_color = SEVERITY_COLORS[severity]
|
14
|
+
|
15
|
+
"%{#{severity_color}}#{severity.rjust 5}%{reset} : #{message}\n" % log_colors
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def log_colors
|
20
|
+
@log_colors ||= log_colors!
|
21
|
+
end
|
22
|
+
|
23
|
+
def log_colors!
|
24
|
+
colors? ? actual_colors : empty_colors
|
25
|
+
end
|
26
|
+
|
27
|
+
def actual_colors
|
28
|
+
{
|
29
|
+
red: "\e[31m", green: "\e[32m", yellow: "\e[33m",
|
30
|
+
blue: "\e[34m", purple: "\e[35m", cyan: "\e[36m",
|
31
|
+
underlined: "\e[4m", bold: "\e[1m",
|
32
|
+
none: "", reset: "\e[0m"
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
def empty_colors
|
37
|
+
{
|
38
|
+
red: "", green: "", yellow: "",
|
39
|
+
blue: "", purple: "", cyan: "",
|
40
|
+
underlined: "", bold: "",
|
41
|
+
none: "", reset: ""
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
def colors?
|
46
|
+
if Config.log_color == 'auto'
|
47
|
+
tty?
|
48
|
+
else
|
49
|
+
Config.log_color
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def tty?
|
54
|
+
ENV['TTY'] == 'on' ? true : ENV['TTY'] == 'off' ? false : $stdout.tty?
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'httparty'
|
4
|
+
require 'lightly'
|
5
|
+
require 'nokogiri'
|
6
|
+
|
7
|
+
module Snapcrawl
|
8
|
+
class Page
|
9
|
+
using StringRefinements
|
10
|
+
|
11
|
+
attr_reader :url, :depth
|
12
|
+
|
13
|
+
EXTENSION_BLACKLIST = "png|gif|jpg|pdf|zip"
|
14
|
+
PROTOCOL_BLACKLIST = "mailto|tel"
|
15
|
+
|
16
|
+
def initialize(url, depth: 0)
|
17
|
+
@url, @depth = url.protocolize, depth
|
18
|
+
end
|
19
|
+
|
20
|
+
def valid?
|
21
|
+
http_response&.success?
|
22
|
+
end
|
23
|
+
|
24
|
+
def site
|
25
|
+
@site ||= Addressable::URI.parse(url).site
|
26
|
+
end
|
27
|
+
|
28
|
+
def path
|
29
|
+
@path ||= Addressable::URI.parse(url).request_uri
|
30
|
+
end
|
31
|
+
|
32
|
+
def links
|
33
|
+
return nil unless valid?
|
34
|
+
doc = Nokogiri::HTML http_response.body
|
35
|
+
normalize_links doc.css('a')
|
36
|
+
end
|
37
|
+
|
38
|
+
def pages
|
39
|
+
return nil unless valid?
|
40
|
+
links.map { |link| Page.new link, depth: depth+1 }
|
41
|
+
end
|
42
|
+
|
43
|
+
def save_screenshot(outfile)
|
44
|
+
return false unless valid?
|
45
|
+
Screenshot.new(url).save "#{outfile}"
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def http_response
|
51
|
+
@http_response ||= http_response!
|
52
|
+
end
|
53
|
+
|
54
|
+
def http_response!
|
55
|
+
response = cache.get(url) { HTTParty.get url }
|
56
|
+
|
57
|
+
if !response.success?
|
58
|
+
$logger.warn "http error on %{purple}%{underlined}#{url}%{reset}, code: %{yellow}#{response.code}%{reset}, message: #{response.message.strip}"
|
59
|
+
end
|
60
|
+
|
61
|
+
response
|
62
|
+
|
63
|
+
rescue => e
|
64
|
+
$logger.error "http error on %{purple}%{underlined}#{url}%{reset} - %{red}#{e.class}%{reset}: #{e.message}"
|
65
|
+
nil
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
def normalize_links(links)
|
70
|
+
result = []
|
71
|
+
|
72
|
+
links.each do |link|
|
73
|
+
valid_link = normalize_link link
|
74
|
+
result << valid_link if valid_link
|
75
|
+
end
|
76
|
+
|
77
|
+
result.uniq
|
78
|
+
end
|
79
|
+
|
80
|
+
def normalize_link(link)
|
81
|
+
link = link.attribute('href').to_s.dup
|
82
|
+
|
83
|
+
# Remove #hash
|
84
|
+
link.gsub!(/#.+$/, '')
|
85
|
+
return nil if link.empty?
|
86
|
+
|
87
|
+
# Remove links to specific extensions and protocols
|
88
|
+
return nil if link =~ /\.(#{EXTENSION_BLACKLIST})(\?.*)?$/
|
89
|
+
return nil if link =~ /^(#{PROTOCOL_BLACKLIST}):/
|
90
|
+
|
91
|
+
# Strip spaces
|
92
|
+
link.strip!
|
93
|
+
|
94
|
+
# Convert relative links to absolute
|
95
|
+
begin
|
96
|
+
link = Addressable::URI.join(url, link).to_s.dup
|
97
|
+
rescue => e
|
98
|
+
$logger.warn "%{red}#{e.class}%{reset}: #{e.message} on #{path} (link: #{link})"
|
99
|
+
return nil
|
100
|
+
end
|
101
|
+
|
102
|
+
# Keep only links in our base domain
|
103
|
+
return nil unless link.include? site
|
104
|
+
link
|
105
|
+
end
|
106
|
+
|
107
|
+
def cache
|
108
|
+
Lightly.new life: Config.cache_life
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Snapcrawl
|
2
|
+
module PairSplit
|
3
|
+
refine Array do
|
4
|
+
def pair_split
|
5
|
+
map do |pair|
|
6
|
+
key, value = pair.split '='
|
7
|
+
|
8
|
+
value = if value =~ /^\d+$/
|
9
|
+
value.to_i
|
10
|
+
elsif ['no', 'false'].include? value
|
11
|
+
false
|
12
|
+
elsif ['yes', 'true'].include? value
|
13
|
+
true
|
14
|
+
else
|
15
|
+
value
|
16
|
+
end
|
17
|
+
|
18
|
+
[key, value]
|
19
|
+
end.to_h
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'webshot'
|
2
|
+
|
3
|
+
module Snapcrawl
|
4
|
+
class Screenshot
|
5
|
+
using StringRefinements
|
6
|
+
|
7
|
+
attr_reader :url
|
8
|
+
|
9
|
+
def initialize(url)
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
|
13
|
+
def save(outfile = nil)
|
14
|
+
outfile ||= "#{url.to_slug}.png"
|
15
|
+
|
16
|
+
fetch_opts = { allowed_status_codes: [404, 401, 403] }
|
17
|
+
if Config.selector
|
18
|
+
fetch_opts[:selector] = Config.selector
|
19
|
+
fetch_opts[:full] = false
|
20
|
+
end
|
21
|
+
|
22
|
+
webshot_capture url, outfile, fetch_opts
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def webshot_capture(url, image_path, fetch_opts)
|
28
|
+
webshot_capture! url, image_path, fetch_opts
|
29
|
+
rescue => e
|
30
|
+
raise ScreenshotError, "#{e.class} #{e.message}"
|
31
|
+
end
|
32
|
+
|
33
|
+
def webshot_capture!(url, image_path, fetch_opts)
|
34
|
+
hide_output do
|
35
|
+
webshot.capture url, image_path, fetch_opts do |magick|
|
36
|
+
magick.combine_options do |c|
|
37
|
+
c.background "white"
|
38
|
+
c.gravity 'north'
|
39
|
+
c.quality 100
|
40
|
+
c.extent Config.height > 0 ? "#{Config.width}x#{Config.height}" : "#{Config.width}x"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def webshot
|
47
|
+
@webshot ||= Webshot::Screenshot.instance
|
48
|
+
end
|
49
|
+
|
50
|
+
# The webshot gem messes with stdout/stderr streams so we keep it in
|
51
|
+
# check by using this method. Also, in some sites (e.g. uown.co) it
|
52
|
+
# prints some output to stdout, this is why we override $stdout for
|
53
|
+
# the duration of the run.
|
54
|
+
def hide_output
|
55
|
+
keep_stdout, keep_stderr = $stdout, $stderr
|
56
|
+
$stdout, $stderr = StringIO.new, StringIO.new
|
57
|
+
yield
|
58
|
+
ensure
|
59
|
+
$stdout, $stderr = keep_stdout, keep_stderr
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# All values below are the default values
|
2
|
+
|
3
|
+
# log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
|
4
|
+
log_level: 1
|
5
|
+
|
6
|
+
# log_color (yes, no, auto)
|
7
|
+
# yes = always show log color
|
8
|
+
# no = never use colors
|
9
|
+
# auto = only use colors when running in an interactive terminal
|
10
|
+
log_color: auto
|
11
|
+
|
12
|
+
# number of levels to crawl, 0 means capture only the root URL
|
13
|
+
depth: 1
|
14
|
+
|
15
|
+
# screenshot width in pixels
|
16
|
+
width: 1280
|
17
|
+
|
18
|
+
# screenshot height in pixels, 0 means the entire height
|
19
|
+
height: 0
|
20
|
+
|
21
|
+
# number of seconds to consider the page cache and its screenshot fresh
|
22
|
+
cache_life: 86400
|
23
|
+
|
24
|
+
# where to store the HTML page cache
|
25
|
+
cache_dir: cache
|
26
|
+
|
27
|
+
# where to store screenshots
|
28
|
+
snaps_dir: snaps
|
29
|
+
|
30
|
+
# screenshot filename template, where '%{url}' will be replaced with a
|
31
|
+
# slug version of the URL (no need to include the .png extension)
|
32
|
+
name_template: '%{url}'
|
33
|
+
|
34
|
+
# urls not matching this regular expression will be ignored
|
35
|
+
url_whitelist:
|
36
|
+
|
37
|
+
# urls matching this regular expression will be ignored
|
38
|
+
url_blacklist:
|
39
|
+
|
40
|
+
# take a screenshot of this CSS selector only
|
41
|
+
css_selector:
|
@@ -0,0 +1,26 @@
|
|
1
|
+
Snapcrawl
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
snapcrawl URL [--config FILE] [SETTINGS...]
|
5
|
+
snapcrawl -h | --help
|
6
|
+
snapcrawl -v | --version
|
7
|
+
|
8
|
+
Options:
|
9
|
+
-c, --config FILE
|
10
|
+
Path to config file, with or without the .yml extension
|
11
|
+
A sample file will be created if not found
|
12
|
+
[default: snapcrawl.yml]
|
13
|
+
|
14
|
+
-h, --help
|
15
|
+
Show this screen
|
16
|
+
|
17
|
+
-v, --version
|
18
|
+
Show version number
|
19
|
+
|
20
|
+
Settings:
|
21
|
+
You may provide any of the options available in the config as 'key=value'.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
snapcrawl example.com
|
25
|
+
snapcrawl example.com --config simple
|
26
|
+
snapcrawl example.com depth=1 log_level=2
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-03-
|
11
|
+
date: 2020-03-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|
@@ -16,48 +16,42 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0.
|
20
|
-
- - ">="
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
version: 0.5.4
|
19
|
+
version: '0.7'
|
23
20
|
type: :runtime
|
24
21
|
prerelease: false
|
25
22
|
version_requirements: !ruby/object:Gem::Requirement
|
26
23
|
requirements:
|
27
24
|
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version: '0.
|
30
|
-
- - ">="
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: 0.5.4
|
26
|
+
version: '0.7'
|
33
27
|
- !ruby/object:Gem::Dependency
|
34
28
|
name: docopt
|
35
29
|
requirement: !ruby/object:Gem::Requirement
|
36
30
|
requirements:
|
37
31
|
- - "~>"
|
38
32
|
- !ruby/object:Gem::Version
|
39
|
-
version: '0.
|
33
|
+
version: '0.6'
|
40
34
|
type: :runtime
|
41
35
|
prerelease: false
|
42
36
|
version_requirements: !ruby/object:Gem::Requirement
|
43
37
|
requirements:
|
44
38
|
- - "~>"
|
45
39
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
40
|
+
version: '0.6'
|
47
41
|
- !ruby/object:Gem::Dependency
|
48
42
|
name: nokogiri
|
49
43
|
requirement: !ruby/object:Gem::Requirement
|
50
44
|
requirements:
|
51
45
|
- - "~>"
|
52
46
|
- !ruby/object:Gem::Version
|
53
|
-
version: '1.
|
47
|
+
version: '1.10'
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
51
|
requirements:
|
58
52
|
- - "~>"
|
59
53
|
- !ruby/object:Gem::Version
|
60
|
-
version: '1.
|
54
|
+
version: '1.10'
|
61
55
|
- !ruby/object:Gem::Dependency
|
62
56
|
name: webshot
|
63
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,14 +72,14 @@ dependencies:
|
|
78
72
|
requirements:
|
79
73
|
- - "~>"
|
80
74
|
- !ruby/object:Gem::Version
|
81
|
-
version: '0.
|
75
|
+
version: '0.18'
|
82
76
|
type: :runtime
|
83
77
|
prerelease: false
|
84
78
|
version_requirements: !ruby/object:Gem::Requirement
|
85
79
|
requirements:
|
86
80
|
- - "~>"
|
87
81
|
- !ruby/object:Gem::Version
|
88
|
-
version: '0.
|
82
|
+
version: '0.18'
|
89
83
|
- !ruby/object:Gem::Dependency
|
90
84
|
name: addressable
|
91
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,6 +94,34 @@ dependencies:
|
|
100
94
|
- - "~>"
|
101
95
|
- !ruby/object:Gem::Version
|
102
96
|
version: '2.7'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: lightly
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.3'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.3'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: sting
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.4'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.4'
|
103
125
|
description: Snapcrawl is a command line utility for crawling a website and saving
|
104
126
|
screenshots.
|
105
127
|
email: db@dannyben.com
|
@@ -111,9 +133,19 @@ files:
|
|
111
133
|
- README.md
|
112
134
|
- bin/snapcrawl
|
113
135
|
- lib/snapcrawl.rb
|
136
|
+
- lib/snapcrawl/cli.rb
|
137
|
+
- lib/snapcrawl/config.rb
|
114
138
|
- lib/snapcrawl/crawler.rb
|
115
|
-
- lib/snapcrawl/
|
139
|
+
- lib/snapcrawl/dependencies.rb
|
116
140
|
- lib/snapcrawl/exceptions.rb
|
141
|
+
- lib/snapcrawl/log_helpers.rb
|
142
|
+
- lib/snapcrawl/page.rb
|
143
|
+
- lib/snapcrawl/pretty_logger.rb
|
144
|
+
- lib/snapcrawl/refinements/pair_split.rb
|
145
|
+
- lib/snapcrawl/refinements/string_refinements.rb
|
146
|
+
- lib/snapcrawl/screenshot.rb
|
147
|
+
- lib/snapcrawl/templates/config.yml
|
148
|
+
- lib/snapcrawl/templates/docopt.txt
|
117
149
|
- lib/snapcrawl/version.rb
|
118
150
|
homepage: https://github.com/DannyBen/snapcrawl
|
119
151
|
licenses:
|
@@ -130,9 +162,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
130
162
|
version: '2.3'
|
131
163
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
164
|
requirements:
|
133
|
-
- - "
|
165
|
+
- - ">"
|
134
166
|
- !ruby/object:Gem::Version
|
135
|
-
version:
|
167
|
+
version: 1.3.1
|
136
168
|
requirements: []
|
137
169
|
rubygems_version: 3.0.3
|
138
170
|
signing_key:
|
data/lib/snapcrawl/docopt.txt
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
Snapcrawl
|
2
|
-
|
3
|
-
Usage:
|
4
|
-
snapcrawl URL [options]
|
5
|
-
snapcrawl -h | --help
|
6
|
-
snapcrawl -v | --version
|
7
|
-
|
8
|
-
Options:
|
9
|
-
-f, --folder PATH
|
10
|
-
Where to save screenshots [default: snaps]
|
11
|
-
|
12
|
-
-n, --name TEMPLATE
|
13
|
-
Filename template. Include the string '%{url}' anywhere in the name to
|
14
|
-
use the captured URL in the filename [default: %{url}]
|
15
|
-
|
16
|
-
-a, --age SECONDS
|
17
|
-
Number of seconds to consider screenshots fresh [default: 86400]
|
18
|
-
|
19
|
-
-d, --depth LEVELS
|
20
|
-
Number of levels to crawl [default: 1]
|
21
|
-
|
22
|
-
-W, --width PIXELS
|
23
|
-
Screen width in pixels [default: 1280]
|
24
|
-
|
25
|
-
-H, --height PIXELS
|
26
|
-
Screen height in pixels. Use 0 to capture the full page [default: 0]
|
27
|
-
|
28
|
-
-s, --selector SELECTOR
|
29
|
-
CSS selector to capture
|
30
|
-
|
31
|
-
-o, --only REGEX
|
32
|
-
Include only URLs that match REGEX
|
33
|
-
|
34
|
-
-h, --help
|
35
|
-
Show this screen
|
36
|
-
|
37
|
-
-v, --version
|
38
|
-
Show version number
|
39
|
-
|
40
|
-
Examples:
|
41
|
-
snapcrawl example.com
|
42
|
-
snapcrawl example.com -d2 -fscreens
|
43
|
-
snapcrawl example.com -d2 > out.txt 2> err.txt &
|
44
|
-
snapcrawl example.com -W360 -H480
|
45
|
-
snapcrawl example.com --selector "#main-content"
|
46
|
-
snapcrawl example.com --only "products|collections"
|
47
|
-
snapcrawl example.com --name "screenshot-%{url}"
|
48
|
-
snapcrawl example.com --name "`date +%Y%m%d`_%{url}"
|