snapcrawl 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +38 -5
- data/lib/snapcrawl/crawler.rb +17 -7
- data/lib/snapcrawl/templates/docopt.txt +12 -0
- data/lib/snapcrawl/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f8bfcb13d6d049104a97fe95b4f20527c9e93f9
|
4
|
+
data.tar.gz: 5d39a2e40270cbe8ddfd5e0863016a665059e747
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c0ad7d74dff9e73d5892870cf162c2b61c96a3f316f6ffedf2b9bd84c09c080bf0e330d32a9f567d4f3e8bec14964afe2773e010177236f438046e3f06b87624
|
7
|
+
data.tar.gz: 015a6dd81b525bcd59cc52360c5d4989542646af72e84e48a350719cb4b88751e6def5ab201b81de9c256c7036e9894dc5ca03ae30829e7a8d4ab2de0af47aaa
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# SnapCrawl - crawl a website and take screenshots
|
2
2
|
|
3
|
-
|
4
|
-
screenshots.
|
3
|
+
SnapCrawl is a command line utility for crawling a website and saving
|
4
|
+
screenshots.
|
5
5
|
|
6
6
|
## Features
|
7
7
|
|
@@ -18,6 +18,39 @@ screenshots. It is using [Runfile](https://github.com/DannyBen/runfile).
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
$ snapcrawl --help
|
22
|
-
|
23
|
-
|
21
|
+
$ snapcrawl --help
|
22
|
+
|
23
|
+
Snapcrawl
|
24
|
+
|
25
|
+
Usage:
|
26
|
+
snapcrawl go <url> [options]
|
27
|
+
snapcrawl -h | --help
|
28
|
+
snapcrawl -v | --version
|
29
|
+
|
30
|
+
Options:
|
31
|
+
-f --folder <path> Where to save screenshots [default: snaps]
|
32
|
+
-a --age <n> Number of seconds to consider screenshots fresh
|
33
|
+
[default: 86400]
|
34
|
+
-d --depth <n> Number of levels to crawl [default: 1]
|
35
|
+
-W --width <n> Screen width in pixels [default: 1280]
|
36
|
+
-H --height <n> Screen height in pixels. Use 0 to capture the full
|
37
|
+
page [default: 0]
|
38
|
+
-s --selector <s> CSS selector to capture
|
39
|
+
-o --only <regex> Include only URLs that match <regex>
|
40
|
+
-h --help Show this screen
|
41
|
+
-v --version Show version
|
42
|
+
|
43
|
+
Examples:
|
44
|
+
snapcrawl go example.com
|
45
|
+
snapcrawl go example.com -d2 -fscreens
|
46
|
+
snapcrawl go example.com -d2 > out.txt 2> err.txt &
|
47
|
+
snapcrawl go example.com -W360 -H480
|
48
|
+
snapcrawl go example.com --selector "#main-content"
|
49
|
+
snapcrawl go example.com --only "products|collections"
|
50
|
+
|
51
|
+
---
|
52
|
+
|
53
|
+
## Notes
|
54
|
+
|
55
|
+
1. If a URL cannot be found, SnapCrawl will report to stderr.
|
56
|
+
You can create a report by running `snapcrawl go example.com 2> err.txt`
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -36,6 +36,8 @@ module Snapcrawl
|
|
36
36
|
|
37
37
|
def crawl(url, opts={})
|
38
38
|
defaults = {
|
39
|
+
width: 1280,
|
40
|
+
height: 0,
|
39
41
|
depth: 1,
|
40
42
|
age: 86400,
|
41
43
|
dir: 'snaps',
|
@@ -60,7 +62,11 @@ module Snapcrawl
|
|
60
62
|
next if @done.include? url
|
61
63
|
@done << url
|
62
64
|
say "\n!txtgrn!-----> Visit: #{url}"
|
63
|
-
|
65
|
+
if @opts.only and url !~ /#{@opts.only}/
|
66
|
+
say " Snap: Skipping. Does not match regex"
|
67
|
+
else
|
68
|
+
snap url
|
69
|
+
end
|
64
70
|
new_urls += extract_urls_from url
|
65
71
|
end
|
66
72
|
new_urls
|
@@ -84,9 +90,8 @@ module Snapcrawl
|
|
84
90
|
fetch_opts = {}
|
85
91
|
fetch_opts[:output] = image_path_for(url)
|
86
92
|
fetch_opts[:width] = @opts.width
|
87
|
-
fetch_opts[:height] = @opts.height if @opts.height
|
88
|
-
|
89
|
-
# :div => '.header', # selector for a specific element to take screenshot of
|
93
|
+
fetch_opts[:height] = @opts.height if @opts.height > 0
|
94
|
+
fetch_opts[:div] = @opts.selector if @opts.selector
|
90
95
|
# :top => 0, :left => 0, :width => 100, :height => 100 # dimensions for a specific area
|
91
96
|
|
92
97
|
screenshot = f.fetch fetch_opts
|
@@ -191,9 +196,14 @@ module Snapcrawl
|
|
191
196
|
|
192
197
|
def opts_from_args(args)
|
193
198
|
opts = {}
|
194
|
-
|
195
|
-
|
196
|
-
|
199
|
+
%w[folder selector only].each do |opt|
|
200
|
+
opts[opt.to_sym] = args["--#{opt}"] if args["--#{opt}"]
|
201
|
+
end
|
202
|
+
|
203
|
+
%w[age depth width height].each do |opt|
|
204
|
+
opts[opt.to_sym] = args["--#{opt}"].to_i if args["--#{opt}"]
|
205
|
+
end
|
206
|
+
|
197
207
|
opts
|
198
208
|
end
|
199
209
|
end
|
@@ -10,6 +10,18 @@ Options:
|
|
10
10
|
-a --age <n> Number of seconds to consider screenshots fresh
|
11
11
|
[default: 86400]
|
12
12
|
-d --depth <n> Number of levels to crawl [default: 1]
|
13
|
+
-W --width <n> Screen width in pixels [default: 1280]
|
14
|
+
-H --height <n> Screen height in pixels. Use 0 to capture the full
|
15
|
+
page [default: 0]
|
16
|
+
-s --selector <s> CSS selector to capture
|
17
|
+
-o --only <regex> Include only URLs that match <regex>
|
13
18
|
-h --help Show this screen
|
14
19
|
-v --version Show version
|
15
20
|
|
21
|
+
Examples:
|
22
|
+
snapcrawl go example.com
|
23
|
+
snapcrawl go example.com -d2 -fscreens
|
24
|
+
snapcrawl go example.com -d2 > out.txt 2> err.txt &
|
25
|
+
snapcrawl go example.com -W360 -H480
|
26
|
+
snapcrawl go example.com --selector "#main-content"
|
27
|
+
snapcrawl go example.com --only "products|collections"
|
data/lib/snapcrawl/version.rb
CHANGED