snapcrawl 0.2.3 → 0.2.4rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +78 -79
- data/lib/snapcrawl/crawler.rb +236 -219
- data/lib/snapcrawl/templates/docopt.txt +27 -27
- data/lib/snapcrawl/version.rb +1 -1
- metadata +8 -84
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6cf8adbe42976bd97f013c67ffb70b5c162afaf566a1f5a16483dd0c8553dac0
|
4
|
+
data.tar.gz: c6aff321ae9078f352870899a917b7f4f9e412b1403e39399fb3fefd0c752d60
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d271cacd5fd3f60583b5c53340691c78965436c8ccab897202698c591b395d7be16325e05159d6633aae1c4076b919e361d157652fd2ab1739167c5711f893e1
|
7
|
+
data.tar.gz: 1a87b710f345e98429391dd285648302fb23ed26dc3508b4ecc788c1ceada5b9c89d5bbef56e9f37d2402b76dc6bf5c12277a73ceed6a5c1d7f5dd87a3937491
|
data/README.md
CHANGED
@@ -1,79 +1,78 @@
|
|
1
|
-
Snapcrawl - crawl a website and take screenshots
|
2
|
-
==================================================
|
3
|
-
|
4
|
-
[](http://badge.fury.io/rb/snapcrawl)
|
5
|
-
[](https://codeclimate.com/github/DannyBen/snapcrawl)
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
-
|
18
|
-
- Can
|
19
|
-
-
|
20
|
-
-
|
21
|
-
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
snapcrawl
|
40
|
-
snapcrawl -
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
-
|
45
|
-
|
46
|
-
|
47
|
-
-
|
48
|
-
-
|
49
|
-
|
50
|
-
|
51
|
-
-
|
52
|
-
-
|
53
|
-
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
snapcrawl go example.com
|
58
|
-
snapcrawl go example.com -d2
|
59
|
-
snapcrawl go example.com -
|
60
|
-
snapcrawl go example.com
|
61
|
-
snapcrawl go example.com --
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
- [
|
77
|
-
- [ ]
|
78
|
-
- [ ]
|
79
|
-
- [ ] Add screen size presets (also to user-overridable config)
|
1
|
+
Snapcrawl - crawl a website and take screenshots
|
2
|
+
==================================================
|
3
|
+
|
4
|
+
[](http://badge.fury.io/rb/snapcrawl)
|
5
|
+
[](https://codeclimate.com/github/DannyBen/snapcrawl)
|
6
|
+
|
7
|
+
---
|
8
|
+
|
9
|
+
Snapcrawl is a command line utility for crawling a website and saving
|
10
|
+
screenshots.
|
11
|
+
|
12
|
+
|
13
|
+
Features
|
14
|
+
--------------------------------------------------
|
15
|
+
|
16
|
+
- Crawls a website to any given depth and save screenshots
|
17
|
+
- Can capture the full length of the page
|
18
|
+
- Can use a specific resolution for screenshots
|
19
|
+
- Skips capturing if the screenshot was already saved recently
|
20
|
+
- Uses local caching to avoid expensive crawl operations if not needed
|
21
|
+
- Reports broken links
|
22
|
+
|
23
|
+
|
24
|
+
Install
|
25
|
+
--------------------------------------------------
|
26
|
+
|
27
|
+
$ gem install snapcrawl
|
28
|
+
|
29
|
+
|
30
|
+
Usage
|
31
|
+
--------------------------------------------------
|
32
|
+
|
33
|
+
$ snapcrawl --help
|
34
|
+
|
35
|
+
Snapcrawl
|
36
|
+
|
37
|
+
Usage:
|
38
|
+
snapcrawl go <url> [options]
|
39
|
+
snapcrawl -h | --help
|
40
|
+
snapcrawl -v | --version
|
41
|
+
|
42
|
+
Options:
|
43
|
+
-f --folder <path> Where to save screenshots [default: snaps]
|
44
|
+
-a --age <n> Number of seconds to consider screenshots fresh
|
45
|
+
[default: 86400]
|
46
|
+
-d --depth <n> Number of levels to crawl [default: 1]
|
47
|
+
-W --width <n> Screen width in pixels [default: 1280]
|
48
|
+
-H --height <n> Screen height in pixels. Use 0 to capture the full
|
49
|
+
page [default: 0]
|
50
|
+
-s --selector <s> CSS selector to capture
|
51
|
+
-o --only <regex> Include only URLs that match <regex>
|
52
|
+
-h --help Show this screen
|
53
|
+
-v --version Show version
|
54
|
+
|
55
|
+
Examples:
|
56
|
+
snapcrawl go example.com
|
57
|
+
snapcrawl go example.com -d2 -fscreens
|
58
|
+
snapcrawl go example.com -d2 > out.txt 2> err.txt &
|
59
|
+
snapcrawl go example.com -W360 -H480
|
60
|
+
snapcrawl go example.com --selector "#main-content"
|
61
|
+
snapcrawl go example.com --only "products|collections"
|
62
|
+
|
63
|
+
---
|
64
|
+
|
65
|
+
Notes
|
66
|
+
--------------------------------------------------
|
67
|
+
|
68
|
+
1. If a URL cannot be found, Snapcrawl will report to stderr.
|
69
|
+
You can create a report by running `snapcrawl go example.com 2> err.txt`
|
70
|
+
|
71
|
+
|
72
|
+
Todo
|
73
|
+
--------------------------------------------------
|
74
|
+
|
75
|
+
- [x] Tests (probably against some ad hoc sinatra)
|
76
|
+
- [ ] Make the test server start/stop automatically when testing
|
77
|
+
- [ ] Move ignored file extensions and mailto/tel links to config
|
78
|
+
- [ ] Add screen size presets (also to user-overridable config)
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -1,219 +1,236 @@
|
|
1
|
-
require 'colsole'
|
2
|
-
require 'docopt'
|
3
|
-
require 'fileutils'
|
4
|
-
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
|
-
require 'ostruct'
|
7
|
-
require 'pstore'
|
8
|
-
require '
|
9
|
-
|
10
|
-
module Snapcrawl
|
11
|
-
include Colsole
|
12
|
-
|
13
|
-
class Crawler
|
14
|
-
def self.instance
|
15
|
-
@@instance ||= self.new
|
16
|
-
end
|
17
|
-
|
18
|
-
def initialize
|
19
|
-
@storefile = "snapcrawl.pstore"
|
20
|
-
@store = PStore.new(@storefile)
|
21
|
-
end
|
22
|
-
|
23
|
-
def handle(args)
|
24
|
-
@done = []
|
25
|
-
begin
|
26
|
-
execute Docopt::docopt(doc, argv: args)
|
27
|
-
rescue Docopt::Exit => e
|
28
|
-
puts e.message
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def execute(args)
|
33
|
-
return show_version if args['--version']
|
34
|
-
crawl args['<url>'].dup, opts_from_args(args)
|
35
|
-
end
|
36
|
-
|
37
|
-
def clear_cache
|
38
|
-
FileUtils.rm @storefile if File.exist? @storefile
|
39
|
-
end
|
40
|
-
|
41
|
-
private
|
42
|
-
|
43
|
-
def crawl(url, opts={})
|
44
|
-
defaults = {
|
45
|
-
width: 1280,
|
46
|
-
height: 0,
|
47
|
-
depth: 1,
|
48
|
-
age: 86400,
|
49
|
-
dir: 'snaps',
|
50
|
-
base: url,
|
51
|
-
}
|
52
|
-
urls = [protocolize(url)]
|
53
|
-
|
54
|
-
@opts = OpenStruct.new defaults.merge(opts)
|
55
|
-
|
56
|
-
make_screenshot_dir @opts.dir
|
57
|
-
|
58
|
-
@opts.depth.times do
|
59
|
-
urls = crawl_and_snap urls
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
def crawl_and_snap(urls)
|
64
|
-
new_urls = []
|
65
|
-
urls.each do |url|
|
66
|
-
next if @done.include? url
|
67
|
-
@done << url
|
68
|
-
say "\n!txtgrn!-----> Visit: #{url}"
|
69
|
-
if @opts.only and url !~ /#{@opts.only}/
|
70
|
-
say " Snap: Skipping. Does not match regex"
|
71
|
-
else
|
72
|
-
snap url
|
73
|
-
end
|
74
|
-
new_urls += extract_urls_from url
|
75
|
-
end
|
76
|
-
new_urls
|
77
|
-
end
|
78
|
-
|
79
|
-
# Take a screenshot of a URL, unless we already did so recently
|
80
|
-
def snap(url)
|
81
|
-
file = image_path_for(url)
|
82
|
-
if file_fresh? file
|
83
|
-
say " Snap: Skipping. File exists and seems fresh"
|
84
|
-
else
|
85
|
-
snap!(url)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
# Take a screenshot of the URL, even if file exists
|
90
|
-
def snap!(url)
|
91
|
-
say " !txtblu!Snap!!txtrst! Snapping picture... "
|
92
|
-
|
93
|
-
|
94
|
-
fetch_opts = {}
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
links
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
end
|
218
|
-
|
219
|
-
|
1
|
+
require 'colsole'
|
2
|
+
require 'docopt'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'ostruct'
|
7
|
+
require 'pstore'
|
8
|
+
require 'webshot'
|
9
|
+
|
10
|
+
module Snapcrawl
|
11
|
+
include Colsole
|
12
|
+
|
13
|
+
class Crawler
|
14
|
+
def self.instance
|
15
|
+
@@instance ||= self.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
@storefile = "snapcrawl.pstore"
|
20
|
+
@store = PStore.new(@storefile)
|
21
|
+
end
|
22
|
+
|
23
|
+
def handle(args)
|
24
|
+
@done = []
|
25
|
+
begin
|
26
|
+
execute Docopt::docopt(doc, argv: args)
|
27
|
+
rescue Docopt::Exit => e
|
28
|
+
puts e.message
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def execute(args)
|
33
|
+
return show_version if args['--version']
|
34
|
+
crawl args['<url>'].dup, opts_from_args(args)
|
35
|
+
end
|
36
|
+
|
37
|
+
def clear_cache
|
38
|
+
FileUtils.rm @storefile if File.exist? @storefile
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def crawl(url, opts={})
|
44
|
+
defaults = {
|
45
|
+
width: 1280,
|
46
|
+
height: 0,
|
47
|
+
depth: 1,
|
48
|
+
age: 86400,
|
49
|
+
dir: 'snaps',
|
50
|
+
base: url,
|
51
|
+
}
|
52
|
+
urls = [protocolize(url)]
|
53
|
+
|
54
|
+
@opts = OpenStruct.new defaults.merge(opts)
|
55
|
+
|
56
|
+
make_screenshot_dir @opts.dir
|
57
|
+
|
58
|
+
@opts.depth.times do
|
59
|
+
urls = crawl_and_snap urls
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def crawl_and_snap(urls)
|
64
|
+
new_urls = []
|
65
|
+
urls.each do |url|
|
66
|
+
next if @done.include? url
|
67
|
+
@done << url
|
68
|
+
say "\n!txtgrn!-----> Visit: #{url}"
|
69
|
+
if @opts.only and url !~ /#{@opts.only}/
|
70
|
+
say " Snap: Skipping. Does not match regex"
|
71
|
+
else
|
72
|
+
snap url
|
73
|
+
end
|
74
|
+
new_urls += extract_urls_from url
|
75
|
+
end
|
76
|
+
new_urls
|
77
|
+
end
|
78
|
+
|
79
|
+
# Take a screenshot of a URL, unless we already did so recently
|
80
|
+
def snap(url)
|
81
|
+
file = image_path_for(url)
|
82
|
+
if file_fresh? file
|
83
|
+
say " Snap: Skipping. File exists and seems fresh"
|
84
|
+
else
|
85
|
+
snap!(url)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Take a screenshot of the URL, even if file exists
|
90
|
+
def snap!(url)
|
91
|
+
say " !txtblu!Snap!!txtrst! Snapping picture... "
|
92
|
+
image_path = image_path_for url
|
93
|
+
|
94
|
+
fetch_opts = { allowed_status_codes: [404, 401, 403] }
|
95
|
+
if @opts.selector
|
96
|
+
fetch_opts[:selector] = @opts.selector
|
97
|
+
fetch_opts[:full] = false
|
98
|
+
end
|
99
|
+
|
100
|
+
# The webshot gem messes with stdout/stderr streams so we keep it in
|
101
|
+
# check
|
102
|
+
$keep_stdout, $keep_stderr = $stdout, $stderr
|
103
|
+
|
104
|
+
webshot.capture url, image_path, fetch_opts do |magick|
|
105
|
+
magick.combine_options do |c|
|
106
|
+
c.background "white"
|
107
|
+
c.gravity 'north'
|
108
|
+
c.quality 100
|
109
|
+
c.extent @opts.height > 0 ? "#{@opts.width}x#{@opts.height}" : "#{@opts.width}x"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
$stdout, $stderr = $keep_stdout, $keep_stderr
|
114
|
+
|
115
|
+
say "done"
|
116
|
+
end
|
117
|
+
|
118
|
+
def extract_urls_from(url)
|
119
|
+
cached = nil
|
120
|
+
@store.transaction { cached = @store[url] }
|
121
|
+
if cached
|
122
|
+
say " Crawl: Page was cached. Reading subsequent URLs from cache"
|
123
|
+
return cached
|
124
|
+
else
|
125
|
+
return extract_urls_from! url
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def extract_urls_from!(url)
|
130
|
+
say " !txtblu!Crawl!!txtrst! Extracting links... "
|
131
|
+
|
132
|
+
begin
|
133
|
+
doc = Nokogiri::HTML open url
|
134
|
+
links = doc.css('a')
|
135
|
+
links = normalize_links links
|
136
|
+
@store.transaction { @store[url] = links }
|
137
|
+
say "done"
|
138
|
+
rescue OpenURI::HTTPError => e
|
139
|
+
links = []
|
140
|
+
say "!txtred!FAILED"
|
141
|
+
say "!txtred! ! HTTP Error: #{e.message} at #{url}"
|
142
|
+
end
|
143
|
+
links
|
144
|
+
end
|
145
|
+
|
146
|
+
# mkdir the screenshots folder, if needed
|
147
|
+
def make_screenshot_dir(dir)
|
148
|
+
Dir.exist? dir or FileUtils.mkdir_p dir
|
149
|
+
end
|
150
|
+
|
151
|
+
# Convert any string to a proper handle
|
152
|
+
def handelize(str)
|
153
|
+
str.downcase.gsub(/[^a-z0-9]+/, '-')
|
154
|
+
end
|
155
|
+
|
156
|
+
# Return proper image path for a UR
|
157
|
+
def image_path_for(url)
|
158
|
+
"#{@opts.dir}/#{handelize(url)}.png"
|
159
|
+
end
|
160
|
+
|
161
|
+
# Add protocol to a URL if neeed
|
162
|
+
def protocolize(url)
|
163
|
+
url =~ /^http/ ? url : "http://#{url}"
|
164
|
+
end
|
165
|
+
|
166
|
+
# Return true if the file exists and is not too old
|
167
|
+
def file_fresh?(file)
|
168
|
+
@opts.age > 0 and File.exist?(file) and file_age(file) < @opts.age
|
169
|
+
end
|
170
|
+
|
171
|
+
# Return file age in seconds
|
172
|
+
def file_age(file)
|
173
|
+
(Time.now - File.stat(file).mtime).to_i
|
174
|
+
end
|
175
|
+
|
176
|
+
# Process an array of links and return a better one
|
177
|
+
def normalize_links(links)
|
178
|
+
extensions = "png|gif|jpg|pdf|zip"
|
179
|
+
beginnings = "mailto|tel"
|
180
|
+
|
181
|
+
links_array = []
|
182
|
+
|
183
|
+
links.each_with_index do |link|
|
184
|
+
link = link.attribute('href').to_s
|
185
|
+
|
186
|
+
# Remove #hash
|
187
|
+
link.gsub!(/#.+$/, '')
|
188
|
+
next if link.empty?
|
189
|
+
|
190
|
+
# Remove links to specific extensions and protocols
|
191
|
+
next if link =~ /\.(#{extensions})(\?.*)?$/
|
192
|
+
next if link =~ /^(#{beginnings})/
|
193
|
+
|
194
|
+
# Add the base domain to relative URLs
|
195
|
+
link = link =~ /^http/ ? link : "#{@opts.base}#{link}"
|
196
|
+
link = "http://#{link}" unless link =~ /^http/
|
197
|
+
|
198
|
+
# Keep only links in our base domain
|
199
|
+
next unless link.include? @opts.base
|
200
|
+
|
201
|
+
links_array << link
|
202
|
+
end
|
203
|
+
|
204
|
+
links_array.uniq
|
205
|
+
end
|
206
|
+
|
207
|
+
def show_version
|
208
|
+
puts VERSION
|
209
|
+
end
|
210
|
+
|
211
|
+
def doc
|
212
|
+
@doc ||= File.read template 'docopt.txt'
|
213
|
+
end
|
214
|
+
|
215
|
+
def template(file)
|
216
|
+
File.expand_path("../templates/#{file}", __FILE__)
|
217
|
+
end
|
218
|
+
|
219
|
+
def opts_from_args(args)
|
220
|
+
opts = {}
|
221
|
+
%w[folder selector only].each do |opt|
|
222
|
+
opts[opt.to_sym] = args["--#{opt}"] if args["--#{opt}"]
|
223
|
+
end
|
224
|
+
|
225
|
+
%w[age depth width height].each do |opt|
|
226
|
+
opts[opt.to_sym] = args["--#{opt}"].to_i if args["--#{opt}"]
|
227
|
+
end
|
228
|
+
|
229
|
+
opts
|
230
|
+
end
|
231
|
+
|
232
|
+
def webshot
|
233
|
+
Webshot::Screenshot.instance
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
@@ -1,27 +1,27 @@
|
|
1
|
-
Snapcrawl
|
2
|
-
|
3
|
-
Usage:
|
4
|
-
snapcrawl go <url> [options]
|
5
|
-
snapcrawl -h | --help
|
6
|
-
snapcrawl -v | --version
|
7
|
-
|
8
|
-
Options:
|
9
|
-
-f --folder <path> Where to save screenshots [default: snaps]
|
10
|
-
-a --age <n> Number of seconds to consider screenshots fresh
|
11
|
-
[default: 86400]
|
12
|
-
-d --depth <n> Number of levels to crawl [default: 1]
|
13
|
-
-W --width <n> Screen width in pixels [default: 1280]
|
14
|
-
-H --height <n> Screen height in pixels. Use 0 to capture the full
|
15
|
-
page [default: 0]
|
16
|
-
-s --selector <s> CSS selector to capture
|
17
|
-
-o --only <regex> Include only URLs that match <regex>
|
18
|
-
-h --help Show this screen
|
19
|
-
-v --version Show version
|
20
|
-
|
21
|
-
Examples:
|
22
|
-
snapcrawl go example.com
|
23
|
-
snapcrawl go example.com -d2 -fscreens
|
24
|
-
snapcrawl go example.com -d2 > out.txt 2> err.txt &
|
25
|
-
snapcrawl go example.com -W360 -H480
|
26
|
-
snapcrawl go example.com --selector "#main-content"
|
27
|
-
snapcrawl go example.com --only "products|collections"
|
1
|
+
Snapcrawl
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
snapcrawl go <url> [options]
|
5
|
+
snapcrawl -h | --help
|
6
|
+
snapcrawl -v | --version
|
7
|
+
|
8
|
+
Options:
|
9
|
+
-f --folder <path> Where to save screenshots [default: snaps]
|
10
|
+
-a --age <n> Number of seconds to consider screenshots fresh
|
11
|
+
[default: 86400]
|
12
|
+
-d --depth <n> Number of levels to crawl [default: 1]
|
13
|
+
-W --width <n> Screen width in pixels [default: 1280]
|
14
|
+
-H --height <n> Screen height in pixels. Use 0 to capture the full
|
15
|
+
page [default: 0]
|
16
|
+
-s --selector <s> CSS selector to capture
|
17
|
+
-o --only <regex> Include only URLs that match <regex>
|
18
|
+
-h --help Show this screen
|
19
|
+
-v --version Show version
|
20
|
+
|
21
|
+
Examples:
|
22
|
+
snapcrawl go example.com
|
23
|
+
snapcrawl go example.com -d2 -fscreens
|
24
|
+
snapcrawl go example.com -d2 > out.txt 2> err.txt &
|
25
|
+
snapcrawl go example.com -W360 -H480
|
26
|
+
snapcrawl go example.com --selector "#main-content"
|
27
|
+
snapcrawl go example.com --only "products|collections"
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|
@@ -53,7 +53,7 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.6'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: webshot
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
@@ -70,92 +70,16 @@ dependencies:
|
|
70
70
|
name: phantomjs
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: 1.9.8
|
76
|
-
- - "<"
|
73
|
+
- - ">="
|
77
74
|
- !ruby/object:Gem::Version
|
78
75
|
version: '2.0'
|
79
76
|
type: :runtime
|
80
77
|
prerelease: false
|
81
78
|
version_requirements: !ruby/object:Gem::Requirement
|
82
79
|
requirements:
|
83
|
-
- - "
|
84
|
-
- !ruby/object:Gem::Version
|
85
|
-
version: 1.9.8
|
86
|
-
- - "<"
|
80
|
+
- - ">="
|
87
81
|
- !ruby/object:Gem::Version
|
88
82
|
version: '2.0'
|
89
|
-
- !ruby/object:Gem::Dependency
|
90
|
-
name: runfile
|
91
|
-
requirement: !ruby/object:Gem::Requirement
|
92
|
-
requirements:
|
93
|
-
- - "~>"
|
94
|
-
- !ruby/object:Gem::Version
|
95
|
-
version: '0.5'
|
96
|
-
type: :development
|
97
|
-
prerelease: false
|
98
|
-
version_requirements: !ruby/object:Gem::Requirement
|
99
|
-
requirements:
|
100
|
-
- - "~>"
|
101
|
-
- !ruby/object:Gem::Version
|
102
|
-
version: '0.5'
|
103
|
-
- !ruby/object:Gem::Dependency
|
104
|
-
name: run-gem-dev
|
105
|
-
requirement: !ruby/object:Gem::Requirement
|
106
|
-
requirements:
|
107
|
-
- - "~>"
|
108
|
-
- !ruby/object:Gem::Version
|
109
|
-
version: '0.2'
|
110
|
-
type: :development
|
111
|
-
prerelease: false
|
112
|
-
version_requirements: !ruby/object:Gem::Requirement
|
113
|
-
requirements:
|
114
|
-
- - "~>"
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
version: '0.2'
|
117
|
-
- !ruby/object:Gem::Dependency
|
118
|
-
name: minitest
|
119
|
-
requirement: !ruby/object:Gem::Requirement
|
120
|
-
requirements:
|
121
|
-
- - "~>"
|
122
|
-
- !ruby/object:Gem::Version
|
123
|
-
version: '5.8'
|
124
|
-
type: :development
|
125
|
-
prerelease: false
|
126
|
-
version_requirements: !ruby/object:Gem::Requirement
|
127
|
-
requirements:
|
128
|
-
- - "~>"
|
129
|
-
- !ruby/object:Gem::Version
|
130
|
-
version: '5.8'
|
131
|
-
- !ruby/object:Gem::Dependency
|
132
|
-
name: minitest-reporters
|
133
|
-
requirement: !ruby/object:Gem::Requirement
|
134
|
-
requirements:
|
135
|
-
- - "~>"
|
136
|
-
- !ruby/object:Gem::Version
|
137
|
-
version: '1.1'
|
138
|
-
type: :development
|
139
|
-
prerelease: false
|
140
|
-
version_requirements: !ruby/object:Gem::Requirement
|
141
|
-
requirements:
|
142
|
-
- - "~>"
|
143
|
-
- !ruby/object:Gem::Version
|
144
|
-
version: '1.1'
|
145
|
-
- !ruby/object:Gem::Dependency
|
146
|
-
name: simplecov
|
147
|
-
requirement: !ruby/object:Gem::Requirement
|
148
|
-
requirements:
|
149
|
-
- - "~>"
|
150
|
-
- !ruby/object:Gem::Version
|
151
|
-
version: '0.10'
|
152
|
-
type: :development
|
153
|
-
prerelease: false
|
154
|
-
version_requirements: !ruby/object:Gem::Requirement
|
155
|
-
requirements:
|
156
|
-
- - "~>"
|
157
|
-
- !ruby/object:Gem::Version
|
158
|
-
version: '0.10'
|
159
83
|
description: Snapcrawl is a command line utility for crawling a website and saving
|
160
84
|
screenshots.
|
161
85
|
email: db@dannyben.com
|
@@ -185,12 +109,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
185
109
|
version: '2.0'
|
186
110
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
187
111
|
requirements:
|
188
|
-
- - "
|
112
|
+
- - ">"
|
189
113
|
- !ruby/object:Gem::Version
|
190
|
-
version:
|
114
|
+
version: 1.3.1
|
191
115
|
requirements: []
|
192
116
|
rubyforge_project:
|
193
|
-
rubygems_version: 2.
|
117
|
+
rubygems_version: 2.7.6
|
194
118
|
signing_key:
|
195
119
|
specification_version: 4
|
196
120
|
summary: Crawl a website and take screenshots (CLI + Library)
|