snapcrawl 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -9
- data/lib/snapcrawl/crawler.rb +4 -5
- data/lib/snapcrawl/version.rb +1 -1
- metadata +23 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 76e5f98ee14c7d5f4c8d3d6589a22dd1b9b66335
|
|
4
|
+
data.tar.gz: e9389d5b2782b71854022f819ba5c93f599ca664
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 60da4ba17c53a7fe8f9c61374277f75329045412f4e3ad172cf0ed9fe2684bdf62a37d05cc5cc90265034b47561917960fd74b2e8a11f156cad3062ebf4c42d6
|
|
7
|
+
data.tar.gz: 7d63c4de115ca54183f6933eee913b90c7c9302f1132ac0414eb75941a6564d2989516e3fd033c14c036024e21f02e99b5bb4c15f18f2b509eca9bc38ff0677f
|
data/README.md
CHANGED
|
@@ -1,14 +1,18 @@
|
|
|
1
|
-
|
|
1
|
+
Snapcrawl - crawl a website and take screenshots
|
|
2
|
+
==================================================
|
|
2
3
|
|
|
3
4
|
[](http://badge.fury.io/rb/snapcrawl)
|
|
4
5
|
[](https://codeclimate.com/github/DannyBen/snapcrawl)
|
|
5
6
|
[](https://gemnasium.com/DannyBen/snapcrawl)
|
|
6
7
|
|
|
8
|
+
---
|
|
7
9
|
|
|
8
|
-
|
|
10
|
+
Snapcrawl is a command line utility for crawling a website and saving
|
|
9
11
|
screenshots.
|
|
10
12
|
|
|
11
|
-
|
|
13
|
+
|
|
14
|
+
Features
|
|
15
|
+
--------------------------------------------------
|
|
12
16
|
|
|
13
17
|
- Crawls a website to any given depth and save screenshots
|
|
14
18
|
- Can capture the full length of the page
|
|
@@ -17,11 +21,15 @@ screenshots.
|
|
|
17
21
|
- Uses local caching to avoid expensive crawl operations if not needed
|
|
18
22
|
- Reports broken links
|
|
19
23
|
|
|
20
|
-
|
|
24
|
+
|
|
25
|
+
Install
|
|
26
|
+
--------------------------------------------------
|
|
21
27
|
|
|
22
28
|
$ gem install snapcrawl
|
|
23
29
|
|
|
24
|
-
|
|
30
|
+
|
|
31
|
+
Usage
|
|
32
|
+
--------------------------------------------------
|
|
25
33
|
|
|
26
34
|
$ snapcrawl --help
|
|
27
35
|
|
|
@@ -55,14 +63,17 @@ screenshots.
|
|
|
55
63
|
|
|
56
64
|
---
|
|
57
65
|
|
|
58
|
-
|
|
66
|
+
Notes
|
|
67
|
+
--------------------------------------------------
|
|
59
68
|
|
|
60
|
-
1. If a URL cannot be found,
|
|
69
|
+
1. If a URL cannot be found, Snapcrawl will report to stderr.
|
|
61
70
|
You can create a report by running `snapcrawl go example.com 2> err.txt`
|
|
62
71
|
|
|
63
|
-
|
|
72
|
+
|
|
73
|
+
Todo
|
|
74
|
+
--------------------------------------------------
|
|
64
75
|
|
|
65
76
|
- [x] Tests (probably against some ad hoc sinatra)
|
|
66
|
-
- [ ] Make
|
|
77
|
+
- [ ] Make the test server start/stop automatically when testing
|
|
67
78
|
- [ ] Move ignored file extensions and mailto/tel links to config
|
|
68
79
|
- [ ] Add screen size presets (also to user-overridable config)
|
data/lib/snapcrawl/crawler.rb
CHANGED
|
@@ -170,11 +170,11 @@ module Snapcrawl
|
|
|
170
170
|
links.each_with_index do |link|
|
|
171
171
|
link = link.attribute('href').to_s
|
|
172
172
|
|
|
173
|
-
#
|
|
173
|
+
# Remove #hash
|
|
174
174
|
link.gsub!(/#.+$/, '')
|
|
175
175
|
next if link.empty?
|
|
176
176
|
|
|
177
|
-
# Remove links to
|
|
177
|
+
# Remove links to specific extensions and protocols
|
|
178
178
|
next if link =~ /\.(#{extensions})(\?.*)?$/
|
|
179
179
|
next if link =~ /^(#{beginnings})/
|
|
180
180
|
|
|
@@ -196,8 +196,7 @@ module Snapcrawl
|
|
|
196
196
|
end
|
|
197
197
|
|
|
198
198
|
def doc
|
|
199
|
-
|
|
200
|
-
@doc = File.read template 'docopt.txt'
|
|
199
|
+
@doc ||= File.read template 'docopt.txt'
|
|
201
200
|
end
|
|
202
201
|
|
|
203
202
|
def template(file)
|
|
@@ -217,4 +216,4 @@ module Snapcrawl
|
|
|
217
216
|
opts
|
|
218
217
|
end
|
|
219
218
|
end
|
|
220
|
-
end
|
|
219
|
+
end
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: snapcrawl
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Danny Ben Shitrit
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2017-03-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: colsole
|
|
@@ -66,6 +66,26 @@ dependencies:
|
|
|
66
66
|
- - "~>"
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
68
|
version: '0.1'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: phantomjs
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - "~>"
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: 1.9.8
|
|
76
|
+
- - "<"
|
|
77
|
+
- !ruby/object:Gem::Version
|
|
78
|
+
version: '2.0'
|
|
79
|
+
type: :runtime
|
|
80
|
+
prerelease: false
|
|
81
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
82
|
+
requirements:
|
|
83
|
+
- - "~>"
|
|
84
|
+
- !ruby/object:Gem::Version
|
|
85
|
+
version: 1.9.8
|
|
86
|
+
- - "<"
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '2.0'
|
|
69
89
|
- !ruby/object:Gem::Dependency
|
|
70
90
|
name: runfile
|
|
71
91
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -170,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
170
190
|
version: '0'
|
|
171
191
|
requirements: []
|
|
172
192
|
rubyforge_project:
|
|
173
|
-
rubygems_version: 2.
|
|
193
|
+
rubygems_version: 2.6.6
|
|
174
194
|
signing_key:
|
|
175
195
|
specification_version: 4
|
|
176
196
|
summary: Crawl a website and take screenshots (CLI + Library)
|