snapcrawl 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -5
- data/lib/snapcrawl/crawler.rb +32 -22
- data/lib/snapcrawl/version.rb +1 -1
- metadata +43 -15
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e2794d3fe40619ef7de870f738767419cdb893fc
|
|
4
|
+
data.tar.gz: 6a866a6ac4808e7a522e67db0415e31d1d463b62
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6195bff2403b3c4e7900b6c6e33e962eafa37bbbfed78940da5fce4152318bb6715073ce78bc8d7865451eb287a9e57aa77806064634e9e388c20fef8c5f929a
|
|
7
|
+
data.tar.gz: f2b0207a6b976a34e1554adaa305f415191903cdd51b7dfd7bf70735fd225f4ef7de58589ed3aa179cde10e621be170b93897c3523f9e323cbfddf35dfca854f
|
data/README.md
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# SnapCrawl - crawl a website and take screenshots
|
|
2
2
|
|
|
3
|
+
[](http://badge.fury.io/rb/snapcrawl)
|
|
4
|
+
[](https://codeclimate.com/github/DannyBen/snapcrawl)
|
|
5
|
+
[](https://gemnasium.com/DannyBen/snapcrawl)
|
|
6
|
+
|
|
7
|
+
|
|
3
8
|
SnapCrawl is a command line utility for crawling a website and saving
|
|
4
9
|
screenshots.
|
|
5
10
|
|
|
@@ -18,8 +23,8 @@ screenshots.
|
|
|
18
23
|
|
|
19
24
|
## Usage
|
|
20
25
|
|
|
21
|
-
$ snapcrawl --help
|
|
22
|
-
|
|
26
|
+
$ snapcrawl --help
|
|
27
|
+
|
|
23
28
|
Snapcrawl
|
|
24
29
|
|
|
25
30
|
Usage:
|
|
@@ -46,11 +51,18 @@ screenshots.
|
|
|
46
51
|
snapcrawl go example.com -d2 > out.txt 2> err.txt &
|
|
47
52
|
snapcrawl go example.com -W360 -H480
|
|
48
53
|
snapcrawl go example.com --selector "#main-content"
|
|
49
|
-
snapcrawl go example.com --only "products|collections"
|
|
50
|
-
|
|
54
|
+
snapcrawl go example.com --only "products|collections"
|
|
55
|
+
|
|
51
56
|
---
|
|
52
57
|
|
|
53
58
|
## Notes
|
|
54
59
|
|
|
55
60
|
1. If a URL cannot be found, SnapCrawl will report to stderr.
|
|
56
|
-
You can create a report by running `snapcrawl go example.com 2> err.txt`
|
|
61
|
+
You can create a report by running `snapcrawl go example.com 2> err.txt`
|
|
62
|
+
|
|
63
|
+
## Todo
|
|
64
|
+
|
|
65
|
+
- [x] Tests (probably against some ad hoc sinatra)
|
|
66
|
+
- [ ] Make ths test server start/stop automatically when testing
|
|
67
|
+
- [ ] Move ignored file extensions and mailto/tel links to config
|
|
68
|
+
- [ ] Add screen size presets (also to user-overridable config)
|
data/lib/snapcrawl/crawler.rb
CHANGED
|
@@ -18,10 +18,10 @@ module Snapcrawl
|
|
|
18
18
|
def initialize
|
|
19
19
|
@storefile = "snapcrawl.pstore"
|
|
20
20
|
@store = PStore.new(@storefile)
|
|
21
|
-
@done = []
|
|
22
21
|
end
|
|
23
22
|
|
|
24
23
|
def handle(args)
|
|
24
|
+
@done = []
|
|
25
25
|
begin
|
|
26
26
|
execute Docopt::docopt(doc, argv: args)
|
|
27
27
|
rescue Docopt::Exit => e
|
|
@@ -34,6 +34,12 @@ module Snapcrawl
|
|
|
34
34
|
crawl args['<url>'].dup, opts_from_args(args)
|
|
35
35
|
end
|
|
36
36
|
|
|
37
|
+
def clear_cache
|
|
38
|
+
FileUtils.rm @storefile if File.exist? @storefile
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
37
43
|
def crawl(url, opts={})
|
|
38
44
|
defaults = {
|
|
39
45
|
width: 1280,
|
|
@@ -54,8 +60,6 @@ module Snapcrawl
|
|
|
54
60
|
end
|
|
55
61
|
end
|
|
56
62
|
|
|
57
|
-
private
|
|
58
|
-
|
|
59
63
|
def crawl_and_snap(urls)
|
|
60
64
|
new_urls = []
|
|
61
65
|
urls.each do |url|
|
|
@@ -94,7 +98,7 @@ module Snapcrawl
|
|
|
94
98
|
fetch_opts[:div] = @opts.selector if @opts.selector
|
|
95
99
|
# :top => 0, :left => 0, :width => 100, :height => 100 # dimensions for a specific area
|
|
96
100
|
|
|
97
|
-
|
|
101
|
+
f.fetch fetch_opts
|
|
98
102
|
say "done"
|
|
99
103
|
end
|
|
100
104
|
|
|
@@ -128,12 +132,12 @@ module Snapcrawl
|
|
|
128
132
|
|
|
129
133
|
# mkdir the screenshots folder, if needed
|
|
130
134
|
def make_screenshot_dir(dir)
|
|
131
|
-
Dir.
|
|
135
|
+
Dir.exist? dir or FileUtils.mkdir_p dir
|
|
132
136
|
end
|
|
133
137
|
|
|
134
138
|
# Convert any string to a proper handle
|
|
135
139
|
def handelize(str)
|
|
136
|
-
str.downcase.gsub
|
|
140
|
+
str.downcase.gsub(/[^a-z0-9]+/, '-')
|
|
137
141
|
end
|
|
138
142
|
|
|
139
143
|
# Return proper image path for a UR
|
|
@@ -148,7 +152,7 @@ module Snapcrawl
|
|
|
148
152
|
|
|
149
153
|
# Return true if the file exists and is not too old
|
|
150
154
|
def file_fresh?(file)
|
|
151
|
-
File.exist?(file) and file_age(file) < @opts.age
|
|
155
|
+
@opts.age > 0 and File.exist?(file) and file_age(file) < @opts.age
|
|
152
156
|
end
|
|
153
157
|
|
|
154
158
|
# Return file age in seconds
|
|
@@ -158,27 +162,33 @@ module Snapcrawl
|
|
|
158
162
|
|
|
159
163
|
# Process an array of links and return a better one
|
|
160
164
|
def normalize_links(links)
|
|
161
|
-
|
|
162
|
-
|
|
165
|
+
extensions = "png|gif|jpg|pdf|zip"
|
|
166
|
+
beginnings = "mailto|tel"
|
|
163
167
|
|
|
164
|
-
|
|
165
|
-
links = links.uniq.reject {|link| link.empty?}
|
|
168
|
+
links_array = []
|
|
166
169
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
links = links.reject {|link| link =~ /\.(#{extensions})(\?.*)?$/}
|
|
170
|
+
links.each_with_index do |link|
|
|
171
|
+
link = link.attribute('href').to_s
|
|
170
172
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
173
|
+
# remove #hash
|
|
174
|
+
link.gsub!(/#.+$/, '')
|
|
175
|
+
next if link.empty?
|
|
174
176
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
+
# Remove links to images and other files then to mailto/tel
|
|
178
|
+
next if link =~ /\.(#{extensions})(\?.*)?$/
|
|
179
|
+
next if link =~ /^(#{beginnings})/
|
|
180
|
+
|
|
181
|
+
# Add the base domain to relative URLs
|
|
182
|
+
link = link =~ /^http/ ? link : "#{@opts.base}#{link}"
|
|
183
|
+
link = "http://#{link}" unless link =~ /^http/
|
|
177
184
|
|
|
178
|
-
|
|
179
|
-
|
|
185
|
+
# Keep only links in our base domain
|
|
186
|
+
next unless link.include? @opts.base
|
|
180
187
|
|
|
181
|
-
|
|
188
|
+
links_array << link
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
links_array.uniq
|
|
182
192
|
end
|
|
183
193
|
|
|
184
194
|
def show_version
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: snapcrawl
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Danny Ben Shitrit
|
|
@@ -24,20 +24,6 @@ dependencies:
|
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '0.3'
|
|
27
|
-
- !ruby/object:Gem::Dependency
|
|
28
|
-
name: net-ssh
|
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
|
30
|
-
requirements:
|
|
31
|
-
- - "~>"
|
|
32
|
-
- !ruby/object:Gem::Version
|
|
33
|
-
version: '3.0'
|
|
34
|
-
type: :runtime
|
|
35
|
-
prerelease: false
|
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
-
requirements:
|
|
38
|
-
- - "~>"
|
|
39
|
-
- !ruby/object:Gem::Version
|
|
40
|
-
version: '3.0'
|
|
41
27
|
- !ruby/object:Gem::Dependency
|
|
42
28
|
name: docopt
|
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -108,6 +94,48 @@ dependencies:
|
|
|
108
94
|
- - "~>"
|
|
109
95
|
- !ruby/object:Gem::Version
|
|
110
96
|
version: '0.2'
|
|
97
|
+
- !ruby/object:Gem::Dependency
|
|
98
|
+
name: minitest
|
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
|
100
|
+
requirements:
|
|
101
|
+
- - "~>"
|
|
102
|
+
- !ruby/object:Gem::Version
|
|
103
|
+
version: '5.8'
|
|
104
|
+
type: :development
|
|
105
|
+
prerelease: false
|
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - "~>"
|
|
109
|
+
- !ruby/object:Gem::Version
|
|
110
|
+
version: '5.8'
|
|
111
|
+
- !ruby/object:Gem::Dependency
|
|
112
|
+
name: minitest-reporters
|
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
|
114
|
+
requirements:
|
|
115
|
+
- - "~>"
|
|
116
|
+
- !ruby/object:Gem::Version
|
|
117
|
+
version: '1.1'
|
|
118
|
+
type: :development
|
|
119
|
+
prerelease: false
|
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
121
|
+
requirements:
|
|
122
|
+
- - "~>"
|
|
123
|
+
- !ruby/object:Gem::Version
|
|
124
|
+
version: '1.1'
|
|
125
|
+
- !ruby/object:Gem::Dependency
|
|
126
|
+
name: simplecov
|
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
|
128
|
+
requirements:
|
|
129
|
+
- - "~>"
|
|
130
|
+
- !ruby/object:Gem::Version
|
|
131
|
+
version: '0.10'
|
|
132
|
+
type: :development
|
|
133
|
+
prerelease: false
|
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
135
|
+
requirements:
|
|
136
|
+
- - "~>"
|
|
137
|
+
- !ruby/object:Gem::Version
|
|
138
|
+
version: '0.10'
|
|
111
139
|
description: Snapcrawl is a command line utility for crawling a website and saving
|
|
112
140
|
screenshots.
|
|
113
141
|
email: db@dannyben.com
|