snapcrawl 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +17 -5
- data/lib/snapcrawl/crawler.rb +32 -22
- data/lib/snapcrawl/version.rb +1 -1
- metadata +43 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e2794d3fe40619ef7de870f738767419cdb893fc
|
4
|
+
data.tar.gz: 6a866a6ac4808e7a522e67db0415e31d1d463b62
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6195bff2403b3c4e7900b6c6e33e962eafa37bbbfed78940da5fce4152318bb6715073ce78bc8d7865451eb287a9e57aa77806064634e9e388c20fef8c5f929a
|
7
|
+
data.tar.gz: f2b0207a6b976a34e1554adaa305f415191903cdd51b7dfd7bf70735fd225f4ef7de58589ed3aa179cde10e621be170b93897c3523f9e323cbfddf35dfca854f
|
data/README.md
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
# SnapCrawl - crawl a website and take screenshots
|
2
2
|
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/snapcrawl.svg)](http://badge.fury.io/rb/snapcrawl)
|
4
|
+
[![Code Climate](https://codeclimate.com/github/DannyBen/snapcrawl/badges/gpa.svg)](https://codeclimate.com/github/DannyBen/snapcrawl)
|
5
|
+
[![Dependency Status](https://gemnasium.com/DannyBen/snapcrawl.svg)](https://gemnasium.com/DannyBen/snapcrawl)
|
6
|
+
|
7
|
+
|
3
8
|
SnapCrawl is a command line utility for crawling a website and saving
|
4
9
|
screenshots.
|
5
10
|
|
@@ -18,8 +23,8 @@ screenshots.
|
|
18
23
|
|
19
24
|
## Usage
|
20
25
|
|
21
|
-
$ snapcrawl --help
|
22
|
-
|
26
|
+
$ snapcrawl --help
|
27
|
+
|
23
28
|
Snapcrawl
|
24
29
|
|
25
30
|
Usage:
|
@@ -46,11 +51,18 @@ screenshots.
|
|
46
51
|
snapcrawl go example.com -d2 > out.txt 2> err.txt &
|
47
52
|
snapcrawl go example.com -W360 -H480
|
48
53
|
snapcrawl go example.com --selector "#main-content"
|
49
|
-
snapcrawl go example.com --only "products|collections"
|
50
|
-
|
54
|
+
snapcrawl go example.com --only "products|collections"
|
55
|
+
|
51
56
|
---
|
52
57
|
|
53
58
|
## Notes
|
54
59
|
|
55
60
|
1. If a URL cannot be found, SnapCrawl will report to stderr.
|
56
|
-
You can create a report by running `snapcrawl go example.com 2> err.txt`
|
61
|
+
You can create a report by running `snapcrawl go example.com 2> err.txt`
|
62
|
+
|
63
|
+
## Todo
|
64
|
+
|
65
|
+
- [x] Tests (probably against some ad hoc sinatra)
|
66
|
+
- [ ] Make ths test server start/stop automatically when testing
|
67
|
+
- [ ] Move ignored file extensions and mailto/tel links to config
|
68
|
+
- [ ] Add screen size presets (also to user-overridable config)
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -18,10 +18,10 @@ module Snapcrawl
|
|
18
18
|
def initialize
|
19
19
|
@storefile = "snapcrawl.pstore"
|
20
20
|
@store = PStore.new(@storefile)
|
21
|
-
@done = []
|
22
21
|
end
|
23
22
|
|
24
23
|
def handle(args)
|
24
|
+
@done = []
|
25
25
|
begin
|
26
26
|
execute Docopt::docopt(doc, argv: args)
|
27
27
|
rescue Docopt::Exit => e
|
@@ -34,6 +34,12 @@ module Snapcrawl
|
|
34
34
|
crawl args['<url>'].dup, opts_from_args(args)
|
35
35
|
end
|
36
36
|
|
37
|
+
def clear_cache
|
38
|
+
FileUtils.rm @storefile if File.exist? @storefile
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
37
43
|
def crawl(url, opts={})
|
38
44
|
defaults = {
|
39
45
|
width: 1280,
|
@@ -54,8 +60,6 @@ module Snapcrawl
|
|
54
60
|
end
|
55
61
|
end
|
56
62
|
|
57
|
-
private
|
58
|
-
|
59
63
|
def crawl_and_snap(urls)
|
60
64
|
new_urls = []
|
61
65
|
urls.each do |url|
|
@@ -94,7 +98,7 @@ module Snapcrawl
|
|
94
98
|
fetch_opts[:div] = @opts.selector if @opts.selector
|
95
99
|
# :top => 0, :left => 0, :width => 100, :height => 100 # dimensions for a specific area
|
96
100
|
|
97
|
-
|
101
|
+
f.fetch fetch_opts
|
98
102
|
say "done"
|
99
103
|
end
|
100
104
|
|
@@ -128,12 +132,12 @@ module Snapcrawl
|
|
128
132
|
|
129
133
|
# mkdir the screenshots folder, if needed
|
130
134
|
def make_screenshot_dir(dir)
|
131
|
-
Dir.
|
135
|
+
Dir.exist? dir or FileUtils.mkdir_p dir
|
132
136
|
end
|
133
137
|
|
134
138
|
# Convert any string to a proper handle
|
135
139
|
def handelize(str)
|
136
|
-
str.downcase.gsub
|
140
|
+
str.downcase.gsub(/[^a-z0-9]+/, '-')
|
137
141
|
end
|
138
142
|
|
139
143
|
# Return proper image path for a UR
|
@@ -148,7 +152,7 @@ module Snapcrawl
|
|
148
152
|
|
149
153
|
# Return true if the file exists and is not too old
|
150
154
|
def file_fresh?(file)
|
151
|
-
File.exist?(file) and file_age(file) < @opts.age
|
155
|
+
@opts.age > 0 and File.exist?(file) and file_age(file) < @opts.age
|
152
156
|
end
|
153
157
|
|
154
158
|
# Return file age in seconds
|
@@ -158,27 +162,33 @@ module Snapcrawl
|
|
158
162
|
|
159
163
|
# Process an array of links and return a better one
|
160
164
|
def normalize_links(links)
|
161
|
-
|
162
|
-
|
165
|
+
extensions = "png|gif|jpg|pdf|zip"
|
166
|
+
beginnings = "mailto|tel"
|
163
167
|
|
164
|
-
|
165
|
-
links = links.uniq.reject {|link| link.empty?}
|
168
|
+
links_array = []
|
166
169
|
|
167
|
-
|
168
|
-
|
169
|
-
links = links.reject {|link| link =~ /\.(#{extensions})(\?.*)?$/}
|
170
|
+
links.each_with_index do |link|
|
171
|
+
link = link.attribute('href').to_s
|
170
172
|
|
171
|
-
|
172
|
-
|
173
|
-
|
173
|
+
# remove #hash
|
174
|
+
link.gsub!(/#.+$/, '')
|
175
|
+
next if link.empty?
|
174
176
|
|
175
|
-
|
176
|
-
|
177
|
+
# Remove links to images and other files then to mailto/tel
|
178
|
+
next if link =~ /\.(#{extensions})(\?.*)?$/
|
179
|
+
next if link =~ /^(#{beginnings})/
|
180
|
+
|
181
|
+
# Add the base domain to relative URLs
|
182
|
+
link = link =~ /^http/ ? link : "#{@opts.base}#{link}"
|
183
|
+
link = "http://#{link}" unless link =~ /^http/
|
177
184
|
|
178
|
-
|
179
|
-
|
185
|
+
# Keep only links in our base domain
|
186
|
+
next unless link.include? @opts.base
|
180
187
|
|
181
|
-
|
188
|
+
links_array << link
|
189
|
+
end
|
190
|
+
|
191
|
+
links_array.uniq
|
182
192
|
end
|
183
193
|
|
184
194
|
def show_version
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
@@ -24,20 +24,6 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0.3'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: net-ssh
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '3.0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '3.0'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: docopt
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,6 +94,48 @@ dependencies:
|
|
108
94
|
- - "~>"
|
109
95
|
- !ruby/object:Gem::Version
|
110
96
|
version: '0.2'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: minitest
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '5.8'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '5.8'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: minitest-reporters
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '1.1'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '1.1'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: simplecov
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0.10'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0.10'
|
111
139
|
description: Snapcrawl is a command line utility for crawling a website and saving
|
112
140
|
screenshots.
|
113
141
|
email: db@dannyben.com
|