snapcrawl 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7f8bfcb13d6d049104a97fe95b4f20527c9e93f9
4
- data.tar.gz: 5d39a2e40270cbe8ddfd5e0863016a665059e747
3
+ metadata.gz: e2794d3fe40619ef7de870f738767419cdb893fc
4
+ data.tar.gz: 6a866a6ac4808e7a522e67db0415e31d1d463b62
5
5
  SHA512:
6
- metadata.gz: c0ad7d74dff9e73d5892870cf162c2b61c96a3f316f6ffedf2b9bd84c09c080bf0e330d32a9f567d4f3e8bec14964afe2773e010177236f438046e3f06b87624
7
- data.tar.gz: 015a6dd81b525bcd59cc52360c5d4989542646af72e84e48a350719cb4b88751e6def5ab201b81de9c256c7036e9894dc5ca03ae30829e7a8d4ab2de0af47aaa
6
+ metadata.gz: 6195bff2403b3c4e7900b6c6e33e962eafa37bbbfed78940da5fce4152318bb6715073ce78bc8d7865451eb287a9e57aa77806064634e9e388c20fef8c5f929a
7
+ data.tar.gz: f2b0207a6b976a34e1554adaa305f415191903cdd51b7dfd7bf70735fd225f4ef7de58589ed3aa179cde10e621be170b93897c3523f9e323cbfddf35dfca854f
data/README.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # SnapCrawl - crawl a website and take screenshots
2
2
 
3
+ [![Gem Version](https://badge.fury.io/rb/snapcrawl.svg)](http://badge.fury.io/rb/snapcrawl)
4
+ [![Code Climate](https://codeclimate.com/github/DannyBen/snapcrawl/badges/gpa.svg)](https://codeclimate.com/github/DannyBen/snapcrawl)
5
+ [![Dependency Status](https://gemnasium.com/DannyBen/snapcrawl.svg)](https://gemnasium.com/DannyBen/snapcrawl)
6
+
7
+
3
8
  SnapCrawl is a command line utility for crawling a website and saving
4
9
  screenshots.
5
10
 
@@ -18,8 +23,8 @@ screenshots.
18
23
 
19
24
  ## Usage
20
25
 
21
- $ snapcrawl --help
22
-
26
+ $ snapcrawl --help
27
+
23
28
  Snapcrawl
24
29
 
25
30
  Usage:
@@ -46,11 +51,18 @@ screenshots.
46
51
  snapcrawl go example.com -d2 > out.txt 2> err.txt &
47
52
  snapcrawl go example.com -W360 -H480
48
53
  snapcrawl go example.com --selector "#main-content"
49
- snapcrawl go example.com --only "products|collections"
50
-
54
+ snapcrawl go example.com --only "products|collections"
55
+
51
56
  ---
52
57
 
53
58
  ## Notes
54
59
 
55
60
  1. If a URL cannot be found, SnapCrawl will report to stderr.
56
- You can create a report by running `snapcrawl go example.com 2> err.txt`
61
+ You can create a report by running `snapcrawl go example.com 2> err.txt`
62
+
63
+ ## Todo
64
+
65
+ - [x] Tests (probably against some ad hoc sinatra)
66
+ - [ ] Make ths test server start/stop automatically when testing
67
+ - [ ] Move ignored file extensions and mailto/tel links to config
68
+ - [ ] Add screen size presets (also to user-overridable config)
@@ -18,10 +18,10 @@ module Snapcrawl
18
18
  def initialize
19
19
  @storefile = "snapcrawl.pstore"
20
20
  @store = PStore.new(@storefile)
21
- @done = []
22
21
  end
23
22
 
24
23
  def handle(args)
24
+ @done = []
25
25
  begin
26
26
  execute Docopt::docopt(doc, argv: args)
27
27
  rescue Docopt::Exit => e
@@ -34,6 +34,12 @@ module Snapcrawl
34
34
  crawl args['<url>'].dup, opts_from_args(args)
35
35
  end
36
36
 
37
+ def clear_cache
38
+ FileUtils.rm @storefile if File.exist? @storefile
39
+ end
40
+
41
+ private
42
+
37
43
  def crawl(url, opts={})
38
44
  defaults = {
39
45
  width: 1280,
@@ -54,8 +60,6 @@ module Snapcrawl
54
60
  end
55
61
  end
56
62
 
57
- private
58
-
59
63
  def crawl_and_snap(urls)
60
64
  new_urls = []
61
65
  urls.each do |url|
@@ -94,7 +98,7 @@ module Snapcrawl
94
98
  fetch_opts[:div] = @opts.selector if @opts.selector
95
99
  # :top => 0, :left => 0, :width => 100, :height => 100 # dimensions for a specific area
96
100
 
97
- screenshot = f.fetch fetch_opts
101
+ f.fetch fetch_opts
98
102
  say "done"
99
103
  end
100
104
 
@@ -128,12 +132,12 @@ module Snapcrawl
128
132
 
129
133
  # mkdir the screenshots folder, if needed
130
134
  def make_screenshot_dir(dir)
131
- Dir.exists? dir or FileUtils.mkdir_p dir
135
+ Dir.exist? dir or FileUtils.mkdir_p dir
132
136
  end
133
137
 
134
138
  # Convert any string to a proper handle
135
139
  def handelize(str)
136
- str.downcase.gsub /[^a-z0-9]+/, '-'
140
+ str.downcase.gsub(/[^a-z0-9]+/, '-')
137
141
  end
138
142
 
139
143
  # Return proper image path for a UR
@@ -148,7 +152,7 @@ module Snapcrawl
148
152
 
149
153
  # Return true if the file exists and is not too old
150
154
  def file_fresh?(file)
151
- File.exist?(file) and file_age(file) < @opts.age
155
+ @opts.age > 0 and File.exist?(file) and file_age(file) < @opts.age
152
156
  end
153
157
 
154
158
  # Return file age in seconds
@@ -158,27 +162,33 @@ module Snapcrawl
158
162
 
159
163
  # Process an array of links and return a better one
160
164
  def normalize_links(links)
161
- # Remove the #hash part from all links
162
- links = links.map {|link| link.attribute('href').to_s.gsub(/#.+$/, '')}
165
+ extensions = "png|gif|jpg|pdf|zip"
166
+ beginnings = "mailto|tel"
163
167
 
164
- # Make unique and remove empties
165
- links = links.uniq.reject {|link| link.empty?}
168
+ links_array = []
166
169
 
167
- # Remove links to images and other files
168
- extensions = "png|gif|jpg|pdf|zip"
169
- links = links.reject {|link| link =~ /\.(#{extensions})(\?.*)?$/}
170
+ links.each_with_index do |link|
171
+ link = link.attribute('href').to_s
170
172
 
171
- # Remove mailto, tel links
172
- beginnings = "mailto|tel"
173
- links = links.reject {|link| link =~ /^(#{beginnings})/}
173
+ # remove #hash
174
+ link.gsub!(/#.+$/, '')
175
+ next if link.empty?
174
176
 
175
- # Add the base domain to relative URLs
176
- links = links.map {|link| link =~ /^http/ ? link : "http://#{@base}#{link}"}
177
+ # Remove links to images and other files then to mailto/tel
178
+ next if link =~ /\.(#{extensions})(\?.*)?$/
179
+ next if link =~ /^(#{beginnings})/
180
+
181
+ # Add the base domain to relative URLs
182
+ link = link =~ /^http/ ? link : "#{@opts.base}#{link}"
183
+ link = "http://#{link}" unless link =~ /^http/
177
184
 
178
- # Keep only links in our base domain
179
- links = links.select {|link| link =~ /https?:\/\/#{@base}.*/}
185
+ # Keep only links in our base domain
186
+ next unless link.include? @opts.base
180
187
 
181
- links
188
+ links_array << link
189
+ end
190
+
191
+ links_array.uniq
182
192
  end
183
193
 
184
194
  def show_version
@@ -1,3 +1,3 @@
1
1
  module Snapcrawl
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: snapcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danny Ben Shitrit
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0.3'
27
- - !ruby/object:Gem::Dependency
28
- name: net-ssh
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '3.0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - "~>"
39
- - !ruby/object:Gem::Version
40
- version: '3.0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: docopt
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -108,6 +94,48 @@ dependencies:
108
94
  - - "~>"
109
95
  - !ruby/object:Gem::Version
110
96
  version: '0.2'
97
+ - !ruby/object:Gem::Dependency
98
+ name: minitest
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '5.8'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '5.8'
111
+ - !ruby/object:Gem::Dependency
112
+ name: minitest-reporters
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.1'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.1'
125
+ - !ruby/object:Gem::Dependency
126
+ name: simplecov
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0.10'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0.10'
111
139
  description: Snapcrawl is a command line utility for crawling a website and saving
112
140
  screenshots.
113
141
  email: db@dannyben.com