snapcrawl 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7f8bfcb13d6d049104a97fe95b4f20527c9e93f9
4
- data.tar.gz: 5d39a2e40270cbe8ddfd5e0863016a665059e747
3
+ metadata.gz: e2794d3fe40619ef7de870f738767419cdb893fc
4
+ data.tar.gz: 6a866a6ac4808e7a522e67db0415e31d1d463b62
5
5
  SHA512:
6
- metadata.gz: c0ad7d74dff9e73d5892870cf162c2b61c96a3f316f6ffedf2b9bd84c09c080bf0e330d32a9f567d4f3e8bec14964afe2773e010177236f438046e3f06b87624
7
- data.tar.gz: 015a6dd81b525bcd59cc52360c5d4989542646af72e84e48a350719cb4b88751e6def5ab201b81de9c256c7036e9894dc5ca03ae30829e7a8d4ab2de0af47aaa
6
+ metadata.gz: 6195bff2403b3c4e7900b6c6e33e962eafa37bbbfed78940da5fce4152318bb6715073ce78bc8d7865451eb287a9e57aa77806064634e9e388c20fef8c5f929a
7
+ data.tar.gz: f2b0207a6b976a34e1554adaa305f415191903cdd51b7dfd7bf70735fd225f4ef7de58589ed3aa179cde10e621be170b93897c3523f9e323cbfddf35dfca854f
data/README.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # SnapCrawl - crawl a website and take screenshots
2
2
 
3
+ [![Gem Version](https://badge.fury.io/rb/snapcrawl.svg)](http://badge.fury.io/rb/snapcrawl)
4
+ [![Code Climate](https://codeclimate.com/github/DannyBen/snapcrawl/badges/gpa.svg)](https://codeclimate.com/github/DannyBen/snapcrawl)
5
+ [![Dependency Status](https://gemnasium.com/DannyBen/snapcrawl.svg)](https://gemnasium.com/DannyBen/snapcrawl)
6
+
7
+
3
8
  SnapCrawl is a command line utility for crawling a website and saving
4
9
  screenshots.
5
10
 
@@ -18,8 +23,8 @@ screenshots.
18
23
 
19
24
  ## Usage
20
25
 
21
- $ snapcrawl --help
22
-
26
+ $ snapcrawl --help
27
+
23
28
  Snapcrawl
24
29
 
25
30
  Usage:
@@ -46,11 +51,18 @@ screenshots.
46
51
  snapcrawl go example.com -d2 > out.txt 2> err.txt &
47
52
  snapcrawl go example.com -W360 -H480
48
53
  snapcrawl go example.com --selector "#main-content"
49
- snapcrawl go example.com --only "products|collections"
50
-
54
+ snapcrawl go example.com --only "products|collections"
55
+
51
56
  ---
52
57
 
53
58
  ## Notes
54
59
 
55
60
  1. If a URL cannot be found, SnapCrawl will report to stderr.
56
- You can create a report by running `snapcrawl go example.com 2> err.txt`
61
+ You can create a report by running `snapcrawl go example.com 2> err.txt`
62
+
63
+ ## Todo
64
+
65
+ - [x] Tests (probably against some ad hoc sinatra)
66
+ - [ ] Make ths test server start/stop automatically when testing
67
+ - [ ] Move ignored file extensions and mailto/tel links to config
68
+ - [ ] Add screen size presets (also to user-overridable config)
@@ -18,10 +18,10 @@ module Snapcrawl
18
18
  def initialize
19
19
  @storefile = "snapcrawl.pstore"
20
20
  @store = PStore.new(@storefile)
21
- @done = []
22
21
  end
23
22
 
24
23
  def handle(args)
24
+ @done = []
25
25
  begin
26
26
  execute Docopt::docopt(doc, argv: args)
27
27
  rescue Docopt::Exit => e
@@ -34,6 +34,12 @@ module Snapcrawl
34
34
  crawl args['<url>'].dup, opts_from_args(args)
35
35
  end
36
36
 
37
+ def clear_cache
38
+ FileUtils.rm @storefile if File.exist? @storefile
39
+ end
40
+
41
+ private
42
+
37
43
  def crawl(url, opts={})
38
44
  defaults = {
39
45
  width: 1280,
@@ -54,8 +60,6 @@ module Snapcrawl
54
60
  end
55
61
  end
56
62
 
57
- private
58
-
59
63
  def crawl_and_snap(urls)
60
64
  new_urls = []
61
65
  urls.each do |url|
@@ -94,7 +98,7 @@ module Snapcrawl
94
98
  fetch_opts[:div] = @opts.selector if @opts.selector
95
99
  # :top => 0, :left => 0, :width => 100, :height => 100 # dimensions for a specific area
96
100
 
97
- screenshot = f.fetch fetch_opts
101
+ f.fetch fetch_opts
98
102
  say "done"
99
103
  end
100
104
 
@@ -128,12 +132,12 @@ module Snapcrawl
128
132
 
129
133
  # mkdir the screenshots folder, if needed
130
134
  def make_screenshot_dir(dir)
131
- Dir.exists? dir or FileUtils.mkdir_p dir
135
+ Dir.exist? dir or FileUtils.mkdir_p dir
132
136
  end
133
137
 
134
138
  # Convert any string to a proper handle
135
139
  def handelize(str)
136
- str.downcase.gsub /[^a-z0-9]+/, '-'
140
+ str.downcase.gsub(/[^a-z0-9]+/, '-')
137
141
  end
138
142
 
139
143
  # Return proper image path for a UR
@@ -148,7 +152,7 @@ module Snapcrawl
148
152
 
149
153
  # Return true if the file exists and is not too old
150
154
  def file_fresh?(file)
151
- File.exist?(file) and file_age(file) < @opts.age
155
+ @opts.age > 0 and File.exist?(file) and file_age(file) < @opts.age
152
156
  end
153
157
 
154
158
  # Return file age in seconds
@@ -158,27 +162,33 @@ module Snapcrawl
158
162
 
159
163
  # Process an array of links and return a better one
160
164
  def normalize_links(links)
161
- # Remove the #hash part from all links
162
- links = links.map {|link| link.attribute('href').to_s.gsub(/#.+$/, '')}
165
+ extensions = "png|gif|jpg|pdf|zip"
166
+ beginnings = "mailto|tel"
163
167
 
164
- # Make unique and remove empties
165
- links = links.uniq.reject {|link| link.empty?}
168
+ links_array = []
166
169
 
167
- # Remove links to images and other files
168
- extensions = "png|gif|jpg|pdf|zip"
169
- links = links.reject {|link| link =~ /\.(#{extensions})(\?.*)?$/}
170
+ links.each_with_index do |link|
171
+ link = link.attribute('href').to_s
170
172
 
171
- # Remove mailto, tel links
172
- beginnings = "mailto|tel"
173
- links = links.reject {|link| link =~ /^(#{beginnings})/}
173
+ # remove #hash
174
+ link.gsub!(/#.+$/, '')
175
+ next if link.empty?
174
176
 
175
- # Add the base domain to relative URLs
176
- links = links.map {|link| link =~ /^http/ ? link : "http://#{@base}#{link}"}
177
+ # Remove links to images and other files then to mailto/tel
178
+ next if link =~ /\.(#{extensions})(\?.*)?$/
179
+ next if link =~ /^(#{beginnings})/
180
+
181
+ # Add the base domain to relative URLs
182
+ link = link =~ /^http/ ? link : "#{@opts.base}#{link}"
183
+ link = "http://#{link}" unless link =~ /^http/
177
184
 
178
- # Keep only links in our base domain
179
- links = links.select {|link| link =~ /https?:\/\/#{@base}.*/}
185
+ # Keep only links in our base domain
186
+ next unless link.include? @opts.base
180
187
 
181
- links
188
+ links_array << link
189
+ end
190
+
191
+ links_array.uniq
182
192
  end
183
193
 
184
194
  def show_version
@@ -1,3 +1,3 @@
1
1
  module Snapcrawl
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: snapcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danny Ben Shitrit
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0.3'
27
- - !ruby/object:Gem::Dependency
28
- name: net-ssh
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '3.0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - "~>"
39
- - !ruby/object:Gem::Version
40
- version: '3.0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: docopt
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -108,6 +94,48 @@ dependencies:
108
94
  - - "~>"
109
95
  - !ruby/object:Gem::Version
110
96
  version: '0.2'
97
+ - !ruby/object:Gem::Dependency
98
+ name: minitest
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '5.8'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '5.8'
111
+ - !ruby/object:Gem::Dependency
112
+ name: minitest-reporters
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.1'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.1'
125
+ - !ruby/object:Gem::Dependency
126
+ name: simplecov
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0.10'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0.10'
111
139
  description: Snapcrawl is a command line utility for crawling a website and saving
112
140
  screenshots.
113
141
  email: db@dannyben.com