snapcrawl 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e0dba7d3febfccf88e22194125387c6219cf500f0cb0e00fe4ed343a57fe9034
4
- data.tar.gz: d05440bccd6ddde9773b24c425f3420b1970c3985805f6eb685171237e382cda
3
+ metadata.gz: d5c46ba3f4a171eb59bbc9afa6eca7327c8ef1964268bb63f407354201bb553b
4
+ data.tar.gz: beff7935a00a34edf97207f443a0a8a1b7e4734a6149f3b60fc6c625030a1be2
5
5
  SHA512:
6
- metadata.gz: 71f65748af4339bf606765e2f1ea0aab90f4099ff986ce59318c6c966363ca6386a92d08ce175373cfa8e83c096af844983da2cde22ccbbebd2ab151514b6e72
7
- data.tar.gz: a641b038a18114117375249d260895d73d08cd2cf47c7f668641c0891cd4b4723db80d4313a6a64cd89de02be0ea72aa9fa7b15f74d1ba6e0724017486c2d9c4
6
+ metadata.gz: 6e72274c6fa25d0b397bfd20fd676f0f8c83abb1d493610d58d091af4bdab700614ed8530aeaa75bea89c4c754a9fbea25feec669a62cdfb79d759c296d530ef
7
+ data.tar.gz: 349670f9155d5dae148c77af37f9aab2c6b334c560d73b48ca09a8fe10c15b485daee015082ac76c1698f0aa2d313bccbd747560d731f64738f8c94a801e2b4d
@@ -46,6 +46,7 @@ module Snapcrawl
46
46
  private
47
47
 
48
48
  def crawl(url, opts={})
49
+ url = protocolize url
49
50
  defaults = {
50
51
  width: 1280,
51
52
  height: 0,
@@ -55,13 +56,13 @@ module Snapcrawl
55
56
  name: '%{url}',
56
57
  base: url,
57
58
  }
58
- urls = [protocolize(url)]
59
+ urls = [url]
59
60
 
60
61
  @opts = OpenStruct.new defaults.merge(opts)
61
62
 
62
63
  make_screenshot_dir @opts.folder
63
64
 
64
- @opts.depth.times do
65
+ @opts.depth.times do
65
66
  urls = crawl_and_snap urls
66
67
  end
67
68
  end
@@ -136,9 +137,13 @@ module Snapcrawl
136
137
  if response.success?
137
138
  doc = Nokogiri::HTML response.body
138
139
  links = doc.css('a')
139
- links = normalize_links links
140
+ links, warnings = normalize_links links
140
141
  @store.transaction { @store[url] = links }
141
142
  say "done"
143
+ warnings.each do |warning|
144
+ say "!txtylw! Warn: #{warning[:link]}"
145
+ say word_wrap " #{warning[:message]}"
146
+ end
142
147
  else
143
148
  links = []
144
149
  say "!txtred!FAILED"
@@ -184,6 +189,7 @@ module Snapcrawl
184
189
  beginnings = "mailto|tel"
185
190
 
186
191
  links_array = []
192
+ warnings = []
187
193
 
188
194
  links.each do |link|
189
195
  link = link.attribute('href').to_s
@@ -200,7 +206,16 @@ module Snapcrawl
200
206
  link.strip!
201
207
 
202
208
  # Convert relative links to absolute
203
- link = URI.join( @opts.base, link ).to_s
209
+ begin
210
+ link = URI.join( @opts.base, link ).to_s
211
+ rescue URI::InvalidURIError
212
+ escaped_link = URI.escape link
213
+ warnings << { link: link, message: "Using escaped link: #{escaped_link}" }
214
+ link = URI.join( @opts.base, escaped_link ).to_s
215
+ rescue => e
216
+ warnings << { link: link, message: "#{e.class} #{e.message}" }
217
+ next
218
+ end
204
219
 
205
220
  # Keep only links in our base domain
206
221
  next unless link.include? @opts.base
@@ -208,7 +223,7 @@ module Snapcrawl
208
223
  links_array << link
209
224
  end
210
225
 
211
- links_array.uniq
226
+ [links_array.uniq, warnings]
212
227
  end
213
228
 
214
229
  def doc
@@ -233,7 +248,7 @@ module Snapcrawl
233
248
  end
234
249
 
235
250
  def webshot
236
- Webshot::Screenshot.instance
251
+ @webshot ||= Webshot::Screenshot.instance
237
252
  end
238
253
 
239
254
  # The webshot gem messes with stdout/stderr streams so we keep it in
@@ -1,3 +1,3 @@
1
1
  module Snapcrawl
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: snapcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danny Ben Shitrit
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-10 00:00:00.000000000 Z
11
+ date: 2019-09-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: colsole