snapcrawl 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e0dba7d3febfccf88e22194125387c6219cf500f0cb0e00fe4ed343a57fe9034
4
- data.tar.gz: d05440bccd6ddde9773b24c425f3420b1970c3985805f6eb685171237e382cda
3
+ metadata.gz: d5c46ba3f4a171eb59bbc9afa6eca7327c8ef1964268bb63f407354201bb553b
4
+ data.tar.gz: beff7935a00a34edf97207f443a0a8a1b7e4734a6149f3b60fc6c625030a1be2
5
5
  SHA512:
6
- metadata.gz: 71f65748af4339bf606765e2f1ea0aab90f4099ff986ce59318c6c966363ca6386a92d08ce175373cfa8e83c096af844983da2cde22ccbbebd2ab151514b6e72
7
- data.tar.gz: a641b038a18114117375249d260895d73d08cd2cf47c7f668641c0891cd4b4723db80d4313a6a64cd89de02be0ea72aa9fa7b15f74d1ba6e0724017486c2d9c4
6
+ metadata.gz: 6e72274c6fa25d0b397bfd20fd676f0f8c83abb1d493610d58d091af4bdab700614ed8530aeaa75bea89c4c754a9fbea25feec669a62cdfb79d759c296d530ef
7
+ data.tar.gz: 349670f9155d5dae148c77af37f9aab2c6b334c560d73b48ca09a8fe10c15b485daee015082ac76c1698f0aa2d313bccbd747560d731f64738f8c94a801e2b4d
@@ -46,6 +46,7 @@ module Snapcrawl
46
46
  private
47
47
 
48
48
  def crawl(url, opts={})
49
+ url = protocolize url
49
50
  defaults = {
50
51
  width: 1280,
51
52
  height: 0,
@@ -55,13 +56,13 @@ module Snapcrawl
55
56
  name: '%{url}',
56
57
  base: url,
57
58
  }
58
- urls = [protocolize(url)]
59
+ urls = [url]
59
60
 
60
61
  @opts = OpenStruct.new defaults.merge(opts)
61
62
 
62
63
  make_screenshot_dir @opts.folder
63
64
 
64
- @opts.depth.times do
65
+ @opts.depth.times do
65
66
  urls = crawl_and_snap urls
66
67
  end
67
68
  end
@@ -136,9 +137,13 @@ module Snapcrawl
136
137
  if response.success?
137
138
  doc = Nokogiri::HTML response.body
138
139
  links = doc.css('a')
139
- links = normalize_links links
140
+ links, warnings = normalize_links links
140
141
  @store.transaction { @store[url] = links }
141
142
  say "done"
143
+ warnings.each do |warning|
144
+ say "!txtylw! Warn: #{warning[:link]}"
145
+ say word_wrap " #{warning[:message]}"
146
+ end
142
147
  else
143
148
  links = []
144
149
  say "!txtred!FAILED"
@@ -184,6 +189,7 @@ module Snapcrawl
184
189
  beginnings = "mailto|tel"
185
190
 
186
191
  links_array = []
192
+ warnings = []
187
193
 
188
194
  links.each do |link|
189
195
  link = link.attribute('href').to_s
@@ -200,7 +206,16 @@ module Snapcrawl
200
206
  link.strip!
201
207
 
202
208
  # Convert relative links to absolute
203
- link = URI.join( @opts.base, link ).to_s
209
+ begin
210
+ link = URI.join( @opts.base, link ).to_s
211
+ rescue URI::InvalidURIError
212
+ escaped_link = URI.escape link
213
+ warnings << { link: link, message: "Using escaped link: #{escaped_link}" }
214
+ link = URI.join( @opts.base, escaped_link ).to_s
215
+ rescue => e
216
+ warnings << { link: link, message: "#{e.class} #{e.message}" }
217
+ next
218
+ end
204
219
 
205
220
  # Keep only links in our base domain
206
221
  next unless link.include? @opts.base
@@ -208,7 +223,7 @@ module Snapcrawl
208
223
  links_array << link
209
224
  end
210
225
 
211
- links_array.uniq
226
+ [links_array.uniq, warnings]
212
227
  end
213
228
 
214
229
  def doc
@@ -233,7 +248,7 @@ module Snapcrawl
233
248
  end
234
249
 
235
250
  def webshot
236
- Webshot::Screenshot.instance
251
+ @webshot ||= Webshot::Screenshot.instance
237
252
  end
238
253
 
239
254
  # The webshot gem messes with stdout/stderr streams so we keep it in
@@ -1,3 +1,3 @@
1
1
  module Snapcrawl
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: snapcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danny Ben Shitrit
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-09-10 00:00:00.000000000 Z
11
+ date: 2019-09-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: colsole