snapcrawl 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/snapcrawl/crawler.rb +21 -6
- data/lib/snapcrawl/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d5c46ba3f4a171eb59bbc9afa6eca7327c8ef1964268bb63f407354201bb553b
|
4
|
+
data.tar.gz: beff7935a00a34edf97207f443a0a8a1b7e4734a6149f3b60fc6c625030a1be2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6e72274c6fa25d0b397bfd20fd676f0f8c83abb1d493610d58d091af4bdab700614ed8530aeaa75bea89c4c754a9fbea25feec669a62cdfb79d759c296d530ef
|
7
|
+
data.tar.gz: 349670f9155d5dae148c77af37f9aab2c6b334c560d73b48ca09a8fe10c15b485daee015082ac76c1698f0aa2d313bccbd747560d731f64738f8c94a801e2b4d
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -46,6 +46,7 @@ module Snapcrawl
|
|
46
46
|
private
|
47
47
|
|
48
48
|
def crawl(url, opts={})
|
49
|
+
url = protocolize url
|
49
50
|
defaults = {
|
50
51
|
width: 1280,
|
51
52
|
height: 0,
|
@@ -55,13 +56,13 @@ module Snapcrawl
|
|
55
56
|
name: '%{url}',
|
56
57
|
base: url,
|
57
58
|
}
|
58
|
-
urls = [
|
59
|
+
urls = [url]
|
59
60
|
|
60
61
|
@opts = OpenStruct.new defaults.merge(opts)
|
61
62
|
|
62
63
|
make_screenshot_dir @opts.folder
|
63
64
|
|
64
|
-
@opts.depth.times do
|
65
|
+
@opts.depth.times do
|
65
66
|
urls = crawl_and_snap urls
|
66
67
|
end
|
67
68
|
end
|
@@ -136,9 +137,13 @@ module Snapcrawl
|
|
136
137
|
if response.success?
|
137
138
|
doc = Nokogiri::HTML response.body
|
138
139
|
links = doc.css('a')
|
139
|
-
links = normalize_links links
|
140
|
+
links, warnings = normalize_links links
|
140
141
|
@store.transaction { @store[url] = links }
|
141
142
|
say "done"
|
143
|
+
warnings.each do |warning|
|
144
|
+
say "!txtylw! Warn: #{warning[:link]}"
|
145
|
+
say word_wrap " #{warning[:message]}"
|
146
|
+
end
|
142
147
|
else
|
143
148
|
links = []
|
144
149
|
say "!txtred!FAILED"
|
@@ -184,6 +189,7 @@ module Snapcrawl
|
|
184
189
|
beginnings = "mailto|tel"
|
185
190
|
|
186
191
|
links_array = []
|
192
|
+
warnings = []
|
187
193
|
|
188
194
|
links.each do |link|
|
189
195
|
link = link.attribute('href').to_s
|
@@ -200,7 +206,16 @@ module Snapcrawl
|
|
200
206
|
link.strip!
|
201
207
|
|
202
208
|
# Convert relative links to absolute
|
203
|
-
|
209
|
+
begin
|
210
|
+
link = URI.join( @opts.base, link ).to_s
|
211
|
+
rescue URI::InvalidURIError
|
212
|
+
escaped_link = URI.escape link
|
213
|
+
warnings << { link: link, message: "Using escaped link: #{escaped_link}" }
|
214
|
+
link = URI.join( @opts.base, escaped_link ).to_s
|
215
|
+
rescue => e
|
216
|
+
warnings << { link: link, message: "#{e.class} #{e.message}" }
|
217
|
+
next
|
218
|
+
end
|
204
219
|
|
205
220
|
# Keep only links in our base domain
|
206
221
|
next unless link.include? @opts.base
|
@@ -208,7 +223,7 @@ module Snapcrawl
|
|
208
223
|
links_array << link
|
209
224
|
end
|
210
225
|
|
211
|
-
links_array.uniq
|
226
|
+
[links_array.uniq, warnings]
|
212
227
|
end
|
213
228
|
|
214
229
|
def doc
|
@@ -233,7 +248,7 @@ module Snapcrawl
|
|
233
248
|
end
|
234
249
|
|
235
250
|
def webshot
|
236
|
-
Webshot::Screenshot.instance
|
251
|
+
@webshot ||= Webshot::Screenshot.instance
|
237
252
|
end
|
238
253
|
|
239
254
|
# The webshot gem messes with stdout/stderr streams so we keep it in
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-09-
|
11
|
+
date: 2019-09-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|