snapcrawl 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/snapcrawl/crawler.rb +21 -6
- data/lib/snapcrawl/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d5c46ba3f4a171eb59bbc9afa6eca7327c8ef1964268bb63f407354201bb553b
|
4
|
+
data.tar.gz: beff7935a00a34edf97207f443a0a8a1b7e4734a6149f3b60fc6c625030a1be2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6e72274c6fa25d0b397bfd20fd676f0f8c83abb1d493610d58d091af4bdab700614ed8530aeaa75bea89c4c754a9fbea25feec669a62cdfb79d759c296d530ef
|
7
|
+
data.tar.gz: 349670f9155d5dae148c77af37f9aab2c6b334c560d73b48ca09a8fe10c15b485daee015082ac76c1698f0aa2d313bccbd747560d731f64738f8c94a801e2b4d
|
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -46,6 +46,7 @@ module Snapcrawl
|
|
46
46
|
private
|
47
47
|
|
48
48
|
def crawl(url, opts={})
|
49
|
+
url = protocolize url
|
49
50
|
defaults = {
|
50
51
|
width: 1280,
|
51
52
|
height: 0,
|
@@ -55,13 +56,13 @@ module Snapcrawl
|
|
55
56
|
name: '%{url}',
|
56
57
|
base: url,
|
57
58
|
}
|
58
|
-
urls = [
|
59
|
+
urls = [url]
|
59
60
|
|
60
61
|
@opts = OpenStruct.new defaults.merge(opts)
|
61
62
|
|
62
63
|
make_screenshot_dir @opts.folder
|
63
64
|
|
64
|
-
@opts.depth.times do
|
65
|
+
@opts.depth.times do
|
65
66
|
urls = crawl_and_snap urls
|
66
67
|
end
|
67
68
|
end
|
@@ -136,9 +137,13 @@ module Snapcrawl
|
|
136
137
|
if response.success?
|
137
138
|
doc = Nokogiri::HTML response.body
|
138
139
|
links = doc.css('a')
|
139
|
-
links = normalize_links links
|
140
|
+
links, warnings = normalize_links links
|
140
141
|
@store.transaction { @store[url] = links }
|
141
142
|
say "done"
|
143
|
+
warnings.each do |warning|
|
144
|
+
say "!txtylw! Warn: #{warning[:link]}"
|
145
|
+
say word_wrap " #{warning[:message]}"
|
146
|
+
end
|
142
147
|
else
|
143
148
|
links = []
|
144
149
|
say "!txtred!FAILED"
|
@@ -184,6 +189,7 @@ module Snapcrawl
|
|
184
189
|
beginnings = "mailto|tel"
|
185
190
|
|
186
191
|
links_array = []
|
192
|
+
warnings = []
|
187
193
|
|
188
194
|
links.each do |link|
|
189
195
|
link = link.attribute('href').to_s
|
@@ -200,7 +206,16 @@ module Snapcrawl
|
|
200
206
|
link.strip!
|
201
207
|
|
202
208
|
# Convert relative links to absolute
|
203
|
-
|
209
|
+
begin
|
210
|
+
link = URI.join( @opts.base, link ).to_s
|
211
|
+
rescue URI::InvalidURIError
|
212
|
+
escaped_link = URI.escape link
|
213
|
+
warnings << { link: link, message: "Using escaped link: #{escaped_link}" }
|
214
|
+
link = URI.join( @opts.base, escaped_link ).to_s
|
215
|
+
rescue => e
|
216
|
+
warnings << { link: link, message: "#{e.class} #{e.message}" }
|
217
|
+
next
|
218
|
+
end
|
204
219
|
|
205
220
|
# Keep only links in our base domain
|
206
221
|
next unless link.include? @opts.base
|
@@ -208,7 +223,7 @@ module Snapcrawl
|
|
208
223
|
links_array << link
|
209
224
|
end
|
210
225
|
|
211
|
-
links_array.uniq
|
226
|
+
[links_array.uniq, warnings]
|
212
227
|
end
|
213
228
|
|
214
229
|
def doc
|
@@ -233,7 +248,7 @@ module Snapcrawl
|
|
233
248
|
end
|
234
249
|
|
235
250
|
def webshot
|
236
|
-
Webshot::Screenshot.instance
|
251
|
+
@webshot ||= Webshot::Screenshot.instance
|
237
252
|
end
|
238
253
|
|
239
254
|
# The webshot gem messes with stdout/stderr streams so we keep it in
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-09-
|
11
|
+
date: 2019-09-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: colsole
|