snapcrawl 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/snapcrawl.rb +1 -1
- data/lib/snapcrawl/crawler.rb +4 -12
- data/lib/snapcrawl/exceptions.rb +4 -0
- data/lib/snapcrawl/version.rb +1 -1
- metadata +2 -2
- data/lib/snapcrawl/templates/docopt.txt +0 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 327da92cb63a1a8e6f58e58a4806d4e10b2cfa491960306544165be6423c9b3e
|
4
|
+
data.tar.gz: c6f2056f9ca5614a76bce68bdf2f001668ab626764bce89cf1b1bc4a8f68f833
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 91b00e39fbf5943501cc7f67eb1c684811d10ae3f5acc0263a3a6259ae64ad51d01d89aba75576990d7517a07a53660ec1f63d13adbbaf5bdd6380b7d9dd8050
|
7
|
+
data.tar.gz: 318d7c11aa087a20a8f5c0dd922e9f11f2eeca1be7c165bfa04d54d775d0688e3d3532434a4987cca039c362bb3817542331ce135f8ec686f0178d6e5fa343e5
|
data/lib/snapcrawl.rb
CHANGED
data/lib/snapcrawl/crawler.rb
CHANGED
@@ -5,16 +5,12 @@ require 'httparty'
|
|
5
5
|
require 'nokogiri'
|
6
6
|
require 'ostruct'
|
7
7
|
require 'pstore'
|
8
|
-
require 'uri'
|
9
8
|
require 'addressable/uri'
|
10
9
|
require 'webshot'
|
11
10
|
|
12
11
|
module Snapcrawl
|
13
12
|
include Colsole
|
14
13
|
|
15
|
-
class MissingPhantomJS < StandardError; end
|
16
|
-
class MissingImageMagick < StandardError; end
|
17
|
-
|
18
14
|
class Crawler
|
19
15
|
include Singleton
|
20
16
|
|
@@ -206,11 +202,7 @@ module Snapcrawl
|
|
206
202
|
|
207
203
|
# Convert relative links to absolute
|
208
204
|
begin
|
209
|
-
link = URI.join( @opts.base, link ).to_s.dup
|
210
|
-
rescue URI::InvalidURIError
|
211
|
-
escaped_link = Addressable::URI.encode link
|
212
|
-
warnings << { link: link, message: "Using escaped link: #{escaped_link}" }
|
213
|
-
link = URI.join( @opts.base, escaped_link ).to_s.dup
|
205
|
+
link = Addressable::URI.join( @opts.base, link ).to_s.dup
|
214
206
|
rescue => e
|
215
207
|
warnings << { link: link, message: "#{e.class} #{e.message}" }
|
216
208
|
next
|
@@ -226,11 +218,11 @@ module Snapcrawl
|
|
226
218
|
end
|
227
219
|
|
228
220
|
def doc
|
229
|
-
@doc ||= File.read
|
221
|
+
@doc ||= File.read docopt
|
230
222
|
end
|
231
223
|
|
232
|
-
def
|
233
|
-
File.expand_path
|
224
|
+
def docopt
|
225
|
+
File.expand_path "docopt.txt", __dir__
|
234
226
|
end
|
235
227
|
|
236
228
|
def opts_from_args(args)
|
data/lib/snapcrawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: snapcrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danny Ben Shitrit
|
@@ -112,7 +112,7 @@ files:
|
|
112
112
|
- bin/snapcrawl
|
113
113
|
- lib/snapcrawl.rb
|
114
114
|
- lib/snapcrawl/crawler.rb
|
115
|
-
- lib/snapcrawl/
|
115
|
+
- lib/snapcrawl/exceptions.rb
|
116
116
|
- lib/snapcrawl/version.rb
|
117
117
|
homepage: https://github.com/DannyBen/snapcrawl
|
118
118
|
licenses:
|
@@ -1,48 +0,0 @@
|
|
1
|
-
Snapcrawl
|
2
|
-
|
3
|
-
Usage:
|
4
|
-
snapcrawl URL [options]
|
5
|
-
snapcrawl -h | --help
|
6
|
-
snapcrawl -v | --version
|
7
|
-
|
8
|
-
Options:
|
9
|
-
-f, --folder PATH
|
10
|
-
Where to save screenshots [default: snaps]
|
11
|
-
|
12
|
-
-n, --name TEMPLATE
|
13
|
-
Filename template. Include the string '%{url}' anywhere in the name to
|
14
|
-
use the captured URL in the filename [default: %{url}]
|
15
|
-
|
16
|
-
-a, --age SECONDS
|
17
|
-
Number of seconds to consider screenshots fresh [default: 86400]
|
18
|
-
|
19
|
-
-d, --depth LEVELS
|
20
|
-
Number of levels to crawl [default: 1]
|
21
|
-
|
22
|
-
-W, --width PIXELS
|
23
|
-
Screen width in pixels [default: 1280]
|
24
|
-
|
25
|
-
-H, --height PIXELS
|
26
|
-
Screen height in pixels. Use 0 to capture the full page [default: 0]
|
27
|
-
|
28
|
-
-s, --selector SELECTOR
|
29
|
-
CSS selector to capture
|
30
|
-
|
31
|
-
-o, --only REGEX
|
32
|
-
Include only URLs that match REGEX
|
33
|
-
|
34
|
-
-h, --help
|
35
|
-
Show this screen
|
36
|
-
|
37
|
-
-v, --version
|
38
|
-
Show version number
|
39
|
-
|
40
|
-
Examples:
|
41
|
-
snapcrawl example.com
|
42
|
-
snapcrawl example.com -d2 -fscreens
|
43
|
-
snapcrawl example.com -d2 > out.txt 2> err.txt &
|
44
|
-
snapcrawl example.com -W360 -H480
|
45
|
-
snapcrawl example.com --selector "#main-content"
|
46
|
-
snapcrawl example.com --only "products|collections"
|
47
|
-
snapcrawl example.com --name "screenshot-%{url}"
|
48
|
-
snapcrawl example.com --name "`date +%Y%m%d`_%{url}"
|