powerdlz23 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/grell/.rspec +2 -0
- package/grell/.travis.yml +28 -0
- package/grell/CHANGELOG.md +111 -0
- package/grell/Gemfile +7 -0
- package/grell/LICENSE.txt +22 -0
- package/grell/README.md +213 -0
- package/grell/Rakefile +2 -0
- package/grell/grell.gemspec +36 -0
- package/grell/lib/grell/capybara_driver.rb +44 -0
- package/grell/lib/grell/crawler.rb +83 -0
- package/grell/lib/grell/crawler_manager.rb +84 -0
- package/grell/lib/grell/grell_logger.rb +10 -0
- package/grell/lib/grell/page.rb +275 -0
- package/grell/lib/grell/page_collection.rb +62 -0
- package/grell/lib/grell/rawpage.rb +62 -0
- package/grell/lib/grell/reader.rb +18 -0
- package/grell/lib/grell/version.rb +3 -0
- package/grell/lib/grell.rb +11 -0
- package/grell/spec/lib/capybara_driver_spec.rb +38 -0
- package/grell/spec/lib/crawler_manager_spec.rb +174 -0
- package/grell/spec/lib/crawler_spec.rb +361 -0
- package/grell/spec/lib/page_collection_spec.rb +159 -0
- package/grell/spec/lib/page_spec.rb +418 -0
- package/grell/spec/lib/reader_spec.rb +43 -0
- package/grell/spec/spec_helper.rb +66 -0
- package/heartmagic/config.py +1 -0
- package/heartmagic/heart.py +3 -0
- package/heartmagic/pytransform/__init__.py +483 -0
- package/heartmagic/pytransform/_pytransform.dll +0 -0
- package/heartmagic/pytransform/_pytransform.so +0 -0
- package/httpStatusCode/README.md +2 -0
- package/httpStatusCode/httpStatusCode.js +4 -0
- package/httpStatusCode/reasonPhrases.js +344 -0
- package/httpStatusCode/statusCodes.js +344 -0
- package/package.json +1 -1
- package/snapcrawl/.changelog.old.md +157 -0
- package/snapcrawl/.gitattributes +1 -0
- package/snapcrawl/.github/workflows/test.yml +41 -0
- package/snapcrawl/.rspec +3 -0
- package/snapcrawl/.rubocop.yml +23 -0
- package/snapcrawl/CHANGELOG.md +182 -0
- package/snapcrawl/Gemfile +15 -0
- package/snapcrawl/LICENSE +21 -0
- package/snapcrawl/README.md +135 -0
- package/snapcrawl/Runfile +35 -0
- package/snapcrawl/bin/snapcrawl +25 -0
- package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
- package/snapcrawl/lib/snapcrawl/config.rb +60 -0
- package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
- package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
- package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
- package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
- package/snapcrawl/lib/snapcrawl/page.rb +118 -0
- package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
- package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
- package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
- package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
- package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
- package/snapcrawl/lib/snapcrawl/version.rb +3 -0
- package/snapcrawl/lib/snapcrawl.rb +20 -0
- package/snapcrawl/snapcrawl.gemspec +27 -0
- package/snapcrawl/snapcrawl.yml +41 -0
- package/snapcrawl/spec/README.md +16 -0
- package/snapcrawl/spec/approvals/bin/help +26 -0
- package/snapcrawl/spec/approvals/bin/usage +4 -0
- package/snapcrawl/spec/approvals/cli/usage +4 -0
- package/snapcrawl/spec/approvals/config/defaults +15 -0
- package/snapcrawl/spec/approvals/config/minimal +15 -0
- package/snapcrawl/spec/approvals/integration/blacklist +14 -0
- package/snapcrawl/spec/approvals/integration/default-config +14 -0
- package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
- package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
- package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
- package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
- package/snapcrawl/spec/approvals/integration/whitelist +14 -0
- package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
- package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
- package/snapcrawl/spec/server/config.ru +97 -0
- package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
- package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
- package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
- package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
- package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
- package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
- package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
- package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
- package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
- package/snapcrawl/spec/spec_helper.rb +22 -0
- package/snapcrawl/spec/spec_mixin.rb +10 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
require 'addressable/uri'
|
|
2
|
+
require 'fileutils'
|
|
3
|
+
require 'httparty'
|
|
4
|
+
require 'lightly'
|
|
5
|
+
require 'nokogiri'
|
|
6
|
+
|
|
7
|
+
module Snapcrawl
|
|
8
|
+
class Page
|
|
9
|
+
using StringRefinements
|
|
10
|
+
|
|
11
|
+
attr_reader :url, :depth
|
|
12
|
+
|
|
13
|
+
EXTENSION_BLACKLIST = 'png|gif|jpg|pdf|zip'
|
|
14
|
+
PROTOCOL_BLACKLIST = 'mailto|tel'
|
|
15
|
+
|
|
16
|
+
def initialize(url, depth: 0)
|
|
17
|
+
@url = url.protocolize
|
|
18
|
+
@depth = depth
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def valid?
|
|
22
|
+
http_response&.success?
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def site
|
|
26
|
+
@site ||= Addressable::URI.parse(url).site
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def path
|
|
30
|
+
@path ||= Addressable::URI.parse(url).request_uri
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def links
|
|
34
|
+
return nil unless valid?
|
|
35
|
+
|
|
36
|
+
doc = Nokogiri::HTML http_response.body
|
|
37
|
+
normalize_links doc.css('a')
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def pages
|
|
41
|
+
return nil unless valid?
|
|
42
|
+
|
|
43
|
+
links.map { |link| Page.new link, depth: depth + 1 }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def save_screenshot(outfile)
|
|
47
|
+
return false unless valid?
|
|
48
|
+
|
|
49
|
+
Screenshot.new(url).save outfile
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
def http_response
|
|
55
|
+
@http_response ||= http_response!
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def http_response!
|
|
59
|
+
response = cache.get(url) { HTTParty.get url, httparty_options }
|
|
60
|
+
|
|
61
|
+
unless response.success?
|
|
62
|
+
$logger.warn "http error on mu`#{url}`, code: y`#{response.code}`, message: #{response.message.strip}"
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
response
|
|
66
|
+
rescue => e
|
|
67
|
+
$logger.error "http error on mu`#{url}` - r`#{e.class}`: #{e.message}"
|
|
68
|
+
nil
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def httparty_options
|
|
72
|
+
Config.skip_ssl_verification ? { verify: false } : {}
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def normalize_links(links)
|
|
76
|
+
result = []
|
|
77
|
+
|
|
78
|
+
links.each do |link|
|
|
79
|
+
valid_link = normalize_link link
|
|
80
|
+
result << valid_link if valid_link
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
result.uniq
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def normalize_link(link)
|
|
87
|
+
link = link.attribute('href').to_s.dup
|
|
88
|
+
|
|
89
|
+
# Remove #hash
|
|
90
|
+
link.gsub!(/#.+$/, '')
|
|
91
|
+
return nil if link.empty?
|
|
92
|
+
|
|
93
|
+
# Remove links to specific extensions and protocols
|
|
94
|
+
return nil if /\.(#{EXTENSION_BLACKLIST})(\?.*)?$/o.match?(link)
|
|
95
|
+
return nil if /^(#{PROTOCOL_BLACKLIST}):/o.match?(link)
|
|
96
|
+
|
|
97
|
+
# Strip spaces
|
|
98
|
+
link.strip!
|
|
99
|
+
|
|
100
|
+
# Convert relative links to absolute
|
|
101
|
+
begin
|
|
102
|
+
link = Addressable::URI.join(url, link).to_s.dup
|
|
103
|
+
rescue => e
|
|
104
|
+
$logger.warn "r`#{e.class}`: #{e.message} on #{path} (link: #{link})"
|
|
105
|
+
return nil
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Keep only links in our base domain
|
|
109
|
+
return nil unless link.include? site
|
|
110
|
+
|
|
111
|
+
link
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def cache
|
|
115
|
+
Lightly.new life: Config.cache_life
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
module Snapcrawl
|
|
2
|
+
module PairSplit
|
|
3
|
+
refine Array do
|
|
4
|
+
def pair_split
|
|
5
|
+
false_values = %w[no false]
|
|
6
|
+
true_values = %w[yes true]
|
|
7
|
+
|
|
8
|
+
to_h do |pair|
|
|
9
|
+
key, value = pair.split '='
|
|
10
|
+
|
|
11
|
+
value = if /^\d+$/.match?(value)
|
|
12
|
+
value.to_i
|
|
13
|
+
elsif false_values.include? value
|
|
14
|
+
false
|
|
15
|
+
elsif true_values.include? value
|
|
16
|
+
true
|
|
17
|
+
else
|
|
18
|
+
value
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
[key, value]
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
require 'webshot'
|
|
2
|
+
|
|
3
|
+
module Snapcrawl
|
|
4
|
+
class Screenshot
|
|
5
|
+
using StringRefinements
|
|
6
|
+
|
|
7
|
+
attr_reader :url
|
|
8
|
+
|
|
9
|
+
def initialize(url)
|
|
10
|
+
@url = url
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def save(outfile = nil)
|
|
14
|
+
outfile ||= "#{url.to_slug}.png"
|
|
15
|
+
webshot_capture url, outfile
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def webshot_capture(url, image_path)
|
|
21
|
+
webshot_capture! url, image_path
|
|
22
|
+
rescue => e
|
|
23
|
+
raise ScreenshotError, "#{e.class} #{e.message}"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def webshot_capture!(url, image_path)
|
|
27
|
+
hide_output do
|
|
28
|
+
webshot.capture url, image_path, webshot_options do |magick|
|
|
29
|
+
magick.combine_options do |c|
|
|
30
|
+
c.background 'white'
|
|
31
|
+
c.gravity 'north'
|
|
32
|
+
c.quality 100
|
|
33
|
+
c.extent Config.height.positive? ? "#{Config.width}x#{Config.height}" : "#{Config.width}x"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def webshot_options
|
|
40
|
+
result = { allowed_status_codes: [404, 401, 403] }
|
|
41
|
+
|
|
42
|
+
if Config.css_selector
|
|
43
|
+
result[:selector] = Config.css_selector
|
|
44
|
+
result[:full] = false
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
if Config.screenshot_delay
|
|
48
|
+
result[:timeout] = Config.screenshot_delay
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
result
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def webshot
|
|
55
|
+
@webshot ||= Webshot::Screenshot.instance
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# The webshot gem messes with stdout/stderr streams so we keep it in
|
|
59
|
+
# check by using this method. Also, in some sites (e.g. uown.co) it
|
|
60
|
+
# prints some output to stdout, this is why we override $stdout for
|
|
61
|
+
# the duration of the run.
|
|
62
|
+
def hide_output
|
|
63
|
+
keep_stdout = $stdout
|
|
64
|
+
keep_stderr = $stderr
|
|
65
|
+
$stdout = StringIO.new
|
|
66
|
+
$stderr = StringIO.new
|
|
67
|
+
yield
|
|
68
|
+
ensure
|
|
69
|
+
$stdout = keep_stdout
|
|
70
|
+
$stderr = keep_stderr
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# All values below are the default values
|
|
2
|
+
|
|
3
|
+
# log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
|
|
4
|
+
log_level: 1
|
|
5
|
+
|
|
6
|
+
# log_color (yes, no, auto)
|
|
7
|
+
# yes = always show log color
|
|
8
|
+
# no = never use colors
|
|
9
|
+
# auto = only use colors when running in an interactive terminal
|
|
10
|
+
log_color: auto
|
|
11
|
+
|
|
12
|
+
# number of levels to crawl, 0 means capture only the root URL
|
|
13
|
+
depth: 1
|
|
14
|
+
|
|
15
|
+
# screenshot width in pixels
|
|
16
|
+
width: 1280
|
|
17
|
+
|
|
18
|
+
# screenshot height in pixels, 0 means the entire height
|
|
19
|
+
height: 0
|
|
20
|
+
|
|
21
|
+
# number of seconds to consider the page cache and its screenshot fresh
|
|
22
|
+
cache_life: 86400
|
|
23
|
+
|
|
24
|
+
# where to store the HTML page cache
|
|
25
|
+
cache_dir: cache
|
|
26
|
+
|
|
27
|
+
# where to store screenshots
|
|
28
|
+
snaps_dir: snaps
|
|
29
|
+
|
|
30
|
+
# screenshot filename template, where '%{url}' will be replaced with a
|
|
31
|
+
# slug version of the URL (no need to include the .png extension)
|
|
32
|
+
name_template: '%{url}'
|
|
33
|
+
|
|
34
|
+
# urls not matching this regular expression will be ignored
|
|
35
|
+
url_whitelist:
|
|
36
|
+
|
|
37
|
+
# urls matching this regular expression will be ignored
|
|
38
|
+
url_blacklist:
|
|
39
|
+
|
|
40
|
+
# take a screenshot of this CSS selector only
|
|
41
|
+
css_selector:
|
|
42
|
+
|
|
43
|
+
# when true, ignore SSL related errors
|
|
44
|
+
skip_ssl_verification: false
|
|
45
|
+
|
|
46
|
+
# set to any number of seconds to wait for the page to load before taking
|
|
47
|
+
# a screenshot, leave empty to not wait at all (only needed for pages with
|
|
48
|
+
# animations or other post-load events).
|
|
49
|
+
screenshot_delay:
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Snapcrawl
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
snapcrawl URL [--config FILE] [SETTINGS...]
|
|
5
|
+
snapcrawl -h | --help
|
|
6
|
+
snapcrawl -v | --version
|
|
7
|
+
|
|
8
|
+
Options:
|
|
9
|
+
-c, --config FILE
|
|
10
|
+
Path to config file, with or without the .yml extension.
|
|
11
|
+
A sample file will be created if not found.
|
|
12
|
+
The default filename is 'snapcrawl.yml'.
|
|
13
|
+
|
|
14
|
+
-h, --help
|
|
15
|
+
Show this screen
|
|
16
|
+
|
|
17
|
+
-v, --version
|
|
18
|
+
Show version number
|
|
19
|
+
|
|
20
|
+
Settings:
|
|
21
|
+
Provide any of the options available in the config as 'key=value'.
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
snapcrawl example.com
|
|
25
|
+
snapcrawl example.com --config simple
|
|
26
|
+
snapcrawl example.com depth=1 log_level=2 width=768
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require 'snapcrawl/version'
|
|
2
|
+
require 'snapcrawl/exceptions'
|
|
3
|
+
require 'snapcrawl/refinements/pair_split'
|
|
4
|
+
require 'snapcrawl/refinements/string_refinements'
|
|
5
|
+
require 'snapcrawl/log_helpers'
|
|
6
|
+
require 'snapcrawl/pretty_logger'
|
|
7
|
+
require 'snapcrawl/dependencies'
|
|
8
|
+
require 'snapcrawl/config'
|
|
9
|
+
require 'snapcrawl/screenshot'
|
|
10
|
+
require 'snapcrawl/page'
|
|
11
|
+
require 'snapcrawl/crawler'
|
|
12
|
+
require 'snapcrawl/cli'
|
|
13
|
+
|
|
14
|
+
if ENV['BYEBUG']
|
|
15
|
+
require 'byebug'
|
|
16
|
+
require 'lp'
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
Snapcrawl::Config.load
|
|
20
|
+
$logger = Snapcrawl::PrettyLogger.new
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
3
|
+
require 'snapcrawl/version'
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |s|
|
|
6
|
+
s.name = 'snapcrawl'
|
|
7
|
+
s.version = Snapcrawl::VERSION
|
|
8
|
+
s.summary = 'Crawl a website and take screenshots (CLI + Library)'
|
|
9
|
+
s.description = 'Snapcrawl is a command line utility for crawling a website and saving screenshots.'
|
|
10
|
+
s.authors = ['Danny Ben Shitrit']
|
|
11
|
+
s.email = 'db@dannyben.com'
|
|
12
|
+
s.files = Dir['README.md', 'lib/**/*']
|
|
13
|
+
s.executables = ['snapcrawl']
|
|
14
|
+
s.homepage = 'https://github.com/DannyBen/snapcrawl'
|
|
15
|
+
s.license = 'MIT'
|
|
16
|
+
s.required_ruby_version = '>= 3.0'
|
|
17
|
+
|
|
18
|
+
s.add_runtime_dependency 'addressable', '~> 2.7'
|
|
19
|
+
s.add_runtime_dependency 'colsole', '>= 0.8.1', '< 2'
|
|
20
|
+
s.add_runtime_dependency 'docopt', '~> 0.6'
|
|
21
|
+
s.add_runtime_dependency 'httparty', '~> 0.21'
|
|
22
|
+
s.add_runtime_dependency 'lightly', '~> 0.3'
|
|
23
|
+
s.add_runtime_dependency 'nokogiri', '~> 1.10'
|
|
24
|
+
s.add_runtime_dependency 'sting', '~> 0.4'
|
|
25
|
+
s.add_runtime_dependency 'webshot', '~> 0.1'
|
|
26
|
+
s.metadata['rubygems_mfa_required'] = 'true'
|
|
27
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# All values below are the default values
|
|
2
|
+
|
|
3
|
+
# log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
|
|
4
|
+
log_level: 1
|
|
5
|
+
|
|
6
|
+
# log_color (yes, no, auto)
|
|
7
|
+
# yes = always show log color
|
|
8
|
+
# no = never use colors
|
|
9
|
+
# auto = only use colors when running in an interactive terminal
|
|
10
|
+
log_color: auto
|
|
11
|
+
|
|
12
|
+
# number of levels to crawl, 0 means capture only the root URL
|
|
13
|
+
depth: 1
|
|
14
|
+
|
|
15
|
+
# screenshot width in pixels
|
|
16
|
+
width: 1280
|
|
17
|
+
|
|
18
|
+
# screenshot height in pixels, 0 means the entire height
|
|
19
|
+
height: 0
|
|
20
|
+
|
|
21
|
+
# number of seconds to consider the page cache and its screenshot fresh
|
|
22
|
+
cache_life: 86400
|
|
23
|
+
|
|
24
|
+
# where to store the HTML page cache
|
|
25
|
+
cache_dir: cache
|
|
26
|
+
|
|
27
|
+
# where to store screenshots
|
|
28
|
+
snaps_dir: snaps
|
|
29
|
+
|
|
30
|
+
# screenshot filename template, where '%{url}' will be replaced with a
|
|
31
|
+
# slug version of the URL (no need to include the .png extension)
|
|
32
|
+
name_template: '%{url}'
|
|
33
|
+
|
|
34
|
+
# urls not matching this regular expression will be ignored
|
|
35
|
+
url_whitelist:
|
|
36
|
+
|
|
37
|
+
# urls matching this regular expression will be ignored
|
|
38
|
+
url_blacklist:
|
|
39
|
+
|
|
40
|
+
# take a screenshot of this CSS selector only
|
|
41
|
+
css_selector:
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Snapcrawl Tests
|
|
2
|
+
==================================================
|
|
3
|
+
|
|
4
|
+
Running Tests
|
|
5
|
+
--------------------------------------------------
|
|
6
|
+
|
|
7
|
+
```shell
|
|
8
|
+
# Start a dummy sinatra server
|
|
9
|
+
$ run server start -d
|
|
10
|
+
|
|
11
|
+
# Run all tests
|
|
12
|
+
$ run spec
|
|
13
|
+
|
|
14
|
+
# Stop the server if you are done testing
|
|
15
|
+
$ run server stop
|
|
16
|
+
```
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Snapcrawl
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
snapcrawl URL [--config FILE] [SETTINGS...]
|
|
5
|
+
snapcrawl -h | --help
|
|
6
|
+
snapcrawl -v | --version
|
|
7
|
+
|
|
8
|
+
Options:
|
|
9
|
+
-c, --config FILE
|
|
10
|
+
Path to config file, with or without the .yml extension.
|
|
11
|
+
A sample file will be created if not found.
|
|
12
|
+
The default filename is 'snapcrawl.yml'.
|
|
13
|
+
|
|
14
|
+
-h, --help
|
|
15
|
+
Show this screen
|
|
16
|
+
|
|
17
|
+
-v, --version
|
|
18
|
+
Show version number
|
|
19
|
+
|
|
20
|
+
Settings:
|
|
21
|
+
Provide any of the options available in the config as 'key=value'.
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
snapcrawl example.com
|
|
25
|
+
snapcrawl example.com --config simple
|
|
26
|
+
snapcrawl example.com depth=1 log_level=2 width=768
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
---
|
|
2
|
+
depth: 1
|
|
3
|
+
width: 1280
|
|
4
|
+
height: 0
|
|
5
|
+
cache_life: 86400
|
|
6
|
+
cache_dir: cache
|
|
7
|
+
snaps_dir: snaps
|
|
8
|
+
name_template: "%{url}"
|
|
9
|
+
url_whitelist:
|
|
10
|
+
url_blacklist:
|
|
11
|
+
css_selector:
|
|
12
|
+
log_level: 1
|
|
13
|
+
log_color: auto
|
|
14
|
+
skip_ssl_verification: false
|
|
15
|
+
screenshot_delay:
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
---
|
|
2
|
+
depth: 3
|
|
3
|
+
width: 768
|
|
4
|
+
height: 0
|
|
5
|
+
cache_life: 86400
|
|
6
|
+
cache_dir: cache
|
|
7
|
+
snaps_dir: snaps
|
|
8
|
+
name_template: "%{url}"
|
|
9
|
+
url_whitelist:
|
|
10
|
+
url_blacklist:
|
|
11
|
+
css_selector:
|
|
12
|
+
log_level: 3
|
|
13
|
+
log_color: false
|
|
14
|
+
skip_ssl_verification: false
|
|
15
|
+
screenshot_delay:
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
[36mDEBUG[0m : initializing cli
|
|
2
|
+
[36mDEBUG[0m : initializing crawler with [32mhttp://localhost:3000/filters[0m
|
|
3
|
+
[36mDEBUG[0m : config {"depth"=>1, "width"=>1280, "height"=>0, "cache_life"=>86400, "cache_dir"=>"cache", "snaps_dir"=>"snaps", "name_template"=>"%%{url}", "url_whitelist"=>nil, "url_blacklist"=>"exclude", "css_selector"=>nil, "log_level"=>0, "log_color"=>"auto", "skip_ssl_verification"=>false, "screenshot_delay"=>nil}
|
|
4
|
+
[36mDEBUG[0m : processing queue: [32m1 remaining[0m
|
|
5
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000/filters[0m, depth: 0
|
|
6
|
+
[34m INFO[0m : screenshot for /filters already exists
|
|
7
|
+
[36mDEBUG[0m : ignoring [4m[35mhttp://localhost:3000/filters/exclude-me/1[0m, reason: blacklist
|
|
8
|
+
[36mDEBUG[0m : ignoring [4m[35mhttp://localhost:3000/filters/exclude-me/2[0m, reason: blacklist
|
|
9
|
+
[36mDEBUG[0m : processing queue: [32m2 remaining[0m
|
|
10
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000/filters/include-me/1[0m, depth: 1
|
|
11
|
+
[34m INFO[0m : screenshot for /filters/include-me/1 already exists
|
|
12
|
+
[36mDEBUG[0m : processing queue: [32m1 remaining[0m
|
|
13
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000/filters/include-me/2[0m, depth: 1
|
|
14
|
+
[34m INFO[0m : screenshot for /filters/include-me/2 already exists
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
[36mDEBUG[0m : verifying [32mphantomjs[0m is present
|
|
2
|
+
[36mDEBUG[0m : verifying [32mimagemagick[0m is present
|
|
3
|
+
[36mDEBUG[0m : initializing cli
|
|
4
|
+
[36mDEBUG[0m : initializing crawler with [32mhttp://localhost:3000[0m
|
|
5
|
+
[36mDEBUG[0m : config {"depth"=>1, "width"=>1280, "height"=>0, "cache_life"=>86400, "cache_dir"=>"cache", "snaps_dir"=>"snaps", "name_template"=>"%%{url}", "url_whitelist"=>nil, "url_blacklist"=>nil, "css_selector"=>nil, "log_level"=>1, "log_color"=>"auto", "skip_ssl_verification"=>false, "screenshot_delay"=>nil}
|
|
6
|
+
[36mDEBUG[0m : processing queue: [32m1 remaining[0m
|
|
7
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000[0m, depth: 0
|
|
8
|
+
[34m INFO[0m : [1m[32mcapturing screenshot for /[0m
|
|
9
|
+
[36mDEBUG[0m : processing queue: [32m2 remaining[0m
|
|
10
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000/page[0m, depth: 1
|
|
11
|
+
[34m INFO[0m : [1m[32mcapturing screenshot for /page[0m
|
|
12
|
+
[36mDEBUG[0m : processing queue: [32m1 remaining[0m
|
|
13
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000/errors[0m, depth: 1
|
|
14
|
+
[34m INFO[0m : [1m[32mcapturing screenshot for /errors[0m
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
[36mDEBUG[0m : initializing cli
|
|
2
|
+
[36mDEBUG[0m : initializing crawler with [32mhttp://localhost:3000[0m
|
|
3
|
+
[36mDEBUG[0m : config {"depth"=>0, "width"=>1280, "height"=>0, "cache_life"=>86400, "cache_dir"=>"cache", "snaps_dir"=>"snaps", "name_template"=>"%%{url}", "url_whitelist"=>nil, "url_blacklist"=>nil, "css_selector"=>nil, "log_level"=>1, "log_color"=>"auto", "skip_ssl_verification"=>false, "screenshot_delay"=>nil}
|
|
4
|
+
[36mDEBUG[0m : processing queue: [32m1 remaining[0m
|
|
5
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000[0m, depth: 0
|
|
6
|
+
[34m INFO[0m : screenshot for / already exists
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
[33m WARN[0m : [31mAddressable::URI::InvalidURIError[0m: Invalid scheme format: '\problematic ' on /page (link: \problematic : link)
|
|
2
|
+
[33m WARN[0m : http error on [4m[35mhttp://localhost:3000/broken[0m, code: [33m404[0m, message: Not Found
|
|
3
|
+
[33m WARN[0m : http error on [4m[35mhttp://localhost:3000/secret[0m, code: [33m401[0m, message: Unauthorized
|
|
4
|
+
[33m WARN[0m : http error on [4m[35mhttp://localhost:3000/500[0m, code: [33m500[0m, message: Internal Server Error
|
|
5
|
+
[33m WARN[0m : http error on [4m[35mhttp://localhost:3000/401[0m, code: [33m401[0m, message: Unauthorized
|
|
6
|
+
[33m WARN[0m : http error on [4m[35mhttp://localhost:3000/403[0m, code: [33m403[0m, message: Forbidden
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
INFO : processing http://localhost:3000, depth: 0
|
|
2
|
+
INFO : screenshot for / already exists
|
|
3
|
+
INFO : processing http://localhost:3000/page, depth: 1
|
|
4
|
+
INFO : screenshot for /page already exists
|
|
5
|
+
INFO : processing http://localhost:3000/errors, depth: 1
|
|
6
|
+
INFO : screenshot for /errors already exists
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
[36mDEBUG[0m : initializing cli
|
|
2
|
+
[36mDEBUG[0m : initializing crawler with [32mhttp://localhost:3000/filters[0m
|
|
3
|
+
[36mDEBUG[0m : config {"depth"=>1, "width"=>1280, "height"=>0, "cache_life"=>86400, "cache_dir"=>"cache", "snaps_dir"=>"snaps", "name_template"=>"%%{url}", "url_whitelist"=>"include", "url_blacklist"=>nil, "css_selector"=>nil, "log_level"=>0, "log_color"=>"auto", "skip_ssl_verification"=>false, "screenshot_delay"=>nil}
|
|
4
|
+
[36mDEBUG[0m : processing queue: [32m1 remaining[0m
|
|
5
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000/filters[0m, depth: 0
|
|
6
|
+
[34m INFO[0m : [1m[32mcapturing screenshot for /filters[0m
|
|
7
|
+
[36mDEBUG[0m : ignoring [4m[35mhttp://localhost:3000/filters/exclude-me/1[0m, reason: whitelist
|
|
8
|
+
[36mDEBUG[0m : ignoring [4m[35mhttp://localhost:3000/filters/exclude-me/2[0m, reason: whitelist
|
|
9
|
+
[36mDEBUG[0m : processing queue: [32m2 remaining[0m
|
|
10
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000/filters/include-me/1[0m, depth: 1
|
|
11
|
+
[34m INFO[0m : [1m[32mcapturing screenshot for /filters/include-me/1[0m
|
|
12
|
+
[36mDEBUG[0m : processing queue: [32m1 remaining[0m
|
|
13
|
+
[34m INFO[0m : processing [4m[35mhttp://localhost:3000/filters/include-me/2[0m, depth: 1
|
|
14
|
+
[34m INFO[0m : [1m[32mcapturing screenshot for /filters/include-me/2[0m
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
[34m INFO[0m : [32mHello World[0m
|