snapcrawl 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +23 -0
- data/bin/snapcrawl +7 -0
- data/lib/snapcrawl.rb +4 -0
- data/lib/snapcrawl/crawler.rb +200 -0
- data/lib/snapcrawl/templates/docopt.txt +15 -0
- data/lib/snapcrawl/version.rb +3 -0
- metadata +149 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8de37399949606ebc180cc7a80bf3ad91869e064
|
4
|
+
data.tar.gz: 0b56bd7207f4b0f0e44ecdfcffe890d50bf1d34c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1ef25f70c86ce8f8d626b43d080d4f4048cae9a421111a3fbd232ebc6f26d9d4b67d69a90bbe306b3ff995c136ea8a4252b693dac6e1f2f82c1ff0d8a7e379e9
|
7
|
+
data.tar.gz: 9fb48c4490dc64c14284cbfd2bd4809ec1c6de67d535a6e4175aed4140ecc7366ebe92c791551668c08ef5b30a0b9d4f9ff6eeebee83af715d6910b9ab9f9df4
|
data/README.md
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# SnapCrawl - crawl a website and take screenshots
|
2
|
+
|
3
|
+
`snapcrawl` is a command line utility for crawling a website and saving
|
4
|
+
screenshots. It is using [Runfile](https://github.com/DannyBen/runfile).
|
5
|
+
|
6
|
+
## Features
|
7
|
+
|
8
|
+
- Crawls a website to any given depth and save screenshots
|
9
|
+
- Can capture the full length of the page
|
10
|
+
- Can use a specific resolution for screenshots
|
11
|
+
- Skips capturing if the screenshot was already saved recently
|
12
|
+
- Uses local caching to avoid expensive crawl operations if not needed
|
13
|
+
- Reports broken links
|
14
|
+
|
15
|
+
## Install
|
16
|
+
|
17
|
+
$ gem install snapcrawl
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
$ snapcrawl --help
|
22
|
+
|
23
|
+
|
data/bin/snapcrawl
ADDED
data/lib/snapcrawl.rb
ADDED
@@ -0,0 +1,200 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
require 'docopt'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'ostruct'
|
7
|
+
require 'pstore'
|
8
|
+
require 'screencap'
|
9
|
+
|
10
|
+
module Snapcrawl
|
11
|
+
include Colsole
|
12
|
+
|
13
|
+
class Crawler
|
14
|
+
def self.instance
|
15
|
+
@@instance ||= self.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
@storefile = "snapcrawl.pstore"
|
20
|
+
@store = PStore.new(@storefile)
|
21
|
+
@done = []
|
22
|
+
end
|
23
|
+
|
24
|
+
def handle(args)
|
25
|
+
begin
|
26
|
+
execute Docopt::docopt(doc, argv: args)
|
27
|
+
rescue Docopt::Exit => e
|
28
|
+
puts e.message
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def execute(args)
|
33
|
+
return show_version if args['--version']
|
34
|
+
crawl args['<url>'].dup, opts_from_args(args)
|
35
|
+
end
|
36
|
+
|
37
|
+
def crawl(url, opts={})
|
38
|
+
defaults = {
|
39
|
+
depth: 1,
|
40
|
+
age: 86400,
|
41
|
+
dir: 'snaps',
|
42
|
+
base: url,
|
43
|
+
}
|
44
|
+
urls = [protocolize(url)]
|
45
|
+
|
46
|
+
@opts = OpenStruct.new defaults.merge(opts)
|
47
|
+
|
48
|
+
make_screenshot_dir @opts.dir
|
49
|
+
|
50
|
+
@opts.depth.times do
|
51
|
+
urls = crawl_and_snap urls
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def crawl_and_snap(urls)
|
58
|
+
new_urls = []
|
59
|
+
urls.each do |url|
|
60
|
+
next if @done.include? url
|
61
|
+
@done << url
|
62
|
+
say "\n!txtgrn!-----> Visit: #{url}"
|
63
|
+
snap url
|
64
|
+
new_urls += extract_urls_from url
|
65
|
+
end
|
66
|
+
new_urls
|
67
|
+
end
|
68
|
+
|
69
|
+
# Take a screenshot of a URL, unless we already did so recently
|
70
|
+
def snap(url)
|
71
|
+
file = image_path_for(url)
|
72
|
+
if file_fresh? file
|
73
|
+
say " Snap: Skipping. File exists and seems fresh"
|
74
|
+
else
|
75
|
+
snap!(url)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Take a screenshot of the URL, even if file exists
|
80
|
+
def snap!(url)
|
81
|
+
say " !txtblu!Snap!!txtrst! Snapping picture... "
|
82
|
+
|
83
|
+
f = Screencap::Fetcher.new url
|
84
|
+
fetch_opts = {}
|
85
|
+
fetch_opts[:output] = image_path_for(url)
|
86
|
+
fetch_opts[:width] = @opts.width
|
87
|
+
fetch_opts[:height] = @opts.height if @opts.height
|
88
|
+
# :height => 768,
|
89
|
+
# :div => '.header', # selector for a specific element to take screenshot of
|
90
|
+
# :top => 0, :left => 0, :width => 100, :height => 100 # dimensions for a specific area
|
91
|
+
|
92
|
+
screenshot = f.fetch fetch_opts
|
93
|
+
say "done"
|
94
|
+
end
|
95
|
+
|
96
|
+
def extract_urls_from(url)
|
97
|
+
cached = nil
|
98
|
+
@store.transaction { cached = @store[url] }
|
99
|
+
if cached
|
100
|
+
say " Crawl: Page was cached. Reading subsequent URLs from cache"
|
101
|
+
return cached
|
102
|
+
else
|
103
|
+
return extract_urls_from! url
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def extract_urls_from!(url)
|
108
|
+
say " !txtblu!Crawl!!txtrst! Extracting links... "
|
109
|
+
|
110
|
+
begin
|
111
|
+
doc = Nokogiri::HTML open url
|
112
|
+
links = doc.css('a')
|
113
|
+
links = normalize_links links
|
114
|
+
@store.transaction { @store[url] = links }
|
115
|
+
say "done"
|
116
|
+
rescue OpenURI::HTTPError => e
|
117
|
+
links = []
|
118
|
+
say "!txtred!FAILED"
|
119
|
+
say! "!txtred! ! HTTP Error: #{e.message} at #{url}"
|
120
|
+
end
|
121
|
+
links
|
122
|
+
end
|
123
|
+
|
124
|
+
# mkdir the screenshots folder, if needed
|
125
|
+
def make_screenshot_dir(dir)
|
126
|
+
Dir.exists? dir or FileUtils.mkdir_p dir
|
127
|
+
end
|
128
|
+
|
129
|
+
# Convert any string to a proper handle
|
130
|
+
def handelize(str)
|
131
|
+
str.downcase.gsub /[^a-z0-9]+/, '-'
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return proper image path for a UR
|
135
|
+
def image_path_for(url)
|
136
|
+
"#{@opts.dir}/#{handelize(url)}.png"
|
137
|
+
end
|
138
|
+
|
139
|
+
# Add protocol to a URL if neeed
|
140
|
+
def protocolize(url)
|
141
|
+
url =~ /^http/ ? url : "http://#{url}"
|
142
|
+
end
|
143
|
+
|
144
|
+
# Return true if the file exists and is not too old
|
145
|
+
def file_fresh?(file)
|
146
|
+
File.exist?(file) and file_age(file) < @opts.age
|
147
|
+
end
|
148
|
+
|
149
|
+
# Return file age in seconds
|
150
|
+
def file_age(file)
|
151
|
+
(Time.now - File.stat(file).mtime).to_i
|
152
|
+
end
|
153
|
+
|
154
|
+
# Process an array of links and return a better one
|
155
|
+
def normalize_links(links)
|
156
|
+
# Remove the #hash part from all links
|
157
|
+
links = links.map {|link| link.attribute('href').to_s.gsub(/#.+$/, '')}
|
158
|
+
|
159
|
+
# Make unique and remove empties
|
160
|
+
links = links.uniq.reject {|link| link.empty?}
|
161
|
+
|
162
|
+
# Remove links to images and other files
|
163
|
+
extensions = "png|gif|jpg|pdf|zip"
|
164
|
+
links = links.reject {|link| link =~ /\.(#{extensions})(\?.*)?$/}
|
165
|
+
|
166
|
+
# Remove mailto, tel links
|
167
|
+
beginnings = "mailto|tel"
|
168
|
+
links = links.reject {|link| link =~ /^(#{beginnings})/}
|
169
|
+
|
170
|
+
# Add the base domain to relative URLs
|
171
|
+
links = links.map {|link| link =~ /^http/ ? link : "http://#{@base}#{link}"}
|
172
|
+
|
173
|
+
# Keep only links in our base domain
|
174
|
+
links = links.select {|link| link =~ /https?:\/\/#{@base}.*/}
|
175
|
+
|
176
|
+
links
|
177
|
+
end
|
178
|
+
|
179
|
+
def show_version
|
180
|
+
puts VERSION
|
181
|
+
end
|
182
|
+
|
183
|
+
def doc
|
184
|
+
return @doc if @doc
|
185
|
+
@doc = File.read template 'docopt.txt'
|
186
|
+
end
|
187
|
+
|
188
|
+
def template(file)
|
189
|
+
File.expand_path("../templates/#{file}", __FILE__)
|
190
|
+
end
|
191
|
+
|
192
|
+
def opts_from_args(args)
|
193
|
+
opts = {}
|
194
|
+
opts[:folder] = args['--folder'] if args['--folder']
|
195
|
+
opts[:age] = args['--age'].to_i if args['--age']
|
196
|
+
opts[:depth] = args['--depth'].to_i if args['--depth']
|
197
|
+
opts
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Snapcrawl
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
snapcrawl go <url> [options]
|
5
|
+
snapcrawl -h | --help
|
6
|
+
snapcrawl -v | --version
|
7
|
+
|
8
|
+
Options:
|
9
|
+
-f --folder <path> Where to save screenshots [default: snaps]
|
10
|
+
-a --age <n> Number of seconds to consider screenshots fresh
|
11
|
+
[default: 86400]
|
12
|
+
-d --depth <n> Number of levels to crawl [default: 1]
|
13
|
+
-h --help Show this screen
|
14
|
+
-v --version Show version
|
15
|
+
|
metadata
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: snapcrawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Danny Ben Shitrit
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-05 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: colsole
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.3'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: net-ssh
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: docopt
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0.5'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0.5'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: nokogiri
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.6'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.6'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: screencap
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.1'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.1'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: runfile
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.5'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.5'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: run-gem-dev
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.2'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.2'
|
111
|
+
description: Snapcrawl is a command line utility for crawling a website and saving
|
112
|
+
screenshots.
|
113
|
+
email: db@dannyben.com
|
114
|
+
executables:
|
115
|
+
- snapcrawl
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- README.md
|
120
|
+
- bin/snapcrawl
|
121
|
+
- lib/snapcrawl.rb
|
122
|
+
- lib/snapcrawl/crawler.rb
|
123
|
+
- lib/snapcrawl/templates/docopt.txt
|
124
|
+
- lib/snapcrawl/version.rb
|
125
|
+
homepage: https://github.com/DannyBen/snapcrawl
|
126
|
+
licenses:
|
127
|
+
- MIT
|
128
|
+
metadata: {}
|
129
|
+
post_install_message:
|
130
|
+
rdoc_options: []
|
131
|
+
require_paths:
|
132
|
+
- lib
|
133
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - ">="
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '2.0'
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
requirements: []
|
144
|
+
rubyforge_project:
|
145
|
+
rubygems_version: 2.4.6
|
146
|
+
signing_key:
|
147
|
+
specification_version: 4
|
148
|
+
summary: Crawl a website and take screenshots (CLI + Library)
|
149
|
+
test_files: []
|