snapcrawl 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +23 -0
- data/bin/snapcrawl +7 -0
- data/lib/snapcrawl.rb +4 -0
- data/lib/snapcrawl/crawler.rb +200 -0
- data/lib/snapcrawl/templates/docopt.txt +15 -0
- data/lib/snapcrawl/version.rb +3 -0
- metadata +149 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8de37399949606ebc180cc7a80bf3ad91869e064
|
4
|
+
data.tar.gz: 0b56bd7207f4b0f0e44ecdfcffe890d50bf1d34c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1ef25f70c86ce8f8d626b43d080d4f4048cae9a421111a3fbd232ebc6f26d9d4b67d69a90bbe306b3ff995c136ea8a4252b693dac6e1f2f82c1ff0d8a7e379e9
|
7
|
+
data.tar.gz: 9fb48c4490dc64c14284cbfd2bd4809ec1c6de67d535a6e4175aed4140ecc7366ebe92c791551668c08ef5b30a0b9d4f9ff6eeebee83af715d6910b9ab9f9df4
|
data/README.md
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# SnapCrawl - crawl a website and take screenshots
|
2
|
+
|
3
|
+
`snapcrawl` is a command line utility for crawling a website and saving
|
4
|
+
screenshots. It is using [Runfile](https://github.com/DannyBen/runfile).
|
5
|
+
|
6
|
+
## Features
|
7
|
+
|
8
|
+
- Crawls a website to any given depth and save screenshots
|
9
|
+
- Can capture the full length of the page
|
10
|
+
- Can use a specific resolution for screenshots
|
11
|
+
- Skips capturing if the screenshot was already saved recently
|
12
|
+
- Uses local caching to avoid expensive crawl operations if not needed
|
13
|
+
- Reports broken links
|
14
|
+
|
15
|
+
## Install
|
16
|
+
|
17
|
+
$ gem install snapcrawl
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
$ snapcrawl --help
|
22
|
+
|
23
|
+
|
data/bin/snapcrawl
ADDED
data/lib/snapcrawl.rb
ADDED
@@ -0,0 +1,200 @@
|
|
1
|
+
require 'colsole'
|
2
|
+
require 'docopt'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'ostruct'
|
7
|
+
require 'pstore'
|
8
|
+
require 'screencap'
|
9
|
+
|
10
|
+
module Snapcrawl
|
11
|
+
include Colsole
|
12
|
+
|
13
|
+
class Crawler
|
14
|
+
def self.instance
|
15
|
+
@@instance ||= self.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
@storefile = "snapcrawl.pstore"
|
20
|
+
@store = PStore.new(@storefile)
|
21
|
+
@done = []
|
22
|
+
end
|
23
|
+
|
24
|
+
def handle(args)
|
25
|
+
begin
|
26
|
+
execute Docopt::docopt(doc, argv: args)
|
27
|
+
rescue Docopt::Exit => e
|
28
|
+
puts e.message
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def execute(args)
|
33
|
+
return show_version if args['--version']
|
34
|
+
crawl args['<url>'].dup, opts_from_args(args)
|
35
|
+
end
|
36
|
+
|
37
|
+
def crawl(url, opts={})
|
38
|
+
defaults = {
|
39
|
+
depth: 1,
|
40
|
+
age: 86400,
|
41
|
+
dir: 'snaps',
|
42
|
+
base: url,
|
43
|
+
}
|
44
|
+
urls = [protocolize(url)]
|
45
|
+
|
46
|
+
@opts = OpenStruct.new defaults.merge(opts)
|
47
|
+
|
48
|
+
make_screenshot_dir @opts.dir
|
49
|
+
|
50
|
+
@opts.depth.times do
|
51
|
+
urls = crawl_and_snap urls
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def crawl_and_snap(urls)
|
58
|
+
new_urls = []
|
59
|
+
urls.each do |url|
|
60
|
+
next if @done.include? url
|
61
|
+
@done << url
|
62
|
+
say "\n!txtgrn!-----> Visit: #{url}"
|
63
|
+
snap url
|
64
|
+
new_urls += extract_urls_from url
|
65
|
+
end
|
66
|
+
new_urls
|
67
|
+
end
|
68
|
+
|
69
|
+
# Take a screenshot of a URL, unless we already did so recently
|
70
|
+
def snap(url)
|
71
|
+
file = image_path_for(url)
|
72
|
+
if file_fresh? file
|
73
|
+
say " Snap: Skipping. File exists and seems fresh"
|
74
|
+
else
|
75
|
+
snap!(url)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Take a screenshot of the URL, even if file exists
|
80
|
+
def snap!(url)
|
81
|
+
say " !txtblu!Snap!!txtrst! Snapping picture... "
|
82
|
+
|
83
|
+
f = Screencap::Fetcher.new url
|
84
|
+
fetch_opts = {}
|
85
|
+
fetch_opts[:output] = image_path_for(url)
|
86
|
+
fetch_opts[:width] = @opts.width
|
87
|
+
fetch_opts[:height] = @opts.height if @opts.height
|
88
|
+
# :height => 768,
|
89
|
+
# :div => '.header', # selector for a specific element to take screenshot of
|
90
|
+
# :top => 0, :left => 0, :width => 100, :height => 100 # dimensions for a specific area
|
91
|
+
|
92
|
+
screenshot = f.fetch fetch_opts
|
93
|
+
say "done"
|
94
|
+
end
|
95
|
+
|
96
|
+
def extract_urls_from(url)
|
97
|
+
cached = nil
|
98
|
+
@store.transaction { cached = @store[url] }
|
99
|
+
if cached
|
100
|
+
say " Crawl: Page was cached. Reading subsequent URLs from cache"
|
101
|
+
return cached
|
102
|
+
else
|
103
|
+
return extract_urls_from! url
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def extract_urls_from!(url)
|
108
|
+
say " !txtblu!Crawl!!txtrst! Extracting links... "
|
109
|
+
|
110
|
+
begin
|
111
|
+
doc = Nokogiri::HTML open url
|
112
|
+
links = doc.css('a')
|
113
|
+
links = normalize_links links
|
114
|
+
@store.transaction { @store[url] = links }
|
115
|
+
say "done"
|
116
|
+
rescue OpenURI::HTTPError => e
|
117
|
+
links = []
|
118
|
+
say "!txtred!FAILED"
|
119
|
+
say! "!txtred! ! HTTP Error: #{e.message} at #{url}"
|
120
|
+
end
|
121
|
+
links
|
122
|
+
end
|
123
|
+
|
124
|
+
# mkdir the screenshots folder, if needed
|
125
|
+
def make_screenshot_dir(dir)
|
126
|
+
Dir.exists? dir or FileUtils.mkdir_p dir
|
127
|
+
end
|
128
|
+
|
129
|
+
# Convert any string to a proper handle
|
130
|
+
def handelize(str)
|
131
|
+
str.downcase.gsub /[^a-z0-9]+/, '-'
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return proper image path for a UR
|
135
|
+
def image_path_for(url)
|
136
|
+
"#{@opts.dir}/#{handelize(url)}.png"
|
137
|
+
end
|
138
|
+
|
139
|
+
# Add protocol to a URL if neeed
|
140
|
+
def protocolize(url)
|
141
|
+
url =~ /^http/ ? url : "http://#{url}"
|
142
|
+
end
|
143
|
+
|
144
|
+
# Return true if the file exists and is not too old
|
145
|
+
def file_fresh?(file)
|
146
|
+
File.exist?(file) and file_age(file) < @opts.age
|
147
|
+
end
|
148
|
+
|
149
|
+
# Return file age in seconds
|
150
|
+
def file_age(file)
|
151
|
+
(Time.now - File.stat(file).mtime).to_i
|
152
|
+
end
|
153
|
+
|
154
|
+
# Process an array of links and return a better one
|
155
|
+
def normalize_links(links)
|
156
|
+
# Remove the #hash part from all links
|
157
|
+
links = links.map {|link| link.attribute('href').to_s.gsub(/#.+$/, '')}
|
158
|
+
|
159
|
+
# Make unique and remove empties
|
160
|
+
links = links.uniq.reject {|link| link.empty?}
|
161
|
+
|
162
|
+
# Remove links to images and other files
|
163
|
+
extensions = "png|gif|jpg|pdf|zip"
|
164
|
+
links = links.reject {|link| link =~ /\.(#{extensions})(\?.*)?$/}
|
165
|
+
|
166
|
+
# Remove mailto, tel links
|
167
|
+
beginnings = "mailto|tel"
|
168
|
+
links = links.reject {|link| link =~ /^(#{beginnings})/}
|
169
|
+
|
170
|
+
# Add the base domain to relative URLs
|
171
|
+
links = links.map {|link| link =~ /^http/ ? link : "http://#{@base}#{link}"}
|
172
|
+
|
173
|
+
# Keep only links in our base domain
|
174
|
+
links = links.select {|link| link =~ /https?:\/\/#{@base}.*/}
|
175
|
+
|
176
|
+
links
|
177
|
+
end
|
178
|
+
|
179
|
+
def show_version
|
180
|
+
puts VERSION
|
181
|
+
end
|
182
|
+
|
183
|
+
def doc
|
184
|
+
return @doc if @doc
|
185
|
+
@doc = File.read template 'docopt.txt'
|
186
|
+
end
|
187
|
+
|
188
|
+
def template(file)
|
189
|
+
File.expand_path("../templates/#{file}", __FILE__)
|
190
|
+
end
|
191
|
+
|
192
|
+
def opts_from_args(args)
|
193
|
+
opts = {}
|
194
|
+
opts[:folder] = args['--folder'] if args['--folder']
|
195
|
+
opts[:age] = args['--age'].to_i if args['--age']
|
196
|
+
opts[:depth] = args['--depth'].to_i if args['--depth']
|
197
|
+
opts
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Snapcrawl
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
snapcrawl go <url> [options]
|
5
|
+
snapcrawl -h | --help
|
6
|
+
snapcrawl -v | --version
|
7
|
+
|
8
|
+
Options:
|
9
|
+
-f --folder <path> Where to save screenshots [default: snaps]
|
10
|
+
-a --age <n> Number of seconds to consider screenshots fresh
|
11
|
+
[default: 86400]
|
12
|
+
-d --depth <n> Number of levels to crawl [default: 1]
|
13
|
+
-h --help Show this screen
|
14
|
+
-v --version Show version
|
15
|
+
|
metadata
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: snapcrawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Danny Ben Shitrit
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-05 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: colsole
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.3'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: net-ssh
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: docopt
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0.5'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0.5'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: nokogiri
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.6'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.6'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: screencap
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0.1'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0.1'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: runfile
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.5'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.5'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: run-gem-dev
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.2'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.2'
|
111
|
+
description: Snapcrawl is a command line utility for crawling a website and saving
|
112
|
+
screenshots.
|
113
|
+
email: db@dannyben.com
|
114
|
+
executables:
|
115
|
+
- snapcrawl
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- README.md
|
120
|
+
- bin/snapcrawl
|
121
|
+
- lib/snapcrawl.rb
|
122
|
+
- lib/snapcrawl/crawler.rb
|
123
|
+
- lib/snapcrawl/templates/docopt.txt
|
124
|
+
- lib/snapcrawl/version.rb
|
125
|
+
homepage: https://github.com/DannyBen/snapcrawl
|
126
|
+
licenses:
|
127
|
+
- MIT
|
128
|
+
metadata: {}
|
129
|
+
post_install_message:
|
130
|
+
rdoc_options: []
|
131
|
+
require_paths:
|
132
|
+
- lib
|
133
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - ">="
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '2.0'
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
requirements: []
|
144
|
+
rubyforge_project:
|
145
|
+
rubygems_version: 2.4.6
|
146
|
+
signing_key:
|
147
|
+
specification_version: 4
|
148
|
+
summary: Crawl a website and take screenshots (CLI + Library)
|
149
|
+
test_files: []
|