snapcrawl 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8de37399949606ebc180cc7a80bf3ad91869e064
4
+ data.tar.gz: 0b56bd7207f4b0f0e44ecdfcffe890d50bf1d34c
5
+ SHA512:
6
+ metadata.gz: 1ef25f70c86ce8f8d626b43d080d4f4048cae9a421111a3fbd232ebc6f26d9d4b67d69a90bbe306b3ff995c136ea8a4252b693dac6e1f2f82c1ff0d8a7e379e9
7
+ data.tar.gz: 9fb48c4490dc64c14284cbfd2bd4809ec1c6de67d535a6e4175aed4140ecc7366ebe92c791551668c08ef5b30a0b9d4f9ff6eeebee83af715d6910b9ab9f9df4
@@ -0,0 +1,23 @@
1
+ # SnapCrawl - crawl a website and take screenshots
2
+
3
+ `snapcrawl` is a command line utility for crawling a website and saving
4
+ screenshots. It is using [Runfile](https://github.com/DannyBen/runfile).
5
+
6
+ ## Features
7
+
8
+ - Crawls a website to any given depth and save screenshots
9
+ - Can capture the full length of the page
10
+ - Can use a specific resolution for screenshots
11
+ - Skips capturing if the screenshot was already saved recently
12
+ - Uses local caching to avoid expensive crawl operations if not needed
13
+ - Reports broken links
14
+
15
+ ## Install
16
+
17
+ $ gem install snapcrawl
18
+
19
+ ## Usage
20
+
21
+ $ snapcrawl --help
22
+
23
+
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'snapcrawl'
4
+ trap(:INT) { abort "\r\nGoodbye" }
5
+ include Snapcrawl
6
+ Crawler.instance.handle ARGV
7
+
@@ -0,0 +1,4 @@
1
+ require 'snapcrawl/version'
2
+ require 'snapcrawl/crawler'
3
+
4
+ self.extend Snapcrawl
@@ -0,0 +1,200 @@
1
+ require 'colsole'
2
+ require 'docopt'
3
+ require 'fileutils'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'ostruct'
7
+ require 'pstore'
8
+ require 'screencap'
9
+
10
+ module Snapcrawl
11
+ include Colsole
12
+
13
+ class Crawler
14
+ def self.instance
15
+ @@instance ||= self.new
16
+ end
17
+
18
+ def initialize
19
+ @storefile = "snapcrawl.pstore"
20
+ @store = PStore.new(@storefile)
21
+ @done = []
22
+ end
23
+
24
+ def handle(args)
25
+ begin
26
+ execute Docopt::docopt(doc, argv: args)
27
+ rescue Docopt::Exit => e
28
+ puts e.message
29
+ end
30
+ end
31
+
32
+ def execute(args)
33
+ return show_version if args['--version']
34
+ crawl args['<url>'].dup, opts_from_args(args)
35
+ end
36
+
37
+ def crawl(url, opts={})
38
+ defaults = {
39
+ depth: 1,
40
+ age: 86400,
41
+ dir: 'snaps',
42
+ base: url,
43
+ }
44
+ urls = [protocolize(url)]
45
+
46
+ @opts = OpenStruct.new defaults.merge(opts)
47
+
48
+ make_screenshot_dir @opts.dir
49
+
50
+ @opts.depth.times do
51
+ urls = crawl_and_snap urls
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def crawl_and_snap(urls)
58
+ new_urls = []
59
+ urls.each do |url|
60
+ next if @done.include? url
61
+ @done << url
62
+ say "\n!txtgrn!-----> Visit: #{url}"
63
+ snap url
64
+ new_urls += extract_urls_from url
65
+ end
66
+ new_urls
67
+ end
68
+
69
+ # Take a screenshot of a URL, unless we already did so recently
70
+ def snap(url)
71
+ file = image_path_for(url)
72
+ if file_fresh? file
73
+ say " Snap: Skipping. File exists and seems fresh"
74
+ else
75
+ snap!(url)
76
+ end
77
+ end
78
+
79
+ # Take a screenshot of the URL, even if file exists
80
+ def snap!(url)
81
+ say " !txtblu!Snap!!txtrst! Snapping picture... "
82
+
83
+ f = Screencap::Fetcher.new url
84
+ fetch_opts = {}
85
+ fetch_opts[:output] = image_path_for(url)
86
+ fetch_opts[:width] = @opts.width
87
+ fetch_opts[:height] = @opts.height if @opts.height
88
+ # :height => 768,
89
+ # :div => '.header', # selector for a specific element to take screenshot of
90
+ # :top => 0, :left => 0, :width => 100, :height => 100 # dimensions for a specific area
91
+
92
+ screenshot = f.fetch fetch_opts
93
+ say "done"
94
+ end
95
+
96
+ def extract_urls_from(url)
97
+ cached = nil
98
+ @store.transaction { cached = @store[url] }
99
+ if cached
100
+ say " Crawl: Page was cached. Reading subsequent URLs from cache"
101
+ return cached
102
+ else
103
+ return extract_urls_from! url
104
+ end
105
+ end
106
+
107
+ def extract_urls_from!(url)
108
+ say " !txtblu!Crawl!!txtrst! Extracting links... "
109
+
110
+ begin
111
+ doc = Nokogiri::HTML open url
112
+ links = doc.css('a')
113
+ links = normalize_links links
114
+ @store.transaction { @store[url] = links }
115
+ say "done"
116
+ rescue OpenURI::HTTPError => e
117
+ links = []
118
+ say "!txtred!FAILED"
119
+ say! "!txtred! ! HTTP Error: #{e.message} at #{url}"
120
+ end
121
+ links
122
+ end
123
+
124
+ # mkdir the screenshots folder, if needed
125
+ def make_screenshot_dir(dir)
126
+ Dir.exists? dir or FileUtils.mkdir_p dir
127
+ end
128
+
129
+ # Convert any string to a proper handle
130
+ def handelize(str)
131
+ str.downcase.gsub /[^a-z0-9]+/, '-'
132
+ end
133
+
134
+ # Return proper image path for a UR
135
+ def image_path_for(url)
136
+ "#{@opts.dir}/#{handelize(url)}.png"
137
+ end
138
+
139
+ # Add protocol to a URL if neeed
140
+ def protocolize(url)
141
+ url =~ /^http/ ? url : "http://#{url}"
142
+ end
143
+
144
+ # Return true if the file exists and is not too old
145
+ def file_fresh?(file)
146
+ File.exist?(file) and file_age(file) < @opts.age
147
+ end
148
+
149
+ # Return file age in seconds
150
+ def file_age(file)
151
+ (Time.now - File.stat(file).mtime).to_i
152
+ end
153
+
154
+ # Process an array of links and return a better one
155
+ def normalize_links(links)
156
+ # Remove the #hash part from all links
157
+ links = links.map {|link| link.attribute('href').to_s.gsub(/#.+$/, '')}
158
+
159
+ # Make unique and remove empties
160
+ links = links.uniq.reject {|link| link.empty?}
161
+
162
+ # Remove links to images and other files
163
+ extensions = "png|gif|jpg|pdf|zip"
164
+ links = links.reject {|link| link =~ /\.(#{extensions})(\?.*)?$/}
165
+
166
+ # Remove mailto, tel links
167
+ beginnings = "mailto|tel"
168
+ links = links.reject {|link| link =~ /^(#{beginnings})/}
169
+
170
+ # Add the base domain to relative URLs
171
+ links = links.map {|link| link =~ /^http/ ? link : "http://#{@base}#{link}"}
172
+
173
+ # Keep only links in our base domain
174
+ links = links.select {|link| link =~ /https?:\/\/#{@base}.*/}
175
+
176
+ links
177
+ end
178
+
179
+ def show_version
180
+ puts VERSION
181
+ end
182
+
183
+ def doc
184
+ return @doc if @doc
185
+ @doc = File.read template 'docopt.txt'
186
+ end
187
+
188
+ def template(file)
189
+ File.expand_path("../templates/#{file}", __FILE__)
190
+ end
191
+
192
+ def opts_from_args(args)
193
+ opts = {}
194
+ opts[:folder] = args['--folder'] if args['--folder']
195
+ opts[:age] = args['--age'].to_i if args['--age']
196
+ opts[:depth] = args['--depth'].to_i if args['--depth']
197
+ opts
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,15 @@
1
+ Snapcrawl
2
+
3
+ Usage:
4
+ snapcrawl go <url> [options]
5
+ snapcrawl -h | --help
6
+ snapcrawl -v | --version
7
+
8
+ Options:
9
+ -f --folder <path> Where to save screenshots [default: snaps]
10
+ -a --age <n> Number of seconds to consider screenshots fresh
11
+ [default: 86400]
12
+ -d --depth <n> Number of levels to crawl [default: 1]
13
+ -h --help Show this screen
14
+ -v --version Show version
15
+
@@ -0,0 +1,3 @@
1
+ module Snapcrawl
2
+ VERSION = "0.2.0"
3
+ end
metadata ADDED
@@ -0,0 +1,149 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: snapcrawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Danny Ben Shitrit
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: colsole
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: net-ssh
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: docopt
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.5'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.5'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.6'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.6'
69
+ - !ruby/object:Gem::Dependency
70
+ name: screencap
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: runfile
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.5'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.5'
97
+ - !ruby/object:Gem::Dependency
98
+ name: run-gem-dev
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '0.2'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '0.2'
111
+ description: Snapcrawl is a command line utility for crawling a website and saving
112
+ screenshots.
113
+ email: db@dannyben.com
114
+ executables:
115
+ - snapcrawl
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - README.md
120
+ - bin/snapcrawl
121
+ - lib/snapcrawl.rb
122
+ - lib/snapcrawl/crawler.rb
123
+ - lib/snapcrawl/templates/docopt.txt
124
+ - lib/snapcrawl/version.rb
125
+ homepage: https://github.com/DannyBen/snapcrawl
126
+ licenses:
127
+ - MIT
128
+ metadata: {}
129
+ post_install_message:
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '2.0'
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubyforge_project:
145
+ rubygems_version: 2.4.6
146
+ signing_key:
147
+ specification_version: 4
148
+ summary: Crawl a website and take screenshots (CLI + Library)
149
+ test_files: []