snapcrawl 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8de37399949606ebc180cc7a80bf3ad91869e064
4
+ data.tar.gz: 0b56bd7207f4b0f0e44ecdfcffe890d50bf1d34c
5
+ SHA512:
6
+ metadata.gz: 1ef25f70c86ce8f8d626b43d080d4f4048cae9a421111a3fbd232ebc6f26d9d4b67d69a90bbe306b3ff995c136ea8a4252b693dac6e1f2f82c1ff0d8a7e379e9
7
+ data.tar.gz: 9fb48c4490dc64c14284cbfd2bd4809ec1c6de67d535a6e4175aed4140ecc7366ebe92c791551668c08ef5b30a0b9d4f9ff6eeebee83af715d6910b9ab9f9df4
@@ -0,0 +1,23 @@
1
+ # SnapCrawl - crawl a website and take screenshots
2
+
3
+ `snapcrawl` is a command line utility for crawling a website and saving
4
+ screenshots. It is using [Runfile](https://github.com/DannyBen/runfile).
5
+
6
+ ## Features
7
+
8
+ - Crawls a website to any given depth and save screenshots
9
+ - Can capture the full length of the page
10
+ - Can use a specific resolution for screenshots
11
+ - Skips capturing if the screenshot was already saved recently
12
+ - Uses local caching to avoid expensive crawl operations if not needed
13
+ - Reports broken links
14
+
15
+ ## Install
16
+
17
+ $ gem install snapcrawl
18
+
19
+ ## Usage
20
+
21
+ $ snapcrawl --help
22
+
23
+
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'snapcrawl'
4
+ trap(:INT) { abort "\r\nGoodbye" }
5
+ include Snapcrawl
6
+ Crawler.instance.handle ARGV
7
+
@@ -0,0 +1,4 @@
1
+ require 'snapcrawl/version'
2
+ require 'snapcrawl/crawler'
3
+
4
+ self.extend Snapcrawl
@@ -0,0 +1,200 @@
1
+ require 'colsole'
2
+ require 'docopt'
3
+ require 'fileutils'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ require 'ostruct'
7
+ require 'pstore'
8
+ require 'screencap'
9
+
10
+ module Snapcrawl
11
+ include Colsole
12
+
13
+ class Crawler
14
+ def self.instance
15
+ @@instance ||= self.new
16
+ end
17
+
18
+ def initialize
19
+ @storefile = "snapcrawl.pstore"
20
+ @store = PStore.new(@storefile)
21
+ @done = []
22
+ end
23
+
24
+ def handle(args)
25
+ begin
26
+ execute Docopt::docopt(doc, argv: args)
27
+ rescue Docopt::Exit => e
28
+ puts e.message
29
+ end
30
+ end
31
+
32
+ def execute(args)
33
+ return show_version if args['--version']
34
+ crawl args['<url>'].dup, opts_from_args(args)
35
+ end
36
+
37
+ def crawl(url, opts={})
38
+ defaults = {
39
+ depth: 1,
40
+ age: 86400,
41
+ dir: 'snaps',
42
+ base: url,
43
+ }
44
+ urls = [protocolize(url)]
45
+
46
+ @opts = OpenStruct.new defaults.merge(opts)
47
+
48
+ make_screenshot_dir @opts.dir
49
+
50
+ @opts.depth.times do
51
+ urls = crawl_and_snap urls
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def crawl_and_snap(urls)
58
+ new_urls = []
59
+ urls.each do |url|
60
+ next if @done.include? url
61
+ @done << url
62
+ say "\n!txtgrn!-----> Visit: #{url}"
63
+ snap url
64
+ new_urls += extract_urls_from url
65
+ end
66
+ new_urls
67
+ end
68
+
69
+ # Take a screenshot of a URL, unless we already did so recently
70
+ def snap(url)
71
+ file = image_path_for(url)
72
+ if file_fresh? file
73
+ say " Snap: Skipping. File exists and seems fresh"
74
+ else
75
+ snap!(url)
76
+ end
77
+ end
78
+
79
+ # Take a screenshot of the URL, even if file exists
80
+ def snap!(url)
81
+ say " !txtblu!Snap!!txtrst! Snapping picture... "
82
+
83
+ f = Screencap::Fetcher.new url
84
+ fetch_opts = {}
85
+ fetch_opts[:output] = image_path_for(url)
86
+ fetch_opts[:width] = @opts.width
87
+ fetch_opts[:height] = @opts.height if @opts.height
88
+ # :height => 768,
89
+ # :div => '.header', # selector for a specific element to take screenshot of
90
+ # :top => 0, :left => 0, :width => 100, :height => 100 # dimensions for a specific area
91
+
92
+ screenshot = f.fetch fetch_opts
93
+ say "done"
94
+ end
95
+
96
+ def extract_urls_from(url)
97
+ cached = nil
98
+ @store.transaction { cached = @store[url] }
99
+ if cached
100
+ say " Crawl: Page was cached. Reading subsequent URLs from cache"
101
+ return cached
102
+ else
103
+ return extract_urls_from! url
104
+ end
105
+ end
106
+
107
+ def extract_urls_from!(url)
108
+ say " !txtblu!Crawl!!txtrst! Extracting links... "
109
+
110
+ begin
111
+ doc = Nokogiri::HTML open url
112
+ links = doc.css('a')
113
+ links = normalize_links links
114
+ @store.transaction { @store[url] = links }
115
+ say "done"
116
+ rescue OpenURI::HTTPError => e
117
+ links = []
118
+ say "!txtred!FAILED"
119
+ say! "!txtred! ! HTTP Error: #{e.message} at #{url}"
120
+ end
121
+ links
122
+ end
123
+
124
+ # mkdir the screenshots folder, if needed
125
+ def make_screenshot_dir(dir)
126
+ Dir.exists? dir or FileUtils.mkdir_p dir
127
+ end
128
+
129
+ # Convert any string to a proper handle
130
+ def handelize(str)
131
+ str.downcase.gsub /[^a-z0-9]+/, '-'
132
+ end
133
+
134
+ # Return proper image path for a UR
135
+ def image_path_for(url)
136
+ "#{@opts.dir}/#{handelize(url)}.png"
137
+ end
138
+
139
+ # Add protocol to a URL if neeed
140
+ def protocolize(url)
141
+ url =~ /^http/ ? url : "http://#{url}"
142
+ end
143
+
144
+ # Return true if the file exists and is not too old
145
+ def file_fresh?(file)
146
+ File.exist?(file) and file_age(file) < @opts.age
147
+ end
148
+
149
+ # Return file age in seconds
150
+ def file_age(file)
151
+ (Time.now - File.stat(file).mtime).to_i
152
+ end
153
+
154
+ # Process an array of links and return a better one
155
+ def normalize_links(links)
156
+ # Remove the #hash part from all links
157
+ links = links.map {|link| link.attribute('href').to_s.gsub(/#.+$/, '')}
158
+
159
+ # Make unique and remove empties
160
+ links = links.uniq.reject {|link| link.empty?}
161
+
162
+ # Remove links to images and other files
163
+ extensions = "png|gif|jpg|pdf|zip"
164
+ links = links.reject {|link| link =~ /\.(#{extensions})(\?.*)?$/}
165
+
166
+ # Remove mailto, tel links
167
+ beginnings = "mailto|tel"
168
+ links = links.reject {|link| link =~ /^(#{beginnings})/}
169
+
170
+ # Add the base domain to relative URLs
171
+ links = links.map {|link| link =~ /^http/ ? link : "http://#{@base}#{link}"}
172
+
173
+ # Keep only links in our base domain
174
+ links = links.select {|link| link =~ /https?:\/\/#{@base}.*/}
175
+
176
+ links
177
+ end
178
+
179
+ def show_version
180
+ puts VERSION
181
+ end
182
+
183
+ def doc
184
+ return @doc if @doc
185
+ @doc = File.read template 'docopt.txt'
186
+ end
187
+
188
+ def template(file)
189
+ File.expand_path("../templates/#{file}", __FILE__)
190
+ end
191
+
192
+ def opts_from_args(args)
193
+ opts = {}
194
+ opts[:folder] = args['--folder'] if args['--folder']
195
+ opts[:age] = args['--age'].to_i if args['--age']
196
+ opts[:depth] = args['--depth'].to_i if args['--depth']
197
+ opts
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,15 @@
1
+ Snapcrawl
2
+
3
+ Usage:
4
+ snapcrawl go <url> [options]
5
+ snapcrawl -h | --help
6
+ snapcrawl -v | --version
7
+
8
+ Options:
9
+ -f --folder <path> Where to save screenshots [default: snaps]
10
+ -a --age <n> Number of seconds to consider screenshots fresh
11
+ [default: 86400]
12
+ -d --depth <n> Number of levels to crawl [default: 1]
13
+ -h --help Show this screen
14
+ -v --version Show version
15
+
@@ -0,0 +1,3 @@
1
+ module Snapcrawl
2
+ VERSION = "0.2.0"
3
+ end
metadata ADDED
@@ -0,0 +1,149 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: snapcrawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Danny Ben Shitrit
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: colsole
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: net-ssh
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: docopt
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.5'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.5'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.6'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.6'
69
+ - !ruby/object:Gem::Dependency
70
+ name: screencap
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: runfile
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.5'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.5'
97
+ - !ruby/object:Gem::Dependency
98
+ name: run-gem-dev
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '0.2'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '0.2'
111
+ description: Snapcrawl is a command line utility for crawling a website and saving
112
+ screenshots.
113
+ email: db@dannyben.com
114
+ executables:
115
+ - snapcrawl
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - README.md
120
+ - bin/snapcrawl
121
+ - lib/snapcrawl.rb
122
+ - lib/snapcrawl/crawler.rb
123
+ - lib/snapcrawl/templates/docopt.txt
124
+ - lib/snapcrawl/version.rb
125
+ homepage: https://github.com/DannyBen/snapcrawl
126
+ licenses:
127
+ - MIT
128
+ metadata: {}
129
+ post_install_message:
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '2.0'
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubyforge_project:
145
+ rubygems_version: 2.4.6
146
+ signing_key:
147
+ specification_version: 4
148
+ summary: Crawl a website and take screenshots (CLI + Library)
149
+ test_files: []