snapshotify 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2155e4a1be045e2d84a53ff9b1732b8ebf478c62
4
+ data.tar.gz: f7e78a6b078025ab3ebb6325a291583286fa093f
5
+ SHA512:
6
+ metadata.gz: 95b6c3096a7fe7f0c753f3254ddbe6df17a741b32e237dea7381a504664728cf88381320bb1c0df02a3a7501ead0a1c9cf048616bbf0897cdd9d8a7f4ce61816
7
+ data.tar.gz: b91a62e4dab128e1ccdf3d696b42b29c91cc01e3889eae10cbf22469fc7e27be1f2f101840b535bed3ba175201caa6cc2d9bc629f12594daf57249136152f495
@@ -0,0 +1,3 @@
1
+ site
2
+ pkg
3
+ hackference.co.uk
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,40 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ snapshotify (0.1.0)
5
+ awesome_print
6
+ colorize
7
+ commander
8
+ httparty
9
+ oga
10
+
11
+ GEM
12
+ remote: https://rubygems.org/
13
+ specs:
14
+ ansi (1.5.0)
15
+ ast (2.3.0)
16
+ awesome_print (1.8.0)
17
+ colorize (0.8.1)
18
+ commander (4.4.3)
19
+ highline (~> 1.7.2)
20
+ highline (1.7.8)
21
+ httparty (0.15.6)
22
+ multi_xml (>= 0.5.2)
23
+ multi_xml (0.6.0)
24
+ oga (2.11)
25
+ ast
26
+ ruby-ll (~> 2.1)
27
+ rake (12.1.0)
28
+ ruby-ll (2.1.2)
29
+ ansi
30
+ ast
31
+
32
+ PLATFORMS
33
+ ruby
34
+
35
+ DEPENDENCIES
36
+ rake
37
+ snapshotify!
38
+
39
+ BUNDLED WITH
40
+ 1.16.0.pre.3
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2016 Cristiano Betta
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,25 @@
1
+ # Snapshotify
2
+
3
+ A simple tool for backing up a full site into a static instance.
4
+
5
+ ## To install
6
+ ```sh
7
+ gem install snapshotify
8
+ ```
9
+
10
+ ## To fetch website
11
+ ```sh
12
+ snapshotify go fetch [website_url] [--debug]
13
+ ```
14
+
15
+ ## To serve
16
+ ```sh
17
+ snapshotify serve [website_url]
18
+ ```
19
+
20
+ ## Todo
21
+
22
+ * Add tests
23
+ * Add code to limit hops
24
+ * Add code to limit domains to scrape
25
+ * Fix bugs in asset rewriting
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ task :console do
5
+ exec "irb -r snapshotify -I ./lib"
6
+ end
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require 'snapshotify'
3
+ require 'commander'
4
+
5
+ Snapshotify::Command.new.run
@@ -0,0 +1,9 @@
1
+ require 'snapshotify/version'
2
+ require 'snapshotify/command'
3
+ require 'snapshotify/scraper'
4
+ require 'snapshotify/document'
5
+ require 'snapshotify/emitter'
6
+ require 'snapshotify/writer'
7
+ require 'snapshotify/url'
8
+ require 'snapshotify/resource'
9
+ require 'snapshotify/rewriter'
@@ -0,0 +1,55 @@
1
+ require 'commander'
2
+
3
+ module Snapshotify
4
+ class Command
5
+ include Commander::Methods
6
+
7
+ attr_accessor :emitter
8
+
9
+ def initialize
10
+ self.emitter = Snapshotify::Emitter.new
11
+ end
12
+
13
+ def run
14
+ program :name, 'snapshotify'
15
+ program :version, Snapshotify::VERSION
16
+ program :description, 'A convenient scraper for archiving sites'
17
+ program :help, 'Author', 'Cristiano Betta <cristiano@betta.io>'
18
+
19
+ global_option('--debug') { emitter.debug! }
20
+
21
+ command :snap do |c|
22
+ c.syntax = 'spider <url>'
23
+ c.description = 'Spiders a URL within from the given page, sticking within the original domain'
24
+ c.action do |args, options|
25
+ scraper = Snapshotify::Scraper.new(args.first)
26
+ scraper.emitter = emitter
27
+ scraper.run
28
+
29
+ emitter.newline
30
+ emitter.newline
31
+ emitter.say "Processed"
32
+ emitter.pretty scraper.processed
33
+ emitter.newline
34
+ emitter.say "Could not be processed"
35
+ emitter.pretty scraper.invalid
36
+
37
+ emitter.newline
38
+ emitter.say "Run 'snapshotify serve #{args.first}' to start a webserver on port 8000 with your local copy"
39
+ end
40
+ end
41
+ alias_command :'go fetch', :'snap'
42
+ default_command :snap
43
+
44
+ command :serve do |c|
45
+ c.syntax = 'serve <folder_name>'
46
+ c.description = 'Serve local folder'
47
+ c.action do |args, options|
48
+ exec "ruby -run -ehttpd ./#{ARGV[1]} -p8000"
49
+ end
50
+ end
51
+
52
+ run!
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,45 @@
1
+ require 'httparty'
2
+ require 'oga'
3
+
4
+ module Snapshotify
5
+ class Document
6
+ attr_accessor :url, :emitter
7
+
8
+ def initialize url, parent = nil
9
+ self.url = Snapshotify::Url.new(url, parent)
10
+ end
11
+
12
+ def links
13
+ @links ||= begin
14
+ doc.xpath('//a').map do |element|
15
+ element.attribute('href')
16
+ end.compact.map(&:value).map do |href|
17
+ Snapshotify::Document.new(href, url)
18
+ end.compact
19
+ end
20
+ end
21
+
22
+ def data
23
+ doc.to_xml
24
+ end
25
+
26
+ def write!
27
+ writer = Snapshotify::Writer.new(self)
28
+ writer.emitter = emitter
29
+ writer.write
30
+ end
31
+
32
+ def rewrite
33
+ rewriter = Snapshotify::Rewriter.new(self)
34
+ rewriter.emitter = emitter
35
+ rewriter.rewrite
36
+ end
37
+
38
+ def doc
39
+ @doc ||= begin
40
+ html = HTTParty.get(url.canonical_url)
41
+ Oga.parse_html(html)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,40 @@
1
+ require 'awesome_print'
2
+ require 'colorize'
3
+
4
+ module Snapshotify
5
+ class Emitter
6
+ attr_accessor :debug
7
+
8
+ def initialize
9
+ self.debug = false
10
+ end
11
+
12
+ def debug!
13
+ self.debug = true
14
+ end
15
+
16
+ def newline
17
+ puts "\n"
18
+ end
19
+
20
+ def pretty value
21
+ ap value
22
+ end
23
+
24
+ def log message
25
+ if debug
26
+ puts message
27
+ else
28
+ print ".".colorize(:green)
29
+ end
30
+ end
31
+
32
+ def warning message
33
+ if debug
34
+ puts message.colorize(:yellow)
35
+ else
36
+ print ".".colorize(:yellow)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,45 @@
1
+ require 'httparty'
2
+
3
+ module Snapshotify
4
+ class Resource
5
+ attr_accessor :url, :filename, :emitter, :parent
6
+
7
+ def initialize url, parent
8
+ self.url = Snapshotify::Url.new(url, parent)
9
+ self.parent = parent
10
+ end
11
+
12
+ def write! nested_urls = false
13
+ process_nested_urls if nested_urls
14
+
15
+ writer = Snapshotify::Writer.new(self)
16
+ writer.emitter = emitter
17
+ writer.write
18
+ self.filename = writer.canonical_filename
19
+ end
20
+
21
+ def data
22
+ @data ||= HTTParty.get(url.canonical_url)
23
+ end
24
+
25
+ private
26
+
27
+ def process_nested_urls
28
+ urls = data.scan(/url\((.*?)\)/i).map do |match|
29
+ if match[0].start_with?('"') || match[0].start_with?("'")
30
+ match[0][1...-1]
31
+ else
32
+ match[0]
33
+ end
34
+ end
35
+
36
+ urls.each do |nested_url|
37
+ resource = Snapshotify::Resource.new(nested_url, url)
38
+ resource.emitter = emitter
39
+ resource.write!
40
+
41
+ @data.gsub!(nested_url, resource.url.resource_path)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,67 @@
1
+ module Snapshotify
2
+ class Rewriter
3
+ attr_accessor :resource, :emitter
4
+
5
+ def initialize resource
6
+ self.resource = resource
7
+ end
8
+
9
+ def rewrite
10
+ rewrite_linked_resources
11
+ rewrite_links
12
+ end
13
+
14
+ private
15
+
16
+ def rewrite_linked_resources
17
+ write_css
18
+ write_js
19
+ write_images
20
+ end
21
+
22
+ def write_css
23
+ resource.doc.xpath('//link[@rel="stylesheet"]').each do |element|
24
+ write_element(element, 'href', replace: true)
25
+ end
26
+ end
27
+
28
+ def write_images
29
+ resource.doc.xpath('//img').each do |element|
30
+ write_element(element, 'src', replace: false)
31
+ end
32
+ end
33
+
34
+ def write_js
35
+ resource.doc.xpath('//script').each do |element|
36
+ write_element(element, 'src', replace: false)
37
+ end
38
+ end
39
+
40
+ def write_element element, key, options = {}
41
+ return unless element.attribute(key)
42
+ url = element.attribute(key).value
43
+ _resource = Snapshotify::Resource.new(url, resource.url)
44
+ _resource.emitter = emitter
45
+ _resource.write!(options[:replace])
46
+
47
+ emitter.log("# Rewriting #{url} => #{_resource.filename}") if url != _resource.filename
48
+ element.attribute(key).value = _resource.filename
49
+ end
50
+
51
+ def rewrite_links
52
+ resource.doc.xpath('//a').each do |element|
53
+ return unless element.attribute('href')
54
+ href = element.attribute('href').value
55
+
56
+ url = Snapshotify::Url.new(href, resource.url)
57
+ if url.valid
58
+ new_url = url.absolute_path_or_external_url
59
+ else
60
+ new_url = href
61
+ end
62
+ emitter.log("# Rewriting #{href} => #{new_url}") if href != new_url
63
+ element.attribute('href').value = new_url
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,61 @@
1
+ module Snapshotify
2
+ class Scraper
3
+ attr_accessor :emitter, :unprocessed, :processed, :invalid, :unprocessed_urls, :valid_domains
4
+
5
+ def initialize initial_url
6
+ seed = Snapshotify::Document.new(initial_url)
7
+
8
+ self.valid_domains = [seed.url.host]
9
+
10
+ self.unprocessed = [seed]
11
+ self.unprocessed_urls = [seed.url.canonical_url]
12
+
13
+ self.processed = []
14
+ self.invalid = []
15
+ end
16
+
17
+ def run
18
+ while !unprocessed.empty?
19
+ document = unprocessed.shift
20
+ process document
21
+ end
22
+ self
23
+ end
24
+
25
+ private
26
+
27
+ def process document
28
+ emitter.log(document.url.canonical_url.colorize(:green))
29
+
30
+ document.emitter = emitter
31
+
32
+ enqueue(document.links)
33
+ document.rewrite
34
+ document.write!
35
+
36
+ processed << document.url.canonical_url
37
+ end
38
+
39
+ def enqueue links
40
+ links.each_with_index do |document, index|
41
+ next unless valid?(document)
42
+ next if !valid_domains.include?(document.url.host)
43
+ next if processed.include?(document.url.canonical_url)
44
+ next if unprocessed_urls.include?(document.url.canonical_url)
45
+
46
+ emitter.log("> Enqueued: #{document.url.canonical_url}")
47
+
48
+ unprocessed << document
49
+ unprocessed_urls << document.url.canonical_url
50
+ end
51
+ end
52
+
53
+ def valid?(document)
54
+ if !document.url.valid && !self.invalid.include?(document.url.raw_url)
55
+ emitter.warning("> Invalid URL: #{document.url.raw_url}")
56
+ invalid << document.url.raw_url
57
+ end
58
+ document.url.valid
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,71 @@
1
+ module Snapshotify
2
+ class Url
3
+ attr_accessor :parent, :raw_url, :valid, :uri
4
+
5
+ def initialize raw_url, parent
6
+ self.raw_url = raw_url.strip
7
+ self.parent = parent
8
+ self.valid = true
9
+
10
+ sanitize_string
11
+ parse_uri
12
+ end
13
+
14
+ def host
15
+ uri.host
16
+ end
17
+
18
+ def parent_host
19
+ return unless parent
20
+ parent.host
21
+ end
22
+
23
+ def scheme
24
+ uri.scheme
25
+ end
26
+
27
+ def canonical_url
28
+ uri.to_s
29
+ end
30
+
31
+ def absolute_path_or_external_url
32
+ if parent && parent.host == host
33
+ uri.path
34
+ else
35
+ canonical_url
36
+ end
37
+ end
38
+
39
+ def resource_path
40
+ if parent && parent.host == host
41
+ uri.path
42
+ else
43
+ canonical_url.gsub('http:/', '').gsub('https:/', '')
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def sanitize_string
50
+ if raw_url.start_with?('//')
51
+ self.raw_url = "http:#{raw_url}"
52
+ elsif raw_url.start_with?('/')
53
+ self.raw_url = "#{parent.scheme}://#{parent.host}#{raw_url}"
54
+ elsif !(raw_url.start_with?('http://') || raw_url.start_with?('https://'))
55
+ if parent
56
+ self.raw_url = URI.join(parent.canonical_url, raw_url).to_s
57
+ else
58
+ self.raw_url = "http://#{raw_url}"
59
+ end
60
+ end
61
+ end
62
+
63
+ def parse_uri
64
+ self.uri = URI.parse(raw_url)
65
+ uri.path = "/" if uri.path.empty?
66
+ uri.fragment = nil
67
+ rescue
68
+ self.valid = false
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,3 @@
1
+ module Snapshotify
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,74 @@
1
+ module Snapshotify
2
+ class Writer
3
+ attr_accessor :resource, :emitter
4
+
5
+ def initialize resource
6
+ self.resource = resource
7
+ end
8
+
9
+ def write
10
+ write_file
11
+ end
12
+
13
+ def canonical_filename
14
+ if host == parent_host
15
+ "#{filename}"
16
+ else
17
+ "/#{host}#{filename}"
18
+ end
19
+ end
20
+
21
+
22
+ private
23
+
24
+ def write_file
25
+ return if File.exist?(full_filename)
26
+ ensure_directory full_filename
27
+
28
+ emitter.log("! Saving #{full_filename}")
29
+
30
+ File.open(full_filename, 'w') do |file|
31
+ file.write(resource.data)
32
+ end
33
+ end
34
+
35
+ def full_filename
36
+ @full_filename ||= begin
37
+ if host == parent_host || !parent_host
38
+ "#{host}#{filename}"
39
+ else
40
+ "#{parent_host}/#{host}#{filename}"
41
+ end
42
+ end
43
+ end
44
+
45
+ def filename
46
+ @filename ||= begin
47
+ path = resource.url.uri.path
48
+ if path.end_with?('/')
49
+ return path + 'index.html'
50
+ elsif !path.split('/').last.include?(".")
51
+ return path + '/index.html'
52
+ end
53
+ path
54
+ end
55
+ end
56
+
57
+ def ensure_directory filename
58
+ dir = File.dirname(filename)
59
+ unless File.directory?(dir)
60
+ FileUtils.mkdir_p(dir)
61
+ end
62
+ end
63
+
64
+ def host
65
+ resource.url.host
66
+ end
67
+
68
+ def parent_host
69
+ resource.url.parent_host
70
+ end
71
+
72
+
73
+ end
74
+ end
@@ -0,0 +1,26 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require "snapshotify/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'snapshotify'
6
+ s.version = Snapshotify::VERSION
7
+ s.authors = ["Cristiano Betta"]
8
+ s.email = ['cbetta@gmail.com']
9
+ s.homepage = "http://github.com/cbetta/snapshotify"
10
+ s.summary = "Backup tool for Hackference"
11
+ s.description = "Backup tool for Hackference"
12
+ s.license = 'MIT'
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.add_development_dependency('rake')
20
+
21
+ s.add_dependency('commander')
22
+ s.add_dependency('httparty')
23
+ s.add_dependency('oga')
24
+ s.add_dependency('awesome_print')
25
+ s.add_dependency('colorize')
26
+ end
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: snapshotify
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Cristiano Betta
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-10-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: commander
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: httparty
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: oga
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: awesome_print
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: colorize
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Backup tool for Hackference
98
+ email:
99
+ - cbetta@gmail.com
100
+ executables:
101
+ - snapshotify
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - Gemfile
107
+ - Gemfile.lock
108
+ - LICENSE
109
+ - README.md
110
+ - Rakefile
111
+ - bin/snapshotify
112
+ - lib/snapshotify.rb
113
+ - lib/snapshotify/command.rb
114
+ - lib/snapshotify/document.rb
115
+ - lib/snapshotify/emitter.rb
116
+ - lib/snapshotify/resource.rb
117
+ - lib/snapshotify/rewriter.rb
118
+ - lib/snapshotify/scraper.rb
119
+ - lib/snapshotify/url.rb
120
+ - lib/snapshotify/version.rb
121
+ - lib/snapshotify/writer.rb
122
+ - snapshotify.gemspec
123
+ homepage: http://github.com/cbetta/snapshotify
124
+ licenses:
125
+ - MIT
126
+ metadata: {}
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ required_rubygems_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubyforge_project:
143
+ rubygems_version: 2.6.13
144
+ signing_key:
145
+ specification_version: 4
146
+ summary: Backup tool for Hackference
147
+ test_files: []