snapshotify 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2155e4a1be045e2d84a53ff9b1732b8ebf478c62
4
+ data.tar.gz: f7e78a6b078025ab3ebb6325a291583286fa093f
5
+ SHA512:
6
+ metadata.gz: 95b6c3096a7fe7f0c753f3254ddbe6df17a741b32e237dea7381a504664728cf88381320bb1c0df02a3a7501ead0a1c9cf048616bbf0897cdd9d8a7f4ce61816
7
+ data.tar.gz: b91a62e4dab128e1ccdf3d696b42b29c91cc01e3889eae10cbf22469fc7e27be1f2f101840b535bed3ba175201caa6cc2d9bc629f12594daf57249136152f495
@@ -0,0 +1,3 @@
1
+ site
2
+ pkg
3
+ hackference.co.uk
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,40 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ snapshotify (0.1.0)
5
+ awesome_print
6
+ colorize
7
+ commander
8
+ httparty
9
+ oga
10
+
11
+ GEM
12
+ remote: https://rubygems.org/
13
+ specs:
14
+ ansi (1.5.0)
15
+ ast (2.3.0)
16
+ awesome_print (1.8.0)
17
+ colorize (0.8.1)
18
+ commander (4.4.3)
19
+ highline (~> 1.7.2)
20
+ highline (1.7.8)
21
+ httparty (0.15.6)
22
+ multi_xml (>= 0.5.2)
23
+ multi_xml (0.6.0)
24
+ oga (2.11)
25
+ ast
26
+ ruby-ll (~> 2.1)
27
+ rake (12.1.0)
28
+ ruby-ll (2.1.2)
29
+ ansi
30
+ ast
31
+
32
+ PLATFORMS
33
+ ruby
34
+
35
+ DEPENDENCIES
36
+ rake
37
+ snapshotify!
38
+
39
+ BUNDLED WITH
40
+ 1.16.0.pre.3
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2016 Cristiano Betta
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,25 @@
1
+ # Snapshotify
2
+
3
+ A simple tool for backing up a full site into a static instance.
4
+
5
+ ## To install
6
+ ```sh
7
+ gem install snapshotify
8
+ ```
9
+
10
+ ## To fetch website
11
+ ```sh
12
+ snapshotify go fetch [website_url] [--debug]
13
+ ```
14
+
15
+ ## To serve
16
+ ```sh
17
+ snapshotify serve [website_url]
18
+ ```
19
+
20
+ ## Todo
21
+
22
+ * Add tests
23
+ * Add code to limit hops
24
+ * Add code to limit domains to scrape
25
+ * Fix bugs in asset rewriting
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ task :console do
5
+ exec "irb -r snapshotify -I ./lib"
6
+ end
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require 'snapshotify'
3
+ require 'commander'
4
+
5
+ Snapshotify::Command.new.run
@@ -0,0 +1,9 @@
1
+ require 'snapshotify/version'
2
+ require 'snapshotify/command'
3
+ require 'snapshotify/scraper'
4
+ require 'snapshotify/document'
5
+ require 'snapshotify/emitter'
6
+ require 'snapshotify/writer'
7
+ require 'snapshotify/url'
8
+ require 'snapshotify/resource'
9
+ require 'snapshotify/rewriter'
@@ -0,0 +1,55 @@
1
+ require 'commander'
2
+
3
+ module Snapshotify
4
+ class Command
5
+ include Commander::Methods
6
+
7
+ attr_accessor :emitter
8
+
9
+ def initialize
10
+ self.emitter = Snapshotify::Emitter.new
11
+ end
12
+
13
+ def run
14
+ program :name, 'snapshotify'
15
+ program :version, Snapshotify::VERSION
16
+ program :description, 'A convenient scraper for archiving sites'
17
+ program :help, 'Author', 'Cristiano Betta <cristiano@betta.io>'
18
+
19
+ global_option('--debug') { emitter.debug! }
20
+
21
+ command :snap do |c|
22
+ c.syntax = 'spider <url>'
23
+ c.description = 'Spiders a URL within from the given page, sticking within the original domain'
24
+ c.action do |args, options|
25
+ scraper = Snapshotify::Scraper.new(args.first)
26
+ scraper.emitter = emitter
27
+ scraper.run
28
+
29
+ emitter.newline
30
+ emitter.newline
31
+ emitter.say "Processed"
32
+ emitter.pretty scraper.processed
33
+ emitter.newline
34
+ emitter.say "Could not be processed"
35
+ emitter.pretty scraper.invalid
36
+
37
+ emitter.newline
38
+ emitter.say "Run 'snapshotify serve #{args.first}' to start a webserver on port 8000 with your local copy"
39
+ end
40
+ end
41
+ alias_command :'go fetch', :'snap'
42
+ default_command :snap
43
+
44
+ command :serve do |c|
45
+ c.syntax = 'serve <folder_name>'
46
+ c.description = 'Serve local folder'
47
+ c.action do |args, options|
48
+ exec "ruby -run -ehttpd ./#{ARGV[1]} -p8000"
49
+ end
50
+ end
51
+
52
+ run!
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,45 @@
1
+ require 'httparty'
2
+ require 'oga'
3
+
4
+ module Snapshotify
5
+ class Document
6
+ attr_accessor :url, :emitter
7
+
8
+ def initialize url, parent = nil
9
+ self.url = Snapshotify::Url.new(url, parent)
10
+ end
11
+
12
+ def links
13
+ @links ||= begin
14
+ doc.xpath('//a').map do |element|
15
+ element.attribute('href')
16
+ end.compact.map(&:value).map do |href|
17
+ Snapshotify::Document.new(href, url)
18
+ end.compact
19
+ end
20
+ end
21
+
22
+ def data
23
+ doc.to_xml
24
+ end
25
+
26
+ def write!
27
+ writer = Snapshotify::Writer.new(self)
28
+ writer.emitter = emitter
29
+ writer.write
30
+ end
31
+
32
+ def rewrite
33
+ rewriter = Snapshotify::Rewriter.new(self)
34
+ rewriter.emitter = emitter
35
+ rewriter.rewrite
36
+ end
37
+
38
+ def doc
39
+ @doc ||= begin
40
+ html = HTTParty.get(url.canonical_url)
41
+ Oga.parse_html(html)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,40 @@
1
+ require 'awesome_print'
2
+ require 'colorize'
3
+
4
+ module Snapshotify
5
+ class Emitter
6
+ attr_accessor :debug
7
+
8
+ def initialize
9
+ self.debug = false
10
+ end
11
+
12
+ def debug!
13
+ self.debug = true
14
+ end
15
+
16
+ def newline
17
+ puts "\n"
18
+ end
19
+
20
+ def pretty value
21
+ ap value
22
+ end
23
+
24
+ def log message
25
+ if debug
26
+ puts message
27
+ else
28
+ print ".".colorize(:green)
29
+ end
30
+ end
31
+
32
+ def warning message
33
+ if debug
34
+ puts message.colorize(:yellow)
35
+ else
36
+ print ".".colorize(:yellow)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,45 @@
1
+ require 'httparty'
2
+
3
+ module Snapshotify
4
+ class Resource
5
+ attr_accessor :url, :filename, :emitter, :parent
6
+
7
+ def initialize url, parent
8
+ self.url = Snapshotify::Url.new(url, parent)
9
+ self.parent = parent
10
+ end
11
+
12
+ def write! nested_urls = false
13
+ process_nested_urls if nested_urls
14
+
15
+ writer = Snapshotify::Writer.new(self)
16
+ writer.emitter = emitter
17
+ writer.write
18
+ self.filename = writer.canonical_filename
19
+ end
20
+
21
+ def data
22
+ @data ||= HTTParty.get(url.canonical_url)
23
+ end
24
+
25
+ private
26
+
27
+ def process_nested_urls
28
+ urls = data.scan(/url\((.*?)\)/i).map do |match|
29
+ if match[0].start_with?('"') || match[0].start_with?("'")
30
+ match[0][1...-1]
31
+ else
32
+ match[0]
33
+ end
34
+ end
35
+
36
+ urls.each do |nested_url|
37
+ resource = Snapshotify::Resource.new(nested_url, url)
38
+ resource.emitter = emitter
39
+ resource.write!
40
+
41
+ @data.gsub!(nested_url, resource.url.resource_path)
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,67 @@
1
+ module Snapshotify
2
+ class Rewriter
3
+ attr_accessor :resource, :emitter
4
+
5
+ def initialize resource
6
+ self.resource = resource
7
+ end
8
+
9
+ def rewrite
10
+ rewrite_linked_resources
11
+ rewrite_links
12
+ end
13
+
14
+ private
15
+
16
+ def rewrite_linked_resources
17
+ write_css
18
+ write_js
19
+ write_images
20
+ end
21
+
22
+ def write_css
23
+ resource.doc.xpath('//link[@rel="stylesheet"]').each do |element|
24
+ write_element(element, 'href', replace: true)
25
+ end
26
+ end
27
+
28
+ def write_images
29
+ resource.doc.xpath('//img').each do |element|
30
+ write_element(element, 'src', replace: false)
31
+ end
32
+ end
33
+
34
+ def write_js
35
+ resource.doc.xpath('//script').each do |element|
36
+ write_element(element, 'src', replace: false)
37
+ end
38
+ end
39
+
40
+ def write_element element, key, options = {}
41
+ return unless element.attribute(key)
42
+ url = element.attribute(key).value
43
+ _resource = Snapshotify::Resource.new(url, resource.url)
44
+ _resource.emitter = emitter
45
+ _resource.write!(options[:replace])
46
+
47
+ emitter.log("# Rewriting #{url} => #{_resource.filename}") if url != _resource.filename
48
+ element.attribute(key).value = _resource.filename
49
+ end
50
+
51
+ def rewrite_links
52
+ resource.doc.xpath('//a').each do |element|
53
+ return unless element.attribute('href')
54
+ href = element.attribute('href').value
55
+
56
+ url = Snapshotify::Url.new(href, resource.url)
57
+ if url.valid
58
+ new_url = url.absolute_path_or_external_url
59
+ else
60
+ new_url = href
61
+ end
62
+ emitter.log("# Rewriting #{href} => #{new_url}") if href != new_url
63
+ element.attribute('href').value = new_url
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,61 @@
1
+ module Snapshotify
2
+ class Scraper
3
+ attr_accessor :emitter, :unprocessed, :processed, :invalid, :unprocessed_urls, :valid_domains
4
+
5
+ def initialize initial_url
6
+ seed = Snapshotify::Document.new(initial_url)
7
+
8
+ self.valid_domains = [seed.url.host]
9
+
10
+ self.unprocessed = [seed]
11
+ self.unprocessed_urls = [seed.url.canonical_url]
12
+
13
+ self.processed = []
14
+ self.invalid = []
15
+ end
16
+
17
+ def run
18
+ while !unprocessed.empty?
19
+ document = unprocessed.shift
20
+ process document
21
+ end
22
+ self
23
+ end
24
+
25
+ private
26
+
27
+ def process document
28
+ emitter.log(document.url.canonical_url.colorize(:green))
29
+
30
+ document.emitter = emitter
31
+
32
+ enqueue(document.links)
33
+ document.rewrite
34
+ document.write!
35
+
36
+ processed << document.url.canonical_url
37
+ end
38
+
39
+ def enqueue links
40
+ links.each_with_index do |document, index|
41
+ next unless valid?(document)
42
+ next if !valid_domains.include?(document.url.host)
43
+ next if processed.include?(document.url.canonical_url)
44
+ next if unprocessed_urls.include?(document.url.canonical_url)
45
+
46
+ emitter.log("> Enqueued: #{document.url.canonical_url}")
47
+
48
+ unprocessed << document
49
+ unprocessed_urls << document.url.canonical_url
50
+ end
51
+ end
52
+
53
+ def valid?(document)
54
+ if !document.url.valid && !self.invalid.include?(document.url.raw_url)
55
+ emitter.warning("> Invalid URL: #{document.url.raw_url}")
56
+ invalid << document.url.raw_url
57
+ end
58
+ document.url.valid
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,71 @@
1
+ module Snapshotify
2
+ class Url
3
+ attr_accessor :parent, :raw_url, :valid, :uri
4
+
5
+ def initialize raw_url, parent
6
+ self.raw_url = raw_url.strip
7
+ self.parent = parent
8
+ self.valid = true
9
+
10
+ sanitize_string
11
+ parse_uri
12
+ end
13
+
14
+ def host
15
+ uri.host
16
+ end
17
+
18
+ def parent_host
19
+ return unless parent
20
+ parent.host
21
+ end
22
+
23
+ def scheme
24
+ uri.scheme
25
+ end
26
+
27
+ def canonical_url
28
+ uri.to_s
29
+ end
30
+
31
+ def absolute_path_or_external_url
32
+ if parent && parent.host == host
33
+ uri.path
34
+ else
35
+ canonical_url
36
+ end
37
+ end
38
+
39
+ def resource_path
40
+ if parent && parent.host == host
41
+ uri.path
42
+ else
43
+ canonical_url.gsub('http:/', '').gsub('https:/', '')
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def sanitize_string
50
+ if raw_url.start_with?('//')
51
+ self.raw_url = "http:#{raw_url}"
52
+ elsif raw_url.start_with?('/')
53
+ self.raw_url = "#{parent.scheme}://#{parent.host}#{raw_url}"
54
+ elsif !(raw_url.start_with?('http://') || raw_url.start_with?('https://'))
55
+ if parent
56
+ self.raw_url = URI.join(parent.canonical_url, raw_url).to_s
57
+ else
58
+ self.raw_url = "http://#{raw_url}"
59
+ end
60
+ end
61
+ end
62
+
63
+ def parse_uri
64
+ self.uri = URI.parse(raw_url)
65
+ uri.path = "/" if uri.path.empty?
66
+ uri.fragment = nil
67
+ rescue
68
+ self.valid = false
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,3 @@
1
+ module Snapshotify
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,74 @@
1
+ module Snapshotify
2
+ class Writer
3
+ attr_accessor :resource, :emitter
4
+
5
+ def initialize resource
6
+ self.resource = resource
7
+ end
8
+
9
+ def write
10
+ write_file
11
+ end
12
+
13
+ def canonical_filename
14
+ if host == parent_host
15
+ "#{filename}"
16
+ else
17
+ "/#{host}#{filename}"
18
+ end
19
+ end
20
+
21
+
22
+ private
23
+
24
+ def write_file
25
+ return if File.exist?(full_filename)
26
+ ensure_directory full_filename
27
+
28
+ emitter.log("! Saving #{full_filename}")
29
+
30
+ File.open(full_filename, 'w') do |file|
31
+ file.write(resource.data)
32
+ end
33
+ end
34
+
35
+ def full_filename
36
+ @full_filename ||= begin
37
+ if host == parent_host || !parent_host
38
+ "#{host}#{filename}"
39
+ else
40
+ "#{parent_host}/#{host}#{filename}"
41
+ end
42
+ end
43
+ end
44
+
45
+ def filename
46
+ @filename ||= begin
47
+ path = resource.url.uri.path
48
+ if path.end_with?('/')
49
+ return path + 'index.html'
50
+ elsif !path.split('/').last.include?(".")
51
+ return path + '/index.html'
52
+ end
53
+ path
54
+ end
55
+ end
56
+
57
+ def ensure_directory filename
58
+ dir = File.dirname(filename)
59
+ unless File.directory?(dir)
60
+ FileUtils.mkdir_p(dir)
61
+ end
62
+ end
63
+
64
+ def host
65
+ resource.url.host
66
+ end
67
+
68
+ def parent_host
69
+ resource.url.parent_host
70
+ end
71
+
72
+
73
+ end
74
+ end
@@ -0,0 +1,26 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require "snapshotify/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'snapshotify'
6
+ s.version = Snapshotify::VERSION
7
+ s.authors = ["Cristiano Betta"]
8
+ s.email = ['cbetta@gmail.com']
9
+ s.homepage = "http://github.com/cbetta/snapshotify"
10
+ s.summary = "Backup tool for Hackference"
11
+ s.description = "Backup tool for Hackference"
12
+ s.license = 'MIT'
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+
19
+ s.add_development_dependency('rake')
20
+
21
+ s.add_dependency('commander')
22
+ s.add_dependency('httparty')
23
+ s.add_dependency('oga')
24
+ s.add_dependency('awesome_print')
25
+ s.add_dependency('colorize')
26
+ end
metadata ADDED
@@ -0,0 +1,147 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: snapshotify
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Cristiano Betta
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-10-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: commander
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: httparty
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: oga
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: awesome_print
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: colorize
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Backup tool for Hackference
98
+ email:
99
+ - cbetta@gmail.com
100
+ executables:
101
+ - snapshotify
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - Gemfile
107
+ - Gemfile.lock
108
+ - LICENSE
109
+ - README.md
110
+ - Rakefile
111
+ - bin/snapshotify
112
+ - lib/snapshotify.rb
113
+ - lib/snapshotify/command.rb
114
+ - lib/snapshotify/document.rb
115
+ - lib/snapshotify/emitter.rb
116
+ - lib/snapshotify/resource.rb
117
+ - lib/snapshotify/rewriter.rb
118
+ - lib/snapshotify/scraper.rb
119
+ - lib/snapshotify/url.rb
120
+ - lib/snapshotify/version.rb
121
+ - lib/snapshotify/writer.rb
122
+ - snapshotify.gemspec
123
+ homepage: http://github.com/cbetta/snapshotify
124
+ licenses:
125
+ - MIT
126
+ metadata: {}
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ required_rubygems_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubyforge_project:
143
+ rubygems_version: 2.6.13
144
+ signing_key:
145
+ specification_version: 4
146
+ summary: Backup tool for Hackference
147
+ test_files: []