elsmore 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7a1918ab7c969ec280e5fc1f7c6aebbee87fa19e
4
+ data.tar.gz: 5344fc276d3dec1cfabe38481dba0b5703369b74
5
+ SHA512:
6
+ metadata.gz: 591510236ccc9fba7eacd7b3d03a08b861a84a8d970ded0c3ca2e6289d49973799eca4e31653345d3b52e04b4d7129de361f87ea95c140094442f9c456edb872
7
+ data.tar.gz: 41c518b723dbc0c945060c27e351396e21cf198174ae3b57b22950f98b247ea8a37770d21a19733782d76fb0e32a71d7ae9114a77061c268cde06947996a0d4f
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2016 Cristiano Betta
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1 @@
1
+ # Work in Progress
data/elsmore.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ require File.expand_path('lib/elsmore/version', File.dirname(__FILE__))
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'elsmore'
5
+ s.version = Elsmore::VERSION
6
+ s.summary = "Backup too for Hackference"
7
+ s.description = "Backup too for Hackference"
8
+ s.authors = ["Cristiano Betta", "Mike Elsmore"]
9
+ s.email = ['cbetta@gmail.com', "mike@hackference.co.uk"]
10
+ s.files = Dir.glob('{lib,spec}/**/*') + %w(LICENSE README.md elsmore.gemspec)
11
+ s.homepage = 'https://github.com/cbetta/elsmore'
12
+ s.license = 'MIT'
13
+ s.require_path = 'lib'
14
+
15
+ s.add_development_dependency('rake')
16
+
17
+ s.add_dependency('commander')
18
+ s.add_dependency('httparty')
19
+ s.add_dependency('oga')
20
+ s.add_dependency('awesome_print')
21
+ s.add_dependency('colorize')
22
+ end
@@ -0,0 +1,43 @@
1
+ require 'commander'
2
+
3
+ module Elsmore
4
+ class Command
5
+ include Commander::Methods
6
+
7
+ attr_accessor :emitter
8
+
9
+ def initialize
10
+ self.emitter = Elsmore::Emitter.new
11
+ end
12
+
13
+ def run
14
+ program :name, 'elsmore'
15
+ program :version, Elsmore::VERSION
16
+ program :description, 'A convenient scraper for archiving sites'
17
+ program :help, 'Author', 'Cristiano Betta <cristiano@betta.io>'
18
+
19
+ command :snap do |c|
20
+ c.syntax = 'spider <url> [options]'
21
+ c.description = 'Spiders a URL within from the given page, sticking within the original domain'
22
+ c.action do |args, options|
23
+ scraper = Elsmore::Scraper.new(args.first)
24
+ scraper.emitter = emitter
25
+ result = scraper.run
26
+
27
+ emitter.newline
28
+ emitter.newline
29
+ emitter.say "Processed"
30
+ emitter.pretty result[:processed]
31
+ emitter.newline
32
+ emitter.say "Could not be processed"
33
+ emitter.pretty result[:invalid]
34
+ end
35
+ end
36
+ alias_command :'go fetch', :'snap'
37
+ default_command :snap
38
+
39
+
40
+ run!
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,41 @@
1
+ require 'httparty'
2
+ require 'oga'
3
+
4
+ module Elsmore
5
+ class Document
6
+ attr_accessor :url
7
+
8
+ def initialize url, parent = nil
9
+ self.url = Elsmore::Url.new(url, parent)
10
+ end
11
+
12
+ def links
13
+ @links ||= begin
14
+ doc.xpath('//a').map do |element|
15
+ element.attribute('href')
16
+ end.compact.map(&:value).map do |href|
17
+ Elsmore::Document.new(href, url)
18
+ end.compact
19
+ end
20
+ end
21
+
22
+ def data
23
+ doc.to_xml
24
+ end
25
+
26
+ def write!
27
+ Elsmore::Writer.new(self).write
28
+ end
29
+
30
+ def rewrite
31
+ Elsmore::Rewriter.new(self).rewrite
32
+ end
33
+
34
+ def doc
35
+ @doc ||= begin
36
+ html = HTTParty.get(url.canonical_url)
37
+ Oga.parse_html(html)
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,30 @@
1
+ require 'awesome_print'
2
+ require 'colorize'
3
+
4
+ module Elsmore
5
+ class Emitter
6
+ def newline
7
+ say "\n"
8
+ end
9
+
10
+ def say value
11
+ puts value
12
+ end
13
+
14
+ def dot
15
+ print ".".colorize(:green)
16
+ end
17
+
18
+ def x
19
+ print "x".colorize(:red)
20
+ end
21
+
22
+ def unsure
23
+ print "?".colorize(:blue)
24
+ end
25
+
26
+ def pretty value
27
+ ap value
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,21 @@
1
+ require 'open-uri'
2
+
3
+ module Elsmore
4
+ class Resource
5
+ attr_accessor :url, :filename
6
+
7
+ def initialize url, parent
8
+ self.url = Elsmore::Url.new(url, parent)
9
+ end
10
+
11
+ def write!
12
+ writer = Elsmore::Writer.new(self)
13
+ writer.write
14
+ self.filename = writer.canonical_filename
15
+ end
16
+
17
+ def data
18
+ @data ||= open(url.canonical_url).read
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,57 @@
1
+ module Elsmore
2
+ class Rewriter
3
+ attr_accessor :resource
4
+
5
+ def initialize resource
6
+ self.resource = resource
7
+ end
8
+
9
+ def rewrite
10
+ rewrite_linked_resources
11
+ rewrite_links
12
+ end
13
+
14
+ private
15
+
16
+ def rewrite_linked_resources
17
+ write_css
18
+ write_js
19
+ write_images
20
+ end
21
+
22
+ def write_css
23
+ resource.doc.xpath('//link[rel=stylesheet]').each do |element|
24
+ write_element(element, 'href')
25
+ end
26
+ end
27
+
28
+ def write_images
29
+ resource.doc.xpath('//img').each do |element|
30
+ write_element(element, 'src')
31
+ end
32
+ end
33
+
34
+ def write_js
35
+ resource.doc.xpath('//script').each do |element|
36
+ write_element(element, 'src')
37
+ end
38
+ end
39
+
40
+ def write_element element, key
41
+ return unless element.attribute(key)
42
+ url = element.attribute(key).value
43
+ _resource = Elsmore::Resource.new(url, resource.url)
44
+ _resource.write!
45
+ element.attribute(key).value = _resource.filename
46
+ end
47
+
48
+ def rewrite_links
49
+ resource.doc.xpath('//a').each do |element|
50
+ return unless element.attribute('href')
51
+ href = element.attribute('href').value
52
+ url = Elsmore::Url.new(href, resource.url)
53
+ element.attribute('href').value = url.absolute_path_or_external_url
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,49 @@
1
+ module Elsmore
2
+ class Scraper
3
+ attr_accessor :emitter
4
+
5
+ def initialize initial_url
6
+ seed = Elsmore::Document.new(initial_url)
7
+
8
+ @valid_domains = [seed.url.host]
9
+ @unprocessed = [seed]
10
+ @processed = []
11
+ @invalid = []
12
+ end
13
+
14
+ def run
15
+ while !@unprocessed.empty?
16
+ document = @unprocessed.shift
17
+ next if @processed.include?(document.url.canonical_url)
18
+ emitter.dot
19
+
20
+ enqueue(document.links)
21
+ document.rewrite
22
+ document.write!
23
+
24
+ @processed << document.url.canonical_url
25
+ end
26
+
27
+ {
28
+ processed: @processed,
29
+ invalid: @invalid
30
+ }
31
+ end
32
+
33
+ private
34
+
35
+ def enqueue links
36
+ links.each_with_index do |document, index|
37
+ if !document.url.valid
38
+ emitter.unsure
39
+ @invalid << document.url.raw_url
40
+ next
41
+ end
42
+
43
+ next if !@valid_domains.include?(document.url.host)
44
+ next if @processed.include?(document.url.canonical_url)
45
+ @unprocessed << document
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,63 @@
1
+ module Elsmore
2
+ class Url
3
+ attr_accessor :parent, :raw_url, :valid, :uri
4
+
5
+ def initialize raw_url, parent
6
+ self.raw_url = raw_url.strip
7
+ self.parent = parent
8
+ self.valid = true
9
+
10
+ sanitize_string
11
+ parse_uri
12
+ end
13
+
14
+ def host
15
+ uri.host
16
+ end
17
+
18
+ def parent_host
19
+ return unless parent
20
+ parent.host
21
+ end
22
+
23
+ def scheme
24
+ uri.scheme
25
+ end
26
+
27
+ def canonical_url
28
+ uri.to_s
29
+ end
30
+
31
+ def absolute_path_or_external_url
32
+ if parent && parent.host == host
33
+ uri.path
34
+ else
35
+ canonical_url
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ def sanitize_string
42
+ if raw_url.start_with?('//')
43
+ self.raw_url = "http:#{raw_url}"
44
+ elsif raw_url.start_with?('/')
45
+ self.raw_url = "#{parent.scheme}://#{parent.host}#{raw_url}"
46
+ elsif !(raw_url.start_with?('http://') || raw_url.start_with?('https://'))
47
+ if parent
48
+ self.raw_url = URI.join(parent.canonical_url, raw_url).to_s
49
+ else
50
+ self.raw_url = "http://#{raw_url}"
51
+ end
52
+ end
53
+ end
54
+
55
+ def parse_uri
56
+ self.uri = URI.parse(raw_url)
57
+ uri.path = "/" if uri.path.empty?
58
+ uri.fragment = nil
59
+ rescue
60
+ self.valid = false
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,3 @@
1
+ module Elsmore
2
+ VERSION = '0.1.1'
3
+ end
@@ -0,0 +1,71 @@
1
+ module Elsmore
2
+ class Writer
3
+ attr_accessor :resource
4
+
5
+ def initialize resource
6
+ self.resource = resource
7
+ end
8
+
9
+ def write
10
+ write_file
11
+ end
12
+
13
+ def canonical_filename
14
+ if host == parent_host
15
+ "#{filename}"
16
+ else
17
+ "/#{host}#{filename}"
18
+ end
19
+ end
20
+
21
+
22
+ private
23
+
24
+ def write_file
25
+ return if File.exist?(full_filename)
26
+ ensure_directory full_filename
27
+ File.open(full_filename, 'w') do |file|
28
+ file.write(resource.data)
29
+ end
30
+ end
31
+
32
+ def full_filename
33
+ @full_filename ||= begin
34
+ if host == parent_host || !parent_host
35
+ "#{host}#{filename}"
36
+ else
37
+ "#{parent_host}/#{host}#{filename}"
38
+ end
39
+ end
40
+ end
41
+
42
+ def filename
43
+ @filename ||= begin
44
+ path = resource.url.uri.path
45
+ if path.end_with?('/')
46
+ return path + 'index.html'
47
+ elsif !path.split('/').last.include?(".")
48
+ return path + '/index.html'
49
+ end
50
+ path
51
+ end
52
+ end
53
+
54
+ def ensure_directory filename
55
+ dir = File.dirname(filename)
56
+ unless File.directory?(dir)
57
+ FileUtils.mkdir_p(dir)
58
+ end
59
+ end
60
+
61
+ def host
62
+ resource.url.host
63
+ end
64
+
65
+ def parent_host
66
+ resource.url.parent_host
67
+ end
68
+
69
+
70
+ end
71
+ end
data/lib/elsmore.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'elsmore/version'
2
+ require 'elsmore/command'
3
+ require 'elsmore/scraper'
4
+ require 'elsmore/document'
5
+ require 'elsmore/emitter'
6
+ require 'elsmore/writer'
7
+ require 'elsmore/url'
8
+ require 'elsmore/resource'
9
+ require 'elsmore/rewriter'
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: elsmore
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Cristiano Betta
8
+ - Mike Elsmore
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2016-10-22 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: commander
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: httparty
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: oga
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: awesome_print
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: colorize
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ type: :runtime
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ description: Backup too for Hackference
99
+ email:
100
+ - cbetta@gmail.com
101
+ - mike@hackference.co.uk
102
+ executables: []
103
+ extensions: []
104
+ extra_rdoc_files: []
105
+ files:
106
+ - LICENSE
107
+ - README.md
108
+ - elsmore.gemspec
109
+ - lib/elsmore.rb
110
+ - lib/elsmore/command.rb
111
+ - lib/elsmore/document.rb
112
+ - lib/elsmore/emitter.rb
113
+ - lib/elsmore/resource.rb
114
+ - lib/elsmore/rewriter.rb
115
+ - lib/elsmore/scraper.rb
116
+ - lib/elsmore/url.rb
117
+ - lib/elsmore/version.rb
118
+ - lib/elsmore/writer.rb
119
+ homepage: https://github.com/cbetta/elsmore
120
+ licenses:
121
+ - MIT
122
+ metadata: {}
123
+ post_install_message:
124
+ rdoc_options: []
125
+ require_paths:
126
+ - lib
127
+ required_ruby_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubyforge_project:
139
+ rubygems_version: 2.5.1
140
+ signing_key:
141
+ specification_version: 4
142
+ summary: Backup too for Hackference
143
+ test_files: []