elsmore 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7a1918ab7c969ec280e5fc1f7c6aebbee87fa19e
4
+ data.tar.gz: 5344fc276d3dec1cfabe38481dba0b5703369b74
5
+ SHA512:
6
+ metadata.gz: 591510236ccc9fba7eacd7b3d03a08b861a84a8d970ded0c3ca2e6289d49973799eca4e31653345d3b52e04b4d7129de361f87ea95c140094442f9c456edb872
7
+ data.tar.gz: 41c518b723dbc0c945060c27e351396e21cf198174ae3b57b22950f98b247ea8a37770d21a19733782d76fb0e32a71d7ae9114a77061c268cde06947996a0d4f
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2016 Cristiano Betta
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1 @@
1
+ # Work in Progress
data/elsmore.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ require File.expand_path('lib/elsmore/version', File.dirname(__FILE__))
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'elsmore'
5
+ s.version = Elsmore::VERSION
6
+ s.summary = "Backup too for Hackference"
7
+ s.description = "Backup too for Hackference"
8
+ s.authors = ["Cristiano Betta", "Mike Elsmore"]
9
+ s.email = ['cbetta@gmail.com', "mike@hackference.co.uk"]
10
+ s.files = Dir.glob('{lib,spec}/**/*') + %w(LICENSE README.md elsmore.gemspec)
11
+ s.homepage = 'https://github.com/cbetta/elsmore'
12
+ s.license = 'MIT'
13
+ s.require_path = 'lib'
14
+
15
+ s.add_development_dependency('rake')
16
+
17
+ s.add_dependency('commander')
18
+ s.add_dependency('httparty')
19
+ s.add_dependency('oga')
20
+ s.add_dependency('awesome_print')
21
+ s.add_dependency('colorize')
22
+ end
@@ -0,0 +1,43 @@
1
+ require 'commander'
2
+
3
+ module Elsmore
4
+ class Command
5
+ include Commander::Methods
6
+
7
+ attr_accessor :emitter
8
+
9
+ def initialize
10
+ self.emitter = Elsmore::Emitter.new
11
+ end
12
+
13
+ def run
14
+ program :name, 'elsmore'
15
+ program :version, Elsmore::VERSION
16
+ program :description, 'A convenient scraper for archiving sites'
17
+ program :help, 'Author', 'Cristiano Betta <cristiano@betta.io>'
18
+
19
+ command :snap do |c|
20
+ c.syntax = 'spider <url> [options]'
21
+ c.description = 'Spiders a URL within from the given page, sticking within the original domain'
22
+ c.action do |args, options|
23
+ scraper = Elsmore::Scraper.new(args.first)
24
+ scraper.emitter = emitter
25
+ result = scraper.run
26
+
27
+ emitter.newline
28
+ emitter.newline
29
+ emitter.say "Processed"
30
+ emitter.pretty result[:processed]
31
+ emitter.newline
32
+ emitter.say "Could not be processed"
33
+ emitter.pretty result[:invalid]
34
+ end
35
+ end
36
+ alias_command :'go fetch', :'snap'
37
+ default_command :snap
38
+
39
+
40
+ run!
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,41 @@
1
+ require 'httparty'
2
+ require 'oga'
3
+
4
+ module Elsmore
5
+ class Document
6
+ attr_accessor :url
7
+
8
+ def initialize url, parent = nil
9
+ self.url = Elsmore::Url.new(url, parent)
10
+ end
11
+
12
+ def links
13
+ @links ||= begin
14
+ doc.xpath('//a').map do |element|
15
+ element.attribute('href')
16
+ end.compact.map(&:value).map do |href|
17
+ Elsmore::Document.new(href, url)
18
+ end.compact
19
+ end
20
+ end
21
+
22
+ def data
23
+ doc.to_xml
24
+ end
25
+
26
+ def write!
27
+ Elsmore::Writer.new(self).write
28
+ end
29
+
30
+ def rewrite
31
+ Elsmore::Rewriter.new(self).rewrite
32
+ end
33
+
34
+ def doc
35
+ @doc ||= begin
36
+ html = HTTParty.get(url.canonical_url)
37
+ Oga.parse_html(html)
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,30 @@
1
+ require 'awesome_print'
2
+ require 'colorize'
3
+
4
+ module Elsmore
5
+ class Emitter
6
+ def newline
7
+ say "\n"
8
+ end
9
+
10
+ def say value
11
+ puts value
12
+ end
13
+
14
+ def dot
15
+ print ".".colorize(:green)
16
+ end
17
+
18
+ def x
19
+ print "x".colorize(:red)
20
+ end
21
+
22
+ def unsure
23
+ print "?".colorize(:blue)
24
+ end
25
+
26
+ def pretty value
27
+ ap value
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,21 @@
1
+ require 'open-uri'
2
+
3
+ module Elsmore
4
+ class Resource
5
+ attr_accessor :url, :filename
6
+
7
+ def initialize url, parent
8
+ self.url = Elsmore::Url.new(url, parent)
9
+ end
10
+
11
+ def write!
12
+ writer = Elsmore::Writer.new(self)
13
+ writer.write
14
+ self.filename = writer.canonical_filename
15
+ end
16
+
17
+ def data
18
+ @data ||= open(url.canonical_url).read
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,57 @@
1
+ module Elsmore
2
+ class Rewriter
3
+ attr_accessor :resource
4
+
5
+ def initialize resource
6
+ self.resource = resource
7
+ end
8
+
9
+ def rewrite
10
+ rewrite_linked_resources
11
+ rewrite_links
12
+ end
13
+
14
+ private
15
+
16
+ def rewrite_linked_resources
17
+ write_css
18
+ write_js
19
+ write_images
20
+ end
21
+
22
+ def write_css
23
+ resource.doc.xpath('//link[rel=stylesheet]').each do |element|
24
+ write_element(element, 'href')
25
+ end
26
+ end
27
+
28
+ def write_images
29
+ resource.doc.xpath('//img').each do |element|
30
+ write_element(element, 'src')
31
+ end
32
+ end
33
+
34
+ def write_js
35
+ resource.doc.xpath('//script').each do |element|
36
+ write_element(element, 'src')
37
+ end
38
+ end
39
+
40
+ def write_element element, key
41
+ return unless element.attribute(key)
42
+ url = element.attribute(key).value
43
+ _resource = Elsmore::Resource.new(url, resource.url)
44
+ _resource.write!
45
+ element.attribute(key).value = _resource.filename
46
+ end
47
+
48
+ def rewrite_links
49
+ resource.doc.xpath('//a').each do |element|
50
+ return unless element.attribute('href')
51
+ href = element.attribute('href').value
52
+ url = Elsmore::Url.new(href, resource.url)
53
+ element.attribute('href').value = url.absolute_path_or_external_url
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,49 @@
1
+ module Elsmore
2
+ class Scraper
3
+ attr_accessor :emitter
4
+
5
+ def initialize initial_url
6
+ seed = Elsmore::Document.new(initial_url)
7
+
8
+ @valid_domains = [seed.url.host]
9
+ @unprocessed = [seed]
10
+ @processed = []
11
+ @invalid = []
12
+ end
13
+
14
+ def run
15
+ while !@unprocessed.empty?
16
+ document = @unprocessed.shift
17
+ next if @processed.include?(document.url.canonical_url)
18
+ emitter.dot
19
+
20
+ enqueue(document.links)
21
+ document.rewrite
22
+ document.write!
23
+
24
+ @processed << document.url.canonical_url
25
+ end
26
+
27
+ {
28
+ processed: @processed,
29
+ invalid: @invalid
30
+ }
31
+ end
32
+
33
+ private
34
+
35
+ def enqueue links
36
+ links.each_with_index do |document, index|
37
+ if !document.url.valid
38
+ emitter.unsure
39
+ @invalid << document.url.raw_url
40
+ next
41
+ end
42
+
43
+ next if !@valid_domains.include?(document.url.host)
44
+ next if @processed.include?(document.url.canonical_url)
45
+ @unprocessed << document
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,63 @@
1
+ module Elsmore
2
+ class Url
3
+ attr_accessor :parent, :raw_url, :valid, :uri
4
+
5
+ def initialize raw_url, parent
6
+ self.raw_url = raw_url.strip
7
+ self.parent = parent
8
+ self.valid = true
9
+
10
+ sanitize_string
11
+ parse_uri
12
+ end
13
+
14
+ def host
15
+ uri.host
16
+ end
17
+
18
+ def parent_host
19
+ return unless parent
20
+ parent.host
21
+ end
22
+
23
+ def scheme
24
+ uri.scheme
25
+ end
26
+
27
+ def canonical_url
28
+ uri.to_s
29
+ end
30
+
31
+ def absolute_path_or_external_url
32
+ if parent && parent.host == host
33
+ uri.path
34
+ else
35
+ canonical_url
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ def sanitize_string
42
+ if raw_url.start_with?('//')
43
+ self.raw_url = "http:#{raw_url}"
44
+ elsif raw_url.start_with?('/')
45
+ self.raw_url = "#{parent.scheme}://#{parent.host}#{raw_url}"
46
+ elsif !(raw_url.start_with?('http://') || raw_url.start_with?('https://'))
47
+ if parent
48
+ self.raw_url = URI.join(parent.canonical_url, raw_url).to_s
49
+ else
50
+ self.raw_url = "http://#{raw_url}"
51
+ end
52
+ end
53
+ end
54
+
55
+ def parse_uri
56
+ self.uri = URI.parse(raw_url)
57
+ uri.path = "/" if uri.path.empty?
58
+ uri.fragment = nil
59
+ rescue
60
+ self.valid = false
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,3 @@
1
+ module Elsmore
2
+ VERSION = '0.1.1'
3
+ end
@@ -0,0 +1,71 @@
1
+ module Elsmore
2
+ class Writer
3
+ attr_accessor :resource
4
+
5
+ def initialize resource
6
+ self.resource = resource
7
+ end
8
+
9
+ def write
10
+ write_file
11
+ end
12
+
13
+ def canonical_filename
14
+ if host == parent_host
15
+ "#{filename}"
16
+ else
17
+ "/#{host}#{filename}"
18
+ end
19
+ end
20
+
21
+
22
+ private
23
+
24
+ def write_file
25
+ return if File.exist?(full_filename)
26
+ ensure_directory full_filename
27
+ File.open(full_filename, 'w') do |file|
28
+ file.write(resource.data)
29
+ end
30
+ end
31
+
32
+ def full_filename
33
+ @full_filename ||= begin
34
+ if host == parent_host || !parent_host
35
+ "#{host}#{filename}"
36
+ else
37
+ "#{parent_host}/#{host}#{filename}"
38
+ end
39
+ end
40
+ end
41
+
42
+ def filename
43
+ @filename ||= begin
44
+ path = resource.url.uri.path
45
+ if path.end_with?('/')
46
+ return path + 'index.html'
47
+ elsif !path.split('/').last.include?(".")
48
+ return path + '/index.html'
49
+ end
50
+ path
51
+ end
52
+ end
53
+
54
+ def ensure_directory filename
55
+ dir = File.dirname(filename)
56
+ unless File.directory?(dir)
57
+ FileUtils.mkdir_p(dir)
58
+ end
59
+ end
60
+
61
+ def host
62
+ resource.url.host
63
+ end
64
+
65
+ def parent_host
66
+ resource.url.parent_host
67
+ end
68
+
69
+
70
+ end
71
+ end
data/lib/elsmore.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'elsmore/version'
2
+ require 'elsmore/command'
3
+ require 'elsmore/scraper'
4
+ require 'elsmore/document'
5
+ require 'elsmore/emitter'
6
+ require 'elsmore/writer'
7
+ require 'elsmore/url'
8
+ require 'elsmore/resource'
9
+ require 'elsmore/rewriter'
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: elsmore
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Cristiano Betta
8
+ - Mike Elsmore
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2016-10-22 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: commander
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: httparty
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: oga
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: awesome_print
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: colorize
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ type: :runtime
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ description: Backup too for Hackference
99
+ email:
100
+ - cbetta@gmail.com
101
+ - mike@hackference.co.uk
102
+ executables: []
103
+ extensions: []
104
+ extra_rdoc_files: []
105
+ files:
106
+ - LICENSE
107
+ - README.md
108
+ - elsmore.gemspec
109
+ - lib/elsmore.rb
110
+ - lib/elsmore/command.rb
111
+ - lib/elsmore/document.rb
112
+ - lib/elsmore/emitter.rb
113
+ - lib/elsmore/resource.rb
114
+ - lib/elsmore/rewriter.rb
115
+ - lib/elsmore/scraper.rb
116
+ - lib/elsmore/url.rb
117
+ - lib/elsmore/version.rb
118
+ - lib/elsmore/writer.rb
119
+ homepage: https://github.com/cbetta/elsmore
120
+ licenses:
121
+ - MIT
122
+ metadata: {}
123
+ post_install_message:
124
+ rdoc_options: []
125
+ require_paths:
126
+ - lib
127
+ required_ruby_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubyforge_project:
139
+ rubygems_version: 2.5.1
140
+ signing_key:
141
+ specification_version: 4
142
+ summary: Backup too for Hackference
143
+ test_files: []