website_cloner 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/website-cloner +45 -0
- data/lib/website_cloner/downloader.rb +84 -0
- data/lib/website_cloner/parser.rb +147 -0
- data/lib/website_cloner/utils.rb +20 -0
- data/lib/website_cloner.rb +38 -0
- metadata +49 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6ccbe12c76bec756160265a01eefbea698eb18eb178564650b975f873ac34003
|
4
|
+
data.tar.gz: 0d94db0f3a421e6d2446d6e0813ad8260c3ced2925d9e21c05136a9b78e61bfb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 671c26fc0b66875fd22d7bc823188eec4926a085f214d9bd5675939231e1605a8928eaceae3c3f7691e6768b06009f9666e0482781ff2063c2ea6675a1776b21
|
7
|
+
data.tar.gz: b69244f3a6235f4393c1dbbbabd77427cd8b8856a523808ccc97114cce71952ea250ad92a92904f29e61fc63bff547caee21854404d5718c8934680c93ce9f82
|
data/bin/website-cloner
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/website_cloner'
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
options = {
|
7
|
+
max_pages: 20,
|
8
|
+
session_cookie: nil
|
9
|
+
}
|
10
|
+
|
11
|
+
OptionParser.new do |opts|
|
12
|
+
opts.banner = "Usage: website-cloner <url> <output_directory> [options]"
|
13
|
+
|
14
|
+
opts.on("-m", "--max-pages PAGES", Integer, "Maximum number of pages to clone (default: 20)") do |m|
|
15
|
+
options[:max_pages] = m
|
16
|
+
end
|
17
|
+
|
18
|
+
opts.on("-s", "--session-cookie COOKIE", "Session cookie for authenticated access") do |s|
|
19
|
+
options[:session_cookie] = s
|
20
|
+
end
|
21
|
+
|
22
|
+
opts.on("-h", "--help", "Prints this help") do
|
23
|
+
puts opts
|
24
|
+
exit
|
25
|
+
end
|
26
|
+
end.parse!
|
27
|
+
|
28
|
+
if ARGV.length != 2
|
29
|
+
puts "Usage: website-cloner <url> <output_directory> [options]"
|
30
|
+
puts "Run 'website-cloner --help' for more information."
|
31
|
+
exit 1
|
32
|
+
end
|
33
|
+
|
34
|
+
url = ARGV[0]
|
35
|
+
output_dir = ARGV[1]
|
36
|
+
|
37
|
+
begin
|
38
|
+
WebsiteCloner.clone(url, output_dir, max_pages: options[:max_pages], session_cookie: options[:session_cookie])
|
39
|
+
|
40
|
+
puts "\nWebsite cloned successfully!"
|
41
|
+
rescue StandardError => e
|
42
|
+
puts "\nAn error occurred during cloning: #{e.message}"
|
43
|
+
rescue Interrupt
|
44
|
+
puts "\nOK, exiting on interrupt."
|
45
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'openssl'
|
5
|
+
|
6
|
+
module WebsiteCloner
|
7
|
+
class Downloader
|
8
|
+
attr_reader :output_dir, :base_url, :session_cookie
|
9
|
+
|
10
|
+
def initialize(base_url, output_dir, session_cookie = nil)
|
11
|
+
@base_url = URI.parse(base_url)
|
12
|
+
@output_dir = output_dir
|
13
|
+
@session_cookie = session_cookie
|
14
|
+
FileUtils.mkdir_p(@output_dir)
|
15
|
+
FileUtils.mkdir_p(File.join(@output_dir, 'assets'))
|
16
|
+
FileUtils.mkdir_p(File.join(@output_dir, 'css'))
|
17
|
+
FileUtils.mkdir_p(File.join(@output_dir, 'js'))
|
18
|
+
end
|
19
|
+
|
20
|
+
def download_page(url)
|
21
|
+
Utils.logger.info "Downloading page: #{url}"
|
22
|
+
uri = URI.parse(url)
|
23
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
24
|
+
http.use_ssl = (uri.scheme == 'https')
|
25
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
26
|
+
|
27
|
+
request_path = uri.path.empty? ? '/' : uri.path
|
28
|
+
request_path += "?#{uri.query}" if uri.query
|
29
|
+
|
30
|
+
request = Net::HTTP::Get.new(request_path)
|
31
|
+
request['Cookie'] = @session_cookie if @session_cookie
|
32
|
+
|
33
|
+
response = http.request(request)
|
34
|
+
|
35
|
+
case response
|
36
|
+
when Net::HTTPSuccess
|
37
|
+
response.body
|
38
|
+
when Net::HTTPRedirection
|
39
|
+
download_page(response['location'])
|
40
|
+
else
|
41
|
+
response.error!
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def download_asset(url, type)
|
46
|
+
Utils.logger.info "Downloading asset: #{url}"
|
47
|
+
uri = URI.parse(URI.join(@base_url, url))
|
48
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
49
|
+
http.use_ssl = (uri.scheme == 'https')
|
50
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
51
|
+
|
52
|
+
request_path = uri.path.empty? ? '/' : uri.path
|
53
|
+
request_path += "?#{uri.query}" if uri.query
|
54
|
+
|
55
|
+
request = Net::HTTP::Get.new(request_path)
|
56
|
+
request['Cookie'] = @session_cookie if @session_cookie
|
57
|
+
|
58
|
+
response = http.request(request)
|
59
|
+
|
60
|
+
case response
|
61
|
+
when Net::HTTPSuccess
|
62
|
+
content = response.body
|
63
|
+
filename = File.basename(uri.path).gsub(/^[0-9a-f]+_/, '')
|
64
|
+
filename = URI.decode_www_form_component(filename).gsub('%20', '-')
|
65
|
+
dir = case type
|
66
|
+
when 'css' then 'css'
|
67
|
+
when 'js' then 'js'
|
68
|
+
else 'assets'
|
69
|
+
end
|
70
|
+
path = File.join(@output_dir, dir, filename)
|
71
|
+
FileUtils.mkdir_p(File.dirname(path))
|
72
|
+
|
73
|
+
File.open(path, 'wb') do |file|
|
74
|
+
file.write(content)
|
75
|
+
end
|
76
|
+
|
77
|
+
"#{dir}/#{filename}"
|
78
|
+
else
|
79
|
+
Utils.logger.warn "Failed to download asset: #{url}"
|
80
|
+
url # Return the original URL if download fails
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module WebsiteCloner
|
5
|
+
class Parser
|
6
|
+
def initialize(downloader)
|
7
|
+
@downloader = downloader
|
8
|
+
@file_mapping = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def parse_and_download(content, url)
|
12
|
+
doc = Nokogiri::HTML(content)
|
13
|
+
base_uri = URI.parse(url)
|
14
|
+
|
15
|
+
# Ensure the path is valid and not empty
|
16
|
+
path = base_uri.path.empty? || base_uri.path == '/' ? '/index.html' : base_uri.path
|
17
|
+
|
18
|
+
# Calculate the depth of the directory structure
|
19
|
+
depth = [path.count('/') - 1, 0].max
|
20
|
+
prefix = '../' * depth
|
21
|
+
|
22
|
+
new_pages = []
|
23
|
+
|
24
|
+
# Download and update image sources
|
25
|
+
doc.css('img').each do |img|
|
26
|
+
src = img['src']
|
27
|
+
next if src.nil? || src.empty?
|
28
|
+
new_src = @downloader.download_asset(src, 'image')
|
29
|
+
img['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
|
30
|
+
@file_mapping[src] = new_src
|
31
|
+
|
32
|
+
# Remove srcset attribute to prevent loading from CDN
|
33
|
+
img.remove_attribute('srcset')
|
34
|
+
img.remove_attribute('imagesrcset')
|
35
|
+
|
36
|
+
# Update sizes attribute if present
|
37
|
+
img['sizes'] = '100vw' if img['sizes']
|
38
|
+
end
|
39
|
+
|
40
|
+
# Download and update stylesheet links
|
41
|
+
doc.css('link[rel="stylesheet"]').each do |link|
|
42
|
+
href = link['href']
|
43
|
+
next if href.nil? || href.empty?
|
44
|
+
new_href = @downloader.download_asset(href, 'css')
|
45
|
+
link['href'] = prefix + new_href # Add the correct prefix for assets in subdirs
|
46
|
+
@file_mapping[href] = new_href
|
47
|
+
end
|
48
|
+
|
49
|
+
# Download and update script sources
|
50
|
+
doc.css('script').each do |script|
|
51
|
+
src = script['src']
|
52
|
+
next if src.nil? || src.empty?
|
53
|
+
new_src = @downloader.download_asset(src, 'js')
|
54
|
+
script['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
|
55
|
+
@file_mapping[src] = new_src
|
56
|
+
end
|
57
|
+
|
58
|
+
# Handle internal links starting with '/'
|
59
|
+
doc.css('a').each do |a|
|
60
|
+
href = a['href']
|
61
|
+
next if href.nil? || href.empty?
|
62
|
+
|
63
|
+
# Target only internal links that start with '/'
|
64
|
+
if href.start_with?('/')
|
65
|
+
# Add the new URL to new_pages for downloading before modification
|
66
|
+
new_pages << URI.join(base_uri, href).to_s
|
67
|
+
|
68
|
+
# Special handling for homepage
|
69
|
+
if href == '/'
|
70
|
+
a['href'] = prefix + 'index.html'
|
71
|
+
else
|
72
|
+
# Remove leading '/' for saving the local file
|
73
|
+
href.sub!(/^\//, '')
|
74
|
+
|
75
|
+
# Append '.html' if it's missing and not a file download (like .pdf)
|
76
|
+
href += '.html' unless href =~ /\.\w+$/
|
77
|
+
|
78
|
+
# Update the href attribute
|
79
|
+
a['href'] = href
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Save the updated HTML
|
85
|
+
save_html(doc.to_html, url)
|
86
|
+
|
87
|
+
new_pages
|
88
|
+
end
|
89
|
+
|
90
|
+
def organize_files
|
91
|
+
Dir.glob(File.join(@downloader.output_dir, '**', '*')).each do |file|
|
92
|
+
next if File.directory?(file)
|
93
|
+
|
94
|
+
relative_path = file.sub(@downloader.output_dir + '/', '')
|
95
|
+
dirname = File.dirname(relative_path)
|
96
|
+
basename = File.basename(relative_path)
|
97
|
+
|
98
|
+
if dirname.match?(/^[0-9a-f]+$/)
|
99
|
+
new_basename = URI.decode_www_form_component(basename).gsub('%20', '-')
|
100
|
+
new_path = case
|
101
|
+
when new_basename.end_with?('.css')
|
102
|
+
File.join(@downloader.output_dir, 'css', new_basename.gsub(/^[0-9a-f]+_/, ''))
|
103
|
+
when new_basename.end_with?('.js')
|
104
|
+
File.join(@downloader.output_dir, 'js', new_basename.gsub(/^[0-9a-f]+_/, ''))
|
105
|
+
else
|
106
|
+
File.join(@downloader.output_dir, 'assets', new_basename.gsub(/^[0-9a-f]+_/, ''))
|
107
|
+
end
|
108
|
+
|
109
|
+
FileUtils.mv(file, new_path)
|
110
|
+
@file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
|
111
|
+
elsif !basename.include?('.') && !dirname.start_with?('css', 'js', 'assets')
|
112
|
+
# This is likely a subpage without an extension
|
113
|
+
new_path = "#{file}.html"
|
114
|
+
FileUtils.mv(file, new_path)
|
115
|
+
@file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
update_references
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
def save_html(content, url)
|
125
|
+
uri = URI.parse(url)
|
126
|
+
path = uri.path.empty? || uri.path == '/' ? '/index.html' : uri.path
|
127
|
+
path += '.html' unless path.end_with?('.html')
|
128
|
+
full_path = File.join(@downloader.output_dir, path)
|
129
|
+
FileUtils.mkdir_p(File.dirname(full_path))
|
130
|
+
|
131
|
+
File.open(full_path, 'w') do |file|
|
132
|
+
file.write(content)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def update_references
|
137
|
+
Dir.glob(File.join(@downloader.output_dir, '**', '*.html')).each do |html_file|
|
138
|
+
content = File.read(html_file)
|
139
|
+
@file_mapping.each do |old_path, new_path|
|
140
|
+
content.gsub!(old_path, new_path)
|
141
|
+
content.gsub!("//#{new_path}", new_path) # Remove any double slashes
|
142
|
+
end
|
143
|
+
File.write(html_file, content)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'logger'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module WebsiteCloner
|
5
|
+
module Utils
|
6
|
+
def self.logger
|
7
|
+
@logger ||= Logger.new(STDOUT).tap do |log|
|
8
|
+
log.formatter = proc do |severity, datetime, progname, msg|
|
9
|
+
color = case severity
|
10
|
+
when 'INFO' then "\e[32m" # Green
|
11
|
+
when 'WARN' then "\e[33m" # Yellow
|
12
|
+
when 'ERROR' then "\e[31m" # Red
|
13
|
+
else "\e[0m" # Default
|
14
|
+
end
|
15
|
+
"#{color}[#{severity}] #{msg}\e[0m\n"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require_relative 'website_cloner/downloader'
|
2
|
+
require_relative 'website_cloner/parser'
|
3
|
+
require_relative 'website_cloner/utils'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
module WebsiteCloner
|
7
|
+
class Error < StandardError; end
|
8
|
+
|
9
|
+
def self.clone(url, output_dir, max_pages: 20, session_cookie: nil)
|
10
|
+
Utils.logger.info "Starting to clone #{url}"
|
11
|
+
downloader = Downloader.new(url, output_dir, session_cookie)
|
12
|
+
parser = Parser.new(downloader)
|
13
|
+
|
14
|
+
visited_pages = Set.new
|
15
|
+
pages_to_visit = [url]
|
16
|
+
|
17
|
+
while !pages_to_visit.empty? && visited_pages.size < max_pages
|
18
|
+
current_url = pages_to_visit.shift
|
19
|
+
next if visited_pages.include?(current_url)
|
20
|
+
|
21
|
+
visited_pages.add(current_url)
|
22
|
+
Utils.logger.info "Processing page #{visited_pages.size}/#{max_pages}: #{current_url}"
|
23
|
+
|
24
|
+
begin
|
25
|
+
content = downloader.download_page(current_url)
|
26
|
+
new_pages = parser.parse_and_download(content, current_url)
|
27
|
+
pages_to_visit.concat(new_pages - visited_pages.to_a)
|
28
|
+
rescue => e
|
29
|
+
Utils.logger.error "Error processing #{current_url}: #{e.message}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
Utils.logger.info "Finished cloning. Processed #{visited_pages.size} pages."
|
34
|
+
Utils.logger.info "Organizing files and updating references..."
|
35
|
+
parser.organize_files
|
36
|
+
Utils.logger.info "Done organizing files and updating references."
|
37
|
+
end
|
38
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: website_cloner
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Bhavyansh Yadav
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-09-11 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Website Cloner is a Ruby gem that allows you to create local copies of
|
14
|
+
websites, including all assets and linked pages. It's designed to be easy to use
|
15
|
+
while providing powerful features for customization.
|
16
|
+
email: bhavyansh001@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- bin/website-cloner
|
22
|
+
- lib/website_cloner.rb
|
23
|
+
- lib/website_cloner/downloader.rb
|
24
|
+
- lib/website_cloner/parser.rb
|
25
|
+
- lib/website_cloner/utils.rb
|
26
|
+
homepage: https://rubygems.org/gems/website_cloner
|
27
|
+
licenses:
|
28
|
+
- MIT
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubygems_version: 3.5.18
|
46
|
+
signing_key:
|
47
|
+
specification_version: 4
|
48
|
+
summary: Create local copies of websites, including all assets and linked pages.
|
49
|
+
test_files: []
|