website_cloner 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 6ccbe12c76bec756160265a01eefbea698eb18eb178564650b975f873ac34003
4
+ data.tar.gz: 0d94db0f3a421e6d2446d6e0813ad8260c3ced2925d9e21c05136a9b78e61bfb
5
+ SHA512:
6
+ metadata.gz: 671c26fc0b66875fd22d7bc823188eec4926a085f214d9bd5675939231e1605a8928eaceae3c3f7691e6768b06009f9666e0482781ff2063c2ea6675a1776b21
7
+ data.tar.gz: b69244f3a6235f4393c1dbbbabd77427cd8b8856a523808ccc97114cce71952ea250ad92a92904f29e61fc63bff547caee21854404d5718c8934680c93ce9f82
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/website_cloner'
4
+ require 'optparse'
5
+
6
+ options = {
7
+ max_pages: 20,
8
+ session_cookie: nil
9
+ }
10
+
11
+ OptionParser.new do |opts|
12
+ opts.banner = "Usage: website-cloner <url> <output_directory> [options]"
13
+
14
+ opts.on("-m", "--max-pages PAGES", Integer, "Maximum number of pages to clone (default: 20)") do |m|
15
+ options[:max_pages] = m
16
+ end
17
+
18
+ opts.on("-s", "--session-cookie COOKIE", "Session cookie for authenticated access") do |s|
19
+ options[:session_cookie] = s
20
+ end
21
+
22
+ opts.on("-h", "--help", "Prints this help") do
23
+ puts opts
24
+ exit
25
+ end
26
+ end.parse!
27
+
28
+ if ARGV.length != 2
29
+ puts "Usage: website-cloner <url> <output_directory> [options]"
30
+ puts "Run 'website-cloner --help' for more information."
31
+ exit 1
32
+ end
33
+
34
+ url = ARGV[0]
35
+ output_dir = ARGV[1]
36
+
37
+ begin
38
+ WebsiteCloner.clone(url, output_dir, max_pages: options[:max_pages], session_cookie: options[:session_cookie])
39
+
40
+ puts "\nWebsite cloned successfully!"
41
+ rescue StandardError => e
42
+ puts "\nAn error occurred during cloning: #{e.message}"
43
+ rescue Interrupt
44
+ puts "\nOK, exiting on interrupt."
45
+ end
@@ -0,0 +1,84 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'fileutils'
4
+ require 'openssl'
5
+
6
+ module WebsiteCloner
7
+ class Downloader
8
+ attr_reader :output_dir, :base_url, :session_cookie
9
+
10
+ def initialize(base_url, output_dir, session_cookie = nil)
11
+ @base_url = URI.parse(base_url)
12
+ @output_dir = output_dir
13
+ @session_cookie = session_cookie
14
+ FileUtils.mkdir_p(@output_dir)
15
+ FileUtils.mkdir_p(File.join(@output_dir, 'assets'))
16
+ FileUtils.mkdir_p(File.join(@output_dir, 'css'))
17
+ FileUtils.mkdir_p(File.join(@output_dir, 'js'))
18
+ end
19
+
20
+ def download_page(url)
21
+ Utils.logger.info "Downloading page: #{url}"
22
+ uri = URI.parse(url)
23
+ http = Net::HTTP.new(uri.host, uri.port)
24
+ http.use_ssl = (uri.scheme == 'https')
25
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
26
+
27
+ request_path = uri.path.empty? ? '/' : uri.path
28
+ request_path += "?#{uri.query}" if uri.query
29
+
30
+ request = Net::HTTP::Get.new(request_path)
31
+ request['Cookie'] = @session_cookie if @session_cookie
32
+
33
+ response = http.request(request)
34
+
35
+ case response
36
+ when Net::HTTPSuccess
37
+ response.body
38
+ when Net::HTTPRedirection
39
+ download_page(response['location'])
40
+ else
41
+ response.error!
42
+ end
43
+ end
44
+
45
+ def download_asset(url, type)
46
+ Utils.logger.info "Downloading asset: #{url}"
47
+ uri = URI.parse(URI.join(@base_url, url))
48
+ http = Net::HTTP.new(uri.host, uri.port)
49
+ http.use_ssl = (uri.scheme == 'https')
50
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
51
+
52
+ request_path = uri.path.empty? ? '/' : uri.path
53
+ request_path += "?#{uri.query}" if uri.query
54
+
55
+ request = Net::HTTP::Get.new(request_path)
56
+ request['Cookie'] = @session_cookie if @session_cookie
57
+
58
+ response = http.request(request)
59
+
60
+ case response
61
+ when Net::HTTPSuccess
62
+ content = response.body
63
+ filename = File.basename(uri.path).gsub(/^[0-9a-f]+_/, '')
64
+ filename = URI.decode_www_form_component(filename).gsub('%20', '-')
65
+ dir = case type
66
+ when 'css' then 'css'
67
+ when 'js' then 'js'
68
+ else 'assets'
69
+ end
70
+ path = File.join(@output_dir, dir, filename)
71
+ FileUtils.mkdir_p(File.dirname(path))
72
+
73
+ File.open(path, 'wb') do |file|
74
+ file.write(content)
75
+ end
76
+
77
+ "#{dir}/#{filename}"
78
+ else
79
+ Utils.logger.warn "Failed to download asset: #{url}"
80
+ url # Return the original URL if download fails
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,147 @@
1
+ require 'nokogiri'
2
+ require 'uri'
3
+
4
+ module WebsiteCloner
5
+ class Parser
6
+ def initialize(downloader)
7
+ @downloader = downloader
8
+ @file_mapping = {}
9
+ end
10
+
11
+ def parse_and_download(content, url)
12
+ doc = Nokogiri::HTML(content)
13
+ base_uri = URI.parse(url)
14
+
15
+ # Ensure the path is valid and not empty
16
+ path = base_uri.path.empty? || base_uri.path == '/' ? '/index.html' : base_uri.path
17
+
18
+ # Calculate the depth of the directory structure
19
+ depth = [path.count('/') - 1, 0].max
20
+ prefix = '../' * depth
21
+
22
+ new_pages = []
23
+
24
+ # Download and update image sources
25
+ doc.css('img').each do |img|
26
+ src = img['src']
27
+ next if src.nil? || src.empty?
28
+ new_src = @downloader.download_asset(src, 'image')
29
+ img['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
30
+ @file_mapping[src] = new_src
31
+
32
+ # Remove srcset attribute to prevent loading from CDN
33
+ img.remove_attribute('srcset')
34
+ img.remove_attribute('imagesrcset')
35
+
36
+ # Update sizes attribute if present
37
+ img['sizes'] = '100vw' if img['sizes']
38
+ end
39
+
40
+ # Download and update stylesheet links
41
+ doc.css('link[rel="stylesheet"]').each do |link|
42
+ href = link['href']
43
+ next if href.nil? || href.empty?
44
+ new_href = @downloader.download_asset(href, 'css')
45
+ link['href'] = prefix + new_href # Add the correct prefix for assets in subdirs
46
+ @file_mapping[href] = new_href
47
+ end
48
+
49
+ # Download and update script sources
50
+ doc.css('script').each do |script|
51
+ src = script['src']
52
+ next if src.nil? || src.empty?
53
+ new_src = @downloader.download_asset(src, 'js')
54
+ script['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
55
+ @file_mapping[src] = new_src
56
+ end
57
+
58
+ # Handle internal links starting with '/'
59
+ doc.css('a').each do |a|
60
+ href = a['href']
61
+ next if href.nil? || href.empty?
62
+
63
+ # Target only internal links that start with '/'
64
+ if href.start_with?('/')
65
+ # Add the new URL to new_pages for downloading before modification
66
+ new_pages << URI.join(base_uri, href).to_s
67
+
68
+ # Special handling for homepage
69
+ if href == '/'
70
+ a['href'] = prefix + 'index.html'
71
+ else
72
+ # Remove leading '/' for saving the local file
73
+ href.sub!(/^\//, '')
74
+
75
+ # Append '.html' if it's missing and not a file download (like .pdf)
76
+ href += '.html' unless href =~ /\.\w+$/
77
+
78
+ # Update the href attribute
79
+ a['href'] = href
80
+ end
81
+ end
82
+ end
83
+
84
+ # Save the updated HTML
85
+ save_html(doc.to_html, url)
86
+
87
+ new_pages
88
+ end
89
+
90
+ def organize_files
91
+ Dir.glob(File.join(@downloader.output_dir, '**', '*')).each do |file|
92
+ next if File.directory?(file)
93
+
94
+ relative_path = file.sub(@downloader.output_dir + '/', '')
95
+ dirname = File.dirname(relative_path)
96
+ basename = File.basename(relative_path)
97
+
98
+ if dirname.match?(/^[0-9a-f]+$/)
99
+ new_basename = URI.decode_www_form_component(basename).gsub('%20', '-')
100
+ new_path = case
101
+ when new_basename.end_with?('.css')
102
+ File.join(@downloader.output_dir, 'css', new_basename.gsub(/^[0-9a-f]+_/, ''))
103
+ when new_basename.end_with?('.js')
104
+ File.join(@downloader.output_dir, 'js', new_basename.gsub(/^[0-9a-f]+_/, ''))
105
+ else
106
+ File.join(@downloader.output_dir, 'assets', new_basename.gsub(/^[0-9a-f]+_/, ''))
107
+ end
108
+
109
+ FileUtils.mv(file, new_path)
110
+ @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
111
+ elsif !basename.include?('.') && !dirname.start_with?('css', 'js', 'assets')
112
+ # This is likely a subpage without an extension
113
+ new_path = "#{file}.html"
114
+ FileUtils.mv(file, new_path)
115
+ @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
116
+ end
117
+ end
118
+
119
+ update_references
120
+ end
121
+
122
+ private
123
+
124
+ def save_html(content, url)
125
+ uri = URI.parse(url)
126
+ path = uri.path.empty? || uri.path == '/' ? '/index.html' : uri.path
127
+ path += '.html' unless path.end_with?('.html')
128
+ full_path = File.join(@downloader.output_dir, path)
129
+ FileUtils.mkdir_p(File.dirname(full_path))
130
+
131
+ File.open(full_path, 'w') do |file|
132
+ file.write(content)
133
+ end
134
+ end
135
+
136
+ def update_references
137
+ Dir.glob(File.join(@downloader.output_dir, '**', '*.html')).each do |html_file|
138
+ content = File.read(html_file)
139
+ @file_mapping.each do |old_path, new_path|
140
+ content.gsub!(old_path, new_path)
141
+ content.gsub!("//#{new_path}", new_path) # Remove any double slashes
142
+ end
143
+ File.write(html_file, content)
144
+ end
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,20 @@
1
+ require 'logger'
2
+ require 'set'
3
+
4
+ module WebsiteCloner
5
+ module Utils
6
+ def self.logger
7
+ @logger ||= Logger.new(STDOUT).tap do |log|
8
+ log.formatter = proc do |severity, datetime, progname, msg|
9
+ color = case severity
10
+ when 'INFO' then "\e[32m" # Green
11
+ when 'WARN' then "\e[33m" # Yellow
12
+ when 'ERROR' then "\e[31m" # Red
13
+ else "\e[0m" # Default
14
+ end
15
+ "#{color}[#{severity}] #{msg}\e[0m\n"
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,38 @@
1
+ require_relative 'website_cloner/downloader'
2
+ require_relative 'website_cloner/parser'
3
+ require_relative 'website_cloner/utils'
4
+ require 'set'
5
+
6
+ module WebsiteCloner
7
+ class Error < StandardError; end
8
+
9
+ def self.clone(url, output_dir, max_pages: 20, session_cookie: nil)
10
+ Utils.logger.info "Starting to clone #{url}"
11
+ downloader = Downloader.new(url, output_dir, session_cookie)
12
+ parser = Parser.new(downloader)
13
+
14
+ visited_pages = Set.new
15
+ pages_to_visit = [url]
16
+
17
+ while !pages_to_visit.empty? && visited_pages.size < max_pages
18
+ current_url = pages_to_visit.shift
19
+ next if visited_pages.include?(current_url)
20
+
21
+ visited_pages.add(current_url)
22
+ Utils.logger.info "Processing page #{visited_pages.size}/#{max_pages}: #{current_url}"
23
+
24
+ begin
25
+ content = downloader.download_page(current_url)
26
+ new_pages = parser.parse_and_download(content, current_url)
27
+ pages_to_visit.concat(new_pages - visited_pages.to_a)
28
+ rescue => e
29
+ Utils.logger.error "Error processing #{current_url}: #{e.message}"
30
+ end
31
+ end
32
+
33
+ Utils.logger.info "Finished cloning. Processed #{visited_pages.size} pages."
34
+ Utils.logger.info "Organizing files and updating references..."
35
+ parser.organize_files
36
+ Utils.logger.info "Done organizing files and updating references."
37
+ end
38
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: website_cloner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Bhavyansh Yadav
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-09-11 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Website Cloner is a Ruby gem that allows you to create local copies of
14
+ websites, including all assets and linked pages. It's designed to be easy to use
15
+ while providing powerful features for customization.
16
+ email: bhavyansh001@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - bin/website-cloner
22
+ - lib/website_cloner.rb
23
+ - lib/website_cloner/downloader.rb
24
+ - lib/website_cloner/parser.rb
25
+ - lib/website_cloner/utils.rb
26
+ homepage: https://rubygems.org/gems/website_cloner
27
+ licenses:
28
+ - MIT
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubygems_version: 3.5.18
46
+ signing_key:
47
+ specification_version: 4
48
+ summary: Create local copies of websites, including all assets and linked pages.
49
+ test_files: []