website_cloner 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 6ccbe12c76bec756160265a01eefbea698eb18eb178564650b975f873ac34003
4
+ data.tar.gz: 0d94db0f3a421e6d2446d6e0813ad8260c3ced2925d9e21c05136a9b78e61bfb
5
+ SHA512:
6
+ metadata.gz: 671c26fc0b66875fd22d7bc823188eec4926a085f214d9bd5675939231e1605a8928eaceae3c3f7691e6768b06009f9666e0482781ff2063c2ea6675a1776b21
7
+ data.tar.gz: b69244f3a6235f4393c1dbbbabd77427cd8b8856a523808ccc97114cce71952ea250ad92a92904f29e61fc63bff547caee21854404d5718c8934680c93ce9f82
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/website_cloner'
4
+ require 'optparse'
5
+
6
+ options = {
7
+ max_pages: 20,
8
+ session_cookie: nil
9
+ }
10
+
11
+ OptionParser.new do |opts|
12
+ opts.banner = "Usage: website-cloner <url> <output_directory> [options]"
13
+
14
+ opts.on("-m", "--max-pages PAGES", Integer, "Maximum number of pages to clone (default: 20)") do |m|
15
+ options[:max_pages] = m
16
+ end
17
+
18
+ opts.on("-s", "--session-cookie COOKIE", "Session cookie for authenticated access") do |s|
19
+ options[:session_cookie] = s
20
+ end
21
+
22
+ opts.on("-h", "--help", "Prints this help") do
23
+ puts opts
24
+ exit
25
+ end
26
+ end.parse!
27
+
28
+ if ARGV.length != 2
29
+ puts "Usage: website-cloner <url> <output_directory> [options]"
30
+ puts "Run 'website-cloner --help' for more information."
31
+ exit 1
32
+ end
33
+
34
+ url = ARGV[0]
35
+ output_dir = ARGV[1]
36
+
37
+ begin
38
+ WebsiteCloner.clone(url, output_dir, max_pages: options[:max_pages], session_cookie: options[:session_cookie])
39
+
40
+ puts "\nWebsite cloned successfully!"
41
+ rescue StandardError => e
42
+ puts "\nAn error occurred during cloning: #{e.message}"
43
+ rescue Interrupt
44
+ puts "\nOK, exiting on interrupt."
45
+ end
@@ -0,0 +1,84 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'fileutils'
4
+ require 'openssl'
5
+
6
+ module WebsiteCloner
7
+ class Downloader
8
+ attr_reader :output_dir, :base_url, :session_cookie
9
+
10
+ def initialize(base_url, output_dir, session_cookie = nil)
11
+ @base_url = URI.parse(base_url)
12
+ @output_dir = output_dir
13
+ @session_cookie = session_cookie
14
+ FileUtils.mkdir_p(@output_dir)
15
+ FileUtils.mkdir_p(File.join(@output_dir, 'assets'))
16
+ FileUtils.mkdir_p(File.join(@output_dir, 'css'))
17
+ FileUtils.mkdir_p(File.join(@output_dir, 'js'))
18
+ end
19
+
20
+ def download_page(url)
21
+ Utils.logger.info "Downloading page: #{url}"
22
+ uri = URI.parse(url)
23
+ http = Net::HTTP.new(uri.host, uri.port)
24
+ http.use_ssl = (uri.scheme == 'https')
25
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
26
+
27
+ request_path = uri.path.empty? ? '/' : uri.path
28
+ request_path += "?#{uri.query}" if uri.query
29
+
30
+ request = Net::HTTP::Get.new(request_path)
31
+ request['Cookie'] = @session_cookie if @session_cookie
32
+
33
+ response = http.request(request)
34
+
35
+ case response
36
+ when Net::HTTPSuccess
37
+ response.body
38
+ when Net::HTTPRedirection
39
+ download_page(response['location'])
40
+ else
41
+ response.error!
42
+ end
43
+ end
44
+
45
+ def download_asset(url, type)
46
+ Utils.logger.info "Downloading asset: #{url}"
47
+ uri = URI.parse(URI.join(@base_url, url))
48
+ http = Net::HTTP.new(uri.host, uri.port)
49
+ http.use_ssl = (uri.scheme == 'https')
50
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
51
+
52
+ request_path = uri.path.empty? ? '/' : uri.path
53
+ request_path += "?#{uri.query}" if uri.query
54
+
55
+ request = Net::HTTP::Get.new(request_path)
56
+ request['Cookie'] = @session_cookie if @session_cookie
57
+
58
+ response = http.request(request)
59
+
60
+ case response
61
+ when Net::HTTPSuccess
62
+ content = response.body
63
+ filename = File.basename(uri.path).gsub(/^[0-9a-f]+_/, '')
64
+ filename = URI.decode_www_form_component(filename).gsub('%20', '-')
65
+ dir = case type
66
+ when 'css' then 'css'
67
+ when 'js' then 'js'
68
+ else 'assets'
69
+ end
70
+ path = File.join(@output_dir, dir, filename)
71
+ FileUtils.mkdir_p(File.dirname(path))
72
+
73
+ File.open(path, 'wb') do |file|
74
+ file.write(content)
75
+ end
76
+
77
+ "#{dir}/#{filename}"
78
+ else
79
+ Utils.logger.warn "Failed to download asset: #{url}"
80
+ url # Return the original URL if download fails
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,147 @@
1
+ require 'nokogiri'
2
+ require 'uri'
3
+
4
+ module WebsiteCloner
5
+ class Parser
6
+ def initialize(downloader)
7
+ @downloader = downloader
8
+ @file_mapping = {}
9
+ end
10
+
11
+ def parse_and_download(content, url)
12
+ doc = Nokogiri::HTML(content)
13
+ base_uri = URI.parse(url)
14
+
15
+ # Ensure the path is valid and not empty
16
+ path = base_uri.path.empty? || base_uri.path == '/' ? '/index.html' : base_uri.path
17
+
18
+ # Calculate the depth of the directory structure
19
+ depth = [path.count('/') - 1, 0].max
20
+ prefix = '../' * depth
21
+
22
+ new_pages = []
23
+
24
+ # Download and update image sources
25
+ doc.css('img').each do |img|
26
+ src = img['src']
27
+ next if src.nil? || src.empty?
28
+ new_src = @downloader.download_asset(src, 'image')
29
+ img['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
30
+ @file_mapping[src] = new_src
31
+
32
+ # Remove srcset attribute to prevent loading from CDN
33
+ img.remove_attribute('srcset')
34
+ img.remove_attribute('imagesrcset')
35
+
36
+ # Update sizes attribute if present
37
+ img['sizes'] = '100vw' if img['sizes']
38
+ end
39
+
40
+ # Download and update stylesheet links
41
+ doc.css('link[rel="stylesheet"]').each do |link|
42
+ href = link['href']
43
+ next if href.nil? || href.empty?
44
+ new_href = @downloader.download_asset(href, 'css')
45
+ link['href'] = prefix + new_href # Add the correct prefix for assets in subdirs
46
+ @file_mapping[href] = new_href
47
+ end
48
+
49
+ # Download and update script sources
50
+ doc.css('script').each do |script|
51
+ src = script['src']
52
+ next if src.nil? || src.empty?
53
+ new_src = @downloader.download_asset(src, 'js')
54
+ script['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
55
+ @file_mapping[src] = new_src
56
+ end
57
+
58
+ # Handle internal links starting with '/'
59
+ doc.css('a').each do |a|
60
+ href = a['href']
61
+ next if href.nil? || href.empty?
62
+
63
+ # Target only internal links that start with '/'
64
+ if href.start_with?('/')
65
+ # Add the new URL to new_pages for downloading before modification
66
+ new_pages << URI.join(base_uri, href).to_s
67
+
68
+ # Special handling for homepage
69
+ if href == '/'
70
+ a['href'] = prefix + 'index.html'
71
+ else
72
+ # Remove leading '/' for saving the local file
73
+ href.sub!(/^\//, '')
74
+
75
+ # Append '.html' if it's missing and not a file download (like .pdf)
76
+ href += '.html' unless href =~ /\.\w+$/
77
+
78
+ # Update the href attribute
79
+ a['href'] = href
80
+ end
81
+ end
82
+ end
83
+
84
+ # Save the updated HTML
85
+ save_html(doc.to_html, url)
86
+
87
+ new_pages
88
+ end
89
+
90
+ def organize_files
91
+ Dir.glob(File.join(@downloader.output_dir, '**', '*')).each do |file|
92
+ next if File.directory?(file)
93
+
94
+ relative_path = file.sub(@downloader.output_dir + '/', '')
95
+ dirname = File.dirname(relative_path)
96
+ basename = File.basename(relative_path)
97
+
98
+ if dirname.match?(/^[0-9a-f]+$/)
99
+ new_basename = URI.decode_www_form_component(basename).gsub('%20', '-')
100
+ new_path = case
101
+ when new_basename.end_with?('.css')
102
+ File.join(@downloader.output_dir, 'css', new_basename.gsub(/^[0-9a-f]+_/, ''))
103
+ when new_basename.end_with?('.js')
104
+ File.join(@downloader.output_dir, 'js', new_basename.gsub(/^[0-9a-f]+_/, ''))
105
+ else
106
+ File.join(@downloader.output_dir, 'assets', new_basename.gsub(/^[0-9a-f]+_/, ''))
107
+ end
108
+
109
+ FileUtils.mv(file, new_path)
110
+ @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
111
+ elsif !basename.include?('.') && !dirname.start_with?('css', 'js', 'assets')
112
+ # This is likely a subpage without an extension
113
+ new_path = "#{file}.html"
114
+ FileUtils.mv(file, new_path)
115
+ @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
116
+ end
117
+ end
118
+
119
+ update_references
120
+ end
121
+
122
+ private
123
+
124
+ def save_html(content, url)
125
+ uri = URI.parse(url)
126
+ path = uri.path.empty? || uri.path == '/' ? '/index.html' : uri.path
127
+ path += '.html' unless path.end_with?('.html')
128
+ full_path = File.join(@downloader.output_dir, path)
129
+ FileUtils.mkdir_p(File.dirname(full_path))
130
+
131
+ File.open(full_path, 'w') do |file|
132
+ file.write(content)
133
+ end
134
+ end
135
+
136
+ def update_references
137
+ Dir.glob(File.join(@downloader.output_dir, '**', '*.html')).each do |html_file|
138
+ content = File.read(html_file)
139
+ @file_mapping.each do |old_path, new_path|
140
+ content.gsub!(old_path, new_path)
141
+ content.gsub!("//#{new_path}", new_path) # Remove any double slashes
142
+ end
143
+ File.write(html_file, content)
144
+ end
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,20 @@
1
+ require 'logger'
2
+ require 'set'
3
+
4
+ module WebsiteCloner
5
+ module Utils
6
+ def self.logger
7
+ @logger ||= Logger.new(STDOUT).tap do |log|
8
+ log.formatter = proc do |severity, datetime, progname, msg|
9
+ color = case severity
10
+ when 'INFO' then "\e[32m" # Green
11
+ when 'WARN' then "\e[33m" # Yellow
12
+ when 'ERROR' then "\e[31m" # Red
13
+ else "\e[0m" # Default
14
+ end
15
+ "#{color}[#{severity}] #{msg}\e[0m\n"
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,38 @@
1
+ require_relative 'website_cloner/downloader'
2
+ require_relative 'website_cloner/parser'
3
+ require_relative 'website_cloner/utils'
4
+ require 'set'
5
+
6
+ module WebsiteCloner
7
+ class Error < StandardError; end
8
+
9
+ def self.clone(url, output_dir, max_pages: 20, session_cookie: nil)
10
+ Utils.logger.info "Starting to clone #{url}"
11
+ downloader = Downloader.new(url, output_dir, session_cookie)
12
+ parser = Parser.new(downloader)
13
+
14
+ visited_pages = Set.new
15
+ pages_to_visit = [url]
16
+
17
+ while !pages_to_visit.empty? && visited_pages.size < max_pages
18
+ current_url = pages_to_visit.shift
19
+ next if visited_pages.include?(current_url)
20
+
21
+ visited_pages.add(current_url)
22
+ Utils.logger.info "Processing page #{visited_pages.size}/#{max_pages}: #{current_url}"
23
+
24
+ begin
25
+ content = downloader.download_page(current_url)
26
+ new_pages = parser.parse_and_download(content, current_url)
27
+ pages_to_visit.concat(new_pages - visited_pages.to_a)
28
+ rescue => e
29
+ Utils.logger.error "Error processing #{current_url}: #{e.message}"
30
+ end
31
+ end
32
+
33
+ Utils.logger.info "Finished cloning. Processed #{visited_pages.size} pages."
34
+ Utils.logger.info "Organizing files and updating references..."
35
+ parser.organize_files
36
+ Utils.logger.info "Done organizing files and updating references."
37
+ end
38
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: website_cloner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Bhavyansh Yadav
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-09-11 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Website Cloner is a Ruby gem that allows you to create local copies of
14
+ websites, including all assets and linked pages. It's designed to be easy to use
15
+ while providing powerful features for customization.
16
+ email: bhavyansh001@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - bin/website-cloner
22
+ - lib/website_cloner.rb
23
+ - lib/website_cloner/downloader.rb
24
+ - lib/website_cloner/parser.rb
25
+ - lib/website_cloner/utils.rb
26
+ homepage: https://rubygems.org/gems/website_cloner
27
+ licenses:
28
+ - MIT
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubygems_version: 3.5.18
46
+ signing_key:
47
+ specification_version: 4
48
+ summary: Create local copies of websites, including all assets and linked pages.
49
+ test_files: []