RubyGems - website_cloner - Versions diffs - 0.0.1 - Mend

website_cloner 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +7 -0
data/bin/website-cloner +45 -0
data/lib/website_cloner/downloader.rb +84 -0
data/lib/website_cloner/parser.rb +147 -0
data/lib/website_cloner/utils.rb +20 -0
data/lib/website_cloner.rb +38 -0
metadata +49 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 6ccbe12c76bec756160265a01eefbea698eb18eb178564650b975f873ac34003
+  data.tar.gz: 0d94db0f3a421e6d2446d6e0813ad8260c3ced2925d9e21c05136a9b78e61bfb
+SHA512:
+  metadata.gz: 671c26fc0b66875fd22d7bc823188eec4926a085f214d9bd5675939231e1605a8928eaceae3c3f7691e6768b06009f9666e0482781ff2063c2ea6675a1776b21
+  data.tar.gz: b69244f3a6235f4393c1dbbbabd77427cd8b8856a523808ccc97114cce71952ea250ad92a92904f29e61fc63bff547caee21854404d5718c8934680c93ce9f82

data/bin/website-cloner ADDED Viewed

@@ -0,0 +1,45 @@
+#!/usr/bin/env ruby
+require_relative '../lib/website_cloner'
+require 'optparse'
+options = {
+  max_pages: 20,
+  session_cookie: nil
+}
+OptionParser.new do |opts|
+  opts.banner = "Usage: website-cloner <url> <output_directory> [options]"
+  opts.on("-m", "--max-pages PAGES", Integer, "Maximum number of pages to clone (default: 20)") do |m|
+    options[:max_pages] = m
+  end
+  opts.on("-s", "--session-cookie COOKIE", "Session cookie for authenticated access") do |s|
+    options[:session_cookie] = s
+  end
+  opts.on("-h", "--help", "Prints this help") do
+    puts opts
+    exit
+  end
+end.parse!
+if ARGV.length != 2
+  puts "Usage: website-cloner <url> <output_directory> [options]"
+  puts "Run 'website-cloner --help' for more information."
+  exit 1
+end
+url = ARGV[0]
+output_dir = ARGV[1]
+begin
+  WebsiteCloner.clone(url, output_dir, max_pages: options[:max_pages], session_cookie: options[:session_cookie])
+  puts "\nWebsite cloned successfully!"
+rescue StandardError => e
+  puts "\nAn error occurred during cloning: #{e.message}"
+rescue Interrupt
+  puts "\nOK, exiting on interrupt."
+end

data/lib/website_cloner/downloader.rb ADDED Viewed

@@ -0,0 +1,84 @@
+require 'net/http'
+require 'uri'
+require 'fileutils'
+require 'openssl'
+module WebsiteCloner
+  class Downloader
+    attr_reader :output_dir, :base_url, :session_cookie
+    def initialize(base_url, output_dir, session_cookie = nil)
+      @base_url = URI.parse(base_url)
+      @output_dir = output_dir
+      @session_cookie = session_cookie
+      FileUtils.mkdir_p(@output_dir)
+      FileUtils.mkdir_p(File.join(@output_dir, 'assets'))
+      FileUtils.mkdir_p(File.join(@output_dir, 'css'))
+      FileUtils.mkdir_p(File.join(@output_dir, 'js'))
+    end
+    def download_page(url)
+      Utils.logger.info "Downloading page: #{url}"
+      uri = URI.parse(url)
+      http = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl = (uri.scheme == 'https')
+      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      request_path = uri.path.empty? ? '/' : uri.path
+      request_path += "?#{uri.query}" if uri.query
+      request = Net::HTTP::Get.new(request_path)
+      request['Cookie'] = @session_cookie if @session_cookie
+      response = http.request(request)
+      case response
+      when Net::HTTPSuccess
+        response.body
+      when Net::HTTPRedirection
+        download_page(response['location'])
+      else
+        response.error!
+      end
+    end
+    def download_asset(url, type)
+      Utils.logger.info "Downloading asset: #{url}"
+      uri = URI.parse(URI.join(@base_url, url))
+      http = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl = (uri.scheme == 'https')
+      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      request_path = uri.path.empty? ? '/' : uri.path
+      request_path += "?#{uri.query}" if uri.query
+      request = Net::HTTP::Get.new(request_path)
+      request['Cookie'] = @session_cookie if @session_cookie
+      response = http.request(request)
+      case response
+      when Net::HTTPSuccess
+        content = response.body
+        filename = File.basename(uri.path).gsub(/^[0-9a-f]+_/, '')
+        filename = URI.decode_www_form_component(filename).gsub('%20', '-')
+        dir = case type
+              when 'css' then 'css'
+              when 'js' then 'js'
+              else 'assets'
+              end
+        path = File.join(@output_dir, dir, filename)
+        FileUtils.mkdir_p(File.dirname(path))
+        File.open(path, 'wb') do |file|
+          file.write(content)
+        end
+        "#{dir}/#{filename}"
+      else
+        Utils.logger.warn "Failed to download asset: #{url}"
+        url # Return the original URL if download fails
+      end
+    end
+  end
+end

data/lib/website_cloner/parser.rb ADDED Viewed

@@ -0,0 +1,147 @@
+require 'nokogiri'
+require 'uri'
+module WebsiteCloner
+  class Parser
+    def initialize(downloader)
+      @downloader = downloader
+      @file_mapping = {}
+    end
+    def parse_and_download(content, url)
+      doc = Nokogiri::HTML(content)
+      base_uri = URI.parse(url)
+      # Ensure the path is valid and not empty
+      path = base_uri.path.empty? || base_uri.path == '/' ? '/index.html' : base_uri.path
+      # Calculate the depth of the directory structure
+      depth = [path.count('/') - 1, 0].max
+      prefix = '../' * depth
+      new_pages = []
+      # Download and update image sources
+      doc.css('img').each do |img|
+        src = img['src']
+        next if src.nil? || src.empty?
+        new_src = @downloader.download_asset(src, 'image')
+        img['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
+        @file_mapping[src] = new_src
+        # Remove srcset attribute to prevent loading from CDN
+        img.remove_attribute('srcset')
+        img.remove_attribute('imagesrcset')
+        # Update sizes attribute if present
+        img['sizes'] = '100vw' if img['sizes']
+      end
+      # Download and update stylesheet links
+      doc.css('link[rel="stylesheet"]').each do |link|
+        href = link['href']
+        next if href.nil? || href.empty?
+        new_href = @downloader.download_asset(href, 'css')
+        link['href'] = prefix + new_href # Add the correct prefix for assets in subdirs
+        @file_mapping[href] = new_href
+      end
+      # Download and update script sources
+      doc.css('script').each do |script|
+        src = script['src']
+        next if src.nil? || src.empty?
+        new_src = @downloader.download_asset(src, 'js')
+        script['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
+        @file_mapping[src] = new_src
+      end
+      # Handle internal links starting with '/'
+      doc.css('a').each do |a|
+        href = a['href']
+        next if href.nil? || href.empty?
+        # Target only internal links that start with '/'
+        if href.start_with?('/')
+          # Add the new URL to new_pages for downloading before modification
+          new_pages << URI.join(base_uri, href).to_s
+          # Special handling for homepage
+          if href == '/'
+            a['href'] = prefix + 'index.html'
+          else
+            # Remove leading '/' for saving the local file
+            href.sub!(/^\//, '')
+            # Append '.html' if it's missing and not a file download (like .pdf)
+            href += '.html' unless href =~ /\.\w+$/
+            # Update the href attribute
+            a['href'] = href
+          end
+        end
+      end
+      # Save the updated HTML
+      save_html(doc.to_html, url)
+      new_pages
+    end
+    def organize_files
+      Dir.glob(File.join(@downloader.output_dir, '**', '*')).each do |file|
+        next if File.directory?(file)
+        relative_path = file.sub(@downloader.output_dir + '/', '')
+        dirname = File.dirname(relative_path)
+        basename = File.basename(relative_path)
+        if dirname.match?(/^[0-9a-f]+$/)
+          new_basename = URI.decode_www_form_component(basename).gsub('%20', '-')
+          new_path = case
+                     when new_basename.end_with?('.css')
+                       File.join(@downloader.output_dir, 'css', new_basename.gsub(/^[0-9a-f]+_/, ''))
+                     when new_basename.end_with?('.js')
+                       File.join(@downloader.output_dir, 'js', new_basename.gsub(/^[0-9a-f]+_/, ''))
+                     else
+                       File.join(@downloader.output_dir, 'assets', new_basename.gsub(/^[0-9a-f]+_/, ''))
+                     end
+          FileUtils.mv(file, new_path)
+          @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
+        elsif !basename.include?('.') && !dirname.start_with?('css', 'js', 'assets')
+          # This is likely a subpage without an extension
+          new_path = "#{file}.html"
+          FileUtils.mv(file, new_path)
+          @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
+        end
+      end
+      update_references
+    end
+    private
+    def save_html(content, url)
+      uri = URI.parse(url)
+      path = uri.path.empty? || uri.path == '/' ? '/index.html' : uri.path
+      path += '.html' unless path.end_with?('.html')
+      full_path = File.join(@downloader.output_dir, path)
+      FileUtils.mkdir_p(File.dirname(full_path))
+      File.open(full_path, 'w') do |file|
+        file.write(content)
+      end
+    end
+    def update_references
+      Dir.glob(File.join(@downloader.output_dir, '**', '*.html')).each do |html_file|
+        content = File.read(html_file)
+        @file_mapping.each do |old_path, new_path|
+          content.gsub!(old_path, new_path)
+          content.gsub!("//#{new_path}", new_path) # Remove any double slashes
+        end
+        File.write(html_file, content)
+      end
+    end
+  end
+end

data/lib/website_cloner/utils.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'logger'
+require 'set'
+module WebsiteCloner
+  module Utils
+    def self.logger
+      @logger ||= Logger.new(STDOUT).tap do |log|
+        log.formatter = proc do |severity, datetime, progname, msg|
+          color = case severity
+                  when 'INFO' then "\e[32m"  # Green
+                  when 'WARN' then "\e[33m"  # Yellow
+                  when 'ERROR' then "\e[31m" # Red
+                  else "\e[0m"               # Default
+                  end
+          "#{color}[#{severity}] #{msg}\e[0m\n"
+        end
+      end
+    end
+  end
+end

data/lib/website_cloner.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require_relative 'website_cloner/downloader'
+require_relative 'website_cloner/parser'
+require_relative 'website_cloner/utils'
+require 'set'
+module WebsiteCloner
+  class Error < StandardError; end
+  def self.clone(url, output_dir, max_pages: 20, session_cookie: nil)
+    Utils.logger.info "Starting to clone #{url}"
+    downloader = Downloader.new(url, output_dir, session_cookie)
+    parser = Parser.new(downloader)
+    visited_pages = Set.new
+    pages_to_visit = [url]
+    while !pages_to_visit.empty? && visited_pages.size < max_pages
+      current_url = pages_to_visit.shift
+      next if visited_pages.include?(current_url)
+      visited_pages.add(current_url)
+      Utils.logger.info "Processing page #{visited_pages.size}/#{max_pages}: #{current_url}"
+      begin
+        content = downloader.download_page(current_url)
+        new_pages = parser.parse_and_download(content, current_url)
+        pages_to_visit.concat(new_pages - visited_pages.to_a)
+      rescue => e
+        Utils.logger.error "Error processing #{current_url}: #{e.message}"
+      end
+    end
+    Utils.logger.info "Finished cloning. Processed #{visited_pages.size} pages."
+    Utils.logger.info "Organizing files and updating references..."
+    parser.organize_files
+    Utils.logger.info "Done organizing files and updating references."
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,49 @@
+--- !ruby/object:Gem::Specification
+name: website_cloner
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Bhavyansh Yadav
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2024-09-11 00:00:00.000000000 Z
+dependencies: []
+description: Website Cloner is a Ruby gem that allows you to create local copies of
+  websites, including all assets and linked pages. It's designed to be easy to use
+  while providing powerful features for customization.
+email: bhavyansh001@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/website-cloner
+- lib/website_cloner.rb
+- lib/website_cloner/downloader.rb
+- lib/website_cloner/parser.rb
+- lib/website_cloner/utils.rb
+homepage: https://rubygems.org/gems/website_cloner
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.5.18
+signing_key:
+specification_version: 4
+summary: Create local copies of websites, including all assets and linked pages.
+test_files: []