RubyGems - website_cloner - Versions diffs - 0.0.1 - Mend

website_cloner 0.0.1

Files changed (7) hide show

checksums.yaml +7 -0
data/bin/website-cloner +45 -0
data/lib/website_cloner/downloader.rb +84 -0
data/lib/website_cloner/parser.rb +147 -0
data/lib/website_cloner/utils.rb +20 -0
data/lib/website_cloner.rb +38 -0
metadata +49 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 6ccbe12c76bec756160265a01eefbea698eb18eb178564650b975f873ac34003
+  data.tar.gz: 0d94db0f3a421e6d2446d6e0813ad8260c3ced2925d9e21c05136a9b78e61bfb
+SHA512:
+  metadata.gz: 671c26fc0b66875fd22d7bc823188eec4926a085f214d9bd5675939231e1605a8928eaceae3c3f7691e6768b06009f9666e0482781ff2063c2ea6675a1776b21
+  data.tar.gz: b69244f3a6235f4393c1dbbbabd77427cd8b8856a523808ccc97114cce71952ea250ad92a92904f29e61fc63bff547caee21854404d5718c8934680c93ce9f82

data/bin/website-cloner ADDED Viewed

@@ -0,0 +1,45 @@
+#!/usr/bin/env ruby
+require_relative '../lib/website_cloner'
+require 'optparse'
+options = {
+  max_pages: 20,
+  session_cookie: nil
+}
+OptionParser.new do |opts|
+  opts.banner = "Usage: website-cloner <url> <output_directory> [options]"
+  opts.on("-m", "--max-pages PAGES", Integer, "Maximum number of pages to clone (default: 20)") do |m|
+    options[:max_pages] = m
+  end
+  opts.on("-s", "--session-cookie COOKIE", "Session cookie for authenticated access") do |s|
+    options[:session_cookie] = s
+  end
+  opts.on("-h", "--help", "Prints this help") do
+    puts opts
+    exit
+  end
+end.parse!
+if ARGV.length != 2
+  puts "Usage: website-cloner <url> <output_directory> [options]"
+  puts "Run 'website-cloner --help' for more information."
+  exit 1
+end
+url = ARGV[0]
+output_dir = ARGV[1]
+begin
+  WebsiteCloner.clone(url, output_dir, max_pages: options[:max_pages], session_cookie: options[:session_cookie])
+  puts "\nWebsite cloned successfully!"
+rescue StandardError => e
+  puts "\nAn error occurred during cloning: #{e.message}"
+rescue Interrupt
+  puts "\nOK, exiting on interrupt."
+end

data/lib/website_cloner/downloader.rb ADDED Viewed

@@ -0,0 +1,84 @@
+require 'net/http'
+require 'uri'
+require 'fileutils'
+require 'openssl'
+module WebsiteCloner
+  class Downloader
+    attr_reader :output_dir, :base_url, :session_cookie
+    def initialize(base_url, output_dir, session_cookie = nil)
+      @base_url = URI.parse(base_url)
+      @output_dir = output_dir
+      @session_cookie = session_cookie
+      FileUtils.mkdir_p(@output_dir)
+      FileUtils.mkdir_p(File.join(@output_dir, 'assets'))
+      FileUtils.mkdir_p(File.join(@output_dir, 'css'))
+      FileUtils.mkdir_p(File.join(@output_dir, 'js'))
+    end
+    def download_page(url)
+      Utils.logger.info "Downloading page: #{url}"
+      uri = URI.parse(url)
+      http = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl = (uri.scheme == 'https')
+      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      request_path = uri.path.empty? ? '/' : uri.path
+      request_path += "?#{uri.query}" if uri.query
+      request = Net::HTTP::Get.new(request_path)
+      request['Cookie'] = @session_cookie if @session_cookie
+      response = http.request(request)
+      case response
+      when Net::HTTPSuccess
+        response.body
+      when Net::HTTPRedirection
+        download_page(response['location'])
+      else
+        response.error!
+      end
+    end
+    def download_asset(url, type)
+      Utils.logger.info "Downloading asset: #{url}"
+      uri = URI.parse(URI.join(@base_url, url))
+      http = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl = (uri.scheme == 'https')
+      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      request_path = uri.path.empty? ? '/' : uri.path
+      request_path += "?#{uri.query}" if uri.query
+      request = Net::HTTP::Get.new(request_path)
+      request['Cookie'] = @session_cookie if @session_cookie
+      response = http.request(request)
+      case response
+      when Net::HTTPSuccess
+        content = response.body
+        filename = File.basename(uri.path).gsub(/^[0-9a-f]+_/, '')
+        filename = URI.decode_www_form_component(filename).gsub('%20', '-')
+        dir = case type
+              when 'css' then 'css'
+              when 'js' then 'js'
+              else 'assets'
+              end
+        path = File.join(@output_dir, dir, filename)
+        FileUtils.mkdir_p(File.dirname(path))
+        File.open(path, 'wb') do |file|
+          file.write(content)
+        end
+        "#{dir}/#{filename}"
+      else
+        Utils.logger.warn "Failed to download asset: #{url}"
+        url # Return the original URL if download fails
+      end
+    end
+  end
+end

data/lib/website_cloner/parser.rb ADDED Viewed

@@ -0,0 +1,147 @@
+require 'nokogiri'
+require 'uri'
+module WebsiteCloner
+  class Parser
+    def initialize(downloader)
+      @downloader = downloader
+      @file_mapping = {}
+    end
+    def parse_and_download(content, url)
+      doc = Nokogiri::HTML(content)
+      base_uri = URI.parse(url)
+      # Ensure the path is valid and not empty
+      path = base_uri.path.empty? || base_uri.path == '/' ? '/index.html' : base_uri.path
+      # Calculate the depth of the directory structure
+      depth = [path.count('/') - 1, 0].max
+      prefix = '../' * depth
+      new_pages = []
+      # Download and update image sources
+      doc.css('img').each do |img|
+        src = img['src']
+        next if src.nil? || src.empty?
+        new_src = @downloader.download_asset(src, 'image')
+        img['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
+        @file_mapping[src] = new_src
+        # Remove srcset attribute to prevent loading from CDN
+        img.remove_attribute('srcset')
+        img.remove_attribute('imagesrcset')
+        # Update sizes attribute if present
+        img['sizes'] = '100vw' if img['sizes']
+      end
+      # Download and update stylesheet links
+      doc.css('link[rel="stylesheet"]').each do |link|
+        href = link['href']
+        next if href.nil? || href.empty?
+        new_href = @downloader.download_asset(href, 'css')
+        link['href'] = prefix + new_href # Add the correct prefix for assets in subdirs
+        @file_mapping[href] = new_href
+      end
+      # Download and update script sources
+      doc.css('script').each do |script|
+        src = script['src']
+        next if src.nil? || src.empty?
+        new_src = @downloader.download_asset(src, 'js')
+        script['src'] = prefix + new_src # Add the correct prefix for assets in subdirs
+        @file_mapping[src] = new_src
+      end
+      # Handle internal links starting with '/'
+      doc.css('a').each do |a|
+        href = a['href']
+        next if href.nil? || href.empty?
+        # Target only internal links that start with '/'
+        if href.start_with?('/')
+          # Add the new URL to new_pages for downloading before modification
+          new_pages << URI.join(base_uri, href).to_s
+          # Special handling for homepage
+          if href == '/'
+            a['href'] = prefix + 'index.html'
+          else
+            # Remove leading '/' for saving the local file
+            href.sub!(/^\//, '')
+            # Append '.html' if it's missing and not a file download (like .pdf)
+            href += '.html' unless href =~ /\.\w+$/
+            # Update the href attribute
+            a['href'] = href
+          end
+        end
+      end
+      # Save the updated HTML
+      save_html(doc.to_html, url)
+      new_pages
+    end
+    def organize_files
+      Dir.glob(File.join(@downloader.output_dir, '**', '*')).each do |file|
+        next if File.directory?(file)
+        relative_path = file.sub(@downloader.output_dir + '/', '')
+        dirname = File.dirname(relative_path)
+        basename = File.basename(relative_path)
+        if dirname.match?(/^[0-9a-f]+$/)
+          new_basename = URI.decode_www_form_component(basename).gsub('%20', '-')
+          new_path = case
+                     when new_basename.end_with?('.css')
+                       File.join(@downloader.output_dir, 'css', new_basename.gsub(/^[0-9a-f]+_/, ''))
+                     when new_basename.end_with?('.js')
+                       File.join(@downloader.output_dir, 'js', new_basename.gsub(/^[0-9a-f]+_/, ''))
+                     else
+                       File.join(@downloader.output_dir, 'assets', new_basename.gsub(/^[0-9a-f]+_/, ''))
+                     end
+          FileUtils.mv(file, new_path)
+          @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
+        elsif !basename.include?('.') && !dirname.start_with?('css', 'js', 'assets')
+          # This is likely a subpage without an extension
+          new_path = "#{file}.html"
+          FileUtils.mv(file, new_path)
+          @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}"
+        end
+      end
+      update_references
+    end
+    private
+    def save_html(content, url)
+      uri = URI.parse(url)
+      path = uri.path.empty? || uri.path == '/' ? '/index.html' : uri.path
+      path += '.html' unless path.end_with?('.html')
+      full_path = File.join(@downloader.output_dir, path)
+      FileUtils.mkdir_p(File.dirname(full_path))
+      File.open(full_path, 'w') do |file|
+        file.write(content)
+      end
+    end
+    def update_references
+      Dir.glob(File.join(@downloader.output_dir, '**', '*.html')).each do |html_file|
+        content = File.read(html_file)
+        @file_mapping.each do |old_path, new_path|
+          content.gsub!(old_path, new_path)
+          content.gsub!("//#{new_path}", new_path) # Remove any double slashes
+        end
+        File.write(html_file, content)
+      end
+    end
+  end
+end

data/lib/website_cloner/utils.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'logger'
+require 'set'
+module WebsiteCloner
+  module Utils
+    def self.logger
+      @logger ||= Logger.new(STDOUT).tap do |log|
+        log.formatter = proc do |severity, datetime, progname, msg|
+          color = case severity
+                  when 'INFO' then "\e[32m"  # Green
+                  when 'WARN' then "\e[33m"  # Yellow
+                  when 'ERROR' then "\e[31m" # Red
+                  else "\e[0m"               # Default
+                  end
+          "#{color}[#{severity}] #{msg}\e[0m\n"
+        end
+      end
+    end
+  end
+end

data/lib/website_cloner.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require_relative 'website_cloner/downloader'
+require_relative 'website_cloner/parser'
+require_relative 'website_cloner/utils'
+require 'set'
+module WebsiteCloner
+  class Error < StandardError; end
+  def self.clone(url, output_dir, max_pages: 20, session_cookie: nil)
+    Utils.logger.info "Starting to clone #{url}"
+    downloader = Downloader.new(url, output_dir, session_cookie)
+    parser = Parser.new(downloader)
+    visited_pages = Set.new
+    pages_to_visit = [url]
+    while !pages_to_visit.empty? && visited_pages.size < max_pages
+      current_url = pages_to_visit.shift
+      next if visited_pages.include?(current_url)
+      visited_pages.add(current_url)
+      Utils.logger.info "Processing page #{visited_pages.size}/#{max_pages}: #{current_url}"
+      begin
+        content = downloader.download_page(current_url)
+        new_pages = parser.parse_and_download(content, current_url)
+        pages_to_visit.concat(new_pages - visited_pages.to_a)
+      rescue => e
+        Utils.logger.error "Error processing #{current_url}: #{e.message}"
+      end
+    end
+    Utils.logger.info "Finished cloning. Processed #{visited_pages.size} pages."
+    Utils.logger.info "Organizing files and updating references..."
+    parser.organize_files
+    Utils.logger.info "Done organizing files and updating references."
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,49 @@
+--- !ruby/object:Gem::Specification
+name: website_cloner
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Bhavyansh Yadav
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2024-09-11 00:00:00.000000000 Z
+dependencies: []
+description: Website Cloner is a Ruby gem that allows you to create local copies of
+  websites, including all assets and linked pages. It's designed to be easy to use
+  while providing powerful features for customization.
+email: bhavyansh001@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/website-cloner
+- lib/website_cloner.rb
+- lib/website_cloner/downloader.rb
+- lib/website_cloner/parser.rb
+- lib/website_cloner/utils.rb
+homepage: https://rubygems.org/gems/website_cloner
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.5.18
+signing_key:
+specification_version: 4
+summary: Create local copies of websites, including all assets and linked pages.
+test_files: []