RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.4.5 → 2.4.7 - Mend

wayback_machine_downloader_straw 2.4.5 → 2.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +127 -111
data/lib/wayback_machine_downloader/archive_api.rb +85 -61
data/lib/wayback_machine_downloader/page_requisites.rb +32 -32
data/lib/wayback_machine_downloader/subdom_processor.rb +237 -237
data/lib/wayback_machine_downloader/tidy_bytes.rb +77 -77
data/lib/wayback_machine_downloader/to_regex.rb +106 -106
data/lib/wayback_machine_downloader/url_rewrite.rb +84 -84
data/lib/wayback_machine_downloader.rb +1322 -1158
metadata +5 -5

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c5ba50bde9b0306e043eed8151b12f37f603f5cfd73013e53260543f7fc134a5
-  data.tar.gz: d3dbc1a0f6f894547fb39e56193c967dbf99685a8fa4d3cecaeaff62070aab4c
+  metadata.gz: 558d3187ee31faeadb08cf83e32a87307ae9d55a3327206598f27a78fb715e08
+  data.tar.gz: 9845999e0e618afde419869bb01b04277aca318aa80b238feb8252540fc16315
 SHA512:
-  metadata.gz: 854ec2ccbe2daf620178397bf2c620ed8cf01ca57f175cbb9a2c8e7a057b1495a382ccae64836d75cf08c9a7d5a8ab638859d4c7d34584dcfc51f5eda8b7e5b2
-  data.tar.gz: 1ed3f13c7aadcb097a174c7870730df0920c7b08e9476b786d287dfd2a3e29a50b690ea2fb16b0d762dac726648356788b4122e674db44e7c5a49ffc541f4098
+  metadata.gz: b323c1065ea1ab1d3c5909458cae726462ba1b88fd89effe8cd1efbdd1301d2022363c56b191f8ddbe28c0143fe87dfc94ca847865bb08e3564e29ba36f231a4
+  data.tar.gz: 00c4e775ee05e176e1048d6e5ddd9ddc436edaf9bf7fea61dbab33480e4adc499ea8a3a584e591858aad3a4c2f10096a3b6eb1ff6c8146f26ba6a3d42f104e32

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -1,111 +1,127 @@
-#!/usr/bin/env ruby
-require_relative '../lib/wayback_machine_downloader'
-require 'optparse'
-require 'pp'
-options = {}
-option_parser = OptionParser.new do |opts|
-  opts.banner = "Usage: wayback_machine_downloader http://example.com"
-  opts.separator ""
-  opts.separator "Download an entire website from the Wayback Machine."
-  opts.separator ""
-  opts.separator "Optional options:"
-  opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
-    options[:directory] = t
-  end
-  opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
-    options[:all_timestamps] = true
-  end
-  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
-    options[:from_timestamp] = t
-  end
-  opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
-    options[:to_timestamp] = t
-  end
-  opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
-    options[:exact_url] = t
-  end
-  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
-    options[:only_filter] = t
-  end
-  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
-    options[:exclude_filter] = t
-  end
-  opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
-    options[:all] = true
-  end
-  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
-    options[:threads_count] = t
-  end
-  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
-    options[:maximum_pages] = t
-  end
-  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
-    options[:list] = true
-  end
-  opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
-    options[:rewritten] = true
-  end
-  opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
-    options[:rewrite] = true
-  end
-  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
-    options[:reset] = true
-  end
-  opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
-    options[:keep] = true
-  end
-  opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
-    options[:max_retries] = t
-  end
-  opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
-    options[:recursive_subdomains] = true
-  end
-  opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
-    options[:subdomain_depth] = t
-  end
-  opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
-    options[:page_requisites] = true
-  end
-  opts.on("-v", "--version", "Display version") do |t|
-    options[:version] = t
-  end
-end.parse!
-if (base_url = ARGV[-1])
-  options[:base_url] = base_url
-  wayback_machine_downloader = WaybackMachineDownloader.new options
-  if options[:list]
-    wayback_machine_downloader.list_files
-  else
-    wayback_machine_downloader.download_files
-  end
-elsif options[:version]
-  puts WaybackMachineDownloader::VERSION
-else
-  puts "You need to specify a website to backup. (e.g., http://example.com)"
-  puts "Run `wayback_machine_downloader --help` for more help."
-end
+#!/usr/bin/env ruby
+$stdout.sync = true
+require_relative '../lib/wayback_machine_downloader'
+require 'optparse'
+require 'pp'
+options = {}
+option_parser = OptionParser.new do |opts|
+  opts.banner = "Usage: wayback_machine_downloader http://example.com"
+  opts.separator ""
+  opts.separator "Download an entire website from the Wayback Machine."
+  opts.separator ""
+  opts.separator "Optional options:"
+  opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
+    options[:directory] = t
+  end
+  opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
+    options[:all_timestamps] = true
+  end
+  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
+    options[:from_timestamp] = t
+  end
+  opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
+    options[:to_timestamp] = t
+  end
+  opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
+    options[:exact_url] = t
+  end
+  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
+    options[:only_filter] = t
+  end
+  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
+    options[:exclude_filter] = t
+  end
+  opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
+    options[:all] = true
+  end
+  opts.on("--keep-duplicates", "Do not collapse duplicate CDX captures by digest") do |t|
+    options[:keep_duplicates] = true
+  end
+  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
+    options[:threads_count] = t
+  end
+  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
+    options[:maximum_pages] = t
+  end
+  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
+    options[:list] = true
+  end
+  opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
+    options[:rewritten] = true
+  end
+  opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
+    options[:rewrite] = true
+  end
+  opts.on("--local-only", "Only rewrite links in an already downloaded directory, doesn't download anything") do |t|
+    options[:local_only] = true
+  end
+  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
+    options[:reset] = true
+  end
+  opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
+    options[:keep] = true
+  end
+  opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
+    options[:max_retries] = t
+  end
+  opts.on("--snapshot-at TIMESTAMP", Integer, "Build a composite snapshot at this timestamp") do |t|
+    options[:snapshot_at] = t
+  end
+  opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
+    options[:recursive_subdomains] = true
+  end
+  opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
+    options[:subdomain_depth] = t
+  end
+  opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
+    options[:page_requisites] = true
+  end
+  opts.on("-v", "--version", "Display version") do |t|
+    options[:version] = t
+  end
+end.parse!
+if (base_url = ARGV[-1])
+  options[:base_url] = base_url
+  wayback_machine_downloader = WaybackMachineDownloader.new options
+  if options[:local_only]
+  wayback_machine_downloader.rewrite_local_files
+  elsif options[:list]
+    wayback_machine_downloader.list_files
+  else
+    wayback_machine_downloader.download_files
+  end
+elsif options[:version]
+  puts WaybackMachineDownloader::VERSION
+else
+  puts "You need to specify a website to backup. (e.g., http://example.com)"
+  puts "Run `wayback_machine_downloader --help` for more help."
+end

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -1,61 +1,85 @@
-require 'json'
-require 'uri'
-module ArchiveAPI
-  def get_raw_list_from_api(url, page_index, http)
-    # Automatically append /* if the URL doesn't contain a path after the domain
-    # This is a workaround for an issue with the API and *some* domains.
-    # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
-    # But don't do this when exact_url flag is set
-    if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
-      url = "#{url}/*"
-    end
-    request_url = URI("https://web.archive.org/cdx/search/cdx")
-    params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
-    request_url.query = URI.encode_www_form(params)
-    retries = 0
-    max_retries = (@max_retries || 3)
-    delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
-    begin
-      response = http.get(request_url)
-      body = response.body.to_s.strip
-      return [] if body.empty?
-      json = JSON.parse(body)
-      # Check if the response contains the header ["timestamp", "original"]
-      json.shift if json.first == ["timestamp", "original"]
-      json
-    rescue JSON::ParserError => e
-      warn "Failed to parse JSON from API for #{url}: #{e.message}"
-      []
-    rescue Net::ReadTimeout, Net::OpenTimeout => e
-      if retries < max_retries
-        retries += 1
-        warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
-        sleep(delay * retries)
-        retry
-      else
-        warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
-        []
-      end
-    rescue StandardError => e
-      # treat any other transient-ish error similarly, though without retries for now
-      warn "Error fetching CDX data for #{url}: #{e.message}"
-      []
-    end
-  end
-  def parameters_for_api(page_index)
-    parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
-    parameters.push(["filter", "statuscode:200"]) unless @all
-    parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
-    parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
-    parameters.push(["page", page_index]) if page_index
-    parameters
-  end
-end
+require 'json'
+require 'uri'
+module ArchiveAPI
+  def get_raw_list_from_api(url, page_index, http)
+    # Automatically append /* for host-only URLs
+    # This is a workaround for an issue with the API and *some* domains.
+    # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
+    # But don't do this when exact_url flag is set, and never append twice
+    if url && !@exact_url
+      normalized_url = url.to_s
+      has_wildcard = normalized_url.include?('*')
+      host_and_rest = normalized_url
+        .sub(/\Ahttps?:\/\//i, '')
+        .split(/[?#]/, 2)
+        .first
+      has_path = host_and_rest.include?('/')
+      unless has_wildcard || has_path
+        url = "#{normalized_url}/*"
+      end
+    end
+    request_url = URI("https://web.archive.org/cdx/search/cdx")
+    params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
+    request_url.query = URI.encode_www_form(params)
+    retries = 0
+    max_retries = (@max_retries || 3)
+    delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
+    begin
+      request = Net::HTTP::Get.new(request_url)
+      request["User-Agent"] = "wmd-straw/#{WaybackMachineDownloader::VERSION}"
+      request["Connection"] = "keep-alive"
+      request["Accept-Encoding"] = "gzip"
+      response = http.request(request)
+      case response.code.to_i
+      when 200
+        body = if response['content-encoding'] == 'gzip'
+          Zlib::GzipReader.new(StringIO.new(response.body)).read
+        else
+          response.body.to_s.strip
+        end
+        return [] if body.empty?
+        begin
+          json = JSON.parse(body)
+          # check if the response contains the header ["timestamp", "original"]
+          json.shift if json.first == ["timestamp", "original"]
+          json
+        rescue JSON::ParserError => e
+          raise "Malformed JSON response: #{e.message}"
+        end
+      when 429, 500, 502, 503, 504
+        raise "Server error #{response.code}: #{response.message}"
+      else
+        warn "Unexpected API response #{response.code} for #{url}"
+        []
+      end
+    rescue Net::ReadTimeout, Net::OpenTimeout, StandardError => e
+      if retries < max_retries
+        retries += 1
+        warn "Error talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
+        sleep(delay * retries)
+        retry
+      else
+        warn "Giving up on Wayback CDX API for #{url} after #{max_retries} attempts. (Last error: #{e.message})"
+        []
+      end
+    end
+  end
+  def parameters_for_api(page_index)
+    parameters = [["fl", "timestamp,original"], ["gzip", "true"]]
+    parameters.push(["collapse", "digest"]) unless @keep_duplicates || @all_timestamps
+    parameters.push(["filter", "statuscode:2..|30[12378]"]) unless @all
+    parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
+    parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
+    parameters.push(["page", page_index]) if page_index
+    parameters
+  end
+end

data/lib/wayback_machine_downloader/page_requisites.rb CHANGED Viewed

@@ -1,33 +1,33 @@
-module PageRequisites
-  # regex to find links in href, src, url(), and srcset
-  # this ignores data: URIs, mailto:, and anchors
-  ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
-  def self.extract(html_content)
-    assets = []
-    html_content.scan(ASSET_REGEX) do |match|
-      # match is an array of capture groups; find the one that matched
-      url = match.compact.first
-      next unless url
-      # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
-      if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
-        url.split(',').each do |src_def|
-          src_url = src_def.strip.split(' ').first
-          assets << src_url if valid_asset?(src_url)
-        end
-      else
-        assets << url if valid_asset?(url)
-      end
-    end
-    assets.uniq
-  end
-  def self.valid_asset?(url)
-    return false if url.strip.empty?
-    return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
-    true
-  end
+module PageRequisites
+  # regex to find links in href, src, url(), and srcset
+  # this ignores data: URIs, mailto:, and anchors
+  ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
+  def self.extract(html_content)
+    assets = []
+    html_content.scan(ASSET_REGEX) do |match|
+      # match is an array of capture groups; find the one that matched
+      url = match.compact.first
+      next unless url
+      # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
+      if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
+        url.split(',').each do |src_def|
+          src_url = src_def.strip.split(' ').first
+          assets << src_url if valid_asset?(src_url)
+        end
+      else
+        assets << url if valid_asset?(url)
+      end
+    end
+    assets.uniq
+  end
+  def self.valid_asset?(url)
+    return false if url.strip.empty?
+    return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
+    true
+  end
 end