RubyGems - wayback_machine_downloader_straw - Versions diffs - 2.4.5 → 2.4.6 - Mend

wayback_machine_downloader_straw 2.4.5 → 2.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/bin/wayback_machine_downloader +123 -111
data/lib/wayback_machine_downloader/archive_api.rb +74 -61
data/lib/wayback_machine_downloader/page_requisites.rb +32 -32
data/lib/wayback_machine_downloader/subdom_processor.rb +237 -237
data/lib/wayback_machine_downloader/tidy_bytes.rb +77 -77
data/lib/wayback_machine_downloader/to_regex.rb +106 -106
data/lib/wayback_machine_downloader/url_rewrite.rb +84 -84
data/lib/wayback_machine_downloader.rb +1244 -1158
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c5ba50bde9b0306e043eed8151b12f37f603f5cfd73013e53260543f7fc134a5
-  data.tar.gz: d3dbc1a0f6f894547fb39e56193c967dbf99685a8fa4d3cecaeaff62070aab4c
+  metadata.gz: e52df092b59b0eec27b390b5b00fcfc17fab271acd6cd9df774912f57cfc4dc1
+  data.tar.gz: ce170b42caad7e8136b07c2aa5cb6e751f57dd64bd40c0addcd42a31798d0047
 SHA512:
-  metadata.gz: 854ec2ccbe2daf620178397bf2c620ed8cf01ca57f175cbb9a2c8e7a057b1495a382ccae64836d75cf08c9a7d5a8ab638859d4c7d34584dcfc51f5eda8b7e5b2
-  data.tar.gz: 1ed3f13c7aadcb097a174c7870730df0920c7b08e9476b786d287dfd2a3e29a50b690ea2fb16b0d762dac726648356788b4122e674db44e7c5a49ffc541f4098
+  metadata.gz: 558e9cdfc3d7d4d2081ccb49b12a96bdb64b7768697eb0a1b9a431ed1ad3017ce894975e046a6e50766928c3863797715c7b45d013b6ab7ad78bca59ea86c6d0
+  data.tar.gz: af3064f1489d32cf078fd5d87d2773700e9dfa498075f089029e0e7ec47c500c7815e84d51f426bb6fc3067bf02c9a9404da3b6f74d263c99b4ae96fc32dab35

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -1,111 +1,123 @@
-#!/usr/bin/env ruby
-require_relative '../lib/wayback_machine_downloader'
-require 'optparse'
-require 'pp'
-options = {}
-option_parser = OptionParser.new do |opts|
-  opts.banner = "Usage: wayback_machine_downloader http://example.com"
-  opts.separator ""
-  opts.separator "Download an entire website from the Wayback Machine."
-  opts.separator ""
-  opts.separator "Optional options:"
-  opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
-    options[:directory] = t
-  end
-  opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
-    options[:all_timestamps] = true
-  end
-  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
-    options[:from_timestamp] = t
-  end
-  opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
-    options[:to_timestamp] = t
-  end
-  opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
-    options[:exact_url] = t
-  end
-  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
-    options[:only_filter] = t
-  end
-  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
-    options[:exclude_filter] = t
-  end
-  opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
-    options[:all] = true
-  end
-  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
-    options[:threads_count] = t
-  end
-  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
-    options[:maximum_pages] = t
-  end
-  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
-    options[:list] = true
-  end
-  opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
-    options[:rewritten] = true
-  end
-  opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
-    options[:rewrite] = true
-  end
-  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
-    options[:reset] = true
-  end
-  opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
-    options[:keep] = true
-  end
-  opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
-    options[:max_retries] = t
-  end
-  opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
-    options[:recursive_subdomains] = true
-  end
-  opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
-    options[:subdomain_depth] = t
-  end
-  opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
-    options[:page_requisites] = true
-  end
-  opts.on("-v", "--version", "Display version") do |t|
-    options[:version] = t
-  end
-end.parse!
-if (base_url = ARGV[-1])
-  options[:base_url] = base_url
-  wayback_machine_downloader = WaybackMachineDownloader.new options
-  if options[:list]
-    wayback_machine_downloader.list_files
-  else
-    wayback_machine_downloader.download_files
-  end
-elsif options[:version]
-  puts WaybackMachineDownloader::VERSION
-else
-  puts "You need to specify a website to backup. (e.g., http://example.com)"
-  puts "Run `wayback_machine_downloader --help` for more help."
-end
+#!/usr/bin/env ruby
+$stdout.sync = true
+require_relative '../lib/wayback_machine_downloader'
+require 'optparse'
+require 'pp'
+options = {}
+option_parser = OptionParser.new do |opts|
+  opts.banner = "Usage: wayback_machine_downloader http://example.com"
+  opts.separator ""
+  opts.separator "Download an entire website from the Wayback Machine."
+  opts.separator ""
+  opts.separator "Optional options:"
+  opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
+    options[:directory] = t
+  end
+  opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
+    options[:all_timestamps] = true
+  end
+  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
+    options[:from_timestamp] = t
+  end
+  opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
+    options[:to_timestamp] = t
+  end
+  opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
+    options[:exact_url] = t
+  end
+  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
+    options[:only_filter] = t
+  end
+  opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
+    options[:exclude_filter] = t
+  end
+  opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
+    options[:all] = true
+  end
+  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
+    options[:threads_count] = t
+  end
+  opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
+    options[:maximum_pages] = t
+  end
+  opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
+    options[:list] = true
+  end
+  opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
+    options[:rewritten] = true
+  end
+  opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
+    options[:rewrite] = true
+  end
+  opts.on("--local-only", "Only rewrite links in an already downloaded directory, doesn't download anything") do |t|
+    options[:local_only] = true
+  end
+  opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
+    options[:reset] = true
+  end
+  opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
+    options[:keep] = true
+  end
+  opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
+    options[:max_retries] = t
+  end
+  opts.on("--snapshot-at TIMESTAMP", Integer, "Build a composite snapshot at this timestamp") do |t|
+    options[:snapshot_at] = t
+  end
+  opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
+    options[:recursive_subdomains] = true
+  end
+  opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
+    options[:subdomain_depth] = t
+  end
+  opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
+    options[:page_requisites] = true
+  end
+  opts.on("-v", "--version", "Display version") do |t|
+    options[:version] = t
+  end
+end.parse!
+if (base_url = ARGV[-1])
+  options[:base_url] = base_url
+  wayback_machine_downloader = WaybackMachineDownloader.new options
+  if options[:local_only]
+  wayback_machine_downloader.rewrite_local_files
+  elsif options[:list]
+    wayback_machine_downloader.list_files
+  else
+    wayback_machine_downloader.download_files
+  end
+elsif options[:version]
+  puts WaybackMachineDownloader::VERSION
+else
+  puts "You need to specify a website to backup. (e.g., http://example.com)"
+  puts "Run `wayback_machine_downloader --help` for more help."
+end

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -1,61 +1,74 @@
-require 'json'
-require 'uri'
-module ArchiveAPI
-  def get_raw_list_from_api(url, page_index, http)
-    # Automatically append /* if the URL doesn't contain a path after the domain
-    # This is a workaround for an issue with the API and *some* domains.
-    # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
-    # But don't do this when exact_url flag is set
-    if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
-      url = "#{url}/*"
-    end
-    request_url = URI("https://web.archive.org/cdx/search/cdx")
-    params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
-    request_url.query = URI.encode_www_form(params)
-    retries = 0
-    max_retries = (@max_retries || 3)
-    delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
-    begin
-      response = http.get(request_url)
-      body = response.body.to_s.strip
-      return [] if body.empty?
-      json = JSON.parse(body)
-      # Check if the response contains the header ["timestamp", "original"]
-      json.shift if json.first == ["timestamp", "original"]
-      json
-    rescue JSON::ParserError => e
-      warn "Failed to parse JSON from API for #{url}: #{e.message}"
-      []
-    rescue Net::ReadTimeout, Net::OpenTimeout => e
-      if retries < max_retries
-        retries += 1
-        warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
-        sleep(delay * retries)
-        retry
-      else
-        warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
-        []
-      end
-    rescue StandardError => e
-      # treat any other transient-ish error similarly, though without retries for now
-      warn "Error fetching CDX data for #{url}: #{e.message}"
-      []
-    end
-  end
-  def parameters_for_api(page_index)
-    parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
-    parameters.push(["filter", "statuscode:200"]) unless @all
-    parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
-    parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
-    parameters.push(["page", page_index]) if page_index
-    parameters
-  end
-end
+require 'json'
+require 'uri'
+module ArchiveAPI
+  def get_raw_list_from_api(url, page_index, http)
+    # Automatically append /* if the URL doesn't contain a path after the domain
+    # This is a workaround for an issue with the API and *some* domains.
+    # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
+    # But don't do this when exact_url flag is set
+    if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
+      url = "#{url}/*"
+    end
+    request_url = URI("https://web.archive.org/cdx/search/cdx")
+    params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
+    request_url.query = URI.encode_www_form(params)
+    retries = 0
+    max_retries = (@max_retries || 3)
+    delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
+    begin
+      request = Net::HTTP::Get.new(request_url)
+      request["User-Agent"] = "wmd-straw/#{WaybackMachineDownloader::VERSION}"
+      request["Connection"] = "keep-alive"
+      request["Accept-Encoding"] = "gzip"
+      response = http.request(request)
+      case response.code.to_i
+      when 200
+        body = if response['content-encoding'] == 'gzip'
+          Zlib::GzipReader.new(StringIO.new(response.body)).read
+        else
+          response.body.to_s.strip
+        end
+        return [] if body.empty?
+        begin
+          json = JSON.parse(body)
+          # check if the response contains the header ["timestamp", "original"]
+          json.shift if json.first == ["timestamp", "original"]
+          json
+        rescue JSON::ParserError => e
+          raise "Malformed JSON response: #{e.message}"
+        end
+      when 429, 500, 502, 503, 504
+        raise "Server error #{response.code}: #{response.message}"
+      else
+        warn "Unexpected API response #{response.code} for #{url}"
+        []
+      end
+    rescue Net::ReadTimeout, Net::OpenTimeout, StandardError => e
+      if retries < max_retries
+        retries += 1
+        warn "Error talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
+        sleep(delay * retries)
+        retry
+      else
+        warn "Giving up on Wayback CDX API for #{url} after #{max_retries} attempts. (Last error: #{e.message})"
+        []
+      end
+    end
+  end
+  def parameters_for_api(page_index)
+    parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "true"]]
+    parameters.push(["filter", "statuscode:200"]) unless @all
+    parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
+    parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
+    parameters.push(["page", page_index]) if page_index
+    parameters
+  end
+end

data/lib/wayback_machine_downloader/page_requisites.rb CHANGED Viewed

@@ -1,33 +1,33 @@
-module PageRequisites
-  # regex to find links in href, src, url(), and srcset
-  # this ignores data: URIs, mailto:, and anchors
-  ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
-  def self.extract(html_content)
-    assets = []
-    html_content.scan(ASSET_REGEX) do |match|
-      # match is an array of capture groups; find the one that matched
-      url = match.compact.first
-      next unless url
-      # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
-      if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
-        url.split(',').each do |src_def|
-          src_url = src_def.strip.split(' ').first
-          assets << src_url if valid_asset?(src_url)
-        end
-      else
-        assets << url if valid_asset?(url)
-      end
-    end
-    assets.uniq
-  end
-  def self.valid_asset?(url)
-    return false if url.strip.empty?
-    return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
-    true
-  end
+module PageRequisites
+  # regex to find links in href, src, url(), and srcset
+  # this ignores data: URIs, mailto:, and anchors
+  ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
+  def self.extract(html_content)
+    assets = []
+    html_content.scan(ASSET_REGEX) do |match|
+      # match is an array of capture groups; find the one that matched
+      url = match.compact.first
+      next unless url
+      # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
+      if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
+        url.split(',').each do |src_def|
+          src_url = src_def.strip.split(' ').first
+          assets << src_url if valid_asset?(src_url)
+        end
+      else
+        assets << url if valid_asset?(url)
+      end
+    end
+    assets.uniq
+  end
+  def self.valid_asset?(url)
+    return false if url.strip.empty?
+    return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
+    true
+  end
 end