RubyGems - wayback_machine_downloader - Versions diffs - 2.2.1 → 2.3.0 - Mend

wayback_machine_downloader 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +5 -5
data/bin/wayback_machine_downloader +1 -1
data/lib/wayback_machine_downloader.rb +13 -13
data/lib/wayback_machine_downloader/archive_api.rb +22 -12
data/lib/wayback_machine_downloader/tidy_bytes.rb +1 -1
metadata +6 -7

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: d037bdcdc516a9366f9d6181d63e61970f3a2ec1
-  data.tar.gz: 03eef551fbb7be1d6dfb29f05c380e517b2870f8
+SHA256:
+  metadata.gz: 57cbbb04b38525f6dd9c1a8f4022ee28ce45c76d1d26acc90076a4b8b6014b44
+  data.tar.gz: 4128b3ab753e91bea93ddebdafba133091663617e0c247c022076a8c11dfa5c2
 SHA512:
-  metadata.gz: ee25c7e833907143a08d9b6438482d1c9bc76219eb62b4ac179ef770c323b60c3956bb7c1c1bd33ce0e7f5b3cda620fda4c06258837066762541b4404fd4d2cc
-  data.tar.gz: cffca8734ae0ee449b35aae26f77032cd27bd749bbc34e925a64ea51f52a8a55c9b7fe85c5a3c2a8787715fba15f5c4138a04d4499ab170a0404d4fb0c2624d2
+  metadata.gz: bb08b6f6e9fa930b025fbf0c783476bd965e364ef46ccd2fecc8e5d0954be062b67b801acf4d168556f7d90f1d5c836a16184e371a5e5d47da7e804278d893ab
+  data.tar.gz: e750e04ab4e1f795e061f0fe91581abb60baecc7d5427d9ec8e724f931d1afba207512731f350eee7c04ceacf12b4a90823e9b3c750e3799810944430279c330

data/bin/wayback_machine_downloader CHANGED Viewed

@@ -46,7 +46,7 @@ option_parser = OptionParser.new do |opts|
     options[:all] = true
   end
-  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
+  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
     options[:threads_count] = t
   end

data/lib/wayback_machine_downloader.rb CHANGED Viewed

@@ -14,7 +14,7 @@ class WaybackMachineDownloader
   include ArchiveAPI
-  VERSION = "2.2.1"
+  VERSION = "2.3.0"
   attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
@@ -84,7 +84,7 @@ class WaybackMachineDownloader
     # Note: Passing a page index parameter allow us to get more snapshots,
     # but from a less fresh index
     print "Getting snapshot pages"
-    snapshot_list_to_consider = ""
+    snapshot_list_to_consider = []
     snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
     print "."
     unless @exact_url
@@ -95,17 +95,15 @@ class WaybackMachineDownloader
         print "."
       end
     end
-    puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
+    puts " found #{snapshot_list_to_consider.length} snaphots to consider."
     puts
     snapshot_list_to_consider
   end
   def get_file_list_curated
     file_list_curated = Hash.new
-    get_all_snapshots_to_consider.each_line do |line|
-      next unless line.include?('/')
-      file_timestamp = line[0..13].to_i
-      file_url = line[15..-2]
+    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
+      next unless file_url.include?('/')
       file_id = file_url.split('/')[3..-1].join('/')
       file_id = CGI::unescape file_id
       file_id = file_id.tidy_bytes unless file_id == ""
@@ -130,10 +128,8 @@ class WaybackMachineDownloader
   def get_file_list_all_timestamps
     file_list_curated = Hash.new
-    get_all_snapshots_to_consider.each_line do |line|
-      next unless line.include?('/')
-      file_timestamp = line[0..13].to_i
-      file_url = line[15..-2]
+    get_all_snapshots_to_consider.each do |file_timestamp, file_url|
+      next unless file_url.include?('/')
       file_id = file_url.split('/')[3..-1].join('/')
       file_id_and_timestamp = [file_timestamp, file_id].join('/')
       file_id_and_timestamp = CGI::unescape file_id_and_timestamp
@@ -176,11 +172,15 @@ class WaybackMachineDownloader
   def list_files
     # retrieval produces its own output
+    @orig_stdout = $stdout
+    $stdout = $stderr
     files = get_file_list_by_timestamp
+    $stdout = @orig_stdout
     puts "["
-    files.each do |file|
+    files[0...-1].each do |file|
       puts file.to_json + ","
     end
+    puts files[-1].to_json
     puts "]"
   end
@@ -268,7 +268,7 @@ class WaybackMachineDownloader
         structure_dir_path dir_path
         open(file_path, "wb") do |file|
           begin
-            open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
+            URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
               file.write(uri.read)
             end
           rescue OpenURI::HTTPError => e

data/lib/wayback_machine_downloader/archive_api.rb CHANGED Viewed

@@ -1,28 +1,38 @@
+require 'json'
+require 'uri'
 module ArchiveAPI
   def get_raw_list_from_api url, page_index
-    request_url = "http://web.archive.org/cdx/search/xd?url="
-    request_url += url
-    request_url += parameters_for_api page_index
+    request_url = URI("https://web.archive.org/cdx/search/xd")
+    params = [["output", "json"], ["url", url]]
+    params += parameters_for_api page_index
+    request_url.query = URI.encode_www_form(params)
-    open(request_url).read
+    begin
+      json = JSON.parse(URI(request_url).open.read)
+      if (json[0] <=> ["timestamp","original"]) == 0
+        json.shift
+      end
+      json
+    rescue JSON::ParserError
+      []
+    end
   end
   def parameters_for_api page_index
-    parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
-    if @all
-      parameters += ""
-    else
-      parameters += "&filter=statuscode:200"
+    parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
+    if !@all
+      parameters.push(["filter", "statuscode:200"])
     end
     if @from_timestamp and @from_timestamp != 0
-      parameters += "&from=" + @from_timestamp.to_s
+      parameters.push(["from", @from_timestamp.to_s])
     end
     if @to_timestamp and @to_timestamp != 0
-      parameters += "&to=" + @to_timestamp.to_s
+      parameters.push(["to", @to_timestamp.to_s])
     end
     if page_index
-      parameters += "&page=#{page_index}"
+      parameters.push(["page", page_index])
     end
     parameters
   end

data/lib/wayback_machine_downloader/tidy_bytes.rb CHANGED Viewed

@@ -70,7 +70,7 @@ module TibyBytes
         if is_unused || is_restricted
           bytes[i] = tidy_byte(byte)
         elsif is_cont
-          # Not expecting contination byte? Clean up. Otherwise, now expect one less.
+          # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
           conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
         else
           if conts_expected > 0

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wayback_machine_downloader
 version: !ruby/object:Gem::Version
-  version: 2.2.1
+  version: 2.3.0
 platform: ruby
 authors:
 - hartator
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-10-27 00:00:00.000000000 Z
+date: 2021-06-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.5.2
-signing_key:
+rubygems_version: 3.1.4
+signing_key:
 specification_version: 4
 summary: Download an entire website from the Wayback Machine.
 test_files: []