wayback_machine_downloader 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d037bdcdc516a9366f9d6181d63e61970f3a2ec1
4
- data.tar.gz: 03eef551fbb7be1d6dfb29f05c380e517b2870f8
2
+ SHA256:
3
+ metadata.gz: 57cbbb04b38525f6dd9c1a8f4022ee28ce45c76d1d26acc90076a4b8b6014b44
4
+ data.tar.gz: 4128b3ab753e91bea93ddebdafba133091663617e0c247c022076a8c11dfa5c2
5
5
  SHA512:
6
- metadata.gz: ee25c7e833907143a08d9b6438482d1c9bc76219eb62b4ac179ef770c323b60c3956bb7c1c1bd33ce0e7f5b3cda620fda4c06258837066762541b4404fd4d2cc
7
- data.tar.gz: cffca8734ae0ee449b35aae26f77032cd27bd749bbc34e925a64ea51f52a8a55c9b7fe85c5a3c2a8787715fba15f5c4138a04d4499ab170a0404d4fb0c2624d2
6
+ metadata.gz: bb08b6f6e9fa930b025fbf0c783476bd965e364ef46ccd2fecc8e5d0954be062b67b801acf4d168556f7d90f1d5c836a16184e371a5e5d47da7e804278d893ab
7
+ data.tar.gz: e750e04ab4e1f795e061f0fe91581abb60baecc7d5427d9ec8e724f931d1afba207512731f350eee7c04ceacf12b4a90823e9b3c750e3799810944430279c330
@@ -46,7 +46,7 @@ option_parser = OptionParser.new do |opts|
46
46
  options[:all] = true
47
47
  end
48
48
 
49
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
49
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
50
50
  options[:threads_count] = t
51
51
  end
52
52
 
@@ -14,7 +14,7 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.2.1"
17
+ VERSION = "2.3.0"
18
18
 
19
19
  attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
20
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
@@ -84,7 +84,7 @@ class WaybackMachineDownloader
84
84
  # Note: Passing a page index parameter allow us to get more snapshots,
85
85
  # but from a less fresh index
86
86
  print "Getting snapshot pages"
87
- snapshot_list_to_consider = ""
87
+ snapshot_list_to_consider = []
88
88
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
89
89
  print "."
90
90
  unless @exact_url
@@ -95,17 +95,15 @@ class WaybackMachineDownloader
95
95
  print "."
96
96
  end
97
97
  end
98
- puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
98
+ puts " found #{snapshot_list_to_consider.length} snaphots to consider."
99
99
  puts
100
100
  snapshot_list_to_consider
101
101
  end
102
102
 
103
103
  def get_file_list_curated
104
104
  file_list_curated = Hash.new
105
- get_all_snapshots_to_consider.each_line do |line|
106
- next unless line.include?('/')
107
- file_timestamp = line[0..13].to_i
108
- file_url = line[15..-2]
105
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
106
+ next unless file_url.include?('/')
109
107
  file_id = file_url.split('/')[3..-1].join('/')
110
108
  file_id = CGI::unescape file_id
111
109
  file_id = file_id.tidy_bytes unless file_id == ""
@@ -130,10 +128,8 @@ class WaybackMachineDownloader
130
128
 
131
129
  def get_file_list_all_timestamps
132
130
  file_list_curated = Hash.new
133
- get_all_snapshots_to_consider.each_line do |line|
134
- next unless line.include?('/')
135
- file_timestamp = line[0..13].to_i
136
- file_url = line[15..-2]
131
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
132
+ next unless file_url.include?('/')
137
133
  file_id = file_url.split('/')[3..-1].join('/')
138
134
  file_id_and_timestamp = [file_timestamp, file_id].join('/')
139
135
  file_id_and_timestamp = CGI::unescape file_id_and_timestamp
@@ -176,11 +172,15 @@ class WaybackMachineDownloader
176
172
 
177
173
  def list_files
178
174
  # retrieval produces its own output
175
+ @orig_stdout = $stdout
176
+ $stdout = $stderr
179
177
  files = get_file_list_by_timestamp
178
+ $stdout = @orig_stdout
180
179
  puts "["
181
- files.each do |file|
180
+ files[0...-1].each do |file|
182
181
  puts file.to_json + ","
183
182
  end
183
+ puts files[-1].to_json
184
184
  puts "]"
185
185
  end
186
186
 
@@ -268,7 +268,7 @@ class WaybackMachineDownloader
268
268
  structure_dir_path dir_path
269
269
  open(file_path, "wb") do |file|
270
270
  begin
271
- open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
271
+ URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
272
272
  file.write(uri.read)
273
273
  end
274
274
  rescue OpenURI::HTTPError => e
@@ -1,28 +1,38 @@
1
+ require 'json'
2
+ require 'uri'
3
+
1
4
  module ArchiveAPI
2
5
 
3
6
  def get_raw_list_from_api url, page_index
4
- request_url = "http://web.archive.org/cdx/search/xd?url="
5
- request_url += url
6
- request_url += parameters_for_api page_index
7
+ request_url = URI("https://web.archive.org/cdx/search/xd")
8
+ params = [["output", "json"], ["url", url]]
9
+ params += parameters_for_api page_index
10
+ request_url.query = URI.encode_www_form(params)
7
11
 
8
- open(request_url).read
12
+ begin
13
+ json = JSON.parse(URI(request_url).open.read)
14
+ if (json[0] <=> ["timestamp","original"]) == 0
15
+ json.shift
16
+ end
17
+ json
18
+ rescue JSON::ParserError
19
+ []
20
+ end
9
21
  end
10
22
 
11
23
  def parameters_for_api page_index
12
- parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
13
- if @all
14
- parameters += ""
15
- else
16
- parameters += "&filter=statuscode:200"
24
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
25
+ if !@all
26
+ parameters.push(["filter", "statuscode:200"])
17
27
  end
18
28
  if @from_timestamp and @from_timestamp != 0
19
- parameters += "&from=" + @from_timestamp.to_s
29
+ parameters.push(["from", @from_timestamp.to_s])
20
30
  end
21
31
  if @to_timestamp and @to_timestamp != 0
22
- parameters += "&to=" + @to_timestamp.to_s
32
+ parameters.push(["to", @to_timestamp.to_s])
23
33
  end
24
34
  if page_index
25
- parameters += "&page=#{page_index}"
35
+ parameters.push(["page", page_index])
26
36
  end
27
37
  parameters
28
38
  end
@@ -70,7 +70,7 @@ module TibyBytes
70
70
  if is_unused || is_restricted
71
71
  bytes[i] = tidy_byte(byte)
72
72
  elsif is_cont
73
- # Not expecting contination byte? Clean up. Otherwise, now expect one less.
73
+ # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
74
74
  conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
75
  else
76
76
  if conts_expected > 0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-27 00:00:00.000000000 Z
11
+ date: 2021-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -57,7 +57,7 @@ homepage: https://github.com/hartator/wayback-machine-downloader
57
57
  licenses:
58
58
  - MIT
59
59
  metadata: {}
60
- post_install_message:
60
+ post_install_message:
61
61
  rdoc_options: []
62
62
  require_paths:
63
63
  - lib
@@ -72,9 +72,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
72
72
  - !ruby/object:Gem::Version
73
73
  version: '0'
74
74
  requirements: []
75
- rubyforge_project:
76
- rubygems_version: 2.5.2
77
- signing_key:
75
+ rubygems_version: 3.1.4
76
+ signing_key:
78
77
  specification_version: 4
79
78
  summary: Download an entire website from the Wayback Machine.
80
79
  test_files: []