wayback_machine_downloader 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 520f637efbb03d1e3ac87aadf1a937cc132f6c32
|
4
|
+
data.tar.gz: 90ec9079f5420153e1b7c149ce5aedb45bd4ba2c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6009139fbad22b7e269582d905956147df5cec565e376cbf1bdbb1125bb906fdea31c6501b3dfd9fa1b35d730444c976bcf187b8c869e8ed7db146ea155ba8fb
|
7
|
+
data.tar.gz: b4cff49e64c3ec528b544184e1426b76d1284c7f97ee9c787f370cc305b652f328c53ac48d3fc9bd9c9405b26edceb7a888e7f4a8f061ee704608e95a0642108
|
@@ -26,6 +26,10 @@ option_parser = OptionParser.new do |opts|
|
|
26
26
|
options[:to_timestamp] = t
|
27
27
|
end
|
28
28
|
|
29
|
+
opts.on("-e", "--exact_url", String, "Download only the url provied and not the full site") do |t|
|
30
|
+
options[:only_filter] = t
|
31
|
+
end
|
32
|
+
|
29
33
|
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
30
34
|
options[:only_filter] = t
|
31
35
|
end
|
@@ -42,11 +46,11 @@ option_parser = OptionParser.new do |opts|
|
|
42
46
|
options[:threads_count] = t
|
43
47
|
end
|
44
48
|
|
45
|
-
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page
|
49
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
|
46
50
|
options[:maximum_pages] = t
|
47
51
|
end
|
48
52
|
|
49
|
-
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything
|
53
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
|
50
54
|
options[:list] = true
|
51
55
|
end
|
52
56
|
|
@@ -58,7 +62,7 @@ end.parse!
|
|
58
62
|
if (base_url = ARGV[-1])
|
59
63
|
options[:base_url] = base_url
|
60
64
|
wayback_machine_downloader = WaybackMachineDownloader.new options
|
61
|
-
if
|
65
|
+
if options[:list]
|
62
66
|
wayback_machine_downloader.list_files
|
63
67
|
else
|
64
68
|
wayback_machine_downloader.download_files
|
@@ -14,19 +14,21 @@ class WaybackMachineDownloader
|
|
14
14
|
|
15
15
|
include ArchiveAPI
|
16
16
|
|
17
|
-
VERSION = "2.
|
17
|
+
VERSION = "2.1.0"
|
18
18
|
|
19
|
-
attr_accessor :base_url, :
|
19
|
+
attr_accessor :base_url, :exact_url, :directory,
|
20
|
+
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
21
|
+
:all, :maximum_pages, :threads_count
|
20
22
|
|
21
23
|
def initialize params
|
22
24
|
@base_url = params[:base_url]
|
25
|
+
@exact_url = params[:exact_url]
|
23
26
|
@directory = params[:directory]
|
24
27
|
@from_timestamp = params[:from_timestamp].to_i
|
25
28
|
@to_timestamp = params[:to_timestamp].to_i
|
26
29
|
@only_filter = params[:only_filter]
|
27
30
|
@exclude_filter = params[:exclude_filter]
|
28
31
|
@all = params[:all]
|
29
|
-
@list = params[:list]
|
30
32
|
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
31
33
|
@threads_count = params[:threads_count].to_i
|
32
34
|
end
|
@@ -78,18 +80,19 @@ class WaybackMachineDownloader
|
|
78
80
|
end
|
79
81
|
|
80
82
|
def get_all_snapshots_to_consider
|
81
|
-
# Note: Passing a page index parameter allow us to get more snapshots,
|
83
|
+
# Note: Passing a page index parameter allow us to get more snapshots,
|
84
|
+
# but from a less fresh index
|
82
85
|
print "Getting snapshot pages"
|
83
86
|
snapshot_list_to_consider = ""
|
84
87
|
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
|
85
88
|
print "."
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
89
|
+
unless @exact_url
|
90
|
+
@maximum_pages.times do |page_index|
|
91
|
+
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
|
92
|
+
break if snapshot_list.empty?
|
93
|
+
snapshot_list_to_consider += snapshot_list
|
94
|
+
print "."
|
95
|
+
end
|
93
96
|
end
|
94
97
|
puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
|
95
98
|
puts
|
@@ -134,8 +137,10 @@ class WaybackMachineDownloader
|
|
134
137
|
end
|
135
138
|
|
136
139
|
def list_files
|
140
|
+
# retrieval produces its own output
|
141
|
+
files = get_file_list_by_timestamp
|
137
142
|
puts "["
|
138
|
-
|
143
|
+
files.each do |file|
|
139
144
|
puts file.to_json + ","
|
140
145
|
end
|
141
146
|
puts "]"
|
@@ -179,7 +184,7 @@ class WaybackMachineDownloader
|
|
179
184
|
|
180
185
|
def structure_dir_path dir_path
|
181
186
|
begin
|
182
|
-
FileUtils::mkdir_p dir_path unless File.
|
187
|
+
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
183
188
|
rescue Errno::EEXIST => e
|
184
189
|
error_to_string = e.to_s
|
185
190
|
puts "# #{error_to_string}"
|
@@ -219,7 +224,7 @@ class WaybackMachineDownloader
|
|
219
224
|
if Gem.win_platform?
|
220
225
|
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
221
226
|
end
|
222
|
-
unless File.
|
227
|
+
unless File.exist? file_path
|
223
228
|
begin
|
224
229
|
structure_dir_path dir_path
|
225
230
|
open(file_path, "wb") do |file|
|
@@ -240,7 +245,7 @@ class WaybackMachineDownloader
|
|
240
245
|
rescue StandardError => e
|
241
246
|
puts "#{file_url} # #{e}"
|
242
247
|
ensure
|
243
|
-
if not @all and File.
|
248
|
+
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
244
249
|
File.delete(file_path)
|
245
250
|
puts "#{file_path} was empty and was removed."
|
246
251
|
end
|
@@ -1,15 +1,15 @@
|
|
1
1
|
module ArchiveAPI
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
def get_raw_list_from_api url, page_index
|
4
|
+
request_url = "http://web.archive.org/cdx/search/xd?url="
|
5
|
+
request_url += url
|
6
|
+
request_url += parameters_for_api page_index
|
7
7
|
|
8
8
|
open(request_url).read
|
9
|
-
|
9
|
+
end
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
def parameters_for_api page_index
|
12
|
+
parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
|
13
13
|
if @all
|
14
14
|
parameters += ""
|
15
15
|
else
|
@@ -60,7 +60,7 @@ module TibyBytes
|
|
60
60
|
bytes.each_index do |i|
|
61
61
|
|
62
62
|
byte = bytes[i]
|
63
|
-
|
63
|
+
_is_ascii = byte < 128
|
64
64
|
is_cont = byte > 127 && byte < 192
|
65
65
|
is_lead = byte > 191 && byte < 245
|
66
66
|
is_unused = byte > 240
|
@@ -78,7 +78,7 @@ module TibyBytes
|
|
78
78
|
# the leading byte.
|
79
79
|
begin
|
80
80
|
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
81
|
-
rescue NoMethodError
|
81
|
+
rescue NoMethodError
|
82
82
|
next
|
83
83
|
end
|
84
84
|
conts_expected = 0
|
@@ -98,7 +98,7 @@ module TibyBytes
|
|
98
98
|
end
|
99
99
|
begin
|
100
100
|
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
101
|
-
rescue ArgumentError
|
101
|
+
rescue ArgumentError
|
102
102
|
nil
|
103
103
|
end
|
104
104
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-06-
|
11
|
+
date: 2017-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|