wayback_machine_downloader_hhr 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 89aa753a924055b41a371b5c616158dc5b65dfa63b136fff078588d839949f64
|
4
|
+
data.tar.gz: be143940de3f24c545a8bf202b1fb28f601b124f69927213e834b49bada36cf3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: adf23257485832a2e6c4ccc443cf43583e59851e39d2e474bbad097ff9332f71ec63e1e0ade769b72485ca6e38234fac5f18b62ba3ca9858cf6bd46ebc1a4835
|
7
|
+
data.tar.gz: 14779e8b3bc933186d33671047411a10bdb27f2b04e311013b9c630849b07393251ee6db6445419cea71ad8c012bd1fc81aa2b94f6ad51129fef9702e8d4fa42
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/wayback_machine_downloader'
|
4
|
+
require 'optparse'
|
5
|
+
require 'pp'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
option_parser = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: wayback_machine_downloader http://example.com"
|
10
|
+
|
11
|
+
opts.separator ""
|
12
|
+
opts.separator "Download an entire website from the Wayback Machine."
|
13
|
+
|
14
|
+
opts.separator ""
|
15
|
+
opts.separator "Optional options:"
|
16
|
+
|
17
|
+
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
|
18
|
+
options[:directory] = t
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
22
|
+
options[:all_timestamps] = true
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
26
|
+
options[:from_timestamp] = t
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
|
30
|
+
options[:to_timestamp] = t
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
|
34
|
+
options[:exact_url] = t
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
38
|
+
options[:only_filter] = t
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
42
|
+
options[:exclude_filter] = t
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
|
46
|
+
options[:all] = true
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
|
50
|
+
options[:threads_count] = t
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
|
54
|
+
options[:maximum_pages] = t
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
|
58
|
+
options[:list] = true
|
59
|
+
end
|
60
|
+
|
61
|
+
opts.on("-v", "--version", "Display version") do |t|
|
62
|
+
options[:version] = t
|
63
|
+
end
|
64
|
+
end.parse!
|
65
|
+
|
66
|
+
if (base_url = ARGV[-1])
|
67
|
+
options[:base_url] = base_url
|
68
|
+
wayback_machine_downloader = WaybackMachineDownloader.new options
|
69
|
+
if options[:list]
|
70
|
+
wayback_machine_downloader.list_files
|
71
|
+
else
|
72
|
+
wayback_machine_downloader.download_files
|
73
|
+
end
|
74
|
+
elsif options[:version]
|
75
|
+
puts WaybackMachineDownloader::VERSION
|
76
|
+
else
|
77
|
+
puts "You need to specify a website to backup. (e.g., http://example.com)"
|
78
|
+
puts "Run `wayback_machine_downloader --help` for more help."
|
79
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module ArchiveAPI
|
5
|
+
|
6
|
+
def get_raw_list_from_api url, page_index, http
|
7
|
+
request_url = URI("https://web.archive.org/cdx/search/xd")
|
8
|
+
params = [["output", "json"], ["url", url]]
|
9
|
+
params += parameters_for_api page_index
|
10
|
+
request_url.query = URI.encode_www_form(params)
|
11
|
+
|
12
|
+
begin
|
13
|
+
json = JSON.parse(http.get(URI(request_url)).body)
|
14
|
+
if (json[0] <=> ["timestamp","original"]) == 0
|
15
|
+
json.shift
|
16
|
+
end
|
17
|
+
json
|
18
|
+
rescue JSON::ParserError
|
19
|
+
[]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def parameters_for_api page_index
|
24
|
+
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
|
25
|
+
if !@all
|
26
|
+
parameters.push(["filter", "statuscode:200"])
|
27
|
+
end
|
28
|
+
if @from_timestamp and @from_timestamp != 0
|
29
|
+
parameters.push(["from", @from_timestamp.to_s])
|
30
|
+
end
|
31
|
+
if @to_timestamp and @to_timestamp != 0
|
32
|
+
parameters.push(["to", @to_timestamp.to_s])
|
33
|
+
end
|
34
|
+
if page_index
|
35
|
+
parameters.push(["page", page_index])
|
36
|
+
end
|
37
|
+
parameters
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
module TibyBytes
|
2
|
+
|
3
|
+
# CP-1252 decimal byte => UTF-8 approximation as an array of bytes
|
4
|
+
CP1252 = {
|
5
|
+
128 => [226, 130, 172],
|
6
|
+
129 => nil,
|
7
|
+
130 => [226, 128, 154],
|
8
|
+
131 => [198, 146],
|
9
|
+
132 => [226, 128, 158],
|
10
|
+
133 => [226, 128, 166],
|
11
|
+
134 => [226, 128, 160],
|
12
|
+
135 => [226, 128, 161],
|
13
|
+
136 => [203, 134],
|
14
|
+
137 => [226, 128, 176],
|
15
|
+
138 => [197, 160],
|
16
|
+
139 => [226, 128, 185],
|
17
|
+
140 => [197, 146],
|
18
|
+
141 => nil,
|
19
|
+
142 => [197, 189],
|
20
|
+
143 => nil,
|
21
|
+
144 => nil,
|
22
|
+
145 => [226, 128, 152],
|
23
|
+
146 => [226, 128, 153],
|
24
|
+
147 => [226, 128, 156],
|
25
|
+
148 => [226, 128, 157],
|
26
|
+
149 => [226, 128, 162],
|
27
|
+
150 => [226, 128, 147],
|
28
|
+
151 => [226, 128, 148],
|
29
|
+
152 => [203, 156],
|
30
|
+
153 => [226, 132, 162],
|
31
|
+
154 => [197, 161],
|
32
|
+
155 => [226, 128, 186],
|
33
|
+
156 => [197, 147],
|
34
|
+
157 => nil,
|
35
|
+
158 => [197, 190],
|
36
|
+
159 => [197, 184]
|
37
|
+
}
|
38
|
+
|
39
|
+
module StringMixin
|
40
|
+
|
41
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
42
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
43
|
+
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
44
|
+
# always work.
|
45
|
+
#
|
46
|
+
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
47
|
+
# encoding is CP-1252 or ISO-8859-1.
|
48
|
+
def tidy_bytes(force = false)
|
49
|
+
|
50
|
+
if force
|
51
|
+
return unpack("C*").map do |b|
|
52
|
+
tidy_byte(b)
|
53
|
+
end.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
54
|
+
end
|
55
|
+
|
56
|
+
bytes = unpack("C*")
|
57
|
+
conts_expected = 0
|
58
|
+
last_lead = 0
|
59
|
+
|
60
|
+
bytes.each_index do |i|
|
61
|
+
|
62
|
+
byte = bytes[i]
|
63
|
+
_is_ascii = byte < 128
|
64
|
+
is_cont = byte > 127 && byte < 192
|
65
|
+
is_lead = byte > 191 && byte < 245
|
66
|
+
is_unused = byte > 240
|
67
|
+
is_restricted = byte > 244
|
68
|
+
|
69
|
+
# Impossible or highly unlikely byte? Clean it.
|
70
|
+
if is_unused || is_restricted
|
71
|
+
bytes[i] = tidy_byte(byte)
|
72
|
+
elsif is_cont
|
73
|
+
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
|
74
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
75
|
+
else
|
76
|
+
if conts_expected > 0
|
77
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
78
|
+
# the leading byte.
|
79
|
+
begin
|
80
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
81
|
+
rescue NoMethodError
|
82
|
+
next
|
83
|
+
end
|
84
|
+
conts_expected = 0
|
85
|
+
end
|
86
|
+
if is_lead
|
87
|
+
# Final byte is leading? Clean it.
|
88
|
+
if i == bytes.length - 1
|
89
|
+
bytes[i] = tidy_byte(bytes.last)
|
90
|
+
else
|
91
|
+
# Valid leading byte? Expect continuations determined by position of
|
92
|
+
# first zero bit, with max of 3.
|
93
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
94
|
+
last_lead = i
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
begin
|
100
|
+
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
101
|
+
rescue ArgumentError
|
102
|
+
nil
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Tidy bytes in-place.
|
107
|
+
def tidy_bytes!(force = false)
|
108
|
+
replace tidy_bytes(force)
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
def tidy_byte(byte)
|
114
|
+
byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class String
|
121
|
+
include TibyBytes::StringMixin
|
122
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module ToRegex
|
2
|
+
module StringMixin
|
3
|
+
class << self
|
4
|
+
def literal?(str)
|
5
|
+
REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
INLINE_OPTIONS = /[imxnesu]*/
|
10
|
+
REGEXP_DELIMITERS = {
|
11
|
+
'%r{' => '}',
|
12
|
+
'/' => '/',
|
13
|
+
}
|
14
|
+
|
15
|
+
# Get a regex back
|
16
|
+
#
|
17
|
+
# Without :literal or :detect, `"foo".to_regex` will return nil.
|
18
|
+
#
|
19
|
+
# @param [optional, Hash] options
|
20
|
+
# @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
|
21
|
+
# @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
|
22
|
+
# @option options [true,false] :ignore_case /foo/i
|
23
|
+
# @option options [true,false] :multiline /foo/m
|
24
|
+
# @option options [true,false] :extended /foo/x
|
25
|
+
# @option options [true,false] :lang /foo/[nesu]
|
26
|
+
def to_regex(options = {})
|
27
|
+
if args = as_regexp(options)
|
28
|
+
::Regexp.new(*args)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return arguments that can be passed to `Regexp.new`
|
33
|
+
# @see to_regexp
|
34
|
+
def as_regexp(options = {})
|
35
|
+
unless options.is_a?(::Hash)
|
36
|
+
raise ::ArgumentError, "[to_regexp] Options must be a Hash"
|
37
|
+
end
|
38
|
+
str = self
|
39
|
+
|
40
|
+
return if options[:detect] and str == ''
|
41
|
+
|
42
|
+
if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
|
43
|
+
content = ::Regexp.escape str
|
44
|
+
elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
|
45
|
+
delim_start, delim_end = delim_set
|
46
|
+
/\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
|
47
|
+
content = $1
|
48
|
+
inline_options = $2
|
49
|
+
return unless content.is_a?(::String)
|
50
|
+
content.gsub! '\\/', '/'
|
51
|
+
if inline_options
|
52
|
+
options[:ignore_case] = true if inline_options.include?('i')
|
53
|
+
options[:multiline] = true if inline_options.include?('m')
|
54
|
+
options[:extended] = true if inline_options.include?('x')
|
55
|
+
# 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
|
56
|
+
options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
|
57
|
+
end
|
58
|
+
else
|
59
|
+
return
|
60
|
+
end
|
61
|
+
|
62
|
+
ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
|
63
|
+
multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
|
64
|
+
extended = options[:extended] ? ::Regexp::EXTENDED : 0
|
65
|
+
lang = options[:lang] || ''
|
66
|
+
if ::RUBY_VERSION > '1.9' and lang.include?('u')
|
67
|
+
lang = lang.delete 'u'
|
68
|
+
end
|
69
|
+
|
70
|
+
if lang.empty?
|
71
|
+
[ content, (ignore_case|multiline|extended) ]
|
72
|
+
else
|
73
|
+
[ content, (ignore_case|multiline|extended), lang ]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
class String
|
80
|
+
include ToRegex::StringMixin
|
81
|
+
end
|
@@ -0,0 +1,323 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'thread'
|
4
|
+
require 'net/http'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'cgi'
|
8
|
+
require 'json'
|
9
|
+
require_relative 'wayback_machine_downloader/tidy_bytes'
|
10
|
+
require_relative 'wayback_machine_downloader/to_regex'
|
11
|
+
require_relative 'wayback_machine_downloader/archive_api'
|
12
|
+
|
13
|
+
class WaybackMachineDownloader
|
14
|
+
|
15
|
+
include ArchiveAPI
|
16
|
+
|
17
|
+
VERSION = "2.3.2"
|
18
|
+
|
19
|
+
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
20
|
+
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
21
|
+
:all, :maximum_pages, :threads_count
|
22
|
+
|
23
|
+
def initialize params
|
24
|
+
@base_url = params[:base_url]
|
25
|
+
@exact_url = params[:exact_url]
|
26
|
+
@directory = params[:directory]
|
27
|
+
@all_timestamps = params[:all_timestamps]
|
28
|
+
@from_timestamp = params[:from_timestamp].to_i
|
29
|
+
@to_timestamp = params[:to_timestamp].to_i
|
30
|
+
@only_filter = params[:only_filter]
|
31
|
+
@exclude_filter = params[:exclude_filter]
|
32
|
+
@all = params[:all]
|
33
|
+
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
34
|
+
@threads_count = params[:threads_count].to_i
|
35
|
+
end
|
36
|
+
|
37
|
+
def backup_name
|
38
|
+
if @base_url.include? '//'
|
39
|
+
@base_url.split('/')[2]
|
40
|
+
else
|
41
|
+
@base_url
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def backup_path
|
46
|
+
if @directory
|
47
|
+
if @directory[-1] == '/'
|
48
|
+
@directory
|
49
|
+
else
|
50
|
+
@directory + '/'
|
51
|
+
end
|
52
|
+
else
|
53
|
+
'websites/' + backup_name + '/'
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def match_only_filter file_url
|
58
|
+
if @only_filter
|
59
|
+
only_filter_regex = @only_filter.to_regex
|
60
|
+
if only_filter_regex
|
61
|
+
only_filter_regex =~ file_url
|
62
|
+
else
|
63
|
+
file_url.downcase.include? @only_filter.downcase
|
64
|
+
end
|
65
|
+
else
|
66
|
+
true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def match_exclude_filter file_url
|
71
|
+
if @exclude_filter
|
72
|
+
exclude_filter_regex = @exclude_filter.to_regex
|
73
|
+
if exclude_filter_regex
|
74
|
+
exclude_filter_regex =~ file_url
|
75
|
+
else
|
76
|
+
file_url.downcase.include? @exclude_filter.downcase
|
77
|
+
end
|
78
|
+
else
|
79
|
+
false
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def get_all_snapshots_to_consider
|
84
|
+
# Note: Passing a page index parameter allow us to get more snapshots,
|
85
|
+
# but from a less fresh index
|
86
|
+
http = Net::HTTP.new("web.archive.org", 443)
|
87
|
+
http.use_ssl = true
|
88
|
+
http.start()
|
89
|
+
print "Getting snapshot pages"
|
90
|
+
snapshot_list_to_consider = []
|
91
|
+
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http)
|
92
|
+
print "."
|
93
|
+
unless @exact_url
|
94
|
+
@maximum_pages.times do |page_index|
|
95
|
+
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index, http)
|
96
|
+
break if snapshot_list.empty?
|
97
|
+
snapshot_list_to_consider += snapshot_list
|
98
|
+
print "."
|
99
|
+
end
|
100
|
+
end
|
101
|
+
http.finish()
|
102
|
+
puts " found #{snapshot_list_to_consider.length} snaphots to consider."
|
103
|
+
puts
|
104
|
+
snapshot_list_to_consider
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_file_list_curated
|
108
|
+
file_list_curated = Hash.new
|
109
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
110
|
+
next unless file_url.include?('/')
|
111
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
112
|
+
file_id = CGI::unescape file_id
|
113
|
+
file_id = file_id.tidy_bytes unless file_id == ""
|
114
|
+
if file_id.nil?
|
115
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
116
|
+
else
|
117
|
+
if match_exclude_filter(file_url)
|
118
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
119
|
+
elsif not match_only_filter(file_url)
|
120
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
121
|
+
elsif file_list_curated[file_id]
|
122
|
+
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
123
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
124
|
+
end
|
125
|
+
else
|
126
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
file_list_curated
|
131
|
+
end
|
132
|
+
|
133
|
+
def get_file_list_all_timestamps
|
134
|
+
file_list_curated = Hash.new
|
135
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
136
|
+
next unless file_url.include?('/')
|
137
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
138
|
+
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
139
|
+
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
140
|
+
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
141
|
+
if file_id.nil?
|
142
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
143
|
+
else
|
144
|
+
if match_exclude_filter(file_url)
|
145
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
146
|
+
elsif not match_only_filter(file_url)
|
147
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
148
|
+
elsif file_list_curated[file_id_and_timestamp]
|
149
|
+
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
150
|
+
else
|
151
|
+
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
puts "file_list_curated: " + file_list_curated.count.to_s
|
156
|
+
file_list_curated
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
def get_file_list_by_timestamp
|
161
|
+
if @all_timestamps
|
162
|
+
file_list_curated = get_file_list_all_timestamps
|
163
|
+
file_list_curated.map do |file_remote_info|
|
164
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
165
|
+
file_remote_info[1]
|
166
|
+
end
|
167
|
+
else
|
168
|
+
file_list_curated = get_file_list_curated
|
169
|
+
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
170
|
+
file_list_curated.map do |file_remote_info|
|
171
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
172
|
+
file_remote_info[1]
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def list_files
|
178
|
+
# retrieval produces its own output
|
179
|
+
@orig_stdout = $stdout
|
180
|
+
$stdout = $stderr
|
181
|
+
files = get_file_list_by_timestamp
|
182
|
+
$stdout = @orig_stdout
|
183
|
+
puts "["
|
184
|
+
files[0...-1].each do |file|
|
185
|
+
puts file.to_json + ","
|
186
|
+
end
|
187
|
+
puts files[-1].to_json
|
188
|
+
puts "]"
|
189
|
+
end
|
190
|
+
|
191
|
+
def download_files
|
192
|
+
start_time = Time.now
|
193
|
+
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
194
|
+
puts
|
195
|
+
|
196
|
+
if file_list_by_timestamp.count == 0
|
197
|
+
puts "No files to download."
|
198
|
+
puts "Possible reasons:"
|
199
|
+
puts "\t* Site is not in Wayback Machine Archive."
|
200
|
+
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
|
201
|
+
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
|
202
|
+
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
|
203
|
+
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
|
204
|
+
return
|
205
|
+
end
|
206
|
+
|
207
|
+
puts "#{file_list_by_timestamp.count} files to download:"
|
208
|
+
|
209
|
+
threads = []
|
210
|
+
@processed_file_count = 0
|
211
|
+
@threads_count = 1 unless @threads_count != 0
|
212
|
+
@threads_count.times do
|
213
|
+
http = Net::HTTP.new("web.archive.org", 443)
|
214
|
+
http.use_ssl = true
|
215
|
+
http.start()
|
216
|
+
threads << Thread.new do
|
217
|
+
until file_queue.empty?
|
218
|
+
file_remote_info = file_queue.pop(true) rescue nil
|
219
|
+
download_file(file_remote_info, http) if file_remote_info
|
220
|
+
end
|
221
|
+
http.finish()
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
threads.each(&:join)
|
226
|
+
end_time = Time.now
|
227
|
+
puts
|
228
|
+
puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
|
229
|
+
end
|
230
|
+
|
231
|
+
def structure_dir_path dir_path
|
232
|
+
begin
|
233
|
+
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
234
|
+
rescue Errno::EEXIST => e
|
235
|
+
error_to_string = e.to_s
|
236
|
+
puts "# #{error_to_string}"
|
237
|
+
if error_to_string.include? "File exists @ dir_s_mkdir - "
|
238
|
+
file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
|
239
|
+
elsif error_to_string.include? "File exists - "
|
240
|
+
file_already_existing = error_to_string.split("File exists - ")[-1]
|
241
|
+
else
|
242
|
+
raise "Unhandled directory restructure error # #{error_to_string}"
|
243
|
+
end
|
244
|
+
file_already_existing_temporary = file_already_existing + '.temp'
|
245
|
+
file_already_existing_permanent = file_already_existing + '/index.html'
|
246
|
+
FileUtils::mv file_already_existing, file_already_existing_temporary
|
247
|
+
FileUtils::mkdir_p file_already_existing
|
248
|
+
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
|
249
|
+
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
|
250
|
+
structure_dir_path dir_path
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
def download_file (file_remote_info, http)
|
255
|
+
current_encoding = "".encoding
|
256
|
+
file_url = file_remote_info[:file_url].encode(current_encoding)
|
257
|
+
file_id = file_remote_info[:file_id]
|
258
|
+
file_timestamp = file_remote_info[:timestamp]
|
259
|
+
file_path_elements = file_id.split('/')
|
260
|
+
if file_id == ""
|
261
|
+
dir_path = backup_path
|
262
|
+
file_path = backup_path + 'index.html'
|
263
|
+
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
264
|
+
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
265
|
+
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
266
|
+
else
|
267
|
+
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
268
|
+
file_path = backup_path + file_path_elements[0..-1].join('/')
|
269
|
+
end
|
270
|
+
if Gem.win_platform?
|
271
|
+
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
272
|
+
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
273
|
+
end
|
274
|
+
unless File.exist? file_path
|
275
|
+
begin
|
276
|
+
structure_dir_path dir_path
|
277
|
+
open(file_path, "wb") do |file|
|
278
|
+
begin
|
279
|
+
http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
|
280
|
+
file.write(body)
|
281
|
+
end
|
282
|
+
rescue OpenURI::HTTPError => e
|
283
|
+
puts "#{file_url} # #{e}"
|
284
|
+
if @all
|
285
|
+
file.write(e.io.read)
|
286
|
+
puts "#{file_path} saved anyway."
|
287
|
+
end
|
288
|
+
rescue StandardError => e
|
289
|
+
puts "#{file_url} # #{e}"
|
290
|
+
end
|
291
|
+
end
|
292
|
+
rescue StandardError => e
|
293
|
+
puts "#{file_url} # #{e}"
|
294
|
+
ensure
|
295
|
+
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
296
|
+
File.delete(file_path)
|
297
|
+
puts "#{file_path} was empty and was removed."
|
298
|
+
end
|
299
|
+
end
|
300
|
+
semaphore.synchronize do
|
301
|
+
@processed_file_count += 1
|
302
|
+
puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
|
303
|
+
end
|
304
|
+
else
|
305
|
+
semaphore.synchronize do
|
306
|
+
@processed_file_count += 1
|
307
|
+
puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
def file_queue
|
313
|
+
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
|
314
|
+
end
|
315
|
+
|
316
|
+
def file_list_by_timestamp
|
317
|
+
@file_list_by_timestamp ||= get_file_list_by_timestamp
|
318
|
+
end
|
319
|
+
|
320
|
+
def semaphore
|
321
|
+
@semaphore ||= Mutex.new
|
322
|
+
end
|
323
|
+
end
|
metadata
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wayback_machine_downloader_hhr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.3.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- hehaorui
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-11-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '10.2'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '10.2'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: minitest
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '5.2'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '5.2'
|
41
|
+
description: Download an entire website from the Wayback Machine. Wayback Machine
|
42
|
+
by Internet Archive (archive.org) is an awesome tool to view any website at any
|
43
|
+
point of time but lacks an export feature. Wayback Machine Downloader brings exactly
|
44
|
+
this. This version bears minor fixes on original version. It is for hehaorui personal
|
45
|
+
use.
|
46
|
+
email: mail@hehaorui.com
|
47
|
+
executables:
|
48
|
+
- wayback_machine_downloader
|
49
|
+
extensions: []
|
50
|
+
extra_rdoc_files: []
|
51
|
+
files:
|
52
|
+
- bin/wayback_machine_downloader
|
53
|
+
- lib/wayback_machine_downloader.rb
|
54
|
+
- lib/wayback_machine_downloader/archive_api.rb
|
55
|
+
- lib/wayback_machine_downloader/tidy_bytes.rb
|
56
|
+
- lib/wayback_machine_downloader/to_regex.rb
|
57
|
+
homepage: https://github.com/hehaorui/wayback-machine-downloader
|
58
|
+
licenses:
|
59
|
+
- MIT
|
60
|
+
metadata: {}
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options: []
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.9.2
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
requirements: []
|
76
|
+
rubygems_version: 3.5.22
|
77
|
+
signing_key:
|
78
|
+
specification_version: 4
|
79
|
+
summary: Download an entire website from the Wayback Machine, with minor fixes. For
|
80
|
+
hehaorui personal use.
|
81
|
+
test_files: []
|