wayback_machine_downloader_hhr 2.3.2
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 89aa753a924055b41a371b5c616158dc5b65dfa63b136fff078588d839949f64
|
4
|
+
data.tar.gz: be143940de3f24c545a8bf202b1fb28f601b124f69927213e834b49bada36cf3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: adf23257485832a2e6c4ccc443cf43583e59851e39d2e474bbad097ff9332f71ec63e1e0ade769b72485ca6e38234fac5f18b62ba3ca9858cf6bd46ebc1a4835
|
7
|
+
data.tar.gz: 14779e8b3bc933186d33671047411a10bdb27f2b04e311013b9c630849b07393251ee6db6445419cea71ad8c012bd1fc81aa2b94f6ad51129fef9702e8d4fa42
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/wayback_machine_downloader'
|
4
|
+
require 'optparse'
|
5
|
+
require 'pp'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
option_parser = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: wayback_machine_downloader http://example.com"
|
10
|
+
|
11
|
+
opts.separator ""
|
12
|
+
opts.separator "Download an entire website from the Wayback Machine."
|
13
|
+
|
14
|
+
opts.separator ""
|
15
|
+
opts.separator "Optional options:"
|
16
|
+
|
17
|
+
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
|
18
|
+
options[:directory] = t
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
22
|
+
options[:all_timestamps] = true
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
26
|
+
options[:from_timestamp] = t
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
|
30
|
+
options[:to_timestamp] = t
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
|
34
|
+
options[:exact_url] = t
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
38
|
+
options[:only_filter] = t
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
42
|
+
options[:exclude_filter] = t
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
|
46
|
+
options[:all] = true
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
|
50
|
+
options[:threads_count] = t
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
|
54
|
+
options[:maximum_pages] = t
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
|
58
|
+
options[:list] = true
|
59
|
+
end
|
60
|
+
|
61
|
+
opts.on("-v", "--version", "Display version") do |t|
|
62
|
+
options[:version] = t
|
63
|
+
end
|
64
|
+
end.parse!
|
65
|
+
|
66
|
+
if (base_url = ARGV[-1])
|
67
|
+
options[:base_url] = base_url
|
68
|
+
wayback_machine_downloader = WaybackMachineDownloader.new options
|
69
|
+
if options[:list]
|
70
|
+
wayback_machine_downloader.list_files
|
71
|
+
else
|
72
|
+
wayback_machine_downloader.download_files
|
73
|
+
end
|
74
|
+
elsif options[:version]
|
75
|
+
puts WaybackMachineDownloader::VERSION
|
76
|
+
else
|
77
|
+
puts "You need to specify a website to backup. (e.g., http://example.com)"
|
78
|
+
puts "Run `wayback_machine_downloader --help` for more help."
|
79
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module ArchiveAPI
|
5
|
+
|
6
|
+
def get_raw_list_from_api url, page_index, http
|
7
|
+
request_url = URI("https://web.archive.org/cdx/search/xd")
|
8
|
+
params = [["output", "json"], ["url", url]]
|
9
|
+
params += parameters_for_api page_index
|
10
|
+
request_url.query = URI.encode_www_form(params)
|
11
|
+
|
12
|
+
begin
|
13
|
+
json = JSON.parse(http.get(URI(request_url)).body)
|
14
|
+
if (json[0] <=> ["timestamp","original"]) == 0
|
15
|
+
json.shift
|
16
|
+
end
|
17
|
+
json
|
18
|
+
rescue JSON::ParserError
|
19
|
+
[]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def parameters_for_api page_index
|
24
|
+
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
|
25
|
+
if !@all
|
26
|
+
parameters.push(["filter", "statuscode:200"])
|
27
|
+
end
|
28
|
+
if @from_timestamp and @from_timestamp != 0
|
29
|
+
parameters.push(["from", @from_timestamp.to_s])
|
30
|
+
end
|
31
|
+
if @to_timestamp and @to_timestamp != 0
|
32
|
+
parameters.push(["to", @to_timestamp.to_s])
|
33
|
+
end
|
34
|
+
if page_index
|
35
|
+
parameters.push(["page", page_index])
|
36
|
+
end
|
37
|
+
parameters
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
module TibyBytes
|
2
|
+
|
3
|
+
# CP-1252 decimal byte => UTF-8 approximation as an array of bytes
|
4
|
+
CP1252 = {
|
5
|
+
128 => [226, 130, 172],
|
6
|
+
129 => nil,
|
7
|
+
130 => [226, 128, 154],
|
8
|
+
131 => [198, 146],
|
9
|
+
132 => [226, 128, 158],
|
10
|
+
133 => [226, 128, 166],
|
11
|
+
134 => [226, 128, 160],
|
12
|
+
135 => [226, 128, 161],
|
13
|
+
136 => [203, 134],
|
14
|
+
137 => [226, 128, 176],
|
15
|
+
138 => [197, 160],
|
16
|
+
139 => [226, 128, 185],
|
17
|
+
140 => [197, 146],
|
18
|
+
141 => nil,
|
19
|
+
142 => [197, 189],
|
20
|
+
143 => nil,
|
21
|
+
144 => nil,
|
22
|
+
145 => [226, 128, 152],
|
23
|
+
146 => [226, 128, 153],
|
24
|
+
147 => [226, 128, 156],
|
25
|
+
148 => [226, 128, 157],
|
26
|
+
149 => [226, 128, 162],
|
27
|
+
150 => [226, 128, 147],
|
28
|
+
151 => [226, 128, 148],
|
29
|
+
152 => [203, 156],
|
30
|
+
153 => [226, 132, 162],
|
31
|
+
154 => [197, 161],
|
32
|
+
155 => [226, 128, 186],
|
33
|
+
156 => [197, 147],
|
34
|
+
157 => nil,
|
35
|
+
158 => [197, 190],
|
36
|
+
159 => [197, 184]
|
37
|
+
}
|
38
|
+
|
39
|
+
module StringMixin
|
40
|
+
|
41
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
42
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
43
|
+
# CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
44
|
+
# always work.
|
45
|
+
#
|
46
|
+
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
47
|
+
# encoding is CP-1252 or ISO-8859-1.
|
48
|
+
def tidy_bytes(force = false)
|
49
|
+
|
50
|
+
if force
|
51
|
+
return unpack("C*").map do |b|
|
52
|
+
tidy_byte(b)
|
53
|
+
end.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
54
|
+
end
|
55
|
+
|
56
|
+
bytes = unpack("C*")
|
57
|
+
conts_expected = 0
|
58
|
+
last_lead = 0
|
59
|
+
|
60
|
+
bytes.each_index do |i|
|
61
|
+
|
62
|
+
byte = bytes[i]
|
63
|
+
_is_ascii = byte < 128
|
64
|
+
is_cont = byte > 127 && byte < 192
|
65
|
+
is_lead = byte > 191 && byte < 245
|
66
|
+
is_unused = byte > 240
|
67
|
+
is_restricted = byte > 244
|
68
|
+
|
69
|
+
# Impossible or highly unlikely byte? Clean it.
|
70
|
+
if is_unused || is_restricted
|
71
|
+
bytes[i] = tidy_byte(byte)
|
72
|
+
elsif is_cont
|
73
|
+
# Not expecting continuation byte? Clean up. Otherwise, now expect one less.
|
74
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
75
|
+
else
|
76
|
+
if conts_expected > 0
|
77
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
78
|
+
# the leading byte.
|
79
|
+
begin
|
80
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
81
|
+
rescue NoMethodError
|
82
|
+
next
|
83
|
+
end
|
84
|
+
conts_expected = 0
|
85
|
+
end
|
86
|
+
if is_lead
|
87
|
+
# Final byte is leading? Clean it.
|
88
|
+
if i == bytes.length - 1
|
89
|
+
bytes[i] = tidy_byte(bytes.last)
|
90
|
+
else
|
91
|
+
# Valid leading byte? Expect continuations determined by position of
|
92
|
+
# first zero bit, with max of 3.
|
93
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
94
|
+
last_lead = i
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
begin
|
100
|
+
bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
101
|
+
rescue ArgumentError
|
102
|
+
nil
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Tidy bytes in-place.
|
107
|
+
def tidy_bytes!(force = false)
|
108
|
+
replace tidy_bytes(force)
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
def tidy_byte(byte)
|
114
|
+
byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class String
|
121
|
+
include TibyBytes::StringMixin
|
122
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module ToRegex
|
2
|
+
module StringMixin
|
3
|
+
class << self
|
4
|
+
def literal?(str)
|
5
|
+
REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
INLINE_OPTIONS = /[imxnesu]*/
|
10
|
+
REGEXP_DELIMITERS = {
|
11
|
+
'%r{' => '}',
|
12
|
+
'/' => '/',
|
13
|
+
}
|
14
|
+
|
15
|
+
# Get a regex back
|
16
|
+
#
|
17
|
+
# Without :literal or :detect, `"foo".to_regex` will return nil.
|
18
|
+
#
|
19
|
+
# @param [optional, Hash] options
|
20
|
+
# @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
|
21
|
+
# @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
|
22
|
+
# @option options [true,false] :ignore_case /foo/i
|
23
|
+
# @option options [true,false] :multiline /foo/m
|
24
|
+
# @option options [true,false] :extended /foo/x
|
25
|
+
# @option options [true,false] :lang /foo/[nesu]
|
26
|
+
def to_regex(options = {})
|
27
|
+
if args = as_regexp(options)
|
28
|
+
::Regexp.new(*args)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return arguments that can be passed to `Regexp.new`
|
33
|
+
# @see to_regexp
|
34
|
+
def as_regexp(options = {})
|
35
|
+
unless options.is_a?(::Hash)
|
36
|
+
raise ::ArgumentError, "[to_regexp] Options must be a Hash"
|
37
|
+
end
|
38
|
+
str = self
|
39
|
+
|
40
|
+
return if options[:detect] and str == ''
|
41
|
+
|
42
|
+
if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
|
43
|
+
content = ::Regexp.escape str
|
44
|
+
elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
|
45
|
+
delim_start, delim_end = delim_set
|
46
|
+
/\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
|
47
|
+
content = $1
|
48
|
+
inline_options = $2
|
49
|
+
return unless content.is_a?(::String)
|
50
|
+
content.gsub! '\\/', '/'
|
51
|
+
if inline_options
|
52
|
+
options[:ignore_case] = true if inline_options.include?('i')
|
53
|
+
options[:multiline] = true if inline_options.include?('m')
|
54
|
+
options[:extended] = true if inline_options.include?('x')
|
55
|
+
# 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
|
56
|
+
options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
|
57
|
+
end
|
58
|
+
else
|
59
|
+
return
|
60
|
+
end
|
61
|
+
|
62
|
+
ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
|
63
|
+
multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
|
64
|
+
extended = options[:extended] ? ::Regexp::EXTENDED : 0
|
65
|
+
lang = options[:lang] || ''
|
66
|
+
if ::RUBY_VERSION > '1.9' and lang.include?('u')
|
67
|
+
lang = lang.delete 'u'
|
68
|
+
end
|
69
|
+
|
70
|
+
if lang.empty?
|
71
|
+
[ content, (ignore_case|multiline|extended) ]
|
72
|
+
else
|
73
|
+
[ content, (ignore_case|multiline|extended), lang ]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
class String
|
80
|
+
include ToRegex::StringMixin
|
81
|
+
end
|
@@ -0,0 +1,323 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'thread'
|
4
|
+
require 'net/http'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'cgi'
|
8
|
+
require 'json'
|
9
|
+
require_relative 'wayback_machine_downloader/tidy_bytes'
|
10
|
+
require_relative 'wayback_machine_downloader/to_regex'
|
11
|
+
require_relative 'wayback_machine_downloader/archive_api'
|
12
|
+
|
13
|
+
class WaybackMachineDownloader
|
14
|
+
|
15
|
+
include ArchiveAPI
|
16
|
+
|
17
|
+
VERSION = "2.3.2"
|
18
|
+
|
19
|
+
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
20
|
+
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
21
|
+
:all, :maximum_pages, :threads_count
|
22
|
+
|
23
|
+
def initialize params
|
24
|
+
@base_url = params[:base_url]
|
25
|
+
@exact_url = params[:exact_url]
|
26
|
+
@directory = params[:directory]
|
27
|
+
@all_timestamps = params[:all_timestamps]
|
28
|
+
@from_timestamp = params[:from_timestamp].to_i
|
29
|
+
@to_timestamp = params[:to_timestamp].to_i
|
30
|
+
@only_filter = params[:only_filter]
|
31
|
+
@exclude_filter = params[:exclude_filter]
|
32
|
+
@all = params[:all]
|
33
|
+
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
34
|
+
@threads_count = params[:threads_count].to_i
|
35
|
+
end
|
36
|
+
|
37
|
+
def backup_name
|
38
|
+
if @base_url.include? '//'
|
39
|
+
@base_url.split('/')[2]
|
40
|
+
else
|
41
|
+
@base_url
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def backup_path
|
46
|
+
if @directory
|
47
|
+
if @directory[-1] == '/'
|
48
|
+
@directory
|
49
|
+
else
|
50
|
+
@directory + '/'
|
51
|
+
end
|
52
|
+
else
|
53
|
+
'websites/' + backup_name + '/'
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def match_only_filter file_url
|
58
|
+
if @only_filter
|
59
|
+
only_filter_regex = @only_filter.to_regex
|
60
|
+
if only_filter_regex
|
61
|
+
only_filter_regex =~ file_url
|
62
|
+
else
|
63
|
+
file_url.downcase.include? @only_filter.downcase
|
64
|
+
end
|
65
|
+
else
|
66
|
+
true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def match_exclude_filter file_url
|
71
|
+
if @exclude_filter
|
72
|
+
exclude_filter_regex = @exclude_filter.to_regex
|
73
|
+
if exclude_filter_regex
|
74
|
+
exclude_filter_regex =~ file_url
|
75
|
+
else
|
76
|
+
file_url.downcase.include? @exclude_filter.downcase
|
77
|
+
end
|
78
|
+
else
|
79
|
+
false
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def get_all_snapshots_to_consider
|
84
|
+
# Note: Passing a page index parameter allow us to get more snapshots,
|
85
|
+
# but from a less fresh index
|
86
|
+
http = Net::HTTP.new("web.archive.org", 443)
|
87
|
+
http.use_ssl = true
|
88
|
+
http.start()
|
89
|
+
print "Getting snapshot pages"
|
90
|
+
snapshot_list_to_consider = []
|
91
|
+
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http)
|
92
|
+
print "."
|
93
|
+
unless @exact_url
|
94
|
+
@maximum_pages.times do |page_index|
|
95
|
+
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index, http)
|
96
|
+
break if snapshot_list.empty?
|
97
|
+
snapshot_list_to_consider += snapshot_list
|
98
|
+
print "."
|
99
|
+
end
|
100
|
+
end
|
101
|
+
http.finish()
|
102
|
+
puts " found #{snapshot_list_to_consider.length} snaphots to consider."
|
103
|
+
puts
|
104
|
+
snapshot_list_to_consider
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_file_list_curated
|
108
|
+
file_list_curated = Hash.new
|
109
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
110
|
+
next unless file_url.include?('/')
|
111
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
112
|
+
file_id = CGI::unescape file_id
|
113
|
+
file_id = file_id.tidy_bytes unless file_id == ""
|
114
|
+
if file_id.nil?
|
115
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
116
|
+
else
|
117
|
+
if match_exclude_filter(file_url)
|
118
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
119
|
+
elsif not match_only_filter(file_url)
|
120
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
121
|
+
elsif file_list_curated[file_id]
|
122
|
+
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
123
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
124
|
+
end
|
125
|
+
else
|
126
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
file_list_curated
|
131
|
+
end
|
132
|
+
|
133
|
+
def get_file_list_all_timestamps
|
134
|
+
file_list_curated = Hash.new
|
135
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
136
|
+
next unless file_url.include?('/')
|
137
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
138
|
+
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
139
|
+
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
140
|
+
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
141
|
+
if file_id.nil?
|
142
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
143
|
+
else
|
144
|
+
if match_exclude_filter(file_url)
|
145
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
146
|
+
elsif not match_only_filter(file_url)
|
147
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
148
|
+
elsif file_list_curated[file_id_and_timestamp]
|
149
|
+
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
150
|
+
else
|
151
|
+
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
puts "file_list_curated: " + file_list_curated.count.to_s
|
156
|
+
file_list_curated
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
def get_file_list_by_timestamp
|
161
|
+
if @all_timestamps
|
162
|
+
file_list_curated = get_file_list_all_timestamps
|
163
|
+
file_list_curated.map do |file_remote_info|
|
164
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
165
|
+
file_remote_info[1]
|
166
|
+
end
|
167
|
+
else
|
168
|
+
file_list_curated = get_file_list_curated
|
169
|
+
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
170
|
+
file_list_curated.map do |file_remote_info|
|
171
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
172
|
+
file_remote_info[1]
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def list_files
|
178
|
+
# retrieval produces its own output
|
179
|
+
@orig_stdout = $stdout
|
180
|
+
$stdout = $stderr
|
181
|
+
files = get_file_list_by_timestamp
|
182
|
+
$stdout = @orig_stdout
|
183
|
+
puts "["
|
184
|
+
files[0...-1].each do |file|
|
185
|
+
puts file.to_json + ","
|
186
|
+
end
|
187
|
+
puts files[-1].to_json
|
188
|
+
puts "]"
|
189
|
+
end
|
190
|
+
|
191
|
+
def download_files
|
192
|
+
start_time = Time.now
|
193
|
+
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
194
|
+
puts
|
195
|
+
|
196
|
+
if file_list_by_timestamp.count == 0
|
197
|
+
puts "No files to download."
|
198
|
+
puts "Possible reasons:"
|
199
|
+
puts "\t* Site is not in Wayback Machine Archive."
|
200
|
+
puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
|
201
|
+
puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
|
202
|
+
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
|
203
|
+
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
|
204
|
+
return
|
205
|
+
end
|
206
|
+
|
207
|
+
puts "#{file_list_by_timestamp.count} files to download:"
|
208
|
+
|
209
|
+
threads = []
|
210
|
+
@processed_file_count = 0
|
211
|
+
@threads_count = 1 unless @threads_count != 0
|
212
|
+
@threads_count.times do
|
213
|
+
http = Net::HTTP.new("web.archive.org", 443)
|
214
|
+
http.use_ssl = true
|
215
|
+
http.start()
|
216
|
+
threads << Thread.new do
|
217
|
+
until file_queue.empty?
|
218
|
+
file_remote_info = file_queue.pop(true) rescue nil
|
219
|
+
download_file(file_remote_info, http) if file_remote_info
|
220
|
+
end
|
221
|
+
http.finish()
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
threads.each(&:join)
|
226
|
+
end_time = Time.now
|
227
|
+
puts
|
228
|
+
puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
|
229
|
+
end
|
230
|
+
|
231
|
+
def structure_dir_path dir_path
|
232
|
+
begin
|
233
|
+
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
234
|
+
rescue Errno::EEXIST => e
|
235
|
+
error_to_string = e.to_s
|
236
|
+
puts "# #{error_to_string}"
|
237
|
+
if error_to_string.include? "File exists @ dir_s_mkdir - "
|
238
|
+
file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
|
239
|
+
elsif error_to_string.include? "File exists - "
|
240
|
+
file_already_existing = error_to_string.split("File exists - ")[-1]
|
241
|
+
else
|
242
|
+
raise "Unhandled directory restructure error # #{error_to_string}"
|
243
|
+
end
|
244
|
+
file_already_existing_temporary = file_already_existing + '.temp'
|
245
|
+
file_already_existing_permanent = file_already_existing + '/index.html'
|
246
|
+
FileUtils::mv file_already_existing, file_already_existing_temporary
|
247
|
+
FileUtils::mkdir_p file_already_existing
|
248
|
+
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
|
249
|
+
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
|
250
|
+
structure_dir_path dir_path
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
def download_file (file_remote_info, http)
|
255
|
+
current_encoding = "".encoding
|
256
|
+
file_url = file_remote_info[:file_url].encode(current_encoding)
|
257
|
+
file_id = file_remote_info[:file_id]
|
258
|
+
file_timestamp = file_remote_info[:timestamp]
|
259
|
+
file_path_elements = file_id.split('/')
|
260
|
+
if file_id == ""
|
261
|
+
dir_path = backup_path
|
262
|
+
file_path = backup_path + 'index.html'
|
263
|
+
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
264
|
+
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
265
|
+
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
266
|
+
else
|
267
|
+
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
268
|
+
file_path = backup_path + file_path_elements[0..-1].join('/')
|
269
|
+
end
|
270
|
+
if Gem.win_platform?
|
271
|
+
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
272
|
+
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
273
|
+
end
|
274
|
+
unless File.exist? file_path
|
275
|
+
begin
|
276
|
+
structure_dir_path dir_path
|
277
|
+
open(file_path, "wb") do |file|
|
278
|
+
begin
|
279
|
+
http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
|
280
|
+
file.write(body)
|
281
|
+
end
|
282
|
+
rescue OpenURI::HTTPError => e
|
283
|
+
puts "#{file_url} # #{e}"
|
284
|
+
if @all
|
285
|
+
file.write(e.io.read)
|
286
|
+
puts "#{file_path} saved anyway."
|
287
|
+
end
|
288
|
+
rescue StandardError => e
|
289
|
+
puts "#{file_url} # #{e}"
|
290
|
+
end
|
291
|
+
end
|
292
|
+
rescue StandardError => e
|
293
|
+
puts "#{file_url} # #{e}"
|
294
|
+
ensure
|
295
|
+
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
296
|
+
File.delete(file_path)
|
297
|
+
puts "#{file_path} was empty and was removed."
|
298
|
+
end
|
299
|
+
end
|
300
|
+
semaphore.synchronize do
|
301
|
+
@processed_file_count += 1
|
302
|
+
puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
|
303
|
+
end
|
304
|
+
else
|
305
|
+
semaphore.synchronize do
|
306
|
+
@processed_file_count += 1
|
307
|
+
puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
def file_queue
|
313
|
+
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
|
314
|
+
end
|
315
|
+
|
316
|
+
def file_list_by_timestamp
|
317
|
+
@file_list_by_timestamp ||= get_file_list_by_timestamp
|
318
|
+
end
|
319
|
+
|
320
|
+
def semaphore
|
321
|
+
@semaphore ||= Mutex.new
|
322
|
+
end
|
323
|
+
end
|
metadata
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wayback_machine_downloader_hhr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.3.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- hehaorui
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-11-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '10.2'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '10.2'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: minitest
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '5.2'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '5.2'
|
41
|
+
description: Download an entire website from the Wayback Machine. Wayback Machine
|
42
|
+
by Internet Archive (archive.org) is an awesome tool to view any website at any
|
43
|
+
point of time but lacks an export feature. Wayback Machine Downloader brings exactly
|
44
|
+
this. This version bears minor fixes on original version. It is for hehaorui personal
|
45
|
+
use.
|
46
|
+
email: mail@hehaorui.com
|
47
|
+
executables:
|
48
|
+
- wayback_machine_downloader
|
49
|
+
extensions: []
|
50
|
+
extra_rdoc_files: []
|
51
|
+
files:
|
52
|
+
- bin/wayback_machine_downloader
|
53
|
+
- lib/wayback_machine_downloader.rb
|
54
|
+
- lib/wayback_machine_downloader/archive_api.rb
|
55
|
+
- lib/wayback_machine_downloader/tidy_bytes.rb
|
56
|
+
- lib/wayback_machine_downloader/to_regex.rb
|
57
|
+
homepage: https://github.com/hehaorui/wayback-machine-downloader
|
58
|
+
licenses:
|
59
|
+
- MIT
|
60
|
+
metadata: {}
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options: []
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 1.9.2
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
requirements: []
|
76
|
+
rubygems_version: 3.5.22
|
77
|
+
signing_key:
|
78
|
+
specification_version: 4
|
79
|
+
summary: Download an entire website from the Wayback Machine, with minor fixes. For
|
80
|
+
hehaorui personal use.
|
81
|
+
test_files: []
|