wayback_machine_downloader_straw 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: a8d577ca08cca3858efd95bfd879b198a57aa6262fa8e0a7f83ab4f3a362f1fc
|
4
|
+
data.tar.gz: ef73d81d745e7b3e9226458a66b5d54c2410db646ea85cc7145813bc26789dc7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 938e8544bb16b4afc6c81d0e4da602b5d3cd3e05482b3cc945ad3405681278fc03c7ccc1b84992b1ecafe66cf202aa71f306226b92f4b93b30b1c5c7edcbc86e
|
7
|
+
data.tar.gz: 8c236877be6274b9bb3c474fde9ad5a72a30abd5db3eadfc11eeae997488a2ddf27a72388b6d7d47455235ed0e084b1d14a7782911d0dd69e41f2fdcada713e2
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/wayback_machine_downloader'
|
4
|
+
require 'optparse'
|
5
|
+
require 'pp'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
option_parser = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: wayback_machine_downloader http://example.com"
|
10
|
+
|
11
|
+
opts.separator ""
|
12
|
+
opts.separator "Download an entire website from the Wayback Machine."
|
13
|
+
|
14
|
+
opts.separator ""
|
15
|
+
opts.separator "Optional options:"
|
16
|
+
|
17
|
+
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
|
18
|
+
options[:directory] = t
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
22
|
+
options[:all_timestamps] = true
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
26
|
+
options[:from_timestamp] = t
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
|
30
|
+
options[:to_timestamp] = t
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
|
34
|
+
options[:exact_url] = t
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
38
|
+
options[:only_filter] = t
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
42
|
+
options[:exclude_filter] = t
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
|
46
|
+
options[:all] = true
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
|
50
|
+
options[:threads_count] = t
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
|
54
|
+
options[:maximum_pages] = t
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
|
58
|
+
options[:list] = true
|
59
|
+
end
|
60
|
+
|
61
|
+
opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
|
62
|
+
options[:rewritten] = t
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on("-v", "--version", "Display version") do |t|
|
66
|
+
options[:version] = t
|
67
|
+
end
|
68
|
+
end.parse!
|
69
|
+
|
70
|
+
if (base_url = ARGV[-1])
|
71
|
+
options[:base_url] = base_url
|
72
|
+
wayback_machine_downloader = WaybackMachineDownloader.new options
|
73
|
+
if options[:list]
|
74
|
+
wayback_machine_downloader.list_files
|
75
|
+
else
|
76
|
+
wayback_machine_downloader.download_files
|
77
|
+
end
|
78
|
+
elsif options[:version]
|
79
|
+
puts WaybackMachineDownloader::VERSION
|
80
|
+
else
|
81
|
+
puts "You need to specify a website to backup. (e.g., http://example.com)"
|
82
|
+
puts "Run `wayback_machine_downloader --help` for more help."
|
83
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module ArchiveAPI
|
5
|
+
|
6
|
+
def get_raw_list_from_api(url, page_index, http)
|
7
|
+
request_url = URI("https://web.archive.org/cdx/search/xd")
|
8
|
+
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
9
|
+
request_url.query = URI.encode_www_form(params)
|
10
|
+
|
11
|
+
begin
|
12
|
+
response = http.get(request_url)
|
13
|
+
body = response.body.to_s.strip
|
14
|
+
return [] if body.empty?
|
15
|
+
json = JSON.parse(body)
|
16
|
+
|
17
|
+
# Check if the response contains the header ["timestamp", "original"]
|
18
|
+
json.shift if json.first == ["timestamp", "original"]
|
19
|
+
json
|
20
|
+
rescue JSON::ParserError, StandardError => e
|
21
|
+
warn "Failed to fetch data from API: #{e.message}"
|
22
|
+
[]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def parameters_for_api(page_index)
|
27
|
+
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
|
28
|
+
parameters.push(["filter", "statuscode:200"]) unless @all
|
29
|
+
parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
|
30
|
+
parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
|
31
|
+
parameters.push(["page", page_index]) if page_index
|
32
|
+
parameters
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module TidyBytes
|
4
|
+
# precomputing CP1252 to UTF-8 mappings for bytes 128-159
|
5
|
+
CP1252_MAP = (128..159).map do |byte|
|
6
|
+
case byte
|
7
|
+
when 128 then [226, 130, 172] # EURO SIGN
|
8
|
+
when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
|
9
|
+
when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
|
10
|
+
when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
|
11
|
+
when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
|
12
|
+
when 134 then [226, 128, 160] # DAGGER
|
13
|
+
when 135 then [226, 128, 161] # DOUBLE DAGGER
|
14
|
+
when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
|
15
|
+
when 137 then [226, 128, 176] # PER MILLE SIGN
|
16
|
+
when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
|
17
|
+
when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
18
|
+
when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
|
19
|
+
when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
|
20
|
+
when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
|
21
|
+
when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
|
22
|
+
when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
|
23
|
+
when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
|
24
|
+
when 149 then [226, 128, 162] # BULLET
|
25
|
+
when 150 then [226, 128, 147] # EN DASH
|
26
|
+
when 151 then [226, 128, 148] # EM DASH
|
27
|
+
when 152 then [203, 156] # SMALL TILDE
|
28
|
+
when 153 then [226, 132, 162] # TRADE MARK SIGN
|
29
|
+
when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
|
30
|
+
when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
31
|
+
when 156 then [197, 147] # LATIN SMALL LIGATURE OE
|
32
|
+
when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
|
33
|
+
when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
|
34
|
+
end
|
35
|
+
end.freeze
|
36
|
+
|
37
|
+
# precomputing all possible byte conversions
|
38
|
+
CP1252_TO_UTF8 = Array.new(256) do |b|
|
39
|
+
if (128..159).cover?(b)
|
40
|
+
CP1252_MAP[b - 128]&.pack('C*')
|
41
|
+
elsif b < 128
|
42
|
+
b.chr
|
43
|
+
else
|
44
|
+
b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
|
45
|
+
end
|
46
|
+
end.freeze
|
47
|
+
|
48
|
+
def self.included(base)
|
49
|
+
base.class_eval do
|
50
|
+
def tidy_bytes(force = false)
|
51
|
+
return nil if empty?
|
52
|
+
|
53
|
+
if force
|
54
|
+
buffer = String.new(capacity: bytesize)
|
55
|
+
each_byte { |b| buffer << CP1252_TO_UTF8[b] }
|
56
|
+
return buffer.force_encoding(Encoding::UTF_8)
|
57
|
+
end
|
58
|
+
|
59
|
+
begin
|
60
|
+
encode('UTF-8')
|
61
|
+
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
62
|
+
buffer = String.new(capacity: bytesize)
|
63
|
+
scrub { |b| CP1252_TO_UTF8[b.ord] }
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def tidy_bytes!(force = false)
|
68
|
+
result = tidy_bytes(force)
|
69
|
+
result ? replace(result) : self
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
class String
|
76
|
+
include TidyBytes
|
77
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ToRegex
|
4
|
+
module StringMixin
|
5
|
+
INLINE_OPTIONS = /[imxnesu]*/i.freeze
|
6
|
+
REGEXP_DELIMITERS = {
|
7
|
+
'%r{' => '}'.freeze,
|
8
|
+
'/' => '/'.freeze
|
9
|
+
}.freeze
|
10
|
+
|
11
|
+
REGEX_FLAGS = {
|
12
|
+
ignore_case: Regexp::IGNORECASE,
|
13
|
+
multiline: Regexp::MULTILINE,
|
14
|
+
extended: Regexp::EXTENDED
|
15
|
+
}.freeze
|
16
|
+
|
17
|
+
class << self
|
18
|
+
def literal?(str)
|
19
|
+
REGEXP_DELIMITERS.none? { |start, ending| str.start_with?(start) && str.match?(/#{ending}#{INLINE_OPTIONS}\z/) }
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Get a regex back
|
24
|
+
#
|
25
|
+
# Without :literal or :detect, `"foo".to_regex` will return nil.
|
26
|
+
#
|
27
|
+
# @param [optional, Hash] options
|
28
|
+
# @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
|
29
|
+
# @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
|
30
|
+
# @option options [true,false] :ignore_case /foo/i
|
31
|
+
# @option options [true,false] :multiline /foo/m
|
32
|
+
# @option options [true,false] :extended /foo/x
|
33
|
+
# @option options [true,false] :lang /foo/[nesu]
|
34
|
+
def to_regex(options = {})
|
35
|
+
args = as_regexp(options)
|
36
|
+
args ? Regexp.new(*args) : nil
|
37
|
+
end
|
38
|
+
# Return arguments that can be passed to `Regexp.new`
|
39
|
+
# @see to_regexp
|
40
|
+
def as_regexp(options = {})
|
41
|
+
raise ArgumentError, '[to_regexp] Options must be a Hash' unless options.is_a?(Hash)
|
42
|
+
|
43
|
+
str = self
|
44
|
+
return if options[:detect] && str.empty?
|
45
|
+
|
46
|
+
if should_treat_as_literal?(str, options)
|
47
|
+
content = Regexp.escape(str)
|
48
|
+
elsif (delim_set = extract_delimiters(str))
|
49
|
+
content, options = parse_regexp_string(str, delim_set, options)
|
50
|
+
return unless content
|
51
|
+
else
|
52
|
+
return
|
53
|
+
end
|
54
|
+
|
55
|
+
build_regexp_args(content, options)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def should_treat_as_literal?(str, options)
|
61
|
+
options[:literal] || (options[:detect] && ToRegex::StringMixin.literal?(str))
|
62
|
+
end
|
63
|
+
|
64
|
+
def extract_delimiters(str)
|
65
|
+
REGEXP_DELIMITERS.find { |start, _| str.start_with?(start) }
|
66
|
+
end
|
67
|
+
|
68
|
+
def parse_regexp_string(str, delim_set, options)
|
69
|
+
start_delim, end_delim = delim_set
|
70
|
+
match = /\A#{start_delim}(.*)#{end_delim}(#{INLINE_OPTIONS})\z/u.match(str)
|
71
|
+
return unless match
|
72
|
+
|
73
|
+
content = match[1].gsub('\\/', '/')
|
74
|
+
parse_inline_options(match[2], options)
|
75
|
+
[content, options]
|
76
|
+
end
|
77
|
+
|
78
|
+
def parse_inline_options(inline_options, options)
|
79
|
+
return unless inline_options
|
80
|
+
options[:ignore_case] = true if inline_options.include?('i')
|
81
|
+
options[:multiline] = true if inline_options.include?('m')
|
82
|
+
options[:extended] = true if inline_options.include?('x')
|
83
|
+
# 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
|
84
|
+
options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
|
85
|
+
end
|
86
|
+
|
87
|
+
def build_regexp_args(content, options)
|
88
|
+
flags = calculate_flags(options)
|
89
|
+
lang = normalize_lang_option(options[:lang])
|
90
|
+
|
91
|
+
lang.empty? ? [content, flags] : [content, flags, lang]
|
92
|
+
end
|
93
|
+
|
94
|
+
def calculate_flags(options)
|
95
|
+
REGEX_FLAGS.sum { |key, value| options[key] ? value : 0 }
|
96
|
+
end
|
97
|
+
|
98
|
+
def normalize_lang_option(lang)
|
99
|
+
return '' unless lang
|
100
|
+
RUBY_VERSION >= '1.9' ? lang.delete('u') : lang
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
class String
|
106
|
+
include ToRegex::StringMixin
|
107
|
+
end
|
@@ -0,0 +1,491 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'thread'
|
4
|
+
require 'net/http'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'cgi'
|
8
|
+
require 'json'
|
9
|
+
require 'time'
|
10
|
+
require 'concurrent-ruby'
|
11
|
+
require 'logger'
|
12
|
+
require_relative 'wayback_machine_downloader/tidy_bytes'
|
13
|
+
require_relative 'wayback_machine_downloader/to_regex'
|
14
|
+
require_relative 'wayback_machine_downloader/archive_api'
|
15
|
+
|
16
|
+
class ConnectionPool
|
17
|
+
MAX_AGE = 300
|
18
|
+
CLEANUP_INTERVAL = 60
|
19
|
+
DEFAULT_TIMEOUT = 30
|
20
|
+
MAX_RETRIES = 3
|
21
|
+
|
22
|
+
def initialize(size)
|
23
|
+
@size = size
|
24
|
+
@pool = Concurrent::Map.new
|
25
|
+
@creation_times = Concurrent::Map.new
|
26
|
+
@cleanup_thread = schedule_cleanup
|
27
|
+
end
|
28
|
+
|
29
|
+
def with_connection(&block)
|
30
|
+
conn = acquire_connection
|
31
|
+
begin
|
32
|
+
yield conn
|
33
|
+
ensure
|
34
|
+
release_connection(conn)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def shutdown
|
39
|
+
@cleanup_thread&.exit
|
40
|
+
@pool.each_value { |conn| conn.finish if conn&.started? }
|
41
|
+
@pool.clear
|
42
|
+
@creation_times.clear
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def acquire_connection
|
48
|
+
thread_id = Thread.current.object_id
|
49
|
+
conn = @pool[thread_id]
|
50
|
+
|
51
|
+
if should_create_new?(conn)
|
52
|
+
conn&.finish if conn&.started?
|
53
|
+
conn = create_connection
|
54
|
+
@pool[thread_id] = conn
|
55
|
+
@creation_times[thread_id] = Time.now
|
56
|
+
end
|
57
|
+
|
58
|
+
conn
|
59
|
+
end
|
60
|
+
|
61
|
+
def release_connection(conn)
|
62
|
+
return unless conn
|
63
|
+
if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
|
64
|
+
conn.finish
|
65
|
+
@pool.delete(Thread.current.object_id)
|
66
|
+
@creation_times.delete(Thread.current.object_id)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def should_create_new?(conn)
|
71
|
+
return true if conn.nil?
|
72
|
+
return true unless conn.started?
|
73
|
+
return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
|
74
|
+
false
|
75
|
+
end
|
76
|
+
|
77
|
+
def create_connection
|
78
|
+
http = Net::HTTP.new("web.archive.org", 443)
|
79
|
+
http.use_ssl = true
|
80
|
+
http.read_timeout = DEFAULT_TIMEOUT
|
81
|
+
http.open_timeout = DEFAULT_TIMEOUT
|
82
|
+
http.keep_alive_timeout = 30
|
83
|
+
http.max_retries = MAX_RETRIES
|
84
|
+
http.start
|
85
|
+
http
|
86
|
+
end
|
87
|
+
|
88
|
+
def schedule_cleanup
|
89
|
+
Thread.new do
|
90
|
+
loop do
|
91
|
+
cleanup_old_connections
|
92
|
+
sleep CLEANUP_INTERVAL
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def cleanup_old_connections
|
98
|
+
current_time = Time.now
|
99
|
+
@creation_times.each do |thread_id, creation_time|
|
100
|
+
if current_time - creation_time > MAX_AGE
|
101
|
+
conn = @pool[thread_id]
|
102
|
+
conn&.finish if conn&.started?
|
103
|
+
@pool.delete(thread_id)
|
104
|
+
@creation_times.delete(thread_id)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
class WaybackMachineDownloader
|
111
|
+
|
112
|
+
include ArchiveAPI
|
113
|
+
|
114
|
+
VERSION = "2.3.3"
|
115
|
+
DEFAULT_TIMEOUT = 30
|
116
|
+
MAX_RETRIES = 3
|
117
|
+
RETRY_DELAY = 2
|
118
|
+
RATE_LIMIT = 0.25 # Delay between requests in seconds
|
119
|
+
CONNECTION_POOL_SIZE = 10
|
120
|
+
MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
|
121
|
+
|
122
|
+
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
123
|
+
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
124
|
+
:all, :maximum_pages, :threads_count, :logger
|
125
|
+
|
126
|
+
def initialize params
|
127
|
+
validate_params(params)
|
128
|
+
@base_url = params[:base_url]
|
129
|
+
@exact_url = params[:exact_url]
|
130
|
+
@directory = params[:directory]
|
131
|
+
@all_timestamps = params[:all_timestamps]
|
132
|
+
@from_timestamp = params[:from_timestamp].to_i
|
133
|
+
@to_timestamp = params[:to_timestamp].to_i
|
134
|
+
@only_filter = params[:only_filter]
|
135
|
+
@exclude_filter = params[:exclude_filter]
|
136
|
+
@all = params[:all]
|
137
|
+
@maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
|
138
|
+
@threads_count = [params[:threads_count].to_i, 1].max
|
139
|
+
@rewritten = params[:rewritten]
|
140
|
+
@timeout = params[:timeout] || DEFAULT_TIMEOUT
|
141
|
+
@logger = setup_logger
|
142
|
+
@failed_downloads = Concurrent::Array.new
|
143
|
+
@connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
|
144
|
+
end
|
145
|
+
|
146
|
+
def backup_name
|
147
|
+
if @base_url.include? '//'
|
148
|
+
@base_url.split('/')[2]
|
149
|
+
else
|
150
|
+
@base_url
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def backup_path
|
155
|
+
if @directory
|
156
|
+
if @directory[-1] == '/'
|
157
|
+
@directory
|
158
|
+
else
|
159
|
+
@directory + '/'
|
160
|
+
end
|
161
|
+
else
|
162
|
+
'websites/' + backup_name + '/'
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def match_only_filter file_url
|
167
|
+
if @only_filter
|
168
|
+
only_filter_regex = @only_filter.to_regex
|
169
|
+
if only_filter_regex
|
170
|
+
only_filter_regex =~ file_url
|
171
|
+
else
|
172
|
+
file_url.downcase.include? @only_filter.downcase
|
173
|
+
end
|
174
|
+
else
|
175
|
+
true
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
def match_exclude_filter file_url
|
180
|
+
if @exclude_filter
|
181
|
+
exclude_filter_regex = @exclude_filter.to_regex
|
182
|
+
if exclude_filter_regex
|
183
|
+
exclude_filter_regex =~ file_url
|
184
|
+
else
|
185
|
+
file_url.downcase.include? @exclude_filter.downcase
|
186
|
+
end
|
187
|
+
else
|
188
|
+
false
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def get_all_snapshots_to_consider
|
193
|
+
snapshot_list_to_consider = []
|
194
|
+
|
195
|
+
@connection_pool.with_connection do |connection|
|
196
|
+
puts "Getting snapshot pages"
|
197
|
+
|
198
|
+
# Fetch the initial set of snapshots
|
199
|
+
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
|
200
|
+
print "."
|
201
|
+
|
202
|
+
# Fetch additional pages if the exact URL flag is not set
|
203
|
+
unless @exact_url
|
204
|
+
@maximum_pages.times do |page_index|
|
205
|
+
snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection)
|
206
|
+
break if snapshot_list.empty?
|
207
|
+
|
208
|
+
snapshot_list_to_consider += snapshot_list
|
209
|
+
print "."
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
puts " found #{snapshot_list_to_consider.length} snapshots to consider."
|
215
|
+
puts
|
216
|
+
|
217
|
+
snapshot_list_to_consider
|
218
|
+
end
|
219
|
+
|
220
|
+
def get_file_list_curated
|
221
|
+
file_list_curated = Hash.new
|
222
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
223
|
+
next unless file_url.include?('/')
|
224
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
225
|
+
file_id = CGI::unescape file_id
|
226
|
+
file_id = file_id.tidy_bytes unless file_id == ""
|
227
|
+
if file_id.nil?
|
228
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
229
|
+
else
|
230
|
+
if match_exclude_filter(file_url)
|
231
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
232
|
+
elsif not match_only_filter(file_url)
|
233
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
234
|
+
elsif file_list_curated[file_id]
|
235
|
+
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
236
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
237
|
+
end
|
238
|
+
else
|
239
|
+
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
file_list_curated
|
244
|
+
end
|
245
|
+
|
246
|
+
def get_file_list_all_timestamps
|
247
|
+
file_list_curated = Hash.new
|
248
|
+
get_all_snapshots_to_consider.each do |file_timestamp, file_url|
|
249
|
+
next unless file_url.include?('/')
|
250
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
251
|
+
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
252
|
+
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
253
|
+
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
254
|
+
if file_id.nil?
|
255
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
256
|
+
else
|
257
|
+
if match_exclude_filter(file_url)
|
258
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
259
|
+
elsif not match_only_filter(file_url)
|
260
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
261
|
+
elsif file_list_curated[file_id_and_timestamp]
|
262
|
+
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
263
|
+
else
|
264
|
+
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
268
|
+
puts "file_list_curated: " + file_list_curated.count.to_s
|
269
|
+
file_list_curated
|
270
|
+
end
|
271
|
+
|
272
|
+
|
273
|
+
def get_file_list_by_timestamp
|
274
|
+
if @all_timestamps
|
275
|
+
file_list_curated = get_file_list_all_timestamps
|
276
|
+
file_list_curated.map do |file_remote_info|
|
277
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
278
|
+
file_remote_info[1]
|
279
|
+
end
|
280
|
+
else
|
281
|
+
file_list_curated = get_file_list_curated
|
282
|
+
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
283
|
+
file_list_curated.map do |file_remote_info|
|
284
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
285
|
+
file_remote_info[1]
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
def list_files
|
291
|
+
# retrieval produces its own output
|
292
|
+
@orig_stdout = $stdout
|
293
|
+
$stdout = $stderr
|
294
|
+
files = get_file_list_by_timestamp
|
295
|
+
$stdout = @orig_stdout
|
296
|
+
puts "["
|
297
|
+
files[0...-1].each do |file|
|
298
|
+
puts file.to_json + ","
|
299
|
+
end
|
300
|
+
puts files[-1].to_json
|
301
|
+
puts "]"
|
302
|
+
end
|
303
|
+
|
304
|
+
def download_files
|
305
|
+
start_time = Time.now
|
306
|
+
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
|
307
|
+
|
308
|
+
if file_list_by_timestamp.empty?
|
309
|
+
puts "No files to download."
|
310
|
+
return
|
311
|
+
end
|
312
|
+
|
313
|
+
total_files = file_list_by_timestamp.count
|
314
|
+
puts "#{total_files} files to download:"
|
315
|
+
|
316
|
+
@processed_file_count = 0
|
317
|
+
@download_mutex = Mutex.new
|
318
|
+
|
319
|
+
thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
|
320
|
+
pool = Concurrent::FixedThreadPool.new(thread_count)
|
321
|
+
|
322
|
+
file_list_by_timestamp.each do |file_remote_info|
|
323
|
+
pool.post do
|
324
|
+
@connection_pool.with_connection do |connection|
|
325
|
+
result = download_file(file_remote_info, connection)
|
326
|
+
@download_mutex.synchronize do
|
327
|
+
@processed_file_count += 1
|
328
|
+
puts result if result
|
329
|
+
end
|
330
|
+
end
|
331
|
+
sleep(RATE_LIMIT)
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
pool.shutdown
|
336
|
+
pool.wait_for_termination
|
337
|
+
|
338
|
+
end_time = Time.now
|
339
|
+
puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
|
340
|
+
cleanup
|
341
|
+
end
|
342
|
+
|
343
|
+
def structure_dir_path dir_path
|
344
|
+
begin
|
345
|
+
FileUtils::mkdir_p dir_path unless File.exist? dir_path
|
346
|
+
rescue Errno::EEXIST => e
|
347
|
+
error_to_string = e.to_s
|
348
|
+
puts "# #{error_to_string}"
|
349
|
+
if error_to_string.include? "File exists @ dir_s_mkdir - "
|
350
|
+
file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
|
351
|
+
elsif error_to_string.include? "File exists - "
|
352
|
+
file_already_existing = error_to_string.split("File exists - ")[-1]
|
353
|
+
else
|
354
|
+
raise "Unhandled directory restructure error # #{error_to_string}"
|
355
|
+
end
|
356
|
+
file_already_existing_temporary = file_already_existing + '.temp'
|
357
|
+
file_already_existing_permanent = file_already_existing + '/index.html'
|
358
|
+
FileUtils::mv file_already_existing, file_already_existing_temporary
|
359
|
+
FileUtils::mkdir_p file_already_existing
|
360
|
+
FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
|
361
|
+
puts "#{file_already_existing} -> #{file_already_existing_permanent}"
|
362
|
+
structure_dir_path dir_path
|
363
|
+
end
|
364
|
+
end
|
365
|
+
|
366
|
+
def download_file (file_remote_info, http)
|
367
|
+
current_encoding = "".encoding
|
368
|
+
file_url = file_remote_info[:file_url].encode(current_encoding)
|
369
|
+
file_id = file_remote_info[:file_id]
|
370
|
+
file_timestamp = file_remote_info[:timestamp]
|
371
|
+
file_path_elements = file_id.split('/')
|
372
|
+
|
373
|
+
if file_id == ""
|
374
|
+
dir_path = backup_path
|
375
|
+
file_path = backup_path + 'index.html'
|
376
|
+
elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
|
377
|
+
dir_path = backup_path + file_path_elements[0..-1].join('/')
|
378
|
+
file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
|
379
|
+
else
|
380
|
+
dir_path = backup_path + file_path_elements[0..-2].join('/')
|
381
|
+
file_path = backup_path + file_path_elements[0..-1].join('/')
|
382
|
+
end
|
383
|
+
if Gem.win_platform?
|
384
|
+
dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
385
|
+
file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
|
386
|
+
end
|
387
|
+
unless File.exist? file_path
|
388
|
+
begin
|
389
|
+
structure_dir_path dir_path
|
390
|
+
download_with_retry(file_path, file_url, file_timestamp, http)
|
391
|
+
"#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
392
|
+
rescue StandardError => e
|
393
|
+
msg = "#{file_url} # #{e}"
|
394
|
+
if not @all and File.exist?(file_path) and File.size(file_path) == 0
|
395
|
+
File.delete(file_path)
|
396
|
+
msg += "\n#{file_path} was empty and was removed."
|
397
|
+
end
|
398
|
+
msg
|
399
|
+
end
|
400
|
+
else
|
401
|
+
"#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
|
402
|
+
end
|
403
|
+
end
|
404
|
+
|
405
|
+
def file_queue
|
406
|
+
@file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
|
407
|
+
end
|
408
|
+
|
409
|
+
def file_list_by_timestamp
|
410
|
+
@file_list_by_timestamp ||= get_file_list_by_timestamp
|
411
|
+
end
|
412
|
+
|
413
|
+
private
|
414
|
+
|
415
|
+
def validate_params(params)
|
416
|
+
raise ArgumentError, "Base URL is required" unless params[:base_url]
|
417
|
+
raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
|
418
|
+
end
|
419
|
+
|
420
|
+
def setup_logger
|
421
|
+
logger = Logger.new(STDOUT)
|
422
|
+
logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
|
423
|
+
logger.formatter = proc do |severity, datetime, progname, msg|
|
424
|
+
"#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
|
425
|
+
end
|
426
|
+
logger
|
427
|
+
end
|
428
|
+
|
429
|
+
def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
|
430
|
+
retries = 0
|
431
|
+
begin
|
432
|
+
wayback_url = if @rewritten
|
433
|
+
"https://web.archive.org/web/#{file_timestamp}/#{file_url}"
|
434
|
+
else
|
435
|
+
"https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
|
436
|
+
end
|
437
|
+
|
438
|
+
request = Net::HTTP::Get.new(URI(wayback_url))
|
439
|
+
request["Connection"] = "keep-alive"
|
440
|
+
request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
|
441
|
+
|
442
|
+
response = connection.request(request)
|
443
|
+
|
444
|
+
case response
|
445
|
+
when Net::HTTPSuccess
|
446
|
+
File.open(file_path, "wb") do |file|
|
447
|
+
if block_given?
|
448
|
+
yield(response, file)
|
449
|
+
else
|
450
|
+
file.write(response.body)
|
451
|
+
end
|
452
|
+
end
|
453
|
+
when Net::HTTPRedirection
|
454
|
+
raise "Too many redirects for #{file_url}" if redirect_count >= 2
|
455
|
+
location = response['location']
|
456
|
+
@logger.warn("Redirect found for #{file_url} -> #{location}")
|
457
|
+
return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
|
458
|
+
when Net::HTTPTooManyRequests
|
459
|
+
sleep(RATE_LIMIT * 2)
|
460
|
+
raise "Rate limited, retrying..."
|
461
|
+
when Net::HTTPNotFound
|
462
|
+
@logger.warn("File not found, skipping: #{file_url}")
|
463
|
+
return
|
464
|
+
else
|
465
|
+
raise "HTTP Error: #{response.code} #{response.message}"
|
466
|
+
end
|
467
|
+
|
468
|
+
rescue StandardError => e
|
469
|
+
if retries < MAX_RETRIES
|
470
|
+
retries += 1
|
471
|
+
@logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
|
472
|
+
sleep(RETRY_DELAY * retries)
|
473
|
+
retry
|
474
|
+
else
|
475
|
+
@failed_downloads << {url: file_url, error: e.message}
|
476
|
+
raise e
|
477
|
+
end
|
478
|
+
end
|
479
|
+
end
|
480
|
+
|
481
|
+
def cleanup
|
482
|
+
@connection_pool.shutdown
|
483
|
+
|
484
|
+
if @failed_downloads.any?
|
485
|
+
@logger.error("Failed downloads summary:")
|
486
|
+
@failed_downloads.each do |failure|
|
487
|
+
@logger.error(" #{failure[:url]} - #{failure[:error]}")
|
488
|
+
end
|
489
|
+
end
|
490
|
+
end
|
491
|
+
end
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wayback_machine_downloader_straw
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.3.3
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- strawberrymaster
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2025-03-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: concurrent-ruby
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.3.4
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.3'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.3.4
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: rake
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '12.2'
|
40
|
+
type: :development
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '12.2'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: minitest
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '5.2'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '5.2'
|
61
|
+
description: Download complete websites from the Internet Archive's Wayback Machine.
|
62
|
+
While the Wayback Machine (archive.org) excellently preserves web history, it lacks
|
63
|
+
a built-in export functionality; this gem does just that, allowing you to download
|
64
|
+
entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader
|
65
|
+
gem by hartator, with enhanced features and performance improvements.)
|
66
|
+
email: strawberrymaster@vivaldi.net
|
67
|
+
executables:
|
68
|
+
- wayback_machine_downloader
|
69
|
+
extensions: []
|
70
|
+
extra_rdoc_files: []
|
71
|
+
files:
|
72
|
+
- bin/wayback_machine_downloader
|
73
|
+
- lib/wayback_machine_downloader.rb
|
74
|
+
- lib/wayback_machine_downloader/archive_api.rb
|
75
|
+
- lib/wayback_machine_downloader/tidy_bytes.rb
|
76
|
+
- lib/wayback_machine_downloader/to_regex.rb
|
77
|
+
homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
|
78
|
+
licenses:
|
79
|
+
- MIT
|
80
|
+
metadata: {}
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
require_paths:
|
84
|
+
- lib
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 1.9.2
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
requirements: []
|
96
|
+
rubygems_version: 3.5.11
|
97
|
+
signing_key:
|
98
|
+
specification_version: 4
|
99
|
+
summary: Download an entire website from the Wayback Machine.
|
100
|
+
test_files: []
|