wayback_machine_downloader_straw 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: a8d577ca08cca3858efd95bfd879b198a57aa6262fa8e0a7f83ab4f3a362f1fc
4
+ data.tar.gz: ef73d81d745e7b3e9226458a66b5d54c2410db646ea85cc7145813bc26789dc7
5
+ SHA512:
6
+ metadata.gz: 938e8544bb16b4afc6c81d0e4da602b5d3cd3e05482b3cc945ad3405681278fc03c7ccc1b84992b1ecafe66cf202aa71f306226b92f4b93b30b1c5c7edcbc86e
7
+ data.tar.gz: 8c236877be6274b9bb3c474fde9ad5a72a30abd5db3eadfc11eeae997488a2ddf27a72388b6d7d47455235ed0e084b1d14a7782911d0dd69e41f2fdcada713e2
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/wayback_machine_downloader'
4
+ require 'optparse'
5
+ require 'pp'
6
+
7
+ options = {}
8
+ option_parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: wayback_machine_downloader http://example.com"
10
+
11
+ opts.separator ""
12
+ opts.separator "Download an entire website from the Wayback Machine."
13
+
14
+ opts.separator ""
15
+ opts.separator "Optional options:"
16
+
17
+ opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
18
+ options[:directory] = t
19
+ end
20
+
21
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
+ options[:all_timestamps] = true
23
+ end
24
+
25
+ opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
26
+ options[:from_timestamp] = t
27
+ end
28
+
29
+ opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
30
+ options[:to_timestamp] = t
31
+ end
32
+
33
+ opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34
+ options[:exact_url] = t
35
+ end
36
+
37
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
38
+ options[:only_filter] = t
39
+ end
40
+
41
+ opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
42
+ options[:exclude_filter] = t
43
+ end
44
+
45
+ opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
46
+ options[:all] = true
47
+ end
48
+
49
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
50
+ options[:threads_count] = t
51
+ end
52
+
53
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
54
+ options[:maximum_pages] = t
55
+ end
56
+
57
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
58
+ options[:list] = true
59
+ end
60
+
61
+ opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
62
+ options[:rewritten] = t
63
+ end
64
+
65
+ opts.on("-v", "--version", "Display version") do |t|
66
+ options[:version] = t
67
+ end
68
+ end.parse!
69
+
70
+ if (base_url = ARGV[-1])
71
+ options[:base_url] = base_url
72
+ wayback_machine_downloader = WaybackMachineDownloader.new options
73
+ if options[:list]
74
+ wayback_machine_downloader.list_files
75
+ else
76
+ wayback_machine_downloader.download_files
77
+ end
78
+ elsif options[:version]
79
+ puts WaybackMachineDownloader::VERSION
80
+ else
81
+ puts "You need to specify a website to backup. (e.g., http://example.com)"
82
+ puts "Run `wayback_machine_downloader --help` for more help."
83
+ end
@@ -0,0 +1,35 @@
1
+ require 'json'
2
+ require 'uri'
3
+
4
+ module ArchiveAPI
5
+
6
+ def get_raw_list_from_api(url, page_index, http)
7
+ request_url = URI("https://web.archive.org/cdx/search/xd")
8
+ params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
9
+ request_url.query = URI.encode_www_form(params)
10
+
11
+ begin
12
+ response = http.get(request_url)
13
+ body = response.body.to_s.strip
14
+ return [] if body.empty?
15
+ json = JSON.parse(body)
16
+
17
+ # Check if the response contains the header ["timestamp", "original"]
18
+ json.shift if json.first == ["timestamp", "original"]
19
+ json
20
+ rescue JSON::ParserError, StandardError => e
21
+ warn "Failed to fetch data from API: #{e.message}"
22
+ []
23
+ end
24
+ end
25
+
26
+ def parameters_for_api(page_index)
27
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
28
+ parameters.push(["filter", "statuscode:200"]) unless @all
29
+ parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
30
+ parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
31
+ parameters.push(["page", page_index]) if page_index
32
+ parameters
33
+ end
34
+
35
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TidyBytes
4
+ # precomputing CP1252 to UTF-8 mappings for bytes 128-159
5
+ CP1252_MAP = (128..159).map do |byte|
6
+ case byte
7
+ when 128 then [226, 130, 172] # EURO SIGN
8
+ when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK
9
+ when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK
10
+ when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK
11
+ when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS
12
+ when 134 then [226, 128, 160] # DAGGER
13
+ when 135 then [226, 128, 161] # DOUBLE DAGGER
14
+ when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT
15
+ when 137 then [226, 128, 176] # PER MILLE SIGN
16
+ when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON
17
+ when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
18
+ when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE
19
+ when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON
20
+ when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK
21
+ when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK
22
+ when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK
23
+ when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK
24
+ when 149 then [226, 128, 162] # BULLET
25
+ when 150 then [226, 128, 147] # EN DASH
26
+ when 151 then [226, 128, 148] # EM DASH
27
+ when 152 then [203, 156] # SMALL TILDE
28
+ when 153 then [226, 132, 162] # TRADE MARK SIGN
29
+ when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON
30
+ when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
31
+ when 156 then [197, 147] # LATIN SMALL LIGATURE OE
32
+ when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON
33
+ when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS
34
+ end
35
+ end.freeze
36
+
37
+ # precomputing all possible byte conversions
38
+ CP1252_TO_UTF8 = Array.new(256) do |b|
39
+ if (128..159).cover?(b)
40
+ CP1252_MAP[b - 128]&.pack('C*')
41
+ elsif b < 128
42
+ b.chr
43
+ else
44
+ b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
45
+ end
46
+ end.freeze
47
+
48
+ def self.included(base)
49
+ base.class_eval do
50
+ def tidy_bytes(force = false)
51
+ return nil if empty?
52
+
53
+ if force
54
+ buffer = String.new(capacity: bytesize)
55
+ each_byte { |b| buffer << CP1252_TO_UTF8[b] }
56
+ return buffer.force_encoding(Encoding::UTF_8)
57
+ end
58
+
59
+ begin
60
+ encode('UTF-8')
61
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
62
+ buffer = String.new(capacity: bytesize)
63
+ scrub { |b| CP1252_TO_UTF8[b.ord] }
64
+ end
65
+ end
66
+
67
+ def tidy_bytes!(force = false)
68
+ result = tidy_bytes(force)
69
+ result ? replace(result) : self
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ class String
76
+ include TidyBytes
77
+ end
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ToRegex
4
+ module StringMixin
5
+ INLINE_OPTIONS = /[imxnesu]*/i.freeze
6
+ REGEXP_DELIMITERS = {
7
+ '%r{' => '}'.freeze,
8
+ '/' => '/'.freeze
9
+ }.freeze
10
+
11
+ REGEX_FLAGS = {
12
+ ignore_case: Regexp::IGNORECASE,
13
+ multiline: Regexp::MULTILINE,
14
+ extended: Regexp::EXTENDED
15
+ }.freeze
16
+
17
+ class << self
18
+ def literal?(str)
19
+ REGEXP_DELIMITERS.none? { |start, ending| str.start_with?(start) && str.match?(/#{ending}#{INLINE_OPTIONS}\z/) }
20
+ end
21
+ end
22
+
23
+ # Get a regex back
24
+ #
25
+ # Without :literal or :detect, `"foo".to_regex` will return nil.
26
+ #
27
+ # @param [optional, Hash] options
28
+ # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
29
+ # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
30
+ # @option options [true,false] :ignore_case /foo/i
31
+ # @option options [true,false] :multiline /foo/m
32
+ # @option options [true,false] :extended /foo/x
33
+ # @option options [true,false] :lang /foo/[nesu]
34
+ def to_regex(options = {})
35
+ args = as_regexp(options)
36
+ args ? Regexp.new(*args) : nil
37
+ end
38
+ # Return arguments that can be passed to `Regexp.new`
39
+ # @see to_regexp
40
+ def as_regexp(options = {})
41
+ raise ArgumentError, '[to_regexp] Options must be a Hash' unless options.is_a?(Hash)
42
+
43
+ str = self
44
+ return if options[:detect] && str.empty?
45
+
46
+ if should_treat_as_literal?(str, options)
47
+ content = Regexp.escape(str)
48
+ elsif (delim_set = extract_delimiters(str))
49
+ content, options = parse_regexp_string(str, delim_set, options)
50
+ return unless content
51
+ else
52
+ return
53
+ end
54
+
55
+ build_regexp_args(content, options)
56
+ end
57
+
58
+ private
59
+
60
+ def should_treat_as_literal?(str, options)
61
+ options[:literal] || (options[:detect] && ToRegex::StringMixin.literal?(str))
62
+ end
63
+
64
+ def extract_delimiters(str)
65
+ REGEXP_DELIMITERS.find { |start, _| str.start_with?(start) }
66
+ end
67
+
68
+ def parse_regexp_string(str, delim_set, options)
69
+ start_delim, end_delim = delim_set
70
+ match = /\A#{start_delim}(.*)#{end_delim}(#{INLINE_OPTIONS})\z/u.match(str)
71
+ return unless match
72
+
73
+ content = match[1].gsub('\\/', '/')
74
+ parse_inline_options(match[2], options)
75
+ [content, options]
76
+ end
77
+
78
+ def parse_inline_options(inline_options, options)
79
+ return unless inline_options
80
+ options[:ignore_case] = true if inline_options.include?('i')
81
+ options[:multiline] = true if inline_options.include?('m')
82
+ options[:extended] = true if inline_options.include?('x')
83
+ # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
84
+ options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
85
+ end
86
+
87
+ def build_regexp_args(content, options)
88
+ flags = calculate_flags(options)
89
+ lang = normalize_lang_option(options[:lang])
90
+
91
+ lang.empty? ? [content, flags] : [content, flags, lang]
92
+ end
93
+
94
+ def calculate_flags(options)
95
+ REGEX_FLAGS.sum { |key, value| options[key] ? value : 0 }
96
+ end
97
+
98
+ def normalize_lang_option(lang)
99
+ return '' unless lang
100
+ RUBY_VERSION >= '1.9' ? lang.delete('u') : lang
101
+ end
102
+ end
103
+ end
104
+
105
+ class String
106
+ include ToRegex::StringMixin
107
+ end
@@ -0,0 +1,491 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'thread'
4
+ require 'net/http'
5
+ require 'open-uri'
6
+ require 'fileutils'
7
+ require 'cgi'
8
+ require 'json'
9
+ require 'time'
10
+ require 'concurrent-ruby'
11
+ require 'logger'
12
+ require_relative 'wayback_machine_downloader/tidy_bytes'
13
+ require_relative 'wayback_machine_downloader/to_regex'
14
+ require_relative 'wayback_machine_downloader/archive_api'
15
+
16
+ class ConnectionPool
17
+ MAX_AGE = 300
18
+ CLEANUP_INTERVAL = 60
19
+ DEFAULT_TIMEOUT = 30
20
+ MAX_RETRIES = 3
21
+
22
+ def initialize(size)
23
+ @size = size
24
+ @pool = Concurrent::Map.new
25
+ @creation_times = Concurrent::Map.new
26
+ @cleanup_thread = schedule_cleanup
27
+ end
28
+
29
+ def with_connection(&block)
30
+ conn = acquire_connection
31
+ begin
32
+ yield conn
33
+ ensure
34
+ release_connection(conn)
35
+ end
36
+ end
37
+
38
+ def shutdown
39
+ @cleanup_thread&.exit
40
+ @pool.each_value { |conn| conn.finish if conn&.started? }
41
+ @pool.clear
42
+ @creation_times.clear
43
+ end
44
+
45
+ private
46
+
47
+ def acquire_connection
48
+ thread_id = Thread.current.object_id
49
+ conn = @pool[thread_id]
50
+
51
+ if should_create_new?(conn)
52
+ conn&.finish if conn&.started?
53
+ conn = create_connection
54
+ @pool[thread_id] = conn
55
+ @creation_times[thread_id] = Time.now
56
+ end
57
+
58
+ conn
59
+ end
60
+
61
+ def release_connection(conn)
62
+ return unless conn
63
+ if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
64
+ conn.finish
65
+ @pool.delete(Thread.current.object_id)
66
+ @creation_times.delete(Thread.current.object_id)
67
+ end
68
+ end
69
+
70
+ def should_create_new?(conn)
71
+ return true if conn.nil?
72
+ return true unless conn.started?
73
+ return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
74
+ false
75
+ end
76
+
77
+ def create_connection
78
+ http = Net::HTTP.new("web.archive.org", 443)
79
+ http.use_ssl = true
80
+ http.read_timeout = DEFAULT_TIMEOUT
81
+ http.open_timeout = DEFAULT_TIMEOUT
82
+ http.keep_alive_timeout = 30
83
+ http.max_retries = MAX_RETRIES
84
+ http.start
85
+ http
86
+ end
87
+
88
+ def schedule_cleanup
89
+ Thread.new do
90
+ loop do
91
+ cleanup_old_connections
92
+ sleep CLEANUP_INTERVAL
93
+ end
94
+ end
95
+ end
96
+
97
+ def cleanup_old_connections
98
+ current_time = Time.now
99
+ @creation_times.each do |thread_id, creation_time|
100
+ if current_time - creation_time > MAX_AGE
101
+ conn = @pool[thread_id]
102
+ conn&.finish if conn&.started?
103
+ @pool.delete(thread_id)
104
+ @creation_times.delete(thread_id)
105
+ end
106
+ end
107
+ end
108
+ end
109
+
110
+ class WaybackMachineDownloader
111
+
112
+ include ArchiveAPI
113
+
114
+ VERSION = "2.3.3"
115
+ DEFAULT_TIMEOUT = 30
116
+ MAX_RETRIES = 3
117
+ RETRY_DELAY = 2
118
+ RATE_LIMIT = 0.25 # Delay between requests in seconds
119
+ CONNECTION_POOL_SIZE = 10
120
+ MEMORY_BUFFER_SIZE = 16384 # 16KB chunks
121
+
122
+ attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
123
+ :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
124
+ :all, :maximum_pages, :threads_count, :logger
125
+
126
+ def initialize params
127
+ validate_params(params)
128
+ @base_url = params[:base_url]
129
+ @exact_url = params[:exact_url]
130
+ @directory = params[:directory]
131
+ @all_timestamps = params[:all_timestamps]
132
+ @from_timestamp = params[:from_timestamp].to_i
133
+ @to_timestamp = params[:to_timestamp].to_i
134
+ @only_filter = params[:only_filter]
135
+ @exclude_filter = params[:exclude_filter]
136
+ @all = params[:all]
137
+ @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
138
+ @threads_count = [params[:threads_count].to_i, 1].max
139
+ @rewritten = params[:rewritten]
140
+ @timeout = params[:timeout] || DEFAULT_TIMEOUT
141
+ @logger = setup_logger
142
+ @failed_downloads = Concurrent::Array.new
143
+ @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
144
+ end
145
+
146
+ def backup_name
147
+ if @base_url.include? '//'
148
+ @base_url.split('/')[2]
149
+ else
150
+ @base_url
151
+ end
152
+ end
153
+
154
+ def backup_path
155
+ if @directory
156
+ if @directory[-1] == '/'
157
+ @directory
158
+ else
159
+ @directory + '/'
160
+ end
161
+ else
162
+ 'websites/' + backup_name + '/'
163
+ end
164
+ end
165
+
166
+ def match_only_filter file_url
167
+ if @only_filter
168
+ only_filter_regex = @only_filter.to_regex
169
+ if only_filter_regex
170
+ only_filter_regex =~ file_url
171
+ else
172
+ file_url.downcase.include? @only_filter.downcase
173
+ end
174
+ else
175
+ true
176
+ end
177
+ end
178
+
179
+ def match_exclude_filter file_url
180
+ if @exclude_filter
181
+ exclude_filter_regex = @exclude_filter.to_regex
182
+ if exclude_filter_regex
183
+ exclude_filter_regex =~ file_url
184
+ else
185
+ file_url.downcase.include? @exclude_filter.downcase
186
+ end
187
+ else
188
+ false
189
+ end
190
+ end
191
+
192
+ def get_all_snapshots_to_consider
193
+ snapshot_list_to_consider = []
194
+
195
+ @connection_pool.with_connection do |connection|
196
+ puts "Getting snapshot pages"
197
+
198
+ # Fetch the initial set of snapshots
199
+ snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, connection)
200
+ print "."
201
+
202
+ # Fetch additional pages if the exact URL flag is not set
203
+ unless @exact_url
204
+ @maximum_pages.times do |page_index|
205
+ snapshot_list = get_raw_list_from_api("#{@base_url}/*", page_index, connection)
206
+ break if snapshot_list.empty?
207
+
208
+ snapshot_list_to_consider += snapshot_list
209
+ print "."
210
+ end
211
+ end
212
+ end
213
+
214
+ puts " found #{snapshot_list_to_consider.length} snapshots to consider."
215
+ puts
216
+
217
+ snapshot_list_to_consider
218
+ end
219
+
220
+ def get_file_list_curated
221
+ file_list_curated = Hash.new
222
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
223
+ next unless file_url.include?('/')
224
+ file_id = file_url.split('/')[3..-1].join('/')
225
+ file_id = CGI::unescape file_id
226
+ file_id = file_id.tidy_bytes unless file_id == ""
227
+ if file_id.nil?
228
+ puts "Malformed file url, ignoring: #{file_url}"
229
+ else
230
+ if match_exclude_filter(file_url)
231
+ puts "File url matches exclude filter, ignoring: #{file_url}"
232
+ elsif not match_only_filter(file_url)
233
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
234
+ elsif file_list_curated[file_id]
235
+ unless file_list_curated[file_id][:timestamp] > file_timestamp
236
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
237
+ end
238
+ else
239
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
240
+ end
241
+ end
242
+ end
243
+ file_list_curated
244
+ end
245
+
246
+ def get_file_list_all_timestamps
247
+ file_list_curated = Hash.new
248
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
249
+ next unless file_url.include?('/')
250
+ file_id = file_url.split('/')[3..-1].join('/')
251
+ file_id_and_timestamp = [file_timestamp, file_id].join('/')
252
+ file_id_and_timestamp = CGI::unescape file_id_and_timestamp
253
+ file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
254
+ if file_id.nil?
255
+ puts "Malformed file url, ignoring: #{file_url}"
256
+ else
257
+ if match_exclude_filter(file_url)
258
+ puts "File url matches exclude filter, ignoring: #{file_url}"
259
+ elsif not match_only_filter(file_url)
260
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
261
+ elsif file_list_curated[file_id_and_timestamp]
262
+ puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
263
+ else
264
+ file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
265
+ end
266
+ end
267
+ end
268
+ puts "file_list_curated: " + file_list_curated.count.to_s
269
+ file_list_curated
270
+ end
271
+
272
+
273
+ def get_file_list_by_timestamp
274
+ if @all_timestamps
275
+ file_list_curated = get_file_list_all_timestamps
276
+ file_list_curated.map do |file_remote_info|
277
+ file_remote_info[1][:file_id] = file_remote_info[0]
278
+ file_remote_info[1]
279
+ end
280
+ else
281
+ file_list_curated = get_file_list_curated
282
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
283
+ file_list_curated.map do |file_remote_info|
284
+ file_remote_info[1][:file_id] = file_remote_info[0]
285
+ file_remote_info[1]
286
+ end
287
+ end
288
+ end
289
+
290
+ def list_files
291
+ # retrieval produces its own output
292
+ @orig_stdout = $stdout
293
+ $stdout = $stderr
294
+ files = get_file_list_by_timestamp
295
+ $stdout = @orig_stdout
296
+ puts "["
297
+ files[0...-1].each do |file|
298
+ puts file.to_json + ","
299
+ end
300
+ puts files[-1].to_json
301
+ puts "]"
302
+ end
303
+
304
+ def download_files
305
+ start_time = Time.now
306
+ puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
307
+
308
+ if file_list_by_timestamp.empty?
309
+ puts "No files to download."
310
+ return
311
+ end
312
+
313
+ total_files = file_list_by_timestamp.count
314
+ puts "#{total_files} files to download:"
315
+
316
+ @processed_file_count = 0
317
+ @download_mutex = Mutex.new
318
+
319
+ thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
320
+ pool = Concurrent::FixedThreadPool.new(thread_count)
321
+
322
+ file_list_by_timestamp.each do |file_remote_info|
323
+ pool.post do
324
+ @connection_pool.with_connection do |connection|
325
+ result = download_file(file_remote_info, connection)
326
+ @download_mutex.synchronize do
327
+ @processed_file_count += 1
328
+ puts result if result
329
+ end
330
+ end
331
+ sleep(RATE_LIMIT)
332
+ end
333
+ end
334
+
335
+ pool.shutdown
336
+ pool.wait_for_termination
337
+
338
+ end_time = Time.now
339
+ puts "\nDownload completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path}"
340
+ cleanup
341
+ end
342
+
343
+ def structure_dir_path dir_path
344
+ begin
345
+ FileUtils::mkdir_p dir_path unless File.exist? dir_path
346
+ rescue Errno::EEXIST => e
347
+ error_to_string = e.to_s
348
+ puts "# #{error_to_string}"
349
+ if error_to_string.include? "File exists @ dir_s_mkdir - "
350
+ file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
351
+ elsif error_to_string.include? "File exists - "
352
+ file_already_existing = error_to_string.split("File exists - ")[-1]
353
+ else
354
+ raise "Unhandled directory restructure error # #{error_to_string}"
355
+ end
356
+ file_already_existing_temporary = file_already_existing + '.temp'
357
+ file_already_existing_permanent = file_already_existing + '/index.html'
358
+ FileUtils::mv file_already_existing, file_already_existing_temporary
359
+ FileUtils::mkdir_p file_already_existing
360
+ FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
361
+ puts "#{file_already_existing} -> #{file_already_existing_permanent}"
362
+ structure_dir_path dir_path
363
+ end
364
+ end
365
+
366
+ def download_file (file_remote_info, http)
367
+ current_encoding = "".encoding
368
+ file_url = file_remote_info[:file_url].encode(current_encoding)
369
+ file_id = file_remote_info[:file_id]
370
+ file_timestamp = file_remote_info[:timestamp]
371
+ file_path_elements = file_id.split('/')
372
+
373
+ if file_id == ""
374
+ dir_path = backup_path
375
+ file_path = backup_path + 'index.html'
376
+ elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
377
+ dir_path = backup_path + file_path_elements[0..-1].join('/')
378
+ file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
379
+ else
380
+ dir_path = backup_path + file_path_elements[0..-2].join('/')
381
+ file_path = backup_path + file_path_elements[0..-1].join('/')
382
+ end
383
+ if Gem.win_platform?
384
+ dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
385
+ file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
386
+ end
387
+ unless File.exist? file_path
388
+ begin
389
+ structure_dir_path dir_path
390
+ download_with_retry(file_path, file_url, file_timestamp, http)
391
+ "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
392
+ rescue StandardError => e
393
+ msg = "#{file_url} # #{e}"
394
+ if not @all and File.exist?(file_path) and File.size(file_path) == 0
395
+ File.delete(file_path)
396
+ msg += "\n#{file_path} was empty and was removed."
397
+ end
398
+ msg
399
+ end
400
+ else
401
+ "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{file_list_by_timestamp.size})"
402
+ end
403
+ end
404
+
405
+ def file_queue
406
+ @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
407
+ end
408
+
409
+ def file_list_by_timestamp
410
+ @file_list_by_timestamp ||= get_file_list_by_timestamp
411
+ end
412
+
413
+ private
414
+
415
+ def validate_params(params)
416
+ raise ArgumentError, "Base URL is required" unless params[:base_url]
417
+ raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
418
+ end
419
+
420
+ def setup_logger
421
+ logger = Logger.new(STDOUT)
422
+ logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
423
+ logger.formatter = proc do |severity, datetime, progname, msg|
424
+ "#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
425
+ end
426
+ logger
427
+ end
428
+
429
+ def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
430
+ retries = 0
431
+ begin
432
+ wayback_url = if @rewritten
433
+ "https://web.archive.org/web/#{file_timestamp}/#{file_url}"
434
+ else
435
+ "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
436
+ end
437
+
438
+ request = Net::HTTP::Get.new(URI(wayback_url))
439
+ request["Connection"] = "keep-alive"
440
+ request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
441
+
442
+ response = connection.request(request)
443
+
444
+ case response
445
+ when Net::HTTPSuccess
446
+ File.open(file_path, "wb") do |file|
447
+ if block_given?
448
+ yield(response, file)
449
+ else
450
+ file.write(response.body)
451
+ end
452
+ end
453
+ when Net::HTTPRedirection
454
+ raise "Too many redirects for #{file_url}" if redirect_count >= 2
455
+ location = response['location']
456
+ @logger.warn("Redirect found for #{file_url} -> #{location}")
457
+ return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
458
+ when Net::HTTPTooManyRequests
459
+ sleep(RATE_LIMIT * 2)
460
+ raise "Rate limited, retrying..."
461
+ when Net::HTTPNotFound
462
+ @logger.warn("File not found, skipping: #{file_url}")
463
+ return
464
+ else
465
+ raise "HTTP Error: #{response.code} #{response.message}"
466
+ end
467
+
468
+ rescue StandardError => e
469
+ if retries < MAX_RETRIES
470
+ retries += 1
471
+ @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
472
+ sleep(RETRY_DELAY * retries)
473
+ retry
474
+ else
475
+ @failed_downloads << {url: file_url, error: e.message}
476
+ raise e
477
+ end
478
+ end
479
+ end
480
+
481
+ def cleanup
482
+ @connection_pool.shutdown
483
+
484
+ if @failed_downloads.any?
485
+ @logger.error("Failed downloads summary:")
486
+ @failed_downloads.each do |failure|
487
+ @logger.error(" #{failure[:url]} - #{failure[:error]}")
488
+ end
489
+ end
490
+ end
491
+ end
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wayback_machine_downloader_straw
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.3.3
5
+ platform: ruby
6
+ authors:
7
+ - strawberrymaster
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2025-03-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: concurrent-ruby
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.3.4
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.3'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.3.4
33
+ - !ruby/object:Gem::Dependency
34
+ name: rake
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '12.2'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '12.2'
47
+ - !ruby/object:Gem::Dependency
48
+ name: minitest
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '5.2'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '5.2'
61
+ description: Download complete websites from the Internet Archive's Wayback Machine.
62
+ While the Wayback Machine (archive.org) excellently preserves web history, it lacks
63
+ a built-in export functionality; this gem does just that, allowing you to download
64
+ entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader
65
+ gem by hartator, with enhanced features and performance improvements.)
66
+ email: strawberrymaster@vivaldi.net
67
+ executables:
68
+ - wayback_machine_downloader
69
+ extensions: []
70
+ extra_rdoc_files: []
71
+ files:
72
+ - bin/wayback_machine_downloader
73
+ - lib/wayback_machine_downloader.rb
74
+ - lib/wayback_machine_downloader/archive_api.rb
75
+ - lib/wayback_machine_downloader/tidy_bytes.rb
76
+ - lib/wayback_machine_downloader/to_regex.rb
77
+ homepage: https://github.com/StrawberryMaster/wayback-machine-downloader
78
+ licenses:
79
+ - MIT
80
+ metadata: {}
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 1.9.2
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubygems_version: 3.5.11
97
+ signing_key:
98
+ specification_version: 4
99
+ summary: Download an entire website from the Wayback Machine.
100
+ test_files: []