wayback_machine_downloader_hhr 2.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 89aa753a924055b41a371b5c616158dc5b65dfa63b136fff078588d839949f64
4
+ data.tar.gz: be143940de3f24c545a8bf202b1fb28f601b124f69927213e834b49bada36cf3
5
+ SHA512:
6
+ metadata.gz: adf23257485832a2e6c4ccc443cf43583e59851e39d2e474bbad097ff9332f71ec63e1e0ade769b72485ca6e38234fac5f18b62ba3ca9858cf6bd46ebc1a4835
7
+ data.tar.gz: 14779e8b3bc933186d33671047411a10bdb27f2b04e311013b9c630849b07393251ee6db6445419cea71ad8c012bd1fc81aa2b94f6ad51129fef9702e8d4fa42
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/wayback_machine_downloader'
4
+ require 'optparse'
5
+ require 'pp'
6
+
7
+ options = {}
8
+ option_parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: wayback_machine_downloader http://example.com"
10
+
11
+ opts.separator ""
12
+ opts.separator "Download an entire website from the Wayback Machine."
13
+
14
+ opts.separator ""
15
+ opts.separator "Optional options:"
16
+
17
+ opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
18
+ options[:directory] = t
19
+ end
20
+
21
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
+ options[:all_timestamps] = true
23
+ end
24
+
25
+ opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
26
+ options[:from_timestamp] = t
27
+ end
28
+
29
+ opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
30
+ options[:to_timestamp] = t
31
+ end
32
+
33
+ opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34
+ options[:exact_url] = t
35
+ end
36
+
37
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
38
+ options[:only_filter] = t
39
+ end
40
+
41
+ opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
42
+ options[:exclude_filter] = t
43
+ end
44
+
45
+ opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
46
+ options[:all] = true
47
+ end
48
+
49
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
50
+ options[:threads_count] = t
51
+ end
52
+
53
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
54
+ options[:maximum_pages] = t
55
+ end
56
+
57
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
58
+ options[:list] = true
59
+ end
60
+
61
+ opts.on("-v", "--version", "Display version") do |t|
62
+ options[:version] = t
63
+ end
64
+ end.parse!
65
+
66
+ if (base_url = ARGV[-1])
67
+ options[:base_url] = base_url
68
+ wayback_machine_downloader = WaybackMachineDownloader.new options
69
+ if options[:list]
70
+ wayback_machine_downloader.list_files
71
+ else
72
+ wayback_machine_downloader.download_files
73
+ end
74
+ elsif options[:version]
75
+ puts WaybackMachineDownloader::VERSION
76
+ else
77
+ puts "You need to specify a website to backup. (e.g., http://example.com)"
78
+ puts "Run `wayback_machine_downloader --help` for more help."
79
+ end
@@ -0,0 +1,40 @@
1
+ require 'json'
2
+ require 'uri'
3
+
4
+ module ArchiveAPI
5
+
6
+ def get_raw_list_from_api url, page_index, http
7
+ request_url = URI("https://web.archive.org/cdx/search/xd")
8
+ params = [["output", "json"], ["url", url]]
9
+ params += parameters_for_api page_index
10
+ request_url.query = URI.encode_www_form(params)
11
+
12
+ begin
13
+ json = JSON.parse(http.get(URI(request_url)).body)
14
+ if (json[0] <=> ["timestamp","original"]) == 0
15
+ json.shift
16
+ end
17
+ json
18
+ rescue JSON::ParserError
19
+ []
20
+ end
21
+ end
22
+
23
+ def parameters_for_api page_index
24
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
25
+ if !@all
26
+ parameters.push(["filter", "statuscode:200"])
27
+ end
28
+ if @from_timestamp and @from_timestamp != 0
29
+ parameters.push(["from", @from_timestamp.to_s])
30
+ end
31
+ if @to_timestamp and @to_timestamp != 0
32
+ parameters.push(["to", @to_timestamp.to_s])
33
+ end
34
+ if page_index
35
+ parameters.push(["page", page_index])
36
+ end
37
+ parameters
38
+ end
39
+
40
+ end
@@ -0,0 +1,122 @@
1
+ module TibyBytes
2
+
3
+ # CP-1252 decimal byte => UTF-8 approximation as an array of bytes
4
+ CP1252 = {
5
+ 128 => [226, 130, 172],
6
+ 129 => nil,
7
+ 130 => [226, 128, 154],
8
+ 131 => [198, 146],
9
+ 132 => [226, 128, 158],
10
+ 133 => [226, 128, 166],
11
+ 134 => [226, 128, 160],
12
+ 135 => [226, 128, 161],
13
+ 136 => [203, 134],
14
+ 137 => [226, 128, 176],
15
+ 138 => [197, 160],
16
+ 139 => [226, 128, 185],
17
+ 140 => [197, 146],
18
+ 141 => nil,
19
+ 142 => [197, 189],
20
+ 143 => nil,
21
+ 144 => nil,
22
+ 145 => [226, 128, 152],
23
+ 146 => [226, 128, 153],
24
+ 147 => [226, 128, 156],
25
+ 148 => [226, 128, 157],
26
+ 149 => [226, 128, 162],
27
+ 150 => [226, 128, 147],
28
+ 151 => [226, 128, 148],
29
+ 152 => [203, 156],
30
+ 153 => [226, 132, 162],
31
+ 154 => [197, 161],
32
+ 155 => [226, 128, 186],
33
+ 156 => [197, 147],
34
+ 157 => nil,
35
+ 158 => [197, 190],
36
+ 159 => [197, 184]
37
+ }
38
+
39
+ module StringMixin
40
+
41
+ # Attempt to replace invalid UTF-8 bytes with valid ones. This method
42
+ # naively assumes if you have invalid UTF8 bytes, they are either Windows
43
+ # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
44
+ # always work.
45
+ #
46
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's
47
+ # encoding is CP-1252 or ISO-8859-1.
48
+ def tidy_bytes(force = false)
49
+
50
+ if force
51
+ return unpack("C*").map do |b|
52
+ tidy_byte(b)
53
+ end.flatten.compact.pack("C*").unpack("U*").pack("U*")
54
+ end
55
+
56
+ bytes = unpack("C*")
57
+ conts_expected = 0
58
+ last_lead = 0
59
+
60
+ bytes.each_index do |i|
61
+
62
+ byte = bytes[i]
63
+ _is_ascii = byte < 128
64
+ is_cont = byte > 127 && byte < 192
65
+ is_lead = byte > 191 && byte < 245
66
+ is_unused = byte > 240
67
+ is_restricted = byte > 244
68
+
69
+ # Impossible or highly unlikely byte? Clean it.
70
+ if is_unused || is_restricted
71
+ bytes[i] = tidy_byte(byte)
72
+ elsif is_cont
73
+ # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
74
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
+ else
76
+ if conts_expected > 0
77
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
78
+ # the leading byte.
79
+ begin
80
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
81
+ rescue NoMethodError
82
+ next
83
+ end
84
+ conts_expected = 0
85
+ end
86
+ if is_lead
87
+ # Final byte is leading? Clean it.
88
+ if i == bytes.length - 1
89
+ bytes[i] = tidy_byte(bytes.last)
90
+ else
91
+ # Valid leading byte? Expect continuations determined by position of
92
+ # first zero bit, with max of 3.
93
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
94
+ last_lead = i
95
+ end
96
+ end
97
+ end
98
+ end
99
+ begin
100
+ bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101
+ rescue ArgumentError
102
+ nil
103
+ end
104
+ end
105
+
106
+ # Tidy bytes in-place.
107
+ def tidy_bytes!(force = false)
108
+ replace tidy_bytes(force)
109
+ end
110
+
111
+ private
112
+
113
+ def tidy_byte(byte)
114
+ byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
115
+ end
116
+
117
+ end
118
+ end
119
+
120
+ class String
121
+ include TibyBytes::StringMixin
122
+ end
@@ -0,0 +1,81 @@
1
+ module ToRegex
2
+ module StringMixin
3
+ class << self
4
+ def literal?(str)
5
+ REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
6
+ end
7
+ end
8
+
9
+ INLINE_OPTIONS = /[imxnesu]*/
10
+ REGEXP_DELIMITERS = {
11
+ '%r{' => '}',
12
+ '/' => '/',
13
+ }
14
+
15
+ # Get a regex back
16
+ #
17
+ # Without :literal or :detect, `"foo".to_regex` will return nil.
18
+ #
19
+ # @param [optional, Hash] options
20
+ # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
21
+ # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
22
+ # @option options [true,false] :ignore_case /foo/i
23
+ # @option options [true,false] :multiline /foo/m
24
+ # @option options [true,false] :extended /foo/x
25
+ # @option options [true,false] :lang /foo/[nesu]
26
+ def to_regex(options = {})
27
+ if args = as_regexp(options)
28
+ ::Regexp.new(*args)
29
+ end
30
+ end
31
+
32
+ # Return arguments that can be passed to `Regexp.new`
33
+ # @see to_regexp
34
+ def as_regexp(options = {})
35
+ unless options.is_a?(::Hash)
36
+ raise ::ArgumentError, "[to_regexp] Options must be a Hash"
37
+ end
38
+ str = self
39
+
40
+ return if options[:detect] and str == ''
41
+
42
+ if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
43
+ content = ::Regexp.escape str
44
+ elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
45
+ delim_start, delim_end = delim_set
46
+ /\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
47
+ content = $1
48
+ inline_options = $2
49
+ return unless content.is_a?(::String)
50
+ content.gsub! '\\/', '/'
51
+ if inline_options
52
+ options[:ignore_case] = true if inline_options.include?('i')
53
+ options[:multiline] = true if inline_options.include?('m')
54
+ options[:extended] = true if inline_options.include?('x')
55
+ # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
56
+ options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
57
+ end
58
+ else
59
+ return
60
+ end
61
+
62
+ ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
63
+ multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
64
+ extended = options[:extended] ? ::Regexp::EXTENDED : 0
65
+ lang = options[:lang] || ''
66
+ if ::RUBY_VERSION > '1.9' and lang.include?('u')
67
+ lang = lang.delete 'u'
68
+ end
69
+
70
+ if lang.empty?
71
+ [ content, (ignore_case|multiline|extended) ]
72
+ else
73
+ [ content, (ignore_case|multiline|extended), lang ]
74
+ end
75
+ end
76
+ end
77
+ end
78
+
79
+ class String
80
+ include ToRegex::StringMixin
81
+ end
@@ -0,0 +1,323 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'thread'
4
+ require 'net/http'
5
+ require 'open-uri'
6
+ require 'fileutils'
7
+ require 'cgi'
8
+ require 'json'
9
+ require_relative 'wayback_machine_downloader/tidy_bytes'
10
+ require_relative 'wayback_machine_downloader/to_regex'
11
+ require_relative 'wayback_machine_downloader/archive_api'
12
+
13
+ class WaybackMachineDownloader
14
+
15
+ include ArchiveAPI
16
+
17
+ VERSION = "2.3.2"
18
+
19
+ attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
+ :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
+ :all, :maximum_pages, :threads_count
22
+
23
+ def initialize params
24
+ @base_url = params[:base_url]
25
+ @exact_url = params[:exact_url]
26
+ @directory = params[:directory]
27
+ @all_timestamps = params[:all_timestamps]
28
+ @from_timestamp = params[:from_timestamp].to_i
29
+ @to_timestamp = params[:to_timestamp].to_i
30
+ @only_filter = params[:only_filter]
31
+ @exclude_filter = params[:exclude_filter]
32
+ @all = params[:all]
33
+ @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
34
+ @threads_count = params[:threads_count].to_i
35
+ end
36
+
37
+ def backup_name
38
+ if @base_url.include? '//'
39
+ @base_url.split('/')[2]
40
+ else
41
+ @base_url
42
+ end
43
+ end
44
+
45
+ def backup_path
46
+ if @directory
47
+ if @directory[-1] == '/'
48
+ @directory
49
+ else
50
+ @directory + '/'
51
+ end
52
+ else
53
+ 'websites/' + backup_name + '/'
54
+ end
55
+ end
56
+
57
+ def match_only_filter file_url
58
+ if @only_filter
59
+ only_filter_regex = @only_filter.to_regex
60
+ if only_filter_regex
61
+ only_filter_regex =~ file_url
62
+ else
63
+ file_url.downcase.include? @only_filter.downcase
64
+ end
65
+ else
66
+ true
67
+ end
68
+ end
69
+
70
+ def match_exclude_filter file_url
71
+ if @exclude_filter
72
+ exclude_filter_regex = @exclude_filter.to_regex
73
+ if exclude_filter_regex
74
+ exclude_filter_regex =~ file_url
75
+ else
76
+ file_url.downcase.include? @exclude_filter.downcase
77
+ end
78
+ else
79
+ false
80
+ end
81
+ end
82
+
83
+ def get_all_snapshots_to_consider
84
+ # Note: Passing a page index parameter allow us to get more snapshots,
85
+ # but from a less fresh index
86
+ http = Net::HTTP.new("web.archive.org", 443)
87
+ http.use_ssl = true
88
+ http.start()
89
+ print "Getting snapshot pages"
90
+ snapshot_list_to_consider = []
91
+ snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http)
92
+ print "."
93
+ unless @exact_url
94
+ @maximum_pages.times do |page_index|
95
+ snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index, http)
96
+ break if snapshot_list.empty?
97
+ snapshot_list_to_consider += snapshot_list
98
+ print "."
99
+ end
100
+ end
101
+ http.finish()
102
+ puts " found #{snapshot_list_to_consider.length} snaphots to consider."
103
+ puts
104
+ snapshot_list_to_consider
105
+ end
106
+
107
+ def get_file_list_curated
108
+ file_list_curated = Hash.new
109
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
110
+ next unless file_url.include?('/')
111
+ file_id = file_url.split('/')[3..-1].join('/')
112
+ file_id = CGI::unescape file_id
113
+ file_id = file_id.tidy_bytes unless file_id == ""
114
+ if file_id.nil?
115
+ puts "Malformed file url, ignoring: #{file_url}"
116
+ else
117
+ if match_exclude_filter(file_url)
118
+ puts "File url matches exclude filter, ignoring: #{file_url}"
119
+ elsif not match_only_filter(file_url)
120
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
121
+ elsif file_list_curated[file_id]
122
+ unless file_list_curated[file_id][:timestamp] > file_timestamp
123
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
124
+ end
125
+ else
126
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
127
+ end
128
+ end
129
+ end
130
+ file_list_curated
131
+ end
132
+
133
+ def get_file_list_all_timestamps
134
+ file_list_curated = Hash.new
135
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
136
+ next unless file_url.include?('/')
137
+ file_id = file_url.split('/')[3..-1].join('/')
138
+ file_id_and_timestamp = [file_timestamp, file_id].join('/')
139
+ file_id_and_timestamp = CGI::unescape file_id_and_timestamp
140
+ file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
141
+ if file_id.nil?
142
+ puts "Malformed file url, ignoring: #{file_url}"
143
+ else
144
+ if match_exclude_filter(file_url)
145
+ puts "File url matches exclude filter, ignoring: #{file_url}"
146
+ elsif not match_only_filter(file_url)
147
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
148
+ elsif file_list_curated[file_id_and_timestamp]
149
+ puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
150
+ else
151
+ file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
152
+ end
153
+ end
154
+ end
155
+ puts "file_list_curated: " + file_list_curated.count.to_s
156
+ file_list_curated
157
+ end
158
+
159
+
160
+ def get_file_list_by_timestamp
161
+ if @all_timestamps
162
+ file_list_curated = get_file_list_all_timestamps
163
+ file_list_curated.map do |file_remote_info|
164
+ file_remote_info[1][:file_id] = file_remote_info[0]
165
+ file_remote_info[1]
166
+ end
167
+ else
168
+ file_list_curated = get_file_list_curated
169
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
170
+ file_list_curated.map do |file_remote_info|
171
+ file_remote_info[1][:file_id] = file_remote_info[0]
172
+ file_remote_info[1]
173
+ end
174
+ end
175
+ end
176
+
177
+ def list_files
178
+ # retrieval produces its own output
179
+ @orig_stdout = $stdout
180
+ $stdout = $stderr
181
+ files = get_file_list_by_timestamp
182
+ $stdout = @orig_stdout
183
+ puts "["
184
+ files[0...-1].each do |file|
185
+ puts file.to_json + ","
186
+ end
187
+ puts files[-1].to_json
188
+ puts "]"
189
+ end
190
+
191
+ def download_files
192
+ start_time = Time.now
193
+ puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
194
+ puts
195
+
196
+ if file_list_by_timestamp.count == 0
197
+ puts "No files to download."
198
+ puts "Possible reasons:"
199
+ puts "\t* Site is not in Wayback Machine Archive."
200
+ puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
201
+ puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
202
+ puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
203
+ puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
204
+ return
205
+ end
206
+
207
+ puts "#{file_list_by_timestamp.count} files to download:"
208
+
209
+ threads = []
210
+ @processed_file_count = 0
211
+ @threads_count = 1 unless @threads_count != 0
212
+ @threads_count.times do
213
+ http = Net::HTTP.new("web.archive.org", 443)
214
+ http.use_ssl = true
215
+ http.start()
216
+ threads << Thread.new do
217
+ until file_queue.empty?
218
+ file_remote_info = file_queue.pop(true) rescue nil
219
+ download_file(file_remote_info, http) if file_remote_info
220
+ end
221
+ http.finish()
222
+ end
223
+ end
224
+
225
+ threads.each(&:join)
226
+ end_time = Time.now
227
+ puts
228
+ puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
229
+ end
230
+
231
+ def structure_dir_path dir_path
232
+ begin
233
+ FileUtils::mkdir_p dir_path unless File.exist? dir_path
234
+ rescue Errno::EEXIST => e
235
+ error_to_string = e.to_s
236
+ puts "# #{error_to_string}"
237
+ if error_to_string.include? "File exists @ dir_s_mkdir - "
238
+ file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
239
+ elsif error_to_string.include? "File exists - "
240
+ file_already_existing = error_to_string.split("File exists - ")[-1]
241
+ else
242
+ raise "Unhandled directory restructure error # #{error_to_string}"
243
+ end
244
+ file_already_existing_temporary = file_already_existing + '.temp'
245
+ file_already_existing_permanent = file_already_existing + '/index.html'
246
+ FileUtils::mv file_already_existing, file_already_existing_temporary
247
+ FileUtils::mkdir_p file_already_existing
248
+ FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
249
+ puts "#{file_already_existing} -> #{file_already_existing_permanent}"
250
+ structure_dir_path dir_path
251
+ end
252
+ end
253
+
254
+ def download_file (file_remote_info, http)
255
+ current_encoding = "".encoding
256
+ file_url = file_remote_info[:file_url].encode(current_encoding)
257
+ file_id = file_remote_info[:file_id]
258
+ file_timestamp = file_remote_info[:timestamp]
259
+ file_path_elements = file_id.split('/')
260
+ if file_id == ""
261
+ dir_path = backup_path
262
+ file_path = backup_path + 'index.html'
263
+ elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
264
+ dir_path = backup_path + file_path_elements[0..-1].join('/')
265
+ file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
266
+ else
267
+ dir_path = backup_path + file_path_elements[0..-2].join('/')
268
+ file_path = backup_path + file_path_elements[0..-1].join('/')
269
+ end
270
+ if Gem.win_platform?
271
+ dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
272
+ file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
273
+ end
274
+ unless File.exist? file_path
275
+ begin
276
+ structure_dir_path dir_path
277
+ open(file_path, "wb") do |file|
278
+ begin
279
+ http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
280
+ file.write(body)
281
+ end
282
+ rescue OpenURI::HTTPError => e
283
+ puts "#{file_url} # #{e}"
284
+ if @all
285
+ file.write(e.io.read)
286
+ puts "#{file_path} saved anyway."
287
+ end
288
+ rescue StandardError => e
289
+ puts "#{file_url} # #{e}"
290
+ end
291
+ end
292
+ rescue StandardError => e
293
+ puts "#{file_url} # #{e}"
294
+ ensure
295
+ if not @all and File.exist?(file_path) and File.size(file_path) == 0
296
+ File.delete(file_path)
297
+ puts "#{file_path} was empty and was removed."
298
+ end
299
+ end
300
+ semaphore.synchronize do
301
+ @processed_file_count += 1
302
+ puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
303
+ end
304
+ else
305
+ semaphore.synchronize do
306
+ @processed_file_count += 1
307
+ puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
308
+ end
309
+ end
310
+ end
311
+
312
+ def file_queue
313
+ @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
314
+ end
315
+
316
+ def file_list_by_timestamp
317
+ @file_list_by_timestamp ||= get_file_list_by_timestamp
318
+ end
319
+
320
+ def semaphore
321
+ @semaphore ||= Mutex.new
322
+ end
323
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wayback_machine_downloader_hhr
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.3.2
5
+ platform: ruby
6
+ authors:
7
+ - hehaorui
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-11-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '10.2'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '10.2'
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '5.2'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '5.2'
41
+ description: Download an entire website from the Wayback Machine. Wayback Machine
42
+ by Internet Archive (archive.org) is an awesome tool to view any website at any
43
+ point of time but lacks an export feature. Wayback Machine Downloader brings exactly
44
+ this. This version bears minor fixes on original version. It is for hehaorui personal
45
+ use.
46
+ email: mail@hehaorui.com
47
+ executables:
48
+ - wayback_machine_downloader
49
+ extensions: []
50
+ extra_rdoc_files: []
51
+ files:
52
+ - bin/wayback_machine_downloader
53
+ - lib/wayback_machine_downloader.rb
54
+ - lib/wayback_machine_downloader/archive_api.rb
55
+ - lib/wayback_machine_downloader/tidy_bytes.rb
56
+ - lib/wayback_machine_downloader/to_regex.rb
57
+ homepage: https://github.com/hehaorui/wayback-machine-downloader
58
+ licenses:
59
+ - MIT
60
+ metadata: {}
61
+ post_install_message:
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 1.9.2
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ requirements: []
76
+ rubygems_version: 3.5.22
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Download an entire website from the Wayback Machine, with minor fixes. For
80
+ hehaorui personal use.
81
+ test_files: []