wayback_machine_downloader_hhr 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 89aa753a924055b41a371b5c616158dc5b65dfa63b136fff078588d839949f64
4
+ data.tar.gz: be143940de3f24c545a8bf202b1fb28f601b124f69927213e834b49bada36cf3
5
+ SHA512:
6
+ metadata.gz: adf23257485832a2e6c4ccc443cf43583e59851e39d2e474bbad097ff9332f71ec63e1e0ade769b72485ca6e38234fac5f18b62ba3ca9858cf6bd46ebc1a4835
7
+ data.tar.gz: 14779e8b3bc933186d33671047411a10bdb27f2b04e311013b9c630849b07393251ee6db6445419cea71ad8c012bd1fc81aa2b94f6ad51129fef9702e8d4fa42
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/wayback_machine_downloader'
4
+ require 'optparse'
5
+ require 'pp'
6
+
7
+ options = {}
8
+ option_parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: wayback_machine_downloader http://example.com"
10
+
11
+ opts.separator ""
12
+ opts.separator "Download an entire website from the Wayback Machine."
13
+
14
+ opts.separator ""
15
+ opts.separator "Optional options:"
16
+
17
+ opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
18
+ options[:directory] = t
19
+ end
20
+
21
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
+ options[:all_timestamps] = true
23
+ end
24
+
25
+ opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
26
+ options[:from_timestamp] = t
27
+ end
28
+
29
+ opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
30
+ options[:to_timestamp] = t
31
+ end
32
+
33
+ opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34
+ options[:exact_url] = t
35
+ end
36
+
37
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
38
+ options[:only_filter] = t
39
+ end
40
+
41
+ opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
42
+ options[:exclude_filter] = t
43
+ end
44
+
45
+ opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
46
+ options[:all] = true
47
+ end
48
+
49
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
50
+ options[:threads_count] = t
51
+ end
52
+
53
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
54
+ options[:maximum_pages] = t
55
+ end
56
+
57
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
58
+ options[:list] = true
59
+ end
60
+
61
+ opts.on("-v", "--version", "Display version") do |t|
62
+ options[:version] = t
63
+ end
64
+ end.parse!
65
+
66
+ if (base_url = ARGV[-1])
67
+ options[:base_url] = base_url
68
+ wayback_machine_downloader = WaybackMachineDownloader.new options
69
+ if options[:list]
70
+ wayback_machine_downloader.list_files
71
+ else
72
+ wayback_machine_downloader.download_files
73
+ end
74
+ elsif options[:version]
75
+ puts WaybackMachineDownloader::VERSION
76
+ else
77
+ puts "You need to specify a website to backup. (e.g., http://example.com)"
78
+ puts "Run `wayback_machine_downloader --help` for more help."
79
+ end
@@ -0,0 +1,40 @@
1
+ require 'json'
2
+ require 'uri'
3
+
4
+ module ArchiveAPI
5
+
6
+ def get_raw_list_from_api url, page_index, http
7
+ request_url = URI("https://web.archive.org/cdx/search/xd")
8
+ params = [["output", "json"], ["url", url]]
9
+ params += parameters_for_api page_index
10
+ request_url.query = URI.encode_www_form(params)
11
+
12
+ begin
13
+ json = JSON.parse(http.get(URI(request_url)).body)
14
+ if (json[0] <=> ["timestamp","original"]) == 0
15
+ json.shift
16
+ end
17
+ json
18
+ rescue JSON::ParserError
19
+ []
20
+ end
21
+ end
22
+
23
+ def parameters_for_api page_index
24
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
25
+ if !@all
26
+ parameters.push(["filter", "statuscode:200"])
27
+ end
28
+ if @from_timestamp and @from_timestamp != 0
29
+ parameters.push(["from", @from_timestamp.to_s])
30
+ end
31
+ if @to_timestamp and @to_timestamp != 0
32
+ parameters.push(["to", @to_timestamp.to_s])
33
+ end
34
+ if page_index
35
+ parameters.push(["page", page_index])
36
+ end
37
+ parameters
38
+ end
39
+
40
+ end
@@ -0,0 +1,122 @@
1
+ module TibyBytes
2
+
3
+ # CP-1252 decimal byte => UTF-8 approximation as an array of bytes
4
+ CP1252 = {
5
+ 128 => [226, 130, 172],
6
+ 129 => nil,
7
+ 130 => [226, 128, 154],
8
+ 131 => [198, 146],
9
+ 132 => [226, 128, 158],
10
+ 133 => [226, 128, 166],
11
+ 134 => [226, 128, 160],
12
+ 135 => [226, 128, 161],
13
+ 136 => [203, 134],
14
+ 137 => [226, 128, 176],
15
+ 138 => [197, 160],
16
+ 139 => [226, 128, 185],
17
+ 140 => [197, 146],
18
+ 141 => nil,
19
+ 142 => [197, 189],
20
+ 143 => nil,
21
+ 144 => nil,
22
+ 145 => [226, 128, 152],
23
+ 146 => [226, 128, 153],
24
+ 147 => [226, 128, 156],
25
+ 148 => [226, 128, 157],
26
+ 149 => [226, 128, 162],
27
+ 150 => [226, 128, 147],
28
+ 151 => [226, 128, 148],
29
+ 152 => [203, 156],
30
+ 153 => [226, 132, 162],
31
+ 154 => [197, 161],
32
+ 155 => [226, 128, 186],
33
+ 156 => [197, 147],
34
+ 157 => nil,
35
+ 158 => [197, 190],
36
+ 159 => [197, 184]
37
+ }
38
+
39
+ module StringMixin
40
+
41
+ # Attempt to replace invalid UTF-8 bytes with valid ones. This method
42
+ # naively assumes if you have invalid UTF8 bytes, they are either Windows
43
+ # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
44
+ # always work.
45
+ #
46
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's
47
+ # encoding is CP-1252 or ISO-8859-1.
48
+ def tidy_bytes(force = false)
49
+
50
+ if force
51
+ return unpack("C*").map do |b|
52
+ tidy_byte(b)
53
+ end.flatten.compact.pack("C*").unpack("U*").pack("U*")
54
+ end
55
+
56
+ bytes = unpack("C*")
57
+ conts_expected = 0
58
+ last_lead = 0
59
+
60
+ bytes.each_index do |i|
61
+
62
+ byte = bytes[i]
63
+ _is_ascii = byte < 128
64
+ is_cont = byte > 127 && byte < 192
65
+ is_lead = byte > 191 && byte < 245
66
+ is_unused = byte > 240
67
+ is_restricted = byte > 244
68
+
69
+ # Impossible or highly unlikely byte? Clean it.
70
+ if is_unused || is_restricted
71
+ bytes[i] = tidy_byte(byte)
72
+ elsif is_cont
73
+ # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
74
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
75
+ else
76
+ if conts_expected > 0
77
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
78
+ # the leading byte.
79
+ begin
80
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
81
+ rescue NoMethodError
82
+ next
83
+ end
84
+ conts_expected = 0
85
+ end
86
+ if is_lead
87
+ # Final byte is leading? Clean it.
88
+ if i == bytes.length - 1
89
+ bytes[i] = tidy_byte(bytes.last)
90
+ else
91
+ # Valid leading byte? Expect continuations determined by position of
92
+ # first zero bit, with max of 3.
93
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
94
+ last_lead = i
95
+ end
96
+ end
97
+ end
98
+ end
99
+ begin
100
+ bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101
+ rescue ArgumentError
102
+ nil
103
+ end
104
+ end
105
+
106
+ # Tidy bytes in-place.
107
+ def tidy_bytes!(force = false)
108
+ replace tidy_bytes(force)
109
+ end
110
+
111
+ private
112
+
113
+ def tidy_byte(byte)
114
+ byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
115
+ end
116
+
117
+ end
118
+ end
119
+
120
+ class String
121
+ include TibyBytes::StringMixin
122
+ end
@@ -0,0 +1,81 @@
1
+ module ToRegex
2
+ module StringMixin
3
+ class << self
4
+ def literal?(str)
5
+ REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
6
+ end
7
+ end
8
+
9
+ INLINE_OPTIONS = /[imxnesu]*/
10
+ REGEXP_DELIMITERS = {
11
+ '%r{' => '}',
12
+ '/' => '/',
13
+ }
14
+
15
+ # Get a regex back
16
+ #
17
+ # Without :literal or :detect, `"foo".to_regex` will return nil.
18
+ #
19
+ # @param [optional, Hash] options
20
+ # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
21
+ # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
22
+ # @option options [true,false] :ignore_case /foo/i
23
+ # @option options [true,false] :multiline /foo/m
24
+ # @option options [true,false] :extended /foo/x
25
+ # @option options [true,false] :lang /foo/[nesu]
26
+ def to_regex(options = {})
27
+ if args = as_regexp(options)
28
+ ::Regexp.new(*args)
29
+ end
30
+ end
31
+
32
+ # Return arguments that can be passed to `Regexp.new`
33
+ # @see to_regexp
34
+ def as_regexp(options = {})
35
+ unless options.is_a?(::Hash)
36
+ raise ::ArgumentError, "[to_regexp] Options must be a Hash"
37
+ end
38
+ str = self
39
+
40
+ return if options[:detect] and str == ''
41
+
42
+ if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
43
+ content = ::Regexp.escape str
44
+ elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
45
+ delim_start, delim_end = delim_set
46
+ /\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
47
+ content = $1
48
+ inline_options = $2
49
+ return unless content.is_a?(::String)
50
+ content.gsub! '\\/', '/'
51
+ if inline_options
52
+ options[:ignore_case] = true if inline_options.include?('i')
53
+ options[:multiline] = true if inline_options.include?('m')
54
+ options[:extended] = true if inline_options.include?('x')
55
+ # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
56
+ options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
57
+ end
58
+ else
59
+ return
60
+ end
61
+
62
+ ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
63
+ multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
64
+ extended = options[:extended] ? ::Regexp::EXTENDED : 0
65
+ lang = options[:lang] || ''
66
+ if ::RUBY_VERSION > '1.9' and lang.include?('u')
67
+ lang = lang.delete 'u'
68
+ end
69
+
70
+ if lang.empty?
71
+ [ content, (ignore_case|multiline|extended) ]
72
+ else
73
+ [ content, (ignore_case|multiline|extended), lang ]
74
+ end
75
+ end
76
+ end
77
+ end
78
+
79
+ class String
80
+ include ToRegex::StringMixin
81
+ end
@@ -0,0 +1,323 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'thread'
4
+ require 'net/http'
5
+ require 'open-uri'
6
+ require 'fileutils'
7
+ require 'cgi'
8
+ require 'json'
9
+ require_relative 'wayback_machine_downloader/tidy_bytes'
10
+ require_relative 'wayback_machine_downloader/to_regex'
11
+ require_relative 'wayback_machine_downloader/archive_api'
12
+
13
+ class WaybackMachineDownloader
14
+
15
+ include ArchiveAPI
16
+
17
+ VERSION = "2.3.2"
18
+
19
+ attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
+ :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
+ :all, :maximum_pages, :threads_count
22
+
23
+ def initialize params
24
+ @base_url = params[:base_url]
25
+ @exact_url = params[:exact_url]
26
+ @directory = params[:directory]
27
+ @all_timestamps = params[:all_timestamps]
28
+ @from_timestamp = params[:from_timestamp].to_i
29
+ @to_timestamp = params[:to_timestamp].to_i
30
+ @only_filter = params[:only_filter]
31
+ @exclude_filter = params[:exclude_filter]
32
+ @all = params[:all]
33
+ @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
34
+ @threads_count = params[:threads_count].to_i
35
+ end
36
+
37
+ def backup_name
38
+ if @base_url.include? '//'
39
+ @base_url.split('/')[2]
40
+ else
41
+ @base_url
42
+ end
43
+ end
44
+
45
+ def backup_path
46
+ if @directory
47
+ if @directory[-1] == '/'
48
+ @directory
49
+ else
50
+ @directory + '/'
51
+ end
52
+ else
53
+ 'websites/' + backup_name + '/'
54
+ end
55
+ end
56
+
57
+ def match_only_filter file_url
58
+ if @only_filter
59
+ only_filter_regex = @only_filter.to_regex
60
+ if only_filter_regex
61
+ only_filter_regex =~ file_url
62
+ else
63
+ file_url.downcase.include? @only_filter.downcase
64
+ end
65
+ else
66
+ true
67
+ end
68
+ end
69
+
70
+ def match_exclude_filter file_url
71
+ if @exclude_filter
72
+ exclude_filter_regex = @exclude_filter.to_regex
73
+ if exclude_filter_regex
74
+ exclude_filter_regex =~ file_url
75
+ else
76
+ file_url.downcase.include? @exclude_filter.downcase
77
+ end
78
+ else
79
+ false
80
+ end
81
+ end
82
+
83
+ def get_all_snapshots_to_consider
84
+ # Note: Passing a page index parameter allow us to get more snapshots,
85
+ # but from a less fresh index
86
+ http = Net::HTTP.new("web.archive.org", 443)
87
+ http.use_ssl = true
88
+ http.start()
89
+ print "Getting snapshot pages"
90
+ snapshot_list_to_consider = []
91
+ snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http)
92
+ print "."
93
+ unless @exact_url
94
+ @maximum_pages.times do |page_index|
95
+ snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index, http)
96
+ break if snapshot_list.empty?
97
+ snapshot_list_to_consider += snapshot_list
98
+ print "."
99
+ end
100
+ end
101
+ http.finish()
102
+ puts " found #{snapshot_list_to_consider.length} snaphots to consider."
103
+ puts
104
+ snapshot_list_to_consider
105
+ end
106
+
107
+ def get_file_list_curated
108
+ file_list_curated = Hash.new
109
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
110
+ next unless file_url.include?('/')
111
+ file_id = file_url.split('/')[3..-1].join('/')
112
+ file_id = CGI::unescape file_id
113
+ file_id = file_id.tidy_bytes unless file_id == ""
114
+ if file_id.nil?
115
+ puts "Malformed file url, ignoring: #{file_url}"
116
+ else
117
+ if match_exclude_filter(file_url)
118
+ puts "File url matches exclude filter, ignoring: #{file_url}"
119
+ elsif not match_only_filter(file_url)
120
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
121
+ elsif file_list_curated[file_id]
122
+ unless file_list_curated[file_id][:timestamp] > file_timestamp
123
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
124
+ end
125
+ else
126
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
127
+ end
128
+ end
129
+ end
130
+ file_list_curated
131
+ end
132
+
133
+ def get_file_list_all_timestamps
134
+ file_list_curated = Hash.new
135
+ get_all_snapshots_to_consider.each do |file_timestamp, file_url|
136
+ next unless file_url.include?('/')
137
+ file_id = file_url.split('/')[3..-1].join('/')
138
+ file_id_and_timestamp = [file_timestamp, file_id].join('/')
139
+ file_id_and_timestamp = CGI::unescape file_id_and_timestamp
140
+ file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
141
+ if file_id.nil?
142
+ puts "Malformed file url, ignoring: #{file_url}"
143
+ else
144
+ if match_exclude_filter(file_url)
145
+ puts "File url matches exclude filter, ignoring: #{file_url}"
146
+ elsif not match_only_filter(file_url)
147
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
148
+ elsif file_list_curated[file_id_and_timestamp]
149
+ puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
150
+ else
151
+ file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
152
+ end
153
+ end
154
+ end
155
+ puts "file_list_curated: " + file_list_curated.count.to_s
156
+ file_list_curated
157
+ end
158
+
159
+
160
+ def get_file_list_by_timestamp
161
+ if @all_timestamps
162
+ file_list_curated = get_file_list_all_timestamps
163
+ file_list_curated.map do |file_remote_info|
164
+ file_remote_info[1][:file_id] = file_remote_info[0]
165
+ file_remote_info[1]
166
+ end
167
+ else
168
+ file_list_curated = get_file_list_curated
169
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
170
+ file_list_curated.map do |file_remote_info|
171
+ file_remote_info[1][:file_id] = file_remote_info[0]
172
+ file_remote_info[1]
173
+ end
174
+ end
175
+ end
176
+
177
+ def list_files
178
+ # retrieval produces its own output
179
+ @orig_stdout = $stdout
180
+ $stdout = $stderr
181
+ files = get_file_list_by_timestamp
182
+ $stdout = @orig_stdout
183
+ puts "["
184
+ files[0...-1].each do |file|
185
+ puts file.to_json + ","
186
+ end
187
+ puts files[-1].to_json
188
+ puts "]"
189
+ end
190
+
191
+ def download_files
192
+ start_time = Time.now
193
+ puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
194
+ puts
195
+
196
+ if file_list_by_timestamp.count == 0
197
+ puts "No files to download."
198
+ puts "Possible reasons:"
199
+ puts "\t* Site is not in Wayback Machine Archive."
200
+ puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
201
+ puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
202
+ puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
203
+ puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
204
+ return
205
+ end
206
+
207
+ puts "#{file_list_by_timestamp.count} files to download:"
208
+
209
+ threads = []
210
+ @processed_file_count = 0
211
+ @threads_count = 1 unless @threads_count != 0
212
+ @threads_count.times do
213
+ http = Net::HTTP.new("web.archive.org", 443)
214
+ http.use_ssl = true
215
+ http.start()
216
+ threads << Thread.new do
217
+ until file_queue.empty?
218
+ file_remote_info = file_queue.pop(true) rescue nil
219
+ download_file(file_remote_info, http) if file_remote_info
220
+ end
221
+ http.finish()
222
+ end
223
+ end
224
+
225
+ threads.each(&:join)
226
+ end_time = Time.now
227
+ puts
228
+ puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
229
+ end
230
+
231
+ def structure_dir_path dir_path
232
+ begin
233
+ FileUtils::mkdir_p dir_path unless File.exist? dir_path
234
+ rescue Errno::EEXIST => e
235
+ error_to_string = e.to_s
236
+ puts "# #{error_to_string}"
237
+ if error_to_string.include? "File exists @ dir_s_mkdir - "
238
+ file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
239
+ elsif error_to_string.include? "File exists - "
240
+ file_already_existing = error_to_string.split("File exists - ")[-1]
241
+ else
242
+ raise "Unhandled directory restructure error # #{error_to_string}"
243
+ end
244
+ file_already_existing_temporary = file_already_existing + '.temp'
245
+ file_already_existing_permanent = file_already_existing + '/index.html'
246
+ FileUtils::mv file_already_existing, file_already_existing_temporary
247
+ FileUtils::mkdir_p file_already_existing
248
+ FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
249
+ puts "#{file_already_existing} -> #{file_already_existing_permanent}"
250
+ structure_dir_path dir_path
251
+ end
252
+ end
253
+
254
+ def download_file (file_remote_info, http)
255
+ current_encoding = "".encoding
256
+ file_url = file_remote_info[:file_url].encode(current_encoding)
257
+ file_id = file_remote_info[:file_id]
258
+ file_timestamp = file_remote_info[:timestamp]
259
+ file_path_elements = file_id.split('/')
260
+ if file_id == ""
261
+ dir_path = backup_path
262
+ file_path = backup_path + 'index.html'
263
+ elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
264
+ dir_path = backup_path + file_path_elements[0..-1].join('/')
265
+ file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
266
+ else
267
+ dir_path = backup_path + file_path_elements[0..-2].join('/')
268
+ file_path = backup_path + file_path_elements[0..-1].join('/')
269
+ end
270
+ if Gem.win_platform?
271
+ dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
272
+ file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
273
+ end
274
+ unless File.exist? file_path
275
+ begin
276
+ structure_dir_path dir_path
277
+ open(file_path, "wb") do |file|
278
+ begin
279
+ http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
280
+ file.write(body)
281
+ end
282
+ rescue OpenURI::HTTPError => e
283
+ puts "#{file_url} # #{e}"
284
+ if @all
285
+ file.write(e.io.read)
286
+ puts "#{file_path} saved anyway."
287
+ end
288
+ rescue StandardError => e
289
+ puts "#{file_url} # #{e}"
290
+ end
291
+ end
292
+ rescue StandardError => e
293
+ puts "#{file_url} # #{e}"
294
+ ensure
295
+ if not @all and File.exist?(file_path) and File.size(file_path) == 0
296
+ File.delete(file_path)
297
+ puts "#{file_path} was empty and was removed."
298
+ end
299
+ end
300
+ semaphore.synchronize do
301
+ @processed_file_count += 1
302
+ puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
303
+ end
304
+ else
305
+ semaphore.synchronize do
306
+ @processed_file_count += 1
307
+ puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
308
+ end
309
+ end
310
+ end
311
+
312
+ def file_queue
313
+ @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
314
+ end
315
+
316
+ def file_list_by_timestamp
317
+ @file_list_by_timestamp ||= get_file_list_by_timestamp
318
+ end
319
+
320
+ def semaphore
321
+ @semaphore ||= Mutex.new
322
+ end
323
+ end
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wayback_machine_downloader_hhr
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.3.2
5
+ platform: ruby
6
+ authors:
7
+ - hehaorui
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-11-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '10.2'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '10.2'
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '5.2'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '5.2'
41
+ description: Download an entire website from the Wayback Machine. Wayback Machine
42
+ by Internet Archive (archive.org) is an awesome tool to view any website at any
43
+ point of time but lacks an export feature. Wayback Machine Downloader brings exactly
44
+ this. This version bears minor fixes on original version. It is for hehaorui personal
45
+ use.
46
+ email: mail@hehaorui.com
47
+ executables:
48
+ - wayback_machine_downloader
49
+ extensions: []
50
+ extra_rdoc_files: []
51
+ files:
52
+ - bin/wayback_machine_downloader
53
+ - lib/wayback_machine_downloader.rb
54
+ - lib/wayback_machine_downloader/archive_api.rb
55
+ - lib/wayback_machine_downloader/tidy_bytes.rb
56
+ - lib/wayback_machine_downloader/to_regex.rb
57
+ homepage: https://github.com/hehaorui/wayback-machine-downloader
58
+ licenses:
59
+ - MIT
60
+ metadata: {}
61
+ post_install_message:
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 1.9.2
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ requirements: []
76
+ rubygems_version: 3.5.22
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Download an entire website from the Wayback Machine, with minor fixes. For
80
+ hehaorui personal use.
81
+ test_files: []