wayback_machine_downloader_straw 2.4.5 → 2.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +123 -111
- data/lib/wayback_machine_downloader/archive_api.rb +74 -61
- data/lib/wayback_machine_downloader/page_requisites.rb +32 -32
- data/lib/wayback_machine_downloader/subdom_processor.rb +237 -237
- data/lib/wayback_machine_downloader/tidy_bytes.rb +77 -77
- data/lib/wayback_machine_downloader/to_regex.rb +106 -106
- data/lib/wayback_machine_downloader/url_rewrite.rb +84 -84
- data/lib/wayback_machine_downloader.rb +1244 -1158
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e52df092b59b0eec27b390b5b00fcfc17fab271acd6cd9df774912f57cfc4dc1
|
|
4
|
+
data.tar.gz: ce170b42caad7e8136b07c2aa5cb6e751f57dd64bd40c0addcd42a31798d0047
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 558e9cdfc3d7d4d2081ccb49b12a96bdb64b7768697eb0a1b9a431ed1ad3017ce894975e046a6e50766928c3863797715c7b45d013b6ab7ad78bca59ea86c6d0
|
|
7
|
+
data.tar.gz: af3064f1489d32cf078fd5d87d2773700e9dfa498075f089029e0e7ec47c500c7815e84d51f426bb6fc3067bf02c9a9404da3b6f74d263c99b4ae96fc32dab35
|
|
@@ -1,111 +1,123 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
opts.
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
opts.separator ""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
opts.
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$stdout.sync = true
|
|
4
|
+
|
|
5
|
+
require_relative '../lib/wayback_machine_downloader'
|
|
6
|
+
require 'optparse'
|
|
7
|
+
require 'pp'
|
|
8
|
+
|
|
9
|
+
options = {}
|
|
10
|
+
option_parser = OptionParser.new do |opts|
|
|
11
|
+
opts.banner = "Usage: wayback_machine_downloader http://example.com"
|
|
12
|
+
|
|
13
|
+
opts.separator ""
|
|
14
|
+
opts.separator "Download an entire website from the Wayback Machine."
|
|
15
|
+
|
|
16
|
+
opts.separator ""
|
|
17
|
+
opts.separator "Optional options:"
|
|
18
|
+
|
|
19
|
+
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
|
|
20
|
+
options[:directory] = t
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
|
24
|
+
options[:all_timestamps] = true
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
|
28
|
+
options[:from_timestamp] = t
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
|
|
32
|
+
options[:to_timestamp] = t
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
|
|
36
|
+
options[:exact_url] = t
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
|
40
|
+
options[:only_filter] = t
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
|
|
44
|
+
options[:exclude_filter] = t
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
|
|
48
|
+
options[:all] = true
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
|
|
52
|
+
options[:threads_count] = t
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
|
|
56
|
+
options[:maximum_pages] = t
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
|
|
60
|
+
options[:list] = true
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
|
|
64
|
+
options[:rewritten] = true
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
|
|
68
|
+
options[:rewrite] = true
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
opts.on("--local-only", "Only rewrite links in an already downloaded directory, doesn't download anything") do |t|
|
|
72
|
+
options[:local_only] = true
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
|
|
76
|
+
options[:reset] = true
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
|
|
80
|
+
options[:keep] = true
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
|
|
84
|
+
options[:max_retries] = t
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
opts.on("--snapshot-at TIMESTAMP", Integer, "Build a composite snapshot at this timestamp") do |t|
|
|
88
|
+
options[:snapshot_at] = t
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
|
|
92
|
+
options[:recursive_subdomains] = true
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
|
|
96
|
+
options[:subdomain_depth] = t
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
|
|
100
|
+
options[:page_requisites] = true
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
opts.on("-v", "--version", "Display version") do |t|
|
|
104
|
+
options[:version] = t
|
|
105
|
+
end
|
|
106
|
+
end.parse!
|
|
107
|
+
|
|
108
|
+
if (base_url = ARGV[-1])
|
|
109
|
+
options[:base_url] = base_url
|
|
110
|
+
wayback_machine_downloader = WaybackMachineDownloader.new options
|
|
111
|
+
if options[:local_only]
|
|
112
|
+
wayback_machine_downloader.rewrite_local_files
|
|
113
|
+
elsif options[:list]
|
|
114
|
+
wayback_machine_downloader.list_files
|
|
115
|
+
else
|
|
116
|
+
wayback_machine_downloader.download_files
|
|
117
|
+
end
|
|
118
|
+
elsif options[:version]
|
|
119
|
+
puts WaybackMachineDownloader::VERSION
|
|
120
|
+
else
|
|
121
|
+
puts "You need to specify a website to backup. (e.g., http://example.com)"
|
|
122
|
+
puts "Run `wayback_machine_downloader --help` for more help."
|
|
123
|
+
end
|
|
@@ -1,61 +1,74 @@
|
|
|
1
|
-
require 'json'
|
|
2
|
-
require 'uri'
|
|
3
|
-
|
|
4
|
-
module ArchiveAPI
|
|
5
|
-
|
|
6
|
-
def get_raw_list_from_api(url, page_index, http)
|
|
7
|
-
# Automatically append /* if the URL doesn't contain a path after the domain
|
|
8
|
-
# This is a workaround for an issue with the API and *some* domains.
|
|
9
|
-
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
|
10
|
-
# But don't do this when exact_url flag is set
|
|
11
|
-
if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
|
|
12
|
-
url = "#{url}/*"
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
|
16
|
-
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
|
17
|
-
request_url.query = URI.encode_www_form(params)
|
|
18
|
-
|
|
19
|
-
retries = 0
|
|
20
|
-
max_retries = (@max_retries || 3)
|
|
21
|
-
delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
|
|
22
|
-
|
|
23
|
-
begin
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
end
|
|
1
|
+
require 'json'
|
|
2
|
+
require 'uri'
|
|
3
|
+
|
|
4
|
+
module ArchiveAPI
|
|
5
|
+
|
|
6
|
+
def get_raw_list_from_api(url, page_index, http)
|
|
7
|
+
# Automatically append /* if the URL doesn't contain a path after the domain
|
|
8
|
+
# This is a workaround for an issue with the API and *some* domains.
|
|
9
|
+
# See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
|
|
10
|
+
# But don't do this when exact_url flag is set
|
|
11
|
+
if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
|
|
12
|
+
url = "#{url}/*"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
request_url = URI("https://web.archive.org/cdx/search/cdx")
|
|
16
|
+
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
|
|
17
|
+
request_url.query = URI.encode_www_form(params)
|
|
18
|
+
|
|
19
|
+
retries = 0
|
|
20
|
+
max_retries = (@max_retries || 3)
|
|
21
|
+
delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
|
|
22
|
+
|
|
23
|
+
begin
|
|
24
|
+
request = Net::HTTP::Get.new(request_url)
|
|
25
|
+
request["User-Agent"] = "wmd-straw/#{WaybackMachineDownloader::VERSION}"
|
|
26
|
+
request["Connection"] = "keep-alive"
|
|
27
|
+
request["Accept-Encoding"] = "gzip"
|
|
28
|
+
response = http.request(request)
|
|
29
|
+
|
|
30
|
+
case response.code.to_i
|
|
31
|
+
when 200
|
|
32
|
+
body = if response['content-encoding'] == 'gzip'
|
|
33
|
+
Zlib::GzipReader.new(StringIO.new(response.body)).read
|
|
34
|
+
else
|
|
35
|
+
response.body.to_s.strip
|
|
36
|
+
end
|
|
37
|
+
return [] if body.empty?
|
|
38
|
+
begin
|
|
39
|
+
json = JSON.parse(body)
|
|
40
|
+
# check if the response contains the header ["timestamp", "original"]
|
|
41
|
+
json.shift if json.first == ["timestamp", "original"]
|
|
42
|
+
json
|
|
43
|
+
rescue JSON::ParserError => e
|
|
44
|
+
raise "Malformed JSON response: #{e.message}"
|
|
45
|
+
end
|
|
46
|
+
when 429, 500, 502, 503, 504
|
|
47
|
+
raise "Server error #{response.code}: #{response.message}"
|
|
48
|
+
else
|
|
49
|
+
warn "Unexpected API response #{response.code} for #{url}"
|
|
50
|
+
[]
|
|
51
|
+
end
|
|
52
|
+
rescue Net::ReadTimeout, Net::OpenTimeout, StandardError => e
|
|
53
|
+
if retries < max_retries
|
|
54
|
+
retries += 1
|
|
55
|
+
warn "Error talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
|
|
56
|
+
sleep(delay * retries)
|
|
57
|
+
retry
|
|
58
|
+
else
|
|
59
|
+
warn "Giving up on Wayback CDX API for #{url} after #{max_retries} attempts. (Last error: #{e.message})"
|
|
60
|
+
[]
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def parameters_for_api(page_index)
|
|
66
|
+
parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "true"]]
|
|
67
|
+
parameters.push(["filter", "statuscode:200"]) unless @all
|
|
68
|
+
parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
|
|
69
|
+
parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
|
|
70
|
+
parameters.push(["page", page_index]) if page_index
|
|
71
|
+
parameters
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
end
|
|
@@ -1,33 +1,33 @@
|
|
|
1
|
-
module PageRequisites
|
|
2
|
-
# regex to find links in href, src, url(), and srcset
|
|
3
|
-
# this ignores data: URIs, mailto:, and anchors
|
|
4
|
-
ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
|
|
5
|
-
|
|
6
|
-
def self.extract(html_content)
|
|
7
|
-
assets = []
|
|
8
|
-
|
|
9
|
-
html_content.scan(ASSET_REGEX) do |match|
|
|
10
|
-
# match is an array of capture groups; find the one that matched
|
|
11
|
-
url = match.compact.first
|
|
12
|
-
next unless url
|
|
13
|
-
|
|
14
|
-
# handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
|
|
15
|
-
if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
|
|
16
|
-
url.split(',').each do |src_def|
|
|
17
|
-
src_url = src_def.strip.split(' ').first
|
|
18
|
-
assets << src_url if valid_asset?(src_url)
|
|
19
|
-
end
|
|
20
|
-
else
|
|
21
|
-
assets << url if valid_asset?(url)
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
assets.uniq
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
def self.valid_asset?(url)
|
|
29
|
-
return false if url.strip.empty?
|
|
30
|
-
return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
|
|
31
|
-
true
|
|
32
|
-
end
|
|
1
|
+
module PageRequisites
|
|
2
|
+
# regex to find links in href, src, url(), and srcset
|
|
3
|
+
# this ignores data: URIs, mailto:, and anchors
|
|
4
|
+
ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
|
|
5
|
+
|
|
6
|
+
def self.extract(html_content)
|
|
7
|
+
assets = []
|
|
8
|
+
|
|
9
|
+
html_content.scan(ASSET_REGEX) do |match|
|
|
10
|
+
# match is an array of capture groups; find the one that matched
|
|
11
|
+
url = match.compact.first
|
|
12
|
+
next unless url
|
|
13
|
+
|
|
14
|
+
# handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
|
|
15
|
+
if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
|
|
16
|
+
url.split(',').each do |src_def|
|
|
17
|
+
src_url = src_def.strip.split(' ').first
|
|
18
|
+
assets << src_url if valid_asset?(src_url)
|
|
19
|
+
end
|
|
20
|
+
else
|
|
21
|
+
assets << url if valid_asset?(url)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
assets.uniq
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def self.valid_asset?(url)
|
|
29
|
+
return false if url.strip.empty?
|
|
30
|
+
return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
|
|
31
|
+
true
|
|
32
|
+
end
|
|
33
33
|
end
|