wayback_machine_downloader_straw 2.4.5 → 2.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5ba50bde9b0306e043eed8151b12f37f603f5cfd73013e53260543f7fc134a5
4
- data.tar.gz: d3dbc1a0f6f894547fb39e56193c967dbf99685a8fa4d3cecaeaff62070aab4c
3
+ metadata.gz: 558d3187ee31faeadb08cf83e32a87307ae9d55a3327206598f27a78fb715e08
4
+ data.tar.gz: 9845999e0e618afde419869bb01b04277aca318aa80b238feb8252540fc16315
5
5
  SHA512:
6
- metadata.gz: 854ec2ccbe2daf620178397bf2c620ed8cf01ca57f175cbb9a2c8e7a057b1495a382ccae64836d75cf08c9a7d5a8ab638859d4c7d34584dcfc51f5eda8b7e5b2
7
- data.tar.gz: 1ed3f13c7aadcb097a174c7870730df0920c7b08e9476b786d287dfd2a3e29a50b690ea2fb16b0d762dac726648356788b4122e674db44e7c5a49ffc541f4098
6
+ metadata.gz: b323c1065ea1ab1d3c5909458cae726462ba1b88fd89effe8cd1efbdd1301d2022363c56b191f8ddbe28c0143fe87dfc94ca847865bb08e3564e29ba36f231a4
7
+ data.tar.gz: 00c4e775ee05e176e1048d6e5ddd9ddc436edaf9bf7fea61dbab33480e4adc499ea8a3a584e591858aad3a4c2f10096a3b6eb1ff6c8146f26ba6a3d42f104e32
@@ -1,111 +1,127 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative '../lib/wayback_machine_downloader'
4
- require 'optparse'
5
- require 'pp'
6
-
7
- options = {}
8
- option_parser = OptionParser.new do |opts|
9
- opts.banner = "Usage: wayback_machine_downloader http://example.com"
10
-
11
- opts.separator ""
12
- opts.separator "Download an entire website from the Wayback Machine."
13
-
14
- opts.separator ""
15
- opts.separator "Optional options:"
16
-
17
- opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
18
- options[:directory] = t
19
- end
20
-
21
- opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
- options[:all_timestamps] = true
23
- end
24
-
25
- opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
26
- options[:from_timestamp] = t
27
- end
28
-
29
- opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
30
- options[:to_timestamp] = t
31
- end
32
-
33
- opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34
- options[:exact_url] = t
35
- end
36
-
37
- opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
38
- options[:only_filter] = t
39
- end
40
-
41
- opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
42
- options[:exclude_filter] = t
43
- end
44
-
45
- opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
46
- options[:all] = true
47
- end
48
-
49
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
50
- options[:threads_count] = t
51
- end
52
-
53
- opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
54
- options[:maximum_pages] = t
55
- end
56
-
57
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
58
- options[:list] = true
59
- end
60
-
61
- opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
62
- options[:rewritten] = true
63
- end
64
-
65
- opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
66
- options[:rewrite] = true
67
- end
68
-
69
- opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
70
- options[:reset] = true
71
- end
72
-
73
- opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
74
- options[:keep] = true
75
- end
76
-
77
- opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
78
- options[:max_retries] = t
79
- end
80
-
81
- opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
82
- options[:recursive_subdomains] = true
83
- end
84
-
85
- opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
86
- options[:subdomain_depth] = t
87
- end
88
-
89
- opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
90
- options[:page_requisites] = true
91
- end
92
-
93
- opts.on("-v", "--version", "Display version") do |t|
94
- options[:version] = t
95
- end
96
- end.parse!
97
-
98
- if (base_url = ARGV[-1])
99
- options[:base_url] = base_url
100
- wayback_machine_downloader = WaybackMachineDownloader.new options
101
- if options[:list]
102
- wayback_machine_downloader.list_files
103
- else
104
- wayback_machine_downloader.download_files
105
- end
106
- elsif options[:version]
107
- puts WaybackMachineDownloader::VERSION
108
- else
109
- puts "You need to specify a website to backup. (e.g., http://example.com)"
110
- puts "Run `wayback_machine_downloader --help` for more help."
111
- end
1
+ #!/usr/bin/env ruby
2
+
3
+ $stdout.sync = true
4
+
5
+ require_relative '../lib/wayback_machine_downloader'
6
+ require 'optparse'
7
+ require 'pp'
8
+
9
+ options = {}
10
+ option_parser = OptionParser.new do |opts|
11
+ opts.banner = "Usage: wayback_machine_downloader http://example.com"
12
+
13
+ opts.separator ""
14
+ opts.separator "Download an entire website from the Wayback Machine."
15
+
16
+ opts.separator ""
17
+ opts.separator "Optional options:"
18
+
19
+ opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
20
+ options[:directory] = t
21
+ end
22
+
23
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
24
+ options[:all_timestamps] = true
25
+ end
26
+
27
+ opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
28
+ options[:from_timestamp] = t
29
+ end
30
+
31
+ opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
32
+ options[:to_timestamp] = t
33
+ end
34
+
35
+ opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
36
+ options[:exact_url] = t
37
+ end
38
+
39
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
40
+ options[:only_filter] = t
41
+ end
42
+
43
+ opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
44
+ options[:exclude_filter] = t
45
+ end
46
+
47
+ opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
48
+ options[:all] = true
49
+ end
50
+
51
+ opts.on("--keep-duplicates", "Do not collapse duplicate CDX captures by digest") do |t|
52
+ options[:keep_duplicates] = true
53
+ end
54
+
55
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
56
+ options[:threads_count] = t
57
+ end
58
+
59
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
60
+ options[:maximum_pages] = t
61
+ end
62
+
63
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
64
+ options[:list] = true
65
+ end
66
+
67
+ opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
68
+ options[:rewritten] = true
69
+ end
70
+
71
+ opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
72
+ options[:rewrite] = true
73
+ end
74
+
75
+ opts.on("--local-only", "Only rewrite links in an already downloaded directory, doesn't download anything") do |t|
76
+ options[:local_only] = true
77
+ end
78
+
79
+ opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
80
+ options[:reset] = true
81
+ end
82
+
83
+ opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
84
+ options[:keep] = true
85
+ end
86
+
87
+ opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
88
+ options[:max_retries] = t
89
+ end
90
+
91
+ opts.on("--snapshot-at TIMESTAMP", Integer, "Build a composite snapshot at this timestamp") do |t|
92
+ options[:snapshot_at] = t
93
+ end
94
+
95
+ opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
96
+ options[:recursive_subdomains] = true
97
+ end
98
+
99
+ opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
100
+ options[:subdomain_depth] = t
101
+ end
102
+
103
+ opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
104
+ options[:page_requisites] = true
105
+ end
106
+
107
+ opts.on("-v", "--version", "Display version") do |t|
108
+ options[:version] = t
109
+ end
110
+ end.parse!
111
+
112
+ if (base_url = ARGV[-1])
113
+ options[:base_url] = base_url
114
+ wayback_machine_downloader = WaybackMachineDownloader.new options
115
+ if options[:local_only]
116
+ wayback_machine_downloader.rewrite_local_files
117
+ elsif options[:list]
118
+ wayback_machine_downloader.list_files
119
+ else
120
+ wayback_machine_downloader.download_files
121
+ end
122
+ elsif options[:version]
123
+ puts WaybackMachineDownloader::VERSION
124
+ else
125
+ puts "You need to specify a website to backup. (e.g., http://example.com)"
126
+ puts "Run `wayback_machine_downloader --help` for more help."
127
+ end
@@ -1,61 +1,85 @@
1
- require 'json'
2
- require 'uri'
3
-
4
- module ArchiveAPI
5
-
6
- def get_raw_list_from_api(url, page_index, http)
7
- # Automatically append /* if the URL doesn't contain a path after the domain
8
- # This is a workaround for an issue with the API and *some* domains.
9
- # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
10
- # But don't do this when exact_url flag is set
11
- if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
12
- url = "#{url}/*"
13
- end
14
-
15
- request_url = URI("https://web.archive.org/cdx/search/cdx")
16
- params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
17
- request_url.query = URI.encode_www_form(params)
18
-
19
- retries = 0
20
- max_retries = (@max_retries || 3)
21
- delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
22
-
23
- begin
24
- response = http.get(request_url)
25
- body = response.body.to_s.strip
26
- return [] if body.empty?
27
- json = JSON.parse(body)
28
-
29
- # Check if the response contains the header ["timestamp", "original"]
30
- json.shift if json.first == ["timestamp", "original"]
31
- json
32
- rescue JSON::ParserError => e
33
- warn "Failed to parse JSON from API for #{url}: #{e.message}"
34
- []
35
- rescue Net::ReadTimeout, Net::OpenTimeout => e
36
- if retries < max_retries
37
- retries += 1
38
- warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
39
- sleep(delay * retries)
40
- retry
41
- else
42
- warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
43
- []
44
- end
45
- rescue StandardError => e
46
- # treat any other transient-ish error similarly, though without retries for now
47
- warn "Error fetching CDX data for #{url}: #{e.message}"
48
- []
49
- end
50
- end
51
-
52
- def parameters_for_api(page_index)
53
- parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
54
- parameters.push(["filter", "statuscode:200"]) unless @all
55
- parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
56
- parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
57
- parameters.push(["page", page_index]) if page_index
58
- parameters
59
- end
60
-
61
- end
1
+ require 'json'
2
+ require 'uri'
3
+
4
+ module ArchiveAPI
5
+
6
+ def get_raw_list_from_api(url, page_index, http)
7
+ # Automatically append /* for host-only URLs
8
+ # This is a workaround for an issue with the API and *some* domains.
9
+ # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
10
+ # But don't do this when exact_url flag is set, and never append twice
11
+ if url && !@exact_url
12
+ normalized_url = url.to_s
13
+ has_wildcard = normalized_url.include?('*')
14
+ host_and_rest = normalized_url
15
+ .sub(/\Ahttps?:\/\//i, '')
16
+ .split(/[?#]/, 2)
17
+ .first
18
+ has_path = host_and_rest.include?('/')
19
+
20
+ unless has_wildcard || has_path
21
+ url = "#{normalized_url}/*"
22
+ end
23
+ end
24
+
25
+ request_url = URI("https://web.archive.org/cdx/search/cdx")
26
+ params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
27
+ request_url.query = URI.encode_www_form(params)
28
+
29
+ retries = 0
30
+ max_retries = (@max_retries || 3)
31
+ delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
32
+
33
+ begin
34
+ request = Net::HTTP::Get.new(request_url)
35
+ request["User-Agent"] = "wmd-straw/#{WaybackMachineDownloader::VERSION}"
36
+ request["Connection"] = "keep-alive"
37
+ request["Accept-Encoding"] = "gzip"
38
+ response = http.request(request)
39
+
40
+ case response.code.to_i
41
+ when 200
42
+ body = if response['content-encoding'] == 'gzip'
43
+ Zlib::GzipReader.new(StringIO.new(response.body)).read
44
+ else
45
+ response.body.to_s.strip
46
+ end
47
+ return [] if body.empty?
48
+ begin
49
+ json = JSON.parse(body)
50
+ # check if the response contains the header ["timestamp", "original"]
51
+ json.shift if json.first == ["timestamp", "original"]
52
+ json
53
+ rescue JSON::ParserError => e
54
+ raise "Malformed JSON response: #{e.message}"
55
+ end
56
+ when 429, 500, 502, 503, 504
57
+ raise "Server error #{response.code}: #{response.message}"
58
+ else
59
+ warn "Unexpected API response #{response.code} for #{url}"
60
+ []
61
+ end
62
+ rescue Net::ReadTimeout, Net::OpenTimeout, StandardError => e
63
+ if retries < max_retries
64
+ retries += 1
65
+ warn "Error talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
66
+ sleep(delay * retries)
67
+ retry
68
+ else
69
+ warn "Giving up on Wayback CDX API for #{url} after #{max_retries} attempts. (Last error: #{e.message})"
70
+ []
71
+ end
72
+ end
73
+ end
74
+
75
+ def parameters_for_api(page_index)
76
+ parameters = [["fl", "timestamp,original"], ["gzip", "true"]]
77
+ parameters.push(["collapse", "digest"]) unless @keep_duplicates || @all_timestamps
78
+ parameters.push(["filter", "statuscode:2..|30[12378]"]) unless @all
79
+ parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
80
+ parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
81
+ parameters.push(["page", page_index]) if page_index
82
+ parameters
83
+ end
84
+
85
+ end
@@ -1,33 +1,33 @@
1
- module PageRequisites
2
- # regex to find links in href, src, url(), and srcset
3
- # this ignores data: URIs, mailto:, and anchors
4
- ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
5
-
6
- def self.extract(html_content)
7
- assets = []
8
-
9
- html_content.scan(ASSET_REGEX) do |match|
10
- # match is an array of capture groups; find the one that matched
11
- url = match.compact.first
12
- next unless url
13
-
14
- # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
15
- if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
16
- url.split(',').each do |src_def|
17
- src_url = src_def.strip.split(' ').first
18
- assets << src_url if valid_asset?(src_url)
19
- end
20
- else
21
- assets << url if valid_asset?(url)
22
- end
23
- end
24
-
25
- assets.uniq
26
- end
27
-
28
- def self.valid_asset?(url)
29
- return false if url.strip.empty?
30
- return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
31
- true
32
- end
1
+ module PageRequisites
2
+ # regex to find links in href, src, url(), and srcset
3
+ # this ignores data: URIs, mailto:, and anchors
4
+ ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
5
+
6
+ def self.extract(html_content)
7
+ assets = []
8
+
9
+ html_content.scan(ASSET_REGEX) do |match|
10
+ # match is an array of capture groups; find the one that matched
11
+ url = match.compact.first
12
+ next unless url
13
+
14
+ # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
15
+ if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
16
+ url.split(',').each do |src_def|
17
+ src_url = src_def.strip.split(' ').first
18
+ assets << src_url if valid_asset?(src_url)
19
+ end
20
+ else
21
+ assets << url if valid_asset?(url)
22
+ end
23
+ end
24
+
25
+ assets.uniq
26
+ end
27
+
28
+ def self.valid_asset?(url)
29
+ return false if url.strip.empty?
30
+ return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
31
+ true
32
+ end
33
33
  end