wayback_machine_downloader_straw 2.4.5 → 2.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5ba50bde9b0306e043eed8151b12f37f603f5cfd73013e53260543f7fc134a5
4
- data.tar.gz: d3dbc1a0f6f894547fb39e56193c967dbf99685a8fa4d3cecaeaff62070aab4c
3
+ metadata.gz: e52df092b59b0eec27b390b5b00fcfc17fab271acd6cd9df774912f57cfc4dc1
4
+ data.tar.gz: ce170b42caad7e8136b07c2aa5cb6e751f57dd64bd40c0addcd42a31798d0047
5
5
  SHA512:
6
- metadata.gz: 854ec2ccbe2daf620178397bf2c620ed8cf01ca57f175cbb9a2c8e7a057b1495a382ccae64836d75cf08c9a7d5a8ab638859d4c7d34584dcfc51f5eda8b7e5b2
7
- data.tar.gz: 1ed3f13c7aadcb097a174c7870730df0920c7b08e9476b786d287dfd2a3e29a50b690ea2fb16b0d762dac726648356788b4122e674db44e7c5a49ffc541f4098
6
+ metadata.gz: 558e9cdfc3d7d4d2081ccb49b12a96bdb64b7768697eb0a1b9a431ed1ad3017ce894975e046a6e50766928c3863797715c7b45d013b6ab7ad78bca59ea86c6d0
7
+ data.tar.gz: af3064f1489d32cf078fd5d87d2773700e9dfa498075f089029e0e7ec47c500c7815e84d51f426bb6fc3067bf02c9a9404da3b6f74d263c99b4ae96fc32dab35
@@ -1,111 +1,123 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative '../lib/wayback_machine_downloader'
4
- require 'optparse'
5
- require 'pp'
6
-
7
- options = {}
8
- option_parser = OptionParser.new do |opts|
9
- opts.banner = "Usage: wayback_machine_downloader http://example.com"
10
-
11
- opts.separator ""
12
- opts.separator "Download an entire website from the Wayback Machine."
13
-
14
- opts.separator ""
15
- opts.separator "Optional options:"
16
-
17
- opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
18
- options[:directory] = t
19
- end
20
-
21
- opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
- options[:all_timestamps] = true
23
- end
24
-
25
- opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
26
- options[:from_timestamp] = t
27
- end
28
-
29
- opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
30
- options[:to_timestamp] = t
31
- end
32
-
33
- opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34
- options[:exact_url] = t
35
- end
36
-
37
- opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
38
- options[:only_filter] = t
39
- end
40
-
41
- opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
42
- options[:exclude_filter] = t
43
- end
44
-
45
- opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
46
- options[:all] = true
47
- end
48
-
49
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
50
- options[:threads_count] = t
51
- end
52
-
53
- opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
54
- options[:maximum_pages] = t
55
- end
56
-
57
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
58
- options[:list] = true
59
- end
60
-
61
- opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
62
- options[:rewritten] = true
63
- end
64
-
65
- opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
66
- options[:rewrite] = true
67
- end
68
-
69
- opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
70
- options[:reset] = true
71
- end
72
-
73
- opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
74
- options[:keep] = true
75
- end
76
-
77
- opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
78
- options[:max_retries] = t
79
- end
80
-
81
- opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
82
- options[:recursive_subdomains] = true
83
- end
84
-
85
- opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
86
- options[:subdomain_depth] = t
87
- end
88
-
89
- opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
90
- options[:page_requisites] = true
91
- end
92
-
93
- opts.on("-v", "--version", "Display version") do |t|
94
- options[:version] = t
95
- end
96
- end.parse!
97
-
98
- if (base_url = ARGV[-1])
99
- options[:base_url] = base_url
100
- wayback_machine_downloader = WaybackMachineDownloader.new options
101
- if options[:list]
102
- wayback_machine_downloader.list_files
103
- else
104
- wayback_machine_downloader.download_files
105
- end
106
- elsif options[:version]
107
- puts WaybackMachineDownloader::VERSION
108
- else
109
- puts "You need to specify a website to backup. (e.g., http://example.com)"
110
- puts "Run `wayback_machine_downloader --help` for more help."
111
- end
1
+ #!/usr/bin/env ruby
2
+
3
+ $stdout.sync = true
4
+
5
+ require_relative '../lib/wayback_machine_downloader'
6
+ require 'optparse'
7
+ require 'pp'
8
+
9
+ options = {}
10
+ option_parser = OptionParser.new do |opts|
11
+ opts.banner = "Usage: wayback_machine_downloader http://example.com"
12
+
13
+ opts.separator ""
14
+ opts.separator "Download an entire website from the Wayback Machine."
15
+
16
+ opts.separator ""
17
+ opts.separator "Optional options:"
18
+
19
+ opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
20
+ options[:directory] = t
21
+ end
22
+
23
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
24
+ options[:all_timestamps] = true
25
+ end
26
+
27
+ opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
28
+ options[:from_timestamp] = t
29
+ end
30
+
31
+ opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
32
+ options[:to_timestamp] = t
33
+ end
34
+
35
+ opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
36
+ options[:exact_url] = t
37
+ end
38
+
39
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
40
+ options[:only_filter] = t
41
+ end
42
+
43
+ opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
44
+ options[:exclude_filter] = t
45
+ end
46
+
47
+ opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
48
+ options[:all] = true
49
+ end
50
+
51
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
52
+ options[:threads_count] = t
53
+ end
54
+
55
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
56
+ options[:maximum_pages] = t
57
+ end
58
+
59
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
60
+ options[:list] = true
61
+ end
62
+
63
+ opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
64
+ options[:rewritten] = true
65
+ end
66
+
67
+ opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
68
+ options[:rewrite] = true
69
+ end
70
+
71
+ opts.on("--local-only", "Only rewrite links in an already downloaded directory, doesn't download anything") do |t|
72
+ options[:local_only] = true
73
+ end
74
+
75
+ opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
76
+ options[:reset] = true
77
+ end
78
+
79
+ opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
80
+ options[:keep] = true
81
+ end
82
+
83
+ opts.on("--rt", "--retry N", Integer, "Maximum number of retries for failed downloads (default: 3)") do |t|
84
+ options[:max_retries] = t
85
+ end
86
+
87
+ opts.on("--snapshot-at TIMESTAMP", Integer, "Build a composite snapshot at this timestamp") do |t|
88
+ options[:snapshot_at] = t
89
+ end
90
+
91
+ opts.on("--recursive-subdomains", "Recursively download content from subdomains") do |t|
92
+ options[:recursive_subdomains] = true
93
+ end
94
+
95
+ opts.on("--subdomain-depth DEPTH", Integer, "Maximum depth for subdomain recursion (default: 1)") do |t|
96
+ options[:subdomain_depth] = t
97
+ end
98
+
99
+ opts.on("--page-requisites", "Download related assets (images, css, js) for downloaded HTML pages") do |t|
100
+ options[:page_requisites] = true
101
+ end
102
+
103
+ opts.on("-v", "--version", "Display version") do |t|
104
+ options[:version] = t
105
+ end
106
+ end.parse!
107
+
108
+ if (base_url = ARGV[-1])
109
+ options[:base_url] = base_url
110
+ wayback_machine_downloader = WaybackMachineDownloader.new options
111
+ if options[:local_only]
112
+ wayback_machine_downloader.rewrite_local_files
113
+ elsif options[:list]
114
+ wayback_machine_downloader.list_files
115
+ else
116
+ wayback_machine_downloader.download_files
117
+ end
118
+ elsif options[:version]
119
+ puts WaybackMachineDownloader::VERSION
120
+ else
121
+ puts "You need to specify a website to backup. (e.g., http://example.com)"
122
+ puts "Run `wayback_machine_downloader --help` for more help."
123
+ end
@@ -1,61 +1,74 @@
1
- require 'json'
2
- require 'uri'
3
-
4
- module ArchiveAPI
5
-
6
- def get_raw_list_from_api(url, page_index, http)
7
- # Automatically append /* if the URL doesn't contain a path after the domain
8
- # This is a workaround for an issue with the API and *some* domains.
9
- # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
10
- # But don't do this when exact_url flag is set
11
- if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
12
- url = "#{url}/*"
13
- end
14
-
15
- request_url = URI("https://web.archive.org/cdx/search/cdx")
16
- params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
17
- request_url.query = URI.encode_www_form(params)
18
-
19
- retries = 0
20
- max_retries = (@max_retries || 3)
21
- delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
22
-
23
- begin
24
- response = http.get(request_url)
25
- body = response.body.to_s.strip
26
- return [] if body.empty?
27
- json = JSON.parse(body)
28
-
29
- # Check if the response contains the header ["timestamp", "original"]
30
- json.shift if json.first == ["timestamp", "original"]
31
- json
32
- rescue JSON::ParserError => e
33
- warn "Failed to parse JSON from API for #{url}: #{e.message}"
34
- []
35
- rescue Net::ReadTimeout, Net::OpenTimeout => e
36
- if retries < max_retries
37
- retries += 1
38
- warn "Timeout talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
39
- sleep(delay * retries)
40
- retry
41
- else
42
- warn "Giving up on Wayback CDX API for #{url} after #{max_retries} timeouts."
43
- []
44
- end
45
- rescue StandardError => e
46
- # treat any other transient-ish error similarly, though without retries for now
47
- warn "Error fetching CDX data for #{url}: #{e.message}"
48
- []
49
- end
50
- end
51
-
52
- def parameters_for_api(page_index)
53
- parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
54
- parameters.push(["filter", "statuscode:200"]) unless @all
55
- parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
56
- parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
57
- parameters.push(["page", page_index]) if page_index
58
- parameters
59
- end
60
-
61
- end
1
+ require 'json'
2
+ require 'uri'
3
+
4
+ module ArchiveAPI
5
+
6
+ def get_raw_list_from_api(url, page_index, http)
7
+ # Automatically append /* if the URL doesn't contain a path after the domain
8
+ # This is a workaround for an issue with the API and *some* domains.
9
+ # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
10
+ # But don't do this when exact_url flag is set
11
+ if url && !url.match(/^https?:\/\/.*\//i) && !@exact_url
12
+ url = "#{url}/*"
13
+ end
14
+
15
+ request_url = URI("https://web.archive.org/cdx/search/cdx")
16
+ params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
17
+ request_url.query = URI.encode_www_form(params)
18
+
19
+ retries = 0
20
+ max_retries = (@max_retries || 3)
21
+ delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
22
+
23
+ begin
24
+ request = Net::HTTP::Get.new(request_url)
25
+ request["User-Agent"] = "wmd-straw/#{WaybackMachineDownloader::VERSION}"
26
+ request["Connection"] = "keep-alive"
27
+ request["Accept-Encoding"] = "gzip"
28
+ response = http.request(request)
29
+
30
+ case response.code.to_i
31
+ when 200
32
+ body = if response['content-encoding'] == 'gzip'
33
+ Zlib::GzipReader.new(StringIO.new(response.body)).read
34
+ else
35
+ response.body.to_s.strip
36
+ end
37
+ return [] if body.empty?
38
+ begin
39
+ json = JSON.parse(body)
40
+ # check if the response contains the header ["timestamp", "original"]
41
+ json.shift if json.first == ["timestamp", "original"]
42
+ json
43
+ rescue JSON::ParserError => e
44
+ raise "Malformed JSON response: #{e.message}"
45
+ end
46
+ when 429, 500, 502, 503, 504
47
+ raise "Server error #{response.code}: #{response.message}"
48
+ else
49
+ warn "Unexpected API response #{response.code} for #{url}"
50
+ []
51
+ end
52
+ rescue Net::ReadTimeout, Net::OpenTimeout, StandardError => e
53
+ if retries < max_retries
54
+ retries += 1
55
+ warn "Error talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
56
+ sleep(delay * retries)
57
+ retry
58
+ else
59
+ warn "Giving up on Wayback CDX API for #{url} after #{max_retries} attempts. (Last error: #{e.message})"
60
+ []
61
+ end
62
+ end
63
+ end
64
+
65
+ def parameters_for_api(page_index)
66
+ parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "true"]]
67
+ parameters.push(["filter", "statuscode:200"]) unless @all
68
+ parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
69
+ parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
70
+ parameters.push(["page", page_index]) if page_index
71
+ parameters
72
+ end
73
+
74
+ end
@@ -1,33 +1,33 @@
1
- module PageRequisites
2
- # regex to find links in href, src, url(), and srcset
3
- # this ignores data: URIs, mailto:, and anchors
4
- ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
5
-
6
- def self.extract(html_content)
7
- assets = []
8
-
9
- html_content.scan(ASSET_REGEX) do |match|
10
- # match is an array of capture groups; find the one that matched
11
- url = match.compact.first
12
- next unless url
13
-
14
- # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
15
- if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
16
- url.split(',').each do |src_def|
17
- src_url = src_def.strip.split(' ').first
18
- assets << src_url if valid_asset?(src_url)
19
- end
20
- else
21
- assets << url if valid_asset?(url)
22
- end
23
- end
24
-
25
- assets.uniq
26
- end
27
-
28
- def self.valid_asset?(url)
29
- return false if url.strip.empty?
30
- return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
31
- true
32
- end
1
+ module PageRequisites
2
+ # regex to find links in href, src, url(), and srcset
3
+ # this ignores data: URIs, mailto:, and anchors
4
+ ASSET_REGEX = /(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
5
+
6
+ def self.extract(html_content)
7
+ assets = []
8
+
9
+ html_content.scan(ASSET_REGEX) do |match|
10
+ # match is an array of capture groups; find the one that matched
11
+ url = match.compact.first
12
+ next unless url
13
+
14
+ # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
15
+ if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
16
+ url.split(',').each do |src_def|
17
+ src_url = src_def.strip.split(' ').first
18
+ assets << src_url if valid_asset?(src_url)
19
+ end
20
+ else
21
+ assets << url if valid_asset?(url)
22
+ end
23
+ end
24
+
25
+ assets.uniq
26
+ end
27
+
28
+ def self.valid_asset?(url)
29
+ return false if url.strip.empty?
30
+ return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
31
+ true
32
+ end
33
33
  end