archaeo 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +78 -3
- data/lib/archaeo/archive_url.rb +12 -0
- data/lib/archaeo/asset_extractor.rb +117 -8
- data/lib/archaeo/asset_list.rb +24 -1
- data/lib/archaeo/availability_api.rb +3 -1
- data/lib/archaeo/availability_result.rb +16 -2
- data/lib/archaeo/bulk_downloader.rb +81 -13
- data/lib/archaeo/cdx_api.rb +7 -0
- data/lib/archaeo/cdx_filter.rb +21 -1
- data/lib/archaeo/cli.rb +134 -58
- data/lib/archaeo/download_state.rb +17 -3
- data/lib/archaeo/http_client.rb +96 -14
- data/lib/archaeo/page.rb +29 -0
- data/lib/archaeo/page_bundle.rb +14 -0
- data/lib/archaeo/save_api.rb +3 -3
- data/lib/archaeo/save_result.rb +3 -2
- data/lib/archaeo/snapshot.rb +40 -0
- data/lib/archaeo/timestamp.rb +22 -0
- data/lib/archaeo/version.rb +1 -1
- metadata +1 -1
data/lib/archaeo/cli.rb
CHANGED
|
@@ -9,6 +9,10 @@ module Archaeo
|
|
|
9
9
|
class Cli < Thor
|
|
10
10
|
map %w[--version -v] => :version
|
|
11
11
|
|
|
12
|
+
def self.exit_on_failure?
|
|
13
|
+
true
|
|
14
|
+
end
|
|
15
|
+
|
|
12
16
|
desc "version", "Show archaeo version"
|
|
13
17
|
def version
|
|
14
18
|
puts "archaeo #{VERSION}"
|
|
@@ -26,51 +30,64 @@ module Archaeo
|
|
|
26
30
|
option :format, desc: "Output format (table, json, csv)",
|
|
27
31
|
default: "table"
|
|
28
32
|
def snapshots(url)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
when "json" then output_json(snaps)
|
|
34
|
-
when "csv" then output_csv(snaps)
|
|
35
|
-
else output_table(snaps)
|
|
33
|
+
fmt = validate_output_format
|
|
34
|
+
handle_errors do
|
|
35
|
+
snaps = fetch_snapshots(url)
|
|
36
|
+
output_formatted(snaps, fmt)
|
|
36
37
|
end
|
|
37
38
|
end
|
|
38
39
|
|
|
39
40
|
desc "near URL TIMESTAMP",
|
|
40
41
|
"Find the snapshot closest to a timestamp"
|
|
42
|
+
option :format, desc: "Output format (url, json)", default: "url"
|
|
41
43
|
def near(url, timestamp)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
+
handle_errors do
|
|
45
|
+
snap = CdxApi.new.near(url, timestamp: timestamp)
|
|
46
|
+
output_snapshot(snap)
|
|
47
|
+
end
|
|
44
48
|
end
|
|
45
49
|
|
|
46
50
|
desc "oldest URL", "Find the oldest snapshot of a URL"
|
|
51
|
+
option :format, desc: "Output format (url, json)", default: "url"
|
|
47
52
|
def oldest(url)
|
|
48
|
-
|
|
49
|
-
|
|
53
|
+
handle_errors do
|
|
54
|
+
snap = CdxApi.new.oldest(url)
|
|
55
|
+
output_snapshot(snap)
|
|
56
|
+
end
|
|
50
57
|
end
|
|
51
58
|
|
|
52
59
|
desc "newest URL", "Find the newest snapshot of a URL"
|
|
60
|
+
option :format, desc: "Output format (url, json)", default: "url"
|
|
53
61
|
def newest(url)
|
|
54
|
-
|
|
55
|
-
|
|
62
|
+
handle_errors do
|
|
63
|
+
snap = CdxApi.new.newest(url)
|
|
64
|
+
output_snapshot(snap)
|
|
65
|
+
end
|
|
56
66
|
end
|
|
57
67
|
|
|
58
68
|
desc "available URL", "Check if a URL is archived"
|
|
69
|
+
option :timestamp, desc: "Check near this timestamp (YYYYMMDDHHmmss)"
|
|
59
70
|
def available(url)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
71
|
+
handle_errors do
|
|
72
|
+
result = AvailabilityApi.new.near(
|
|
73
|
+
url, timestamp: options[:timestamp]
|
|
74
|
+
)
|
|
75
|
+
if result.available?
|
|
76
|
+
puts "Available: #{result.archive_url}"
|
|
77
|
+
else
|
|
78
|
+
puts "Not available"
|
|
79
|
+
exit 1
|
|
80
|
+
end
|
|
66
81
|
end
|
|
67
82
|
end
|
|
68
83
|
|
|
69
84
|
desc "save URL", "Save a URL to the Wayback Machine"
|
|
70
85
|
def save(url)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
86
|
+
handle_errors do
|
|
87
|
+
result = SaveApi.new.save(url)
|
|
88
|
+
label = result.cached? ? "Cached" : "Saved"
|
|
89
|
+
puts "#{label}: #{result.archive_url}"
|
|
90
|
+
end
|
|
74
91
|
end
|
|
75
92
|
|
|
76
93
|
desc "fetch URL TIMESTAMP",
|
|
@@ -79,15 +96,12 @@ module Archaeo
|
|
|
79
96
|
desc: "Fetch raw (identity) content"
|
|
80
97
|
option :output, desc: "Write content to file"
|
|
81
98
|
def fetch(url, timestamp)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
write_output(options[:output], page.content)
|
|
89
|
-
else
|
|
90
|
-
$stdout.write(page.content)
|
|
99
|
+
handle_errors do
|
|
100
|
+
page = Fetcher.new.fetch(
|
|
101
|
+
url, timestamp: timestamp,
|
|
102
|
+
identity: options[:identity]
|
|
103
|
+
)
|
|
104
|
+
output_page(page)
|
|
91
105
|
end
|
|
92
106
|
end
|
|
93
107
|
|
|
@@ -97,38 +111,34 @@ module Archaeo
|
|
|
97
111
|
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
98
112
|
option :resume, type: :boolean, default: false,
|
|
99
113
|
desc: "Resume interrupted download"
|
|
114
|
+
option :concurrency, type: :numeric, default: 1,
|
|
115
|
+
desc: "Number of parallel downloads"
|
|
100
116
|
def download(url)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
resume: options[:resume],
|
|
108
|
-
) do |current, total, snap|
|
|
109
|
-
warn "[#{current}/#{total}] " \
|
|
110
|
-
"#{snap.timestamp} #{snap.original_url}"
|
|
117
|
+
handle_errors do
|
|
118
|
+
downloader = BulkDownloader.new(
|
|
119
|
+
output_dir: options[:output],
|
|
120
|
+
concurrency: options[:concurrency],
|
|
121
|
+
)
|
|
122
|
+
download_with_progress(downloader, url)
|
|
111
123
|
end
|
|
112
124
|
end
|
|
113
125
|
|
|
114
126
|
desc "known_urls DOMAIN",
|
|
115
127
|
"List all known URLs for a domain"
|
|
116
128
|
def known_urls(domain)
|
|
117
|
-
|
|
118
|
-
|
|
129
|
+
handle_errors do
|
|
130
|
+
CdxApi.new.known_urls(domain).each do |u|
|
|
131
|
+
puts u
|
|
132
|
+
end
|
|
119
133
|
end
|
|
120
134
|
end
|
|
121
135
|
|
|
122
136
|
desc "num_pages URL",
|
|
123
137
|
"Show number of CDX result pages for a URL"
|
|
124
138
|
def num_pages(url)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
exit 1
|
|
129
|
-
rescue Error => e
|
|
130
|
-
warn "Error: #{e.message}"
|
|
131
|
-
exit 1
|
|
139
|
+
handle_errors do
|
|
140
|
+
puts CdxApi.new.num_pages(url)
|
|
141
|
+
end
|
|
132
142
|
end
|
|
133
143
|
|
|
134
144
|
CDX_OPTION_MAP = {
|
|
@@ -143,6 +153,79 @@ module Archaeo
|
|
|
143
153
|
|
|
144
154
|
private
|
|
145
155
|
|
|
156
|
+
def handle_errors
|
|
157
|
+
yield
|
|
158
|
+
rescue RateLimitError => e
|
|
159
|
+
warn "Rate limited: #{e.message}"
|
|
160
|
+
exit 1
|
|
161
|
+
rescue NoSnapshotFound => e
|
|
162
|
+
warn "Not found: #{e.message}"
|
|
163
|
+
exit 1
|
|
164
|
+
rescue BlockedSiteError => e
|
|
165
|
+
warn "Blocked: #{e.message}"
|
|
166
|
+
exit 1
|
|
167
|
+
rescue Error => e
|
|
168
|
+
warn "Error: #{e.message}"
|
|
169
|
+
exit 1
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def validate_output_format
|
|
173
|
+
fmt = options[:format].to_s
|
|
174
|
+
fmt = "table" if fmt.empty?
|
|
175
|
+
unless %w[table json csv].include?(fmt)
|
|
176
|
+
warn "Unknown format '#{fmt}'. Use: table, json, csv"
|
|
177
|
+
exit 1
|
|
178
|
+
end
|
|
179
|
+
fmt
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def fetch_snapshots(url)
|
|
183
|
+
cdx = CdxApi.new
|
|
184
|
+
opts = build_cdx_options(options)
|
|
185
|
+
cdx.snapshots(url, **opts).to_a
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def output_formatted(snaps, fmt)
|
|
189
|
+
case fmt
|
|
190
|
+
when "json" then output_json(snaps)
|
|
191
|
+
when "csv" then output_csv(snaps)
|
|
192
|
+
else output_table(snaps)
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def output_snapshot(snap)
|
|
197
|
+
case options[:format]
|
|
198
|
+
when "json"
|
|
199
|
+
puts JSON.generate(snap.as_json)
|
|
200
|
+
else
|
|
201
|
+
puts snap.archive_url
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def output_page(page)
|
|
206
|
+
if options[:output]
|
|
207
|
+
write_output(options[:output], page.content)
|
|
208
|
+
elsif page.text? || page.json?
|
|
209
|
+
$stdout.write(page.content)
|
|
210
|
+
else
|
|
211
|
+
warn "Binary content (#{page.content_type}). " \
|
|
212
|
+
"Use --output FILE to save."
|
|
213
|
+
exit 1
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def download_with_progress(downloader, url)
|
|
218
|
+
downloader.download(
|
|
219
|
+
url,
|
|
220
|
+
from: options[:from],
|
|
221
|
+
to: options[:to],
|
|
222
|
+
resume: options[:resume],
|
|
223
|
+
) do |current, total, snap|
|
|
224
|
+
warn "[#{current}/#{total}] " \
|
|
225
|
+
"#{snap.timestamp} #{snap.original_url}"
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
146
229
|
def build_cdx_options(opts)
|
|
147
230
|
CDX_OPTION_MAP.each_with_object({}) do |(cli_key, api_key), result|
|
|
148
231
|
value = opts[cli_key]
|
|
@@ -158,14 +241,7 @@ module Archaeo
|
|
|
158
241
|
end
|
|
159
242
|
|
|
160
243
|
def output_json(snaps)
|
|
161
|
-
data = snaps.map
|
|
162
|
-
{
|
|
163
|
-
timestamp: snap.timestamp.to_s,
|
|
164
|
-
status_code: snap.status_code,
|
|
165
|
-
url: snap.original_url,
|
|
166
|
-
archive_url: snap.archive_url,
|
|
167
|
-
}
|
|
168
|
-
end
|
|
244
|
+
data = snaps.map(&:as_json)
|
|
169
245
|
puts JSON.generate(data)
|
|
170
246
|
end
|
|
171
247
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
3
5
|
module Archaeo
|
|
4
6
|
# Tracks download progress for resume support.
|
|
5
7
|
#
|
|
@@ -17,16 +19,21 @@ module Archaeo
|
|
|
17
19
|
end
|
|
18
20
|
|
|
19
21
|
def completed?(timestamp)
|
|
20
|
-
|
|
22
|
+
timestamps_set.include?(timestamp.to_s)
|
|
21
23
|
end
|
|
22
24
|
|
|
23
25
|
def mark_completed(timestamp)
|
|
24
|
-
|
|
26
|
+
ts = timestamp.to_s
|
|
27
|
+
return if timestamps_set.include?(ts)
|
|
28
|
+
|
|
29
|
+
timestamps << ts
|
|
30
|
+
@timestamps_set = nil
|
|
25
31
|
save
|
|
26
32
|
end
|
|
27
33
|
|
|
28
34
|
def clear
|
|
29
35
|
@timestamps = []
|
|
36
|
+
@timestamps_set = nil
|
|
30
37
|
FileUtils.rm_f(@path)
|
|
31
38
|
end
|
|
32
39
|
|
|
@@ -36,6 +43,10 @@ module Archaeo
|
|
|
36
43
|
@timestamps ||= load_timestamps
|
|
37
44
|
end
|
|
38
45
|
|
|
46
|
+
def timestamps_set
|
|
47
|
+
@timestamps_set ||= timestamps.to_set
|
|
48
|
+
end
|
|
49
|
+
|
|
39
50
|
def load_timestamps
|
|
40
51
|
return [] unless File.exist?(@path)
|
|
41
52
|
|
|
@@ -43,7 +54,10 @@ module Archaeo
|
|
|
43
54
|
end
|
|
44
55
|
|
|
45
56
|
def save
|
|
46
|
-
|
|
57
|
+
content = "#{timestamps.sort.join("\n")}\n"
|
|
58
|
+
tmp_path = "#{@path}.tmp"
|
|
59
|
+
File.write(tmp_path, content)
|
|
60
|
+
File.rename(tmp_path, @path)
|
|
47
61
|
end
|
|
48
62
|
end
|
|
49
63
|
end
|
data/lib/archaeo/http_client.rb
CHANGED
|
@@ -15,6 +15,10 @@ module Archaeo
|
|
|
15
15
|
DEFAULT_TIMEOUT = 30
|
|
16
16
|
DEFAULT_MAX_RETRIES = 3
|
|
17
17
|
DEFAULT_RETRY_DELAY = 2
|
|
18
|
+
MAX_POOL_SIZE = 8
|
|
19
|
+
MAX_IDLE_TIME = 60
|
|
20
|
+
|
|
21
|
+
RETRIABLE_STATUSES = [429, 502, 503, 504].freeze
|
|
18
22
|
|
|
19
23
|
TRANSIENT_ERRORS = [
|
|
20
24
|
Net::ReadTimeout,
|
|
@@ -64,7 +68,16 @@ module Archaeo
|
|
|
64
68
|
@retry_delay = retry_delay
|
|
65
69
|
@user_agent = user_agent
|
|
66
70
|
@connections = {}
|
|
71
|
+
@last_used = {}
|
|
67
72
|
@mutex = Mutex.new
|
|
73
|
+
@shutdown = false
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def self.open(**opts)
|
|
77
|
+
client = new(**opts)
|
|
78
|
+
yield client
|
|
79
|
+
ensure
|
|
80
|
+
client&.shutdown
|
|
68
81
|
end
|
|
69
82
|
|
|
70
83
|
def get(url, headers: {})
|
|
@@ -81,6 +94,9 @@ module Archaeo
|
|
|
81
94
|
|
|
82
95
|
def shutdown
|
|
83
96
|
@mutex.synchronize do
|
|
97
|
+
return if @shutdown
|
|
98
|
+
|
|
99
|
+
@shutdown = true
|
|
84
100
|
@connections.each_value do |http|
|
|
85
101
|
http.finish
|
|
86
102
|
rescue StandardError
|
|
@@ -103,16 +119,26 @@ module Archaeo
|
|
|
103
119
|
def connection_for(uri)
|
|
104
120
|
key = connection_key(uri)
|
|
105
121
|
@mutex.synchronize do
|
|
122
|
+
evict_if_pool_full(key)
|
|
106
123
|
http = @connections[key]
|
|
107
124
|
if http && !http.active?
|
|
108
|
-
|
|
125
|
+
close_connection(key)
|
|
109
126
|
http = nil
|
|
110
127
|
end
|
|
111
128
|
@connections[key] = build_connection(uri) unless http
|
|
129
|
+
@last_used[key] = Time.now
|
|
112
130
|
@connections[key]
|
|
113
131
|
end
|
|
114
132
|
end
|
|
115
133
|
|
|
134
|
+
def evict_if_pool_full(key)
|
|
135
|
+
evict_stale_connections
|
|
136
|
+
return unless @connections.size >= MAX_POOL_SIZE &&
|
|
137
|
+
!@connections.key?(key)
|
|
138
|
+
|
|
139
|
+
evict_lru
|
|
140
|
+
end
|
|
141
|
+
|
|
116
142
|
def build_connection(uri)
|
|
117
143
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
118
144
|
http.use_ssl = uri.scheme == "https"
|
|
@@ -124,26 +150,82 @@ module Archaeo
|
|
|
124
150
|
|
|
125
151
|
def invalidate_connection(uri)
|
|
126
152
|
key = connection_key(uri)
|
|
127
|
-
@mutex.synchronize
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
153
|
+
@mutex.synchronize { close_connection(key) }
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def close_connection(key)
|
|
157
|
+
http = @connections.delete(key)
|
|
158
|
+
@last_used.delete(key)
|
|
159
|
+
begin
|
|
160
|
+
http&.finish
|
|
161
|
+
rescue StandardError
|
|
162
|
+
nil
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def evict_stale_connections
|
|
167
|
+
now = Time.now
|
|
168
|
+
@connections.each_key do |key|
|
|
169
|
+
idle = now - (@last_used[key] || now)
|
|
170
|
+
close_connection(key) if idle > MAX_IDLE_TIME
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def evict_lru
|
|
175
|
+
lru_key = @last_used.min_by { |_, t| t }&.first
|
|
176
|
+
close_connection(lru_key) if lru_key
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Internal error class for HTTP status retry signaling
|
|
180
|
+
class RetriableStatusError < StandardError
|
|
181
|
+
attr_reader :response
|
|
182
|
+
|
|
183
|
+
def initialize(response)
|
|
184
|
+
@response = response
|
|
185
|
+
super("Retriable HTTP status: #{response.status}")
|
|
134
186
|
end
|
|
135
187
|
end
|
|
136
188
|
|
|
137
189
|
def attempt_with_retries(uri, headers, request_class)
|
|
138
190
|
retries = 0
|
|
139
191
|
begin
|
|
140
|
-
|
|
192
|
+
execute_and_check(uri, headers, request_class)
|
|
193
|
+
rescue RetriableStatusError => e
|
|
194
|
+
retry_status(e, retries += 1) && retry
|
|
141
195
|
rescue *TRANSIENT_ERRORS => e
|
|
142
|
-
retries += 1
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
196
|
+
retry_transient(e, uri, retries += 1) && retry
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def retry_status(error, retries)
|
|
201
|
+
raise_if_exhausted(retries,
|
|
202
|
+
RateLimitError.new("HTTP #{error.response.status}"))
|
|
203
|
+
sleep(extract_retry_after(error.response) || (@retry_delay * retries))
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def retry_transient(error, uri, retries)
|
|
207
|
+
raise_if_exhausted(retries, error)
|
|
208
|
+
invalidate_connection(uri)
|
|
209
|
+
sleep(@retry_delay * retries)
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def execute_and_check(uri, headers, request_class)
|
|
213
|
+
response = execute_with_connection(uri, headers, request_class)
|
|
214
|
+
if RETRIABLE_STATUSES.include?(response.status)
|
|
215
|
+
raise RetriableStatusError, response
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
response
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def extract_retry_after(response)
|
|
222
|
+
value = response.headers["retry-after"]
|
|
223
|
+
return nil unless value
|
|
224
|
+
|
|
225
|
+
begin
|
|
226
|
+
Integer(value)
|
|
227
|
+
rescue StandardError
|
|
228
|
+
nil
|
|
147
229
|
end
|
|
148
230
|
end
|
|
149
231
|
|
data/lib/archaeo/page.rb
CHANGED
|
@@ -34,6 +34,35 @@ module Archaeo
|
|
|
34
34
|
@encoding ||= detect_encoding
|
|
35
35
|
end
|
|
36
36
|
|
|
37
|
+
def html?
|
|
38
|
+
@content_type&.include?("text/html")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def json?
|
|
42
|
+
@content_type&.include?("application/json")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def image?
|
|
46
|
+
@content_type&.start_with?("image/")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def text?
|
|
50
|
+
@content_type&.start_with?("text/")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def binary?
|
|
54
|
+
!(text? || json? || html?)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def title
|
|
58
|
+
@title ||= begin
|
|
59
|
+
doc = Nokogiri::HTML(@raw_content)
|
|
60
|
+
doc.at_css("title")&.text&.strip
|
|
61
|
+
rescue StandardError
|
|
62
|
+
nil
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
37
66
|
private
|
|
38
67
|
|
|
39
68
|
def detect_encoding
|
data/lib/archaeo/page_bundle.rb
CHANGED
|
@@ -6,11 +6,25 @@ module Archaeo
|
|
|
6
6
|
# Bundles a Page with the AssetList discovered from its HTML,
|
|
7
7
|
# providing a single object for complete page archival.
|
|
8
8
|
class PageBundle
|
|
9
|
+
include Enumerable
|
|
10
|
+
|
|
9
11
|
attr_reader :page, :assets
|
|
10
12
|
|
|
11
13
|
def initialize(page:, assets:)
|
|
12
14
|
@page = page
|
|
13
15
|
@assets = assets
|
|
14
16
|
end
|
|
17
|
+
|
|
18
|
+
def each(&block)
|
|
19
|
+
assets.each(&block)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def size
|
|
23
|
+
assets.size + 1
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def asset_count
|
|
27
|
+
assets.size
|
|
28
|
+
end
|
|
15
29
|
end
|
|
16
30
|
end
|
data/lib/archaeo/save_api.rb
CHANGED
|
@@ -32,7 +32,7 @@ module Archaeo
|
|
|
32
32
|
response = @client.get(save_url)
|
|
33
33
|
check_response_errors!(response, url)
|
|
34
34
|
|
|
35
|
-
result = process_save_response(response, start_time)
|
|
35
|
+
result = process_save_response(response, start_time, url)
|
|
36
36
|
return result if result
|
|
37
37
|
end
|
|
38
38
|
|
|
@@ -40,13 +40,13 @@ module Archaeo
|
|
|
40
40
|
"Failed to save #{url} after #{@max_tries} attempts"
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
-
def process_save_response(response, start_time)
|
|
43
|
+
def process_save_response(response, start_time, url)
|
|
44
44
|
archive_url = extract_archive_url(response)
|
|
45
45
|
return nil unless archive_url
|
|
46
46
|
|
|
47
47
|
ts = Timestamp.parse(extract_timestamp(archive_url))
|
|
48
48
|
cached = ts.to_time < start_time - 2700
|
|
49
|
-
SaveResult.new(archive_url: archive_url,
|
|
49
|
+
SaveResult.new(url: url, archive_url: archive_url,
|
|
50
50
|
timestamp: ts, cached: cached)
|
|
51
51
|
end
|
|
52
52
|
|
data/lib/archaeo/save_result.rb
CHANGED
|
@@ -6,9 +6,10 @@ module Archaeo
|
|
|
6
6
|
# Contains the resulting archive URL, timestamp, and whether
|
|
7
7
|
# the page was already cached in the archive.
|
|
8
8
|
class SaveResult
|
|
9
|
-
attr_reader :archive_url, :timestamp
|
|
9
|
+
attr_reader :url, :archive_url, :timestamp
|
|
10
10
|
|
|
11
|
-
def initialize(archive_url:, timestamp:, cached:)
|
|
11
|
+
def initialize(url:, archive_url:, timestamp:, cached:)
|
|
12
|
+
@url = url
|
|
12
13
|
@archive_url = archive_url
|
|
13
14
|
@timestamp = Timestamp.coerce(timestamp)
|
|
14
15
|
@cached = cached
|
data/lib/archaeo/snapshot.rb
CHANGED
|
@@ -37,6 +37,34 @@ module Archaeo
|
|
|
37
37
|
@status_code == 200
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
+
def redirect?
|
|
41
|
+
@status_code.between?(300, 399)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def client_error?
|
|
45
|
+
@status_code.between?(400, 499)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def server_error?
|
|
49
|
+
@status_code.between?(500, 599)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def error?
|
|
53
|
+
client_error? || server_error?
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def fetch(client: HttpClient.new, identity: false)
|
|
57
|
+
Fetcher.new(client: client).fetch(
|
|
58
|
+
original_url, timestamp: @timestamp, identity: identity
|
|
59
|
+
)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def fetch_with_assets(client: HttpClient.new)
|
|
63
|
+
Fetcher.new(client: client).fetch_page_with_assets(
|
|
64
|
+
original_url, timestamp: @timestamp
|
|
65
|
+
)
|
|
66
|
+
end
|
|
67
|
+
|
|
40
68
|
def to_a
|
|
41
69
|
[@urlkey, @timestamp, @original_url, @mimetype,
|
|
42
70
|
@status_code, @digest, @length]
|
|
@@ -54,6 +82,18 @@ module Archaeo
|
|
|
54
82
|
}
|
|
55
83
|
end
|
|
56
84
|
|
|
85
|
+
def as_json(*)
|
|
86
|
+
{
|
|
87
|
+
urlkey: @urlkey,
|
|
88
|
+
timestamp: @timestamp.to_s,
|
|
89
|
+
original_url: @original_url,
|
|
90
|
+
mimetype: @mimetype,
|
|
91
|
+
status_code: @status_code,
|
|
92
|
+
digest: @digest,
|
|
93
|
+
length: @length,
|
|
94
|
+
}
|
|
95
|
+
end
|
|
96
|
+
|
|
57
97
|
def ==(other)
|
|
58
98
|
other.is_a?(self.class) && to_a == other.to_a
|
|
59
99
|
end
|
data/lib/archaeo/timestamp.rb
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "date"
|
|
4
|
+
|
|
3
5
|
module Archaeo
|
|
4
6
|
# Value object representing a Wayback Machine timestamp (YYYYMMDDHHmmss).
|
|
5
7
|
#
|
|
@@ -70,6 +72,26 @@ module Archaeo
|
|
|
70
72
|
@to_time.to_i
|
|
71
73
|
end
|
|
72
74
|
|
|
75
|
+
def to_iso8601
|
|
76
|
+
@to_time.iso8601
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def to_rfc3339
|
|
80
|
+
@to_time.rfc3339
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def +(seconds)
|
|
84
|
+
self.class.from_time(@to_time + seconds)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def -(other)
|
|
88
|
+
if other.is_a?(self.class)
|
|
89
|
+
@to_time - other.to_time
|
|
90
|
+
else
|
|
91
|
+
self.class.from_time(@to_time - other)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
73
95
|
def <=>(other)
|
|
74
96
|
return nil unless other.is_a?(self.class)
|
|
75
97
|
|
data/lib/archaeo/version.rb
CHANGED