archaeo 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +160 -6
- data/lib/archaeo/archive_url.rb +16 -0
- data/lib/archaeo/asset_extractor.rb +19 -0
- data/lib/archaeo/asset_list.rb +31 -0
- data/lib/archaeo/availability_result.rb +24 -0
- data/lib/archaeo/bulk_downloader.rb +97 -28
- data/lib/archaeo/cdx_api.rb +4 -0
- data/lib/archaeo/cdx_filter.rb +12 -0
- data/lib/archaeo/cli.rb +96 -10
- data/lib/archaeo/download_state.rb +46 -15
- data/lib/archaeo/fetcher.rb +16 -1
- data/lib/archaeo/http_client.rb +43 -11
- data/lib/archaeo/page.rb +32 -0
- data/lib/archaeo/page_bundle.rb +28 -0
- data/lib/archaeo/save_result.rb +19 -0
- data/lib/archaeo/snapshot.rb +22 -0
- data/lib/archaeo/timestamp.rb +14 -0
- data/lib/archaeo/url_normalizer.rb +7 -1
- data/lib/archaeo/url_rewriter.rb +46 -0
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +1 -0
- metadata +1 -1
data/lib/archaeo/cli.rb
CHANGED
|
@@ -9,6 +9,9 @@ module Archaeo
|
|
|
9
9
|
class Cli < Thor
|
|
10
10
|
map %w[--version -v] => :version
|
|
11
11
|
|
|
12
|
+
class_option :quiet, type: :boolean, default: false,
|
|
13
|
+
desc: "Suppress progress messages"
|
|
14
|
+
|
|
12
15
|
def self.exit_on_failure?
|
|
13
16
|
true
|
|
14
17
|
end
|
|
@@ -65,6 +68,39 @@ module Archaeo
|
|
|
65
68
|
end
|
|
66
69
|
end
|
|
67
70
|
|
|
71
|
+
desc "before URL TIMESTAMP",
|
|
72
|
+
"Find the nearest snapshot before a timestamp"
|
|
73
|
+
option :format, desc: "Output format (url, json)", default: "url"
|
|
74
|
+
def before(url, timestamp)
|
|
75
|
+
handle_errors do
|
|
76
|
+
snap = CdxApi.new.before(url, timestamp: timestamp)
|
|
77
|
+
output_snapshot(snap)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
desc "after URL TIMESTAMP",
|
|
82
|
+
"Find the nearest snapshot after a timestamp"
|
|
83
|
+
option :format, desc: "Output format (url, json)", default: "url"
|
|
84
|
+
def after(url, timestamp)
|
|
85
|
+
handle_errors do
|
|
86
|
+
snap = CdxApi.new.after(url, timestamp: timestamp)
|
|
87
|
+
output_snapshot(snap)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
desc "between URL FROM TO",
|
|
92
|
+
"List snapshots in a date range"
|
|
93
|
+
option :format, desc: "Output format (table, json, csv)",
|
|
94
|
+
default: "table"
|
|
95
|
+
def between(url, from, to)
|
|
96
|
+
fmt = validate_output_format
|
|
97
|
+
handle_errors do
|
|
98
|
+
cdx = CdxApi.new
|
|
99
|
+
snaps = cdx.between(url, from: from, to: to).to_a
|
|
100
|
+
output_formatted(snaps, fmt)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
68
104
|
desc "available URL", "Check if a URL is archived"
|
|
69
105
|
option :timestamp, desc: "Check near this timestamp (YYYYMMDDHHmmss)"
|
|
70
106
|
def available(url)
|
|
@@ -105,6 +141,18 @@ module Archaeo
|
|
|
105
141
|
end
|
|
106
142
|
end
|
|
107
143
|
|
|
144
|
+
desc "fetch-assets URL TIMESTAMP",
|
|
145
|
+
"Fetch a page and list its extracted assets"
|
|
146
|
+
option :format, desc: "Output format (json, table)", default: "table"
|
|
147
|
+
def fetch_assets(url, timestamp)
|
|
148
|
+
handle_errors do
|
|
149
|
+
bundle = Fetcher.new.fetch_page_with_assets(
|
|
150
|
+
url, timestamp: timestamp
|
|
151
|
+
)
|
|
152
|
+
output_assets(bundle)
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
108
156
|
desc "download URL", "Download all archived snapshots of a URL"
|
|
109
157
|
option :output, desc: "Output directory", default: "archive"
|
|
110
158
|
option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
|
|
@@ -113,6 +161,8 @@ module Archaeo
|
|
|
113
161
|
desc: "Resume interrupted download"
|
|
114
162
|
option :concurrency, type: :numeric, default: 1,
|
|
115
163
|
desc: "Number of parallel downloads"
|
|
164
|
+
option :dry_run, type: :boolean, default: false,
|
|
165
|
+
desc: "Preview downloads without fetching"
|
|
116
166
|
def download(url)
|
|
117
167
|
handle_errors do
|
|
118
168
|
downloader = BulkDownloader.new(
|
|
@@ -141,6 +191,14 @@ module Archaeo
|
|
|
141
191
|
end
|
|
142
192
|
end
|
|
143
193
|
|
|
194
|
+
desc "count URL",
|
|
195
|
+
"Count snapshots for a URL"
|
|
196
|
+
def count(url)
|
|
197
|
+
handle_errors do
|
|
198
|
+
puts CdxApi.new.count(url)
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
144
202
|
CDX_OPTION_MAP = {
|
|
145
203
|
from: :from,
|
|
146
204
|
to: :to,
|
|
@@ -153,6 +211,10 @@ module Archaeo
|
|
|
153
211
|
|
|
154
212
|
private
|
|
155
213
|
|
|
214
|
+
def quiet?
|
|
215
|
+
options[:quiet]
|
|
216
|
+
end
|
|
217
|
+
|
|
156
218
|
def handle_errors
|
|
157
219
|
yield
|
|
158
220
|
rescue RateLimitError => e
|
|
@@ -214,18 +276,42 @@ module Archaeo
|
|
|
214
276
|
end
|
|
215
277
|
end
|
|
216
278
|
|
|
217
|
-
def
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
279
|
+
def output_assets(bundle)
|
|
280
|
+
case options[:format]
|
|
281
|
+
when "json"
|
|
282
|
+
puts bundle.assets.to_json
|
|
283
|
+
else
|
|
284
|
+
bundle.assets.to_h.each do |type, urls|
|
|
285
|
+
next if urls.empty?
|
|
286
|
+
|
|
287
|
+
puts "#{type}:"
|
|
288
|
+
urls.each { |url| puts " #{url}" }
|
|
289
|
+
end
|
|
226
290
|
end
|
|
227
291
|
end
|
|
228
292
|
|
|
293
|
+
def download_with_progress(downloader, url)
|
|
294
|
+
summary = downloader.download(
|
|
295
|
+
url, from: options[:from], to: options[:to],
|
|
296
|
+
resume: options[:resume], dry_run: options[:dry_run]
|
|
297
|
+
) { |c, t, s| print_progress(c, t, s) }
|
|
298
|
+
print_summary(summary)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def print_progress(current, total, snap)
|
|
302
|
+
return if quiet?
|
|
303
|
+
|
|
304
|
+
warn "[#{current}/#{total}] #{snap.timestamp} #{snap.original_url}"
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
def print_summary(summary)
|
|
308
|
+
return if quiet?
|
|
309
|
+
|
|
310
|
+
warn "Downloaded #{summary.downloaded}/#{summary.total} " \
|
|
311
|
+
"(#{summary.bytes_written} bytes) in " \
|
|
312
|
+
"#{summary.elapsed.round(1)}s"
|
|
313
|
+
end
|
|
314
|
+
|
|
229
315
|
def build_cdx_options(opts)
|
|
230
316
|
CDX_OPTION_MAP.each_with_object({}) do |(cli_key, api_key), result|
|
|
231
317
|
value = opts[cli_key]
|
|
@@ -259,7 +345,7 @@ module Archaeo
|
|
|
259
345
|
def write_output(path, content)
|
|
260
346
|
FileUtils.mkdir_p(File.dirname(path))
|
|
261
347
|
File.binwrite(path, content)
|
|
262
|
-
warn "Written to #{path}"
|
|
348
|
+
warn "Written to #{path}" unless quiet?
|
|
263
349
|
end
|
|
264
350
|
end
|
|
265
351
|
end
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "json"
|
|
3
4
|
require "set"
|
|
4
5
|
|
|
5
6
|
module Archaeo
|
|
6
7
|
# Tracks download progress for resume support.
|
|
7
8
|
#
|
|
8
|
-
# Persists completed snapshot
|
|
9
|
+
# Persists completed snapshot metadata to a JSONL state file within
|
|
9
10
|
# the output directory, allowing interrupted downloads to resume
|
|
10
11
|
# without re-fetching already downloaded snapshots.
|
|
11
12
|
class DownloadState
|
|
@@ -19,42 +20,72 @@ module Archaeo
|
|
|
19
20
|
end
|
|
20
21
|
|
|
21
22
|
def completed?(timestamp)
|
|
22
|
-
|
|
23
|
+
entries_key.include?(timestamp.to_s)
|
|
23
24
|
end
|
|
24
25
|
|
|
25
|
-
def mark_completed(timestamp)
|
|
26
|
+
def mark_completed(timestamp, url: nil, bytes: nil)
|
|
26
27
|
ts = timestamp.to_s
|
|
27
|
-
return if
|
|
28
|
+
return if entries_key.include?(ts)
|
|
28
29
|
|
|
29
|
-
|
|
30
|
-
|
|
30
|
+
entry = { "ts" => ts, "at" => Time.now.utc.iso8601 }
|
|
31
|
+
entry["url"] = url if url
|
|
32
|
+
entry["bytes"] = bytes if bytes
|
|
33
|
+
entries << entry
|
|
34
|
+
@entries_key = nil
|
|
31
35
|
save
|
|
32
36
|
end
|
|
33
37
|
|
|
38
|
+
def entry_for(timestamp)
|
|
39
|
+
entries.find { |e| e["ts"] == timestamp.to_s }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def total_bytes
|
|
43
|
+
entries.sum { |e| e["bytes"].to_i }
|
|
44
|
+
end
|
|
45
|
+
|
|
34
46
|
def clear
|
|
35
|
-
@
|
|
36
|
-
@
|
|
47
|
+
@entries = []
|
|
48
|
+
@entries_key = nil
|
|
37
49
|
FileUtils.rm_f(@path)
|
|
38
50
|
end
|
|
39
51
|
|
|
40
52
|
private
|
|
41
53
|
|
|
42
|
-
def
|
|
43
|
-
@
|
|
54
|
+
def entries
|
|
55
|
+
@entries ||= load_entries
|
|
44
56
|
end
|
|
45
57
|
|
|
46
|
-
def
|
|
47
|
-
@
|
|
58
|
+
def entries_key
|
|
59
|
+
@entries_key ||= entries.each_with_object(Set.new) { |e, s| s << e["ts"] }
|
|
48
60
|
end
|
|
49
61
|
|
|
50
|
-
def
|
|
62
|
+
def load_entries
|
|
51
63
|
return [] unless File.exist?(@path)
|
|
52
64
|
|
|
53
|
-
File.
|
|
65
|
+
first_line = File.open(@path, &:readline).strip
|
|
66
|
+
if first_line.start_with?("{")
|
|
67
|
+
parse_jsonl
|
|
68
|
+
else
|
|
69
|
+
migrate_legacy(first_line)
|
|
70
|
+
end
|
|
71
|
+
rescue EOFError
|
|
72
|
+
[]
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def parse_jsonl
|
|
76
|
+
File.readlines(@path, chomp: true).reject(&:empty?).map do |line|
|
|
77
|
+
JSON.parse(line)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def migrate_legacy(_first_line)
|
|
82
|
+
File.readlines(@path, chomp: true).reject(&:empty?).map do |ts|
|
|
83
|
+
{ "ts" => ts }
|
|
84
|
+
end
|
|
54
85
|
end
|
|
55
86
|
|
|
56
87
|
def save
|
|
57
|
-
content = "#{
|
|
88
|
+
content = "#{entries.map { |e| JSON.generate(e) }.join("\n")}\n"
|
|
58
89
|
tmp_path = "#{@path}.tmp"
|
|
59
90
|
File.write(tmp_path, content)
|
|
60
91
|
File.rename(tmp_path, @path)
|
data/lib/archaeo/fetcher.rb
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "digest"
|
|
4
|
+
|
|
3
5
|
module Archaeo
|
|
4
6
|
# Downloads archived content from the Wayback Machine.
|
|
5
7
|
#
|
|
@@ -13,12 +15,13 @@ module Archaeo
|
|
|
13
15
|
@client = client
|
|
14
16
|
end
|
|
15
17
|
|
|
16
|
-
def fetch(url, timestamp:, identity: false)
|
|
18
|
+
def fetch(url, timestamp:, identity: false, snapshot: nil)
|
|
17
19
|
url = UrlNormalizer.normalize(url)
|
|
18
20
|
ts = Timestamp.coerce(timestamp)
|
|
19
21
|
archive_url = ArchiveUrl.new(url, timestamp: ts,
|
|
20
22
|
identity: identity)
|
|
21
23
|
response = follow_redirects(archive_url.to_s)
|
|
24
|
+
verify_integrity!(response, snapshot) if snapshot
|
|
22
25
|
build_page(response, archive_url.to_s, url, ts)
|
|
23
26
|
end
|
|
24
27
|
|
|
@@ -31,6 +34,18 @@ module Archaeo
|
|
|
31
34
|
|
|
32
35
|
private
|
|
33
36
|
|
|
37
|
+
def verify_integrity!(response, snapshot)
|
|
38
|
+
return unless snapshot.digest && !snapshot.digest.empty?
|
|
39
|
+
|
|
40
|
+
expected = snapshot.digest.delete_prefix("SHA1-")
|
|
41
|
+
actual = Digest::SHA1.hexdigest(response.body)
|
|
42
|
+
return if expected == actual
|
|
43
|
+
|
|
44
|
+
raise IntegrityError,
|
|
45
|
+
"Digest mismatch for #{snapshot.original_url}: " \
|
|
46
|
+
"expected #{expected}, got #{actual}"
|
|
47
|
+
end
|
|
48
|
+
|
|
34
49
|
def build_page(response, archive_url, url, timestamp)
|
|
35
50
|
Page.new(
|
|
36
51
|
content: response.body,
|
data/lib/archaeo/http_client.rb
CHANGED
|
@@ -62,11 +62,13 @@ module Archaeo
|
|
|
62
62
|
def initialize(timeout: DEFAULT_TIMEOUT,
|
|
63
63
|
max_retries: DEFAULT_MAX_RETRIES,
|
|
64
64
|
retry_delay: DEFAULT_RETRY_DELAY,
|
|
65
|
-
user_agent: nil
|
|
65
|
+
user_agent: nil,
|
|
66
|
+
on_request: nil)
|
|
66
67
|
@timeout = timeout
|
|
67
68
|
@max_retries = max_retries
|
|
68
69
|
@retry_delay = retry_delay
|
|
69
70
|
@user_agent = user_agent
|
|
71
|
+
@on_request = on_request
|
|
70
72
|
@connections = {}
|
|
71
73
|
@last_used = {}
|
|
72
74
|
@mutex = Mutex.new
|
|
@@ -106,6 +108,18 @@ module Archaeo
|
|
|
106
108
|
end
|
|
107
109
|
end
|
|
108
110
|
|
|
111
|
+
def pool_stats
|
|
112
|
+
now = Time.now
|
|
113
|
+
@mutex.synchronize do
|
|
114
|
+
{
|
|
115
|
+
active_connections: @connections.size,
|
|
116
|
+
max_pool_size: MAX_POOL_SIZE,
|
|
117
|
+
hosts: @connections.keys,
|
|
118
|
+
idle_times: @last_used.transform_values { |t| (now - t).round },
|
|
119
|
+
}.freeze
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
109
123
|
private
|
|
110
124
|
|
|
111
125
|
def select_user_agent
|
|
@@ -222,11 +236,16 @@ module Archaeo
|
|
|
222
236
|
value = response.headers["retry-after"]
|
|
223
237
|
return nil unless value
|
|
224
238
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
239
|
+
Integer(value)
|
|
240
|
+
rescue ArgumentError
|
|
241
|
+
parse_retry_after_date(value)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def parse_retry_after_date(value)
|
|
245
|
+
remaining = (Time.httpdate(value) - Time.now).to_i
|
|
246
|
+
[remaining, 0].max
|
|
247
|
+
rescue ArgumentError
|
|
248
|
+
nil
|
|
230
249
|
end
|
|
231
250
|
|
|
232
251
|
def raise_if_exhausted(retries, error)
|
|
@@ -237,11 +256,8 @@ module Archaeo
|
|
|
237
256
|
end
|
|
238
257
|
|
|
239
258
|
def execute_with_connection(uri, headers, request_class)
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
headers.each { |k, v| request[k] = v }
|
|
243
|
-
raw = http.request(request)
|
|
244
|
-
build_response(raw)
|
|
259
|
+
request = build_request(uri, headers, request_class)
|
|
260
|
+
execute_tracked_request(uri, request)
|
|
245
261
|
rescue *TRANSIENT_ERRORS
|
|
246
262
|
raise
|
|
247
263
|
rescue StandardError
|
|
@@ -249,6 +265,22 @@ module Archaeo
|
|
|
249
265
|
raise
|
|
250
266
|
end
|
|
251
267
|
|
|
268
|
+
def build_request(uri, headers, request_class)
|
|
269
|
+
request = request_class.new(uri)
|
|
270
|
+
headers.each { |k, v| request[k] = v }
|
|
271
|
+
request
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def execute_tracked_request(uri, request)
|
|
275
|
+
http = connection_for(uri)
|
|
276
|
+
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
277
|
+
raw = http.request(request)
|
|
278
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
|
|
279
|
+
response = build_response(raw)
|
|
280
|
+
@on_request&.call(uri, elapsed, response.status, 0)
|
|
281
|
+
response
|
|
282
|
+
end
|
|
283
|
+
|
|
252
284
|
def default_headers
|
|
253
285
|
{
|
|
254
286
|
"User-Agent" => select_user_agent,
|
data/lib/archaeo/page.rb
CHANGED
|
@@ -50,6 +50,10 @@ module Archaeo
|
|
|
50
50
|
@content_type&.start_with?("text/")
|
|
51
51
|
end
|
|
52
52
|
|
|
53
|
+
def css?
|
|
54
|
+
@content_type&.include?("text/css")
|
|
55
|
+
end
|
|
56
|
+
|
|
53
57
|
def binary?
|
|
54
58
|
!(text? || json? || html?)
|
|
55
59
|
end
|
|
@@ -63,6 +67,34 @@ module Archaeo
|
|
|
63
67
|
end
|
|
64
68
|
end
|
|
65
69
|
|
|
70
|
+
def to_h
|
|
71
|
+
{
|
|
72
|
+
content_type: @content_type,
|
|
73
|
+
status_code: @status_code,
|
|
74
|
+
archive_url: @archive_url,
|
|
75
|
+
original_url: @original_url,
|
|
76
|
+
timestamp: @timestamp,
|
|
77
|
+
size: size,
|
|
78
|
+
encoding: encoding.to_s,
|
|
79
|
+
}
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def as_json(*)
|
|
83
|
+
{
|
|
84
|
+
content_type: @content_type,
|
|
85
|
+
status_code: @status_code,
|
|
86
|
+
archive_url: @archive_url,
|
|
87
|
+
original_url: @original_url,
|
|
88
|
+
timestamp: @timestamp.to_s,
|
|
89
|
+
size: size,
|
|
90
|
+
encoding: encoding.to_s,
|
|
91
|
+
}
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def inspect
|
|
95
|
+
"#<#{self.class.name} #{@content_type} #{size} bytes>"
|
|
96
|
+
end
|
|
97
|
+
|
|
66
98
|
private
|
|
67
99
|
|
|
68
100
|
def detect_encoding
|
data/lib/archaeo/page_bundle.rb
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
3
5
|
module Archaeo
|
|
4
6
|
# A fetched page together with all its extracted asset URLs.
|
|
5
7
|
#
|
|
@@ -26,5 +28,31 @@ module Archaeo
|
|
|
26
28
|
def asset_count
|
|
27
29
|
assets.size
|
|
28
30
|
end
|
|
31
|
+
|
|
32
|
+
def to_h
|
|
33
|
+
{ page: @page.to_h, assets: @assets.to_h }
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def as_json(*)
|
|
37
|
+
{ page: @page.as_json, assets: @assets.to_h }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def to_json(*args)
|
|
41
|
+
JSON.generate(as_json, *args)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def download_assets(output_dir:, client: HttpClient.new)
|
|
45
|
+
FileUtils.mkdir_p(output_dir)
|
|
46
|
+
@assets.all.each do |url|
|
|
47
|
+
filename = File.join(output_dir,
|
|
48
|
+
File.basename(URI.parse(url).path))
|
|
49
|
+
tmp_path = "#{filename}.tmp"
|
|
50
|
+
response = client.get(url)
|
|
51
|
+
File.binwrite(tmp_path, response.body)
|
|
52
|
+
File.rename(tmp_path, filename)
|
|
53
|
+
rescue StandardError
|
|
54
|
+
FileUtils.rm_f(tmp_path) if defined?(tmp_path)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
29
57
|
end
|
|
30
58
|
end
|
data/lib/archaeo/save_result.rb
CHANGED
|
@@ -18,5 +18,24 @@ module Archaeo
|
|
|
18
18
|
def cached?
|
|
19
19
|
@cached
|
|
20
20
|
end
|
|
21
|
+
|
|
22
|
+
def to_h
|
|
23
|
+
{ url: @url, archive_url: @archive_url,
|
|
24
|
+
timestamp: @timestamp, cached: @cached }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def as_json(*)
|
|
28
|
+
{ url: @url, archive_url: @archive_url,
|
|
29
|
+
timestamp: @timestamp.to_s, cached: @cached }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def to_s
|
|
33
|
+
label = @cached ? "Cached" : "Saved"
|
|
34
|
+
"#{label}: #{@archive_url}"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def inspect
|
|
38
|
+
"#<#{self.class.name} #{@url} cached=#{@cached}>"
|
|
39
|
+
end
|
|
21
40
|
end
|
|
22
41
|
end
|
data/lib/archaeo/snapshot.rb
CHANGED
|
@@ -29,6 +29,11 @@ module Archaeo
|
|
|
29
29
|
ArchiveUrl.new(original_url, timestamp: @timestamp).to_s
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
+
def identity_url
|
|
33
|
+
ArchiveUrl.new(original_url, timestamp: @timestamp,
|
|
34
|
+
identity: true).to_s
|
|
35
|
+
end
|
|
36
|
+
|
|
32
37
|
def blocked?
|
|
33
38
|
@status_code == BLOCKED_STATUS
|
|
34
39
|
end
|
|
@@ -53,6 +58,18 @@ module Archaeo
|
|
|
53
58
|
client_error? || server_error?
|
|
54
59
|
end
|
|
55
60
|
|
|
61
|
+
def age
|
|
62
|
+
Time.now - @timestamp.to_time
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def older_than?(seconds)
|
|
66
|
+
age > seconds
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def newer_than?(seconds)
|
|
70
|
+
age <= seconds
|
|
71
|
+
end
|
|
72
|
+
|
|
56
73
|
def fetch(client: HttpClient.new, identity: false)
|
|
57
74
|
Fetcher.new(client: client).fetch(
|
|
58
75
|
original_url, timestamp: @timestamp, identity: identity
|
|
@@ -102,5 +119,10 @@ module Archaeo
|
|
|
102
119
|
def hash
|
|
103
120
|
to_a.hash
|
|
104
121
|
end
|
|
122
|
+
|
|
123
|
+
def inspect
|
|
124
|
+
"#<#{self.class.name} #{timestamp} " \
|
|
125
|
+
"#{original_url} status=#{status_code}>"
|
|
126
|
+
end
|
|
105
127
|
end
|
|
106
128
|
end
|
data/lib/archaeo/timestamp.rb
CHANGED
|
@@ -17,6 +17,7 @@ module Archaeo
|
|
|
17
17
|
def initialize(year:, month: 1, day: 1,
|
|
18
18
|
hour: 0, minute: 0, second: 0)
|
|
19
19
|
@to_time = Time.utc(year, month, day, hour, minute, second)
|
|
20
|
+
freeze
|
|
20
21
|
end
|
|
21
22
|
|
|
22
23
|
def self.parse(string)
|
|
@@ -129,5 +130,18 @@ module Archaeo
|
|
|
129
130
|
def second
|
|
130
131
|
@to_time.sec
|
|
131
132
|
end
|
|
133
|
+
|
|
134
|
+
def to_h
|
|
135
|
+
{ year: year, month: month, day: day,
|
|
136
|
+
hour: hour, minute: minute, second: second }
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def to_a
|
|
140
|
+
[year, month, day, hour, minute, second]
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def inspect
|
|
144
|
+
"#<#{self.class.name} #{self}>"
|
|
145
|
+
end
|
|
132
146
|
end
|
|
133
147
|
end
|
|
@@ -49,7 +49,8 @@ module Archaeo
|
|
|
49
49
|
url = strip_whitespace(url)
|
|
50
50
|
url = strip_surrounding_quotes(url)
|
|
51
51
|
url = fix_double_percent_encoding(url)
|
|
52
|
-
normalize_percent_encoding(url)
|
|
52
|
+
url = normalize_percent_encoding(url)
|
|
53
|
+
remove_default_port(url)
|
|
53
54
|
end
|
|
54
55
|
|
|
55
56
|
def strip_whitespace(url)
|
|
@@ -69,5 +70,10 @@ module Archaeo
|
|
|
69
70
|
def normalize_percent_encoding(url)
|
|
70
71
|
url.gsub(/%[0-9a-f]{2}/i, &:upcase)
|
|
71
72
|
end
|
|
73
|
+
|
|
74
|
+
def remove_default_port(url)
|
|
75
|
+
url.sub(%r{(https://[^/:]+):443(?=/|$)}, '\1')
|
|
76
|
+
.sub(%r{(http://[^/:]+):80(?=/|$)}, '\1')
|
|
77
|
+
end
|
|
72
78
|
end
|
|
73
79
|
end
|
data/lib/archaeo/url_rewriter.rb
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
3
5
|
module Archaeo
|
|
4
6
|
# Rewrites Wayback Machine archive URLs to local file paths.
|
|
5
7
|
#
|
|
@@ -7,6 +9,8 @@ module Archaeo
|
|
|
7
9
|
# browsing. Converts absolute archive URLs into relative paths
|
|
8
10
|
# rooted at a configurable local directory.
|
|
9
11
|
class UrlRewriter
|
|
12
|
+
URL_ATTRS = %w[src href data-src poster].freeze
|
|
13
|
+
|
|
10
14
|
def initialize(archive_prefix, local_prefix)
|
|
11
15
|
@archive_prefix = archive_prefix.to_s
|
|
12
16
|
@local_prefix = local_prefix.to_s
|
|
@@ -18,5 +22,47 @@ module Archaeo
|
|
|
18
22
|
relative = url.sub(@archive_prefix, "")
|
|
19
23
|
File.join(@local_prefix, relative)
|
|
20
24
|
end
|
|
25
|
+
|
|
26
|
+
def rewrite_batch(urls)
|
|
27
|
+
urls.map { |url| rewrite(url) }
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def rewrite_html(html_content)
|
|
31
|
+
doc = Nokogiri::HTML(html_content)
|
|
32
|
+
rewrite_url_attrs(doc)
|
|
33
|
+
rewrite_srcset_attrs(doc)
|
|
34
|
+
doc.to_html
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def rewrite_url_attrs(doc)
|
|
38
|
+
URL_ATTRS.each do |attr|
|
|
39
|
+
doc.css("[#{attr}]").each do |el|
|
|
40
|
+
original = el[attr]
|
|
41
|
+
next unless original&.start_with?(@archive_prefix)
|
|
42
|
+
|
|
43
|
+
el[attr] = rewrite(original)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def rewrite_srcset_attrs(doc)
|
|
49
|
+
doc.css("[srcset]").each do |el|
|
|
50
|
+
el["srcset"] = rewrite_srcset(el["srcset"])
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def rewrite_srcset(srcset)
|
|
57
|
+
return srcset unless srcset
|
|
58
|
+
|
|
59
|
+
srcset.split(",").map do |entry|
|
|
60
|
+
parts = entry.strip.split(/\s+/, 2)
|
|
61
|
+
url = parts[0]
|
|
62
|
+
descriptor = parts[1]
|
|
63
|
+
rewritten = url.start_with?(@archive_prefix) ? rewrite(url) : url
|
|
64
|
+
descriptor ? "#{rewritten} #{descriptor}" : rewritten
|
|
65
|
+
end.join(", ")
|
|
66
|
+
end
|
|
21
67
|
end
|
|
22
68
|
end
|
data/lib/archaeo/version.rb
CHANGED
data/lib/archaeo.rb
CHANGED