archaeo 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ecdcd994fa61efa836a5224a5e329b40b72694c27a79cbb6eb4f91bf57c0f2c9
4
- data.tar.gz: 03ad557eb55ce9946a2936e3beec8cad13db2ecd4b2fc49b0996131d35e6ddba
3
+ metadata.gz: 67239af7cc927c495c67a849ecefb1cdc886ce8d95ddd6e27a2decdde6a93cd3
4
+ data.tar.gz: 8ce4a0f786c2e7db3268b6660a1aa9e2f3b913ff99c22c85c3c2190457defc90
5
5
  SHA512:
6
- metadata.gz: a2859d1738f4f4a9fa0f0ed89d118dacfc24a2f75d3237ad3bdd31cf26c041e8aa5f47c998d8b5e61907c34d491ed48b6df22d2784702d60a15820ba8d8a2a27
7
- data.tar.gz: e2df62b1077c90d8b04173f9aa713f590f9692cdfde3d3b656796308810341cd5b052321f97a28513ebe98143cdb8f28fa7e182720555db231983fa3d2a6d4be
6
+ metadata.gz: ca0a9cc2bf0ad33a0d3dfd88e3228fd79fc3291a42fd3d13bbfbe4e37e744b0e3a5dadcec1cab48c0e13b6af872a8e3f4e80ce3e6593b18f024416b9cf7370fa
7
+ data.tar.gz: bb4b1d9e720dfdcc18c7c4ccb73cc55e29a3e31fb6ffb5bf3b8c0fce1548a63a06da4a710ab5fc5020f142ede69dbdfbef5451944183e280e86e018379a792eb
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Verifies that archived snapshots are still accessible.
5
+ #
6
+ # Checks each snapshot by performing HEAD requests to the
7
+ # archive URL and reporting accessibility status.
8
+ HealthReport = Struct.new(
9
+ :total, :accessible, :missing, :errors, :details,
10
+ keyword_init: true
11
+ )
12
+
13
+ HealthDetail = Struct.new(
14
+ :snapshot, :status, :error,
15
+ keyword_init: true
16
+ )
17
+
18
+ class ArchiveHealthCheck
19
+ def initialize(client: HttpClient.new, cdx_api: nil)
20
+ @client = client
21
+ @cdx_api = cdx_api
22
+ end
23
+
24
+ def check(url, from: nil, to: nil, sample: nil)
25
+ snapshots = fetch_snapshots(url, from: from, to: to)
26
+ snapshots = sample_snapshots(snapshots, sample) if sample
27
+
28
+ details = check_snapshots(snapshots)
29
+ build_report(details)
30
+ end
31
+
32
+ private
33
+
34
+ def fetch_snapshots(url, from:, to:)
35
+ cdx = @cdx_api || CdxApi.new(client: @client)
36
+ opts = {}
37
+ opts[:from] = from if from
38
+ opts[:to] = to if to
39
+ cdx.snapshots(url, **opts)
40
+ .select(&:success?).to_a
41
+ end
42
+
43
+ def sample_snapshots(snapshots, count)
44
+ return snapshots if count.nil? || count >= snapshots.size
45
+
46
+ step = snapshots.size.to_f / count
47
+ (0...count).map { |i| snapshots[(i * step).to_i] }
48
+ end
49
+
50
+ def check_snapshots(snapshots)
51
+ snapshots.map do |snap|
52
+ check_single(snap)
53
+ end
54
+ end
55
+
56
+ def check_single(snapshot)
57
+ response = @client.head(snapshot.archive_url)
58
+ status = response.status.between?(200, 399) ? :accessible : :missing
59
+ HealthDetail.new(snapshot: snapshot, status: status, error: nil)
60
+ rescue StandardError => e
61
+ HealthDetail.new(snapshot: snapshot, status: :error, error: e.message)
62
+ end
63
+
64
+ def build_report(details)
65
+ total = details.size
66
+ accessible = details.count { |d| d.status == :accessible }
67
+ missing = details.count { |d| d.status == :missing }
68
+ errors = details.count { |d| d.status == :error }
69
+
70
+ HealthReport.new(
71
+ total: total, accessible: accessible,
72
+ missing: missing, errors: errors,
73
+ details: details
74
+ )
75
+ end
76
+ end
77
+ end
@@ -15,23 +15,31 @@ module Archaeo
15
15
  # for interrupted download recovery.
16
16
  class BulkDownloader
17
17
  def initialize(client: HttpClient.new, output_dir: "archive",
18
- cdx_api: nil, concurrency: 1, on_error: nil)
18
+ cdx_api: nil, concurrency: 1, on_error: nil,
19
+ rate_limiter: nil, path_sanitizer: nil)
19
20
  @client = client
20
21
  @output_dir = output_dir
21
22
  @cdx_api = cdx_api
22
23
  @concurrency = [1, concurrency.to_i].max
23
24
  @on_error = on_error
25
+ @rate_limiter = rate_limiter || RateLimiter.new
26
+ @path_sanitizer = path_sanitizer || PathSanitizer.new
24
27
  end
25
28
 
26
29
  def download(url, from: nil, to: nil, resume: false,
27
- dry_run: false, &block)
30
+ dry_run: false, all_timestamps: false,
31
+ filter: nil, page_requisites: false,
32
+ snapshot_at: nil, &block)
28
33
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
29
34
  url = UrlNormalizer.normalize(url)
30
35
  FileUtils.mkdir_p(@output_dir) unless dry_run
31
36
 
32
- snapshots = fetch_snapshots(url, from: from, to: to)
37
+ snapshots = fetch_snapshots(url, from: from, to: to,
38
+ all_timestamps: all_timestamps,
39
+ snapshot_at: snapshot_at)
40
+ snapshots = apply_filter(snapshots, filter)
33
41
  downloaded, skipped, bytes, failed =
34
- run_download(snapshots, resume, dry_run, block)
42
+ run_download(snapshots, resume, dry_run, page_requisites, block)
35
43
 
36
44
  build_summary(start_time, snapshots.size, downloaded,
37
45
  skipped, bytes, failed: failed)
@@ -39,25 +47,39 @@ module Archaeo
39
47
 
40
48
  private
41
49
 
42
- def fetch_snapshots(url, from:, to:)
50
+ def fetch_snapshots(url, from:, to:, all_timestamps:, snapshot_at:)
43
51
  cdx = @cdx_api || CdxApi.new(client: @client)
52
+
53
+ if snapshot_at
54
+ ts = Timestamp.coerce(snapshot_at)
55
+ return cdx.composite_snapshot(url, timestamp: ts, collapse: ["digest"])
56
+ end
57
+
44
58
  options = {}
45
59
  options[:from] = from if from
46
60
  options[:to] = to if to
61
+ options[:collapse] = ["digest"] unless all_timestamps
62
+
47
63
  cdx.snapshots(url, **options)
48
64
  .select { |snap| !snap.blocked? && snap.status_code == 200 }
49
65
  end
50
66
 
51
- def run_download(snapshots, resume, dry_run, progress)
67
+ def apply_filter(snapshots, filter)
68
+ return snapshots unless filter
69
+
70
+ snapshots.select { |snap| filter.match?(snap.original_url) }
71
+ end
72
+
73
+ def run_download(snapshots, resume, dry_run, page_requisites, progress)
52
74
  state = DownloadState.new(@output_dir)
53
75
  total = snapshots.size
54
76
 
55
77
  if @concurrency == 1
56
78
  download_sequential(snapshots, total, state, resume,
57
- dry_run, progress)
79
+ dry_run, page_requisites, progress)
58
80
  else
59
81
  download_concurrent(snapshots, total, state, resume,
60
- dry_run, progress)
82
+ dry_run, page_requisites, progress)
61
83
  end
62
84
  end
63
85
 
@@ -71,11 +93,12 @@ module Archaeo
71
93
  end
72
94
 
73
95
  def download_sequential(snapshots, total, state, resume,
74
- dry_run, progress)
96
+ dry_run, page_requisites, progress)
75
97
  counters = { downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
76
98
 
77
99
  snapshots.each_with_index do |snap, index|
78
100
  process_sequential(snap, state, resume, dry_run, counters)
101
+ fetch_requisites(snap, dry_run, counters) if page_requisites
79
102
  progress&.call(index + 1, total, snap)
80
103
  end
81
104
 
@@ -96,6 +119,47 @@ module Archaeo
96
119
  @on_error&.call(snap, e)
97
120
  end
98
121
 
122
+ def fetch_requisites(snap, dry_run, counters)
123
+ return if dry_run
124
+
125
+ begin
126
+ bundle = snap.fetch_with_assets(client: @client)
127
+ bundle.assets.downloadable.all.each do |asset_url|
128
+ asset_snap = find_asset_snapshot(asset_url)
129
+ next unless asset_snap
130
+
131
+ counters[:bytes] += write_asset(asset_snap)
132
+ counters[:downloaded] += 1
133
+ end
134
+ rescue StandardError
135
+ nil
136
+ end
137
+ end
138
+
139
+ def find_asset_snapshot(asset_url)
140
+ cdx = @cdx_api || CdxApi.new(client: @client)
141
+ cdx.near(asset_url, timestamp: Timestamp.now)
142
+ rescue NoSnapshotFound, StandardError
143
+ nil
144
+ end
145
+
146
+ def write_asset(snapshot)
147
+ content = fetch_content(snapshot)
148
+ filename = build_filename(snapshot)
149
+ FileUtils.mkdir_p(File.dirname(filename))
150
+ tmp_path = "#{filename}.tmp"
151
+ File.binwrite(tmp_path, content)
152
+ File.rename(tmp_path, filename)
153
+ content.bytesize
154
+ end
155
+
156
+ def fetch_content(snapshot)
157
+ @rate_limiter.wait(host: "web.archive.org")
158
+ Fetcher.new(client: @client).fetch(
159
+ snapshot.original_url, timestamp: snapshot.timestamp
160
+ ).content
161
+ end
162
+
99
163
  def download_snapshot(snap, state)
100
164
  content = fetch_and_save(snap)
101
165
  state.mark_completed(snap.timestamp, url: snap.original_url,
@@ -104,7 +168,7 @@ module Archaeo
104
168
  end
105
169
 
106
170
  def download_concurrent(snapshots, total, state, resume,
107
- dry_run, progress)
171
+ dry_run, page_requisites, progress)
108
172
  queue = snapshots.each_with_index.to_a
109
173
  shared = { mutex: Mutex.new, errors: [],
110
174
  downloaded: 0, skipped: 0, bytes: 0, failed: 0 }
@@ -112,7 +176,7 @@ module Archaeo
112
176
  threads = Array.new(@concurrency) do
113
177
  Thread.new do
114
178
  process_queue(queue, total, state, resume,
115
- dry_run, progress, shared)
179
+ dry_run, page_requisites, progress, shared)
116
180
  end
117
181
  end
118
182
  threads.each(&:join)
@@ -122,7 +186,7 @@ module Archaeo
122
186
  end
123
187
 
124
188
  def process_queue(queue, total, state, resume, dry_run,
125
- progress, shared)
189
+ _page_requisites, progress, shared)
126
190
  loop do
127
191
  snap, index = shared[:mutex].synchronize { queue.shift }
128
192
  break unless snap
@@ -177,6 +241,7 @@ module Archaeo
177
241
  end
178
242
 
179
243
  def fetch_page(snapshot)
244
+ @rate_limiter.wait(host: "web.archive.org")
180
245
  Fetcher.new(client: @client).fetch(
181
246
  snapshot.original_url, timestamp: snapshot.timestamp
182
247
  )
@@ -231,21 +296,14 @@ module Archaeo
231
296
  end
232
297
 
233
298
  def build_filename(snapshot)
299
+ safe_path = @path_sanitizer.sanitize(snapshot.original_url)
234
300
  ts = snapshot.timestamp.to_s
235
- safe_path = snapshot.original_url
236
- .sub(%r{\Ahttps?://}, "")
237
- .gsub(%r{[<>:"|?*#]}, "_")
238
- .gsub(%r{[/\\]}, File::SEPARATOR)
239
- .gsub(%r{[?&=]}, "_")
240
-
241
- safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
242
- safe_path = "#{safe_path}index" if safe_path.empty?
243
301
 
244
- segments = safe_path.split(File::SEPARATOR).map do |seg|
245
- seg.length > 200 ? seg[0..200] : seg
246
- end
302
+ segments = safe_path.split(File::SEPARATOR)
303
+ last = segments.pop || "index"
247
304
 
248
- File.join(@output_dir, *segments, "#{ts}#{extension_for(snapshot)}")
305
+ File.join(@output_dir, *segments,
306
+ "#{last}_#{ts}#{extension_for(snapshot)}")
249
307
  end
250
308
  end
251
309
  end
@@ -40,8 +40,9 @@ module Archaeo
40
40
  last_skip_timestamp: "lastSkipTimestamp",
41
41
  }.freeze
42
42
 
43
- def initialize(client: HttpClient.new)
43
+ def initialize(client: HttpClient.new, cache_dir: nil)
44
44
  @client = client
45
+ @cache = cache_dir ? CdxCache.new(cache_dir) : nil
45
46
  end
46
47
 
47
48
  # Returns an Enumerator of Snapshot objects, auto-paginating
@@ -50,13 +51,26 @@ module Archaeo
50
51
  url = UrlNormalizer.normalize(url)
51
52
  validate_options!(options)
52
53
 
53
- Enumerator.new do |yielder|
54
- if options.key?(:page)
55
- fetch_page(url, options, yielder)
56
- else
57
- fetch_with_resume_key(url, options, yielder)
58
- end
54
+ if @cache && !options.key?(:page)
55
+ return cached_snapshots(url, options)
59
56
  end
57
+
58
+ build_enumerator(url, options)
59
+ end
60
+
61
+ # Returns one snapshot per unique URL, picking the newest at or before
62
+ # the given timestamp for point-in-time site reconstruction.
63
+ def composite_snapshot(url, timestamp:, collapse: [])
64
+ ts = Timestamp.coerce(timestamp)
65
+ options = { to: ts.to_s, sort: "reverse" }
66
+ options[:collapse] = collapse unless collapse.empty?
67
+
68
+ seen = {}
69
+ snapshots(url, **options).each do |snap|
70
+ key = snap.original_url
71
+ seen[key] = snap unless seen.key?(key)
72
+ end
73
+ seen.values
60
74
  end
61
75
 
62
76
  def near(url, timestamp:)
@@ -153,6 +167,24 @@ module Archaeo
153
167
 
154
168
  private
155
169
 
170
+ def cached_snapshots(url, options)
171
+ Enumerator.new do |yielder|
172
+ @cache.fetch(url, **options) do
173
+ build_enumerator(url, options).to_a
174
+ end.each { |s| yielder << s }
175
+ end
176
+ end
177
+
178
+ def build_enumerator(url, options)
179
+ Enumerator.new do |yielder|
180
+ if options.key?(:page)
181
+ fetch_page(url, options, yielder)
182
+ else
183
+ fetch_with_resume_key(url, options, yielder)
184
+ end
185
+ end
186
+ end
187
+
156
188
  def fetch_with_resume_key(url, options, yielder)
157
189
  params = build_params(url, options)
158
190
  loop do
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "digest"
5
+
6
+ module Archaeo
7
+ # Persists CDX API query results to disk for resume support.
8
+ #
9
+ # Caches snapshot lists keyed by query parameters so that
10
+ # interrupted downloads can resume without re-querying CDX.
11
+ class CdxCache
12
+ CACHE_DIR = ".cache"
13
+
14
+ def initialize(base_dir)
15
+ @base_dir = base_dir
16
+ @cache_dir = File.join(base_dir, CACHE_DIR)
17
+ end
18
+
19
+ def fetch(url, **options)
20
+ key = cache_key(url, options)
21
+ path = cache_path(key)
22
+
23
+ if File.exist?(path)
24
+ load_cache(path)
25
+ else
26
+ snapshots = yield
27
+ save_cache(path, url, options, snapshots)
28
+ snapshots
29
+ end
30
+ end
31
+
32
+ def cached?(url, **options)
33
+ File.exist?(cache_path(cache_key(url, options)))
34
+ end
35
+
36
+ def cache_key(url, options = {})
37
+ parts = [url.to_s]
38
+ parts << options[:from].to_s if options[:from]
39
+ parts << options[:to].to_s if options[:to]
40
+ parts << options[:match_type].to_s if options[:match_type]
41
+ parts += Array(options[:filters]).map(&:to_s) if options[:filters]
42
+ parts += Array(options[:collapse]).map(&:to_s) if options[:collapse]
43
+ parts << options[:sort].to_s if options[:sort]
44
+ Digest::SHA256.hexdigest(parts.join("|"))[0, 16]
45
+ end
46
+
47
+ def clear(url = nil, **options)
48
+ if url
49
+ FileUtils.rm_f(cache_path(cache_key(url, options)))
50
+ else
51
+ FileUtils.rm_rf(@cache_dir)
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def cache_path(key)
58
+ FileUtils.mkdir_p(@cache_dir)
59
+ File.join(@cache_dir, "#{key}.cdx.json")
60
+ end
61
+
62
+ def load_cache(path)
63
+ data = JSON.parse(File.read(path))
64
+ data["snapshots"].map { |row| build_snapshot(row) }
65
+ end
66
+
67
+ def save_cache(path, url, options, snapshots)
68
+ data = {
69
+ "url" => url.to_s,
70
+ "options" => serialize_options(options),
71
+ "cached_at" => Time.now.utc.iso8601,
72
+ "snapshots" => snapshots.map(&:as_json),
73
+ }
74
+ tmp_path = "#{path}.tmp"
75
+ File.write(tmp_path, JSON.generate(data))
76
+ File.rename(tmp_path, path)
77
+ end
78
+
79
+ def serialize_options(options)
80
+ h = {}
81
+ h["from"] = options[:from].to_s if options[:from]
82
+ h["to"] = options[:to].to_s if options[:to]
83
+ h["match_type"] = options[:match_type].to_s if options[:match_type]
84
+ h["filters"] = Array(options[:filters]).map(&:to_s) if options[:filters]
85
+ if options[:collapse]
86
+ h["collapse"] =
87
+ Array(options[:collapse]).map(&:to_s)
88
+ end
89
+ h["sort"] = options[:sort].to_s if options[:sort]
90
+ h
91
+ end
92
+
93
+ def build_snapshot(row)
94
+ Snapshot.new(
95
+ urlkey: row["urlkey"],
96
+ timestamp: row["timestamp"],
97
+ original_url: row["original_url"],
98
+ mimetype: row["mimetype"],
99
+ status_code: row["status_code"],
100
+ digest: row["digest"],
101
+ length: row["length"],
102
+ )
103
+ end
104
+ end
105
+ end
data/lib/archaeo/cli.rb CHANGED
@@ -37,6 +37,8 @@ module Archaeo
37
37
  option :limit, type: :numeric, desc: "Max snapshots to return"
38
38
  option :format, desc: "Output format (table, json, csv)",
39
39
  default: "table"
40
+ option :fields, type: :array,
41
+ desc: "Specific fields to print (timestamp,original,etc)"
40
42
  def snapshots(url)
41
43
  fmt = validate_output_format
42
44
  handle_errors do
@@ -123,11 +125,21 @@ module Archaeo
123
125
  end
124
126
 
125
127
  desc "save URL", "Save a URL to the Wayback Machine"
128
+ option :headers, type: :boolean, default: false,
129
+ desc: "Show response headers"
126
130
  def save(url)
127
131
  handle_errors do
128
132
  result = SaveApi.new.save(url)
129
133
  label = result.cached? ? "Cached" : "Saved"
130
134
  puts "#{label}: #{result.archive_url}"
135
+ if options[:headers] && result.response_headers
136
+ puts "Status: #{result.status_code}"
137
+ puts "Response URL: #{result.response_url}" if result.response_url
138
+ puts "Headers:"
139
+ result.response_headers.each do |k, v|
140
+ puts " #{k}: #{v}"
141
+ end
142
+ end
131
143
  end
132
144
  end
133
145
 
@@ -162,11 +174,16 @@ module Archaeo
162
174
  "Fetch a page and rewrite archive URLs to local paths"
163
175
  option :prefix, desc: "Local path prefix", default: "local"
164
176
  option :output, desc: "Write rewritten HTML to file"
177
+ option :rewrite_js, type: :boolean, default: false,
178
+ desc: "Rewrite URLs in JavaScript strings"
179
+ option :rewrite_absolute, type: :boolean, default: false,
180
+ desc: "Rewrite all absolute archive URLs"
165
181
  def rewrite(url, timestamp)
166
182
  handle_errors do
167
183
  coerced = Timestamp.coerce(timestamp)
168
184
  page = Fetcher.new.fetch(url, timestamp: coerced)
169
- rewritten = build_rewriter(url, coerced).rewrite_html(page.content)
185
+ rewriter = build_rewriter(url, coerced)
186
+ rewritten = rewriter.rewrite_html(page.content)
170
187
  output_rewritten(rewritten)
171
188
  end
172
189
  end
@@ -215,22 +232,61 @@ module Archaeo
215
232
  desc: "Number of parallel downloads"
216
233
  option :dry_run, type: :boolean, default: false,
217
234
  desc: "Preview downloads without fetching"
235
+ option :all_timestamps, type: :boolean, default: false,
236
+ desc: "Download all timestamps, not just latest"
237
+ option :only, desc: "Only download URLs matching this pattern"
238
+ option :exclude, desc: "Exclude URLs matching this pattern"
239
+ option :page_requisites, type: :boolean, default: false,
240
+ desc: "Download linked assets (CSS/JS/images)"
241
+ option :snapshot_at, desc: "Download composite snapshot at timestamp"
242
+ option :rate_limit, type: :numeric, default: 0,
243
+ desc: "Min seconds between requests"
218
244
  def download(url)
219
245
  handle_errors do
246
+ rate_limiter = RateLimiter.new(
247
+ min_interval: options[:rate_limit].to_f,
248
+ )
249
+ filter = build_filter
220
250
  downloader = BulkDownloader.new(
221
251
  output_dir: options[:output],
222
252
  concurrency: options[:concurrency],
253
+ rate_limiter: rate_limiter,
223
254
  )
224
- download_with_progress(downloader, url)
255
+ download_with_progress(downloader, url, filter)
256
+ end
257
+ end
258
+
259
+ desc "health URL", "Check health of archived snapshots"
260
+ option :from, desc: "Start timestamp"
261
+ option :to, desc: "End timestamp"
262
+ option :sample, type: :numeric, desc: "Check only N snapshots"
263
+ option :format, desc: "Output format (table, json)", default: "table"
264
+ def health(url)
265
+ handle_errors do
266
+ checker = ArchiveHealthCheck.new
267
+ report = checker.check(
268
+ url,
269
+ from: options[:from],
270
+ to: options[:to],
271
+ sample: options[:sample],
272
+ )
273
+ output_health(report)
225
274
  end
226
275
  end
227
276
 
228
277
  desc "known_urls DOMAIN",
229
278
  "List all known URLs for a domain"
279
+ option :subdomain, type: :boolean, default: false,
280
+ desc: "Include subdomain URLs"
281
+ option :file, desc: "Save URLs to file"
230
282
  def known_urls(domain)
231
283
  handle_errors do
232
- CdxApi.new.known_urls(domain).each do |u|
233
- puts u
284
+ match_type = options[:subdomain] ? "domain" : "prefix"
285
+ urls = CdxApi.new.known_urls(domain, match_type: match_type)
286
+ if options[:file]
287
+ save_urls_to_file(urls, options[:file])
288
+ else
289
+ urls.each { |u| puts u }
234
290
  end
235
291
  end
236
292
  end
@@ -331,7 +387,11 @@ module Archaeo
331
387
  def build_rewriter(url, timestamp)
332
388
  normalized = UrlNormalizer.normalize(url)
333
389
  archive_prefix = ArchiveUrl.new(normalized, timestamp: timestamp).to_s
334
- UrlRewriter.new(archive_prefix, options[:prefix])
390
+ UrlRewriter.new(
391
+ archive_prefix, options[:prefix],
392
+ rewrite_js: options[:rewrite_js],
393
+ rewrite_absolute: options[:rewrite_absolute]
394
+ )
335
395
  end
336
396
 
337
397
  def output_rewritten(content)
@@ -366,14 +426,55 @@ module Archaeo
366
426
  end
367
427
  end
368
428
 
369
- def download_with_progress(downloader, url)
429
+ def build_filter
430
+ only = options[:only]
431
+ exclude = options[:exclude]
432
+ return nil unless only || exclude
433
+
434
+ PatternFilter.new(only: only, exclude: exclude)
435
+ end
436
+
437
+ def download_with_progress(downloader, url, filter)
370
438
  summary = downloader.download(
371
- url, from: options[:from], to: options[:to],
372
- resume: options[:resume], dry_run: options[:dry_run]
439
+ url,
440
+ from: options[:from], to: options[:to],
441
+ resume: options[:resume], dry_run: options[:dry_run],
442
+ all_timestamps: options[:all_timestamps],
443
+ filter: filter,
444
+ page_requisites: options[:page_requisites],
445
+ snapshot_at: options[:snapshot_at]
373
446
  ) { |c, t, s| print_progress(c, t, s) }
374
447
  print_summary(summary)
375
448
  end
376
449
 
450
+ def output_health(report)
451
+ case options[:format]
452
+ when "json"
453
+ data = {
454
+ total: report.total,
455
+ accessible: report.accessible,
456
+ missing: report.missing,
457
+ errors: report.errors,
458
+ }
459
+ puts JSON.generate(data)
460
+ else
461
+ puts "Total: #{report.total}"
462
+ puts "Accessible: #{report.accessible}"
463
+ puts "Missing: #{report.missing}"
464
+ puts "Errors: #{report.errors}"
465
+ end
466
+ end
467
+
468
+ def save_urls_to_file(urls, file_path)
469
+ FileUtils.mkdir_p(File.dirname(file_path)) unless File.dirname(file_path) == "."
470
+ File.open(file_path, "w") do |f|
471
+ urls.each do |url|
472
+ f.puts(url)
473
+ end
474
+ end
475
+ warn "Saved #{urls.size} URLs to #{file_path}" unless quiet?
476
+ end
477
+
377
478
  def print_progress(current, total, snap)
378
479
  return if quiet?
379
480
 
@@ -62,6 +62,36 @@ module Archaeo
62
62
  end
63
63
  end
64
64
 
65
+ def file_exists?(timestamp, base_dir: @output_dir)
66
+ entry = entry_for(timestamp)
67
+ return false unless entry
68
+
69
+ file_path = find_file(base_dir, timestamp.to_s)
70
+ File.exist?(file_path)
71
+ end
72
+
73
+ def stale_entries(base_dir: @output_dir)
74
+ @mutex.synchronize do
75
+ entries.reject do |e|
76
+ find_file(base_dir,
77
+ e["ts"]) && File.exist?(find_file(base_dir, e["ts"]))
78
+ end
79
+ end
80
+ end
81
+
82
+ def cleanup_stale(base_dir: @output_dir)
83
+ @mutex.synchronize do
84
+ stale = entries.reject do |e|
85
+ path = find_file(base_dir, e["ts"])
86
+ path && File.exist?(path)
87
+ end
88
+ @entries = entries - stale
89
+ @entries_key = nil
90
+ save
91
+ stale.size
92
+ end
93
+ end
94
+
65
95
  private
66
96
 
67
97
  def entries
@@ -103,5 +133,10 @@ module Archaeo
103
133
  File.write(tmp_path, content)
104
134
  File.rename(tmp_path, @path)
105
135
  end
136
+
137
+ def find_file(base_dir, timestamp)
138
+ pattern = File.join(base_dir, "**", "*#{timestamp}*")
139
+ Dir.glob(pattern).first
140
+ end
106
141
  end
107
142
  end