archaeo 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e318dfb4a6478af2e663418fda9952308323be35ef9fc6582a5fa3a327cdbb6d
4
- data.tar.gz: 2f745ac2ea371e6b64d4f83ca39d0f247991882d3104bcff90e523b73e421f9b
3
+ metadata.gz: dff73d8ab14a3b75bf98281d20b5427b55757b330d57e8899a2ffb04d9046c6d
4
+ data.tar.gz: d92c2f8e77d6ba5c51283f0581bb51182ebe84aab74ffe4e4971e0d405eab2cc
5
5
  SHA512:
6
- metadata.gz: dc20f6483c99aba0059a224dba1758cec00d3d5921e7f8296b9826554f8d45780981571df3bdd8c05d0704066e14163c7e1a192339da30b3b98a367b0860a669
7
- data.tar.gz: f4ca21a9c5d5f68d29bfe24ff5caf598a9e8819c7bfc920e46cba3d3a9980f4a086433c0305897fc7506e8cc002d943b94b5c4ea15e12372d0b92389df30f3c3
6
+ metadata.gz: ed1a823e5f358e53ec653e5eee902f95787b9bdedc5670a214f3c1784f4c1829f705d5a53ded5cfb6777f16ed33e32430f2aca13cb9c1fad628885daa55a60a1
7
+ data.tar.gz: 3b809e1aad60db5e04a356dff5ab450333ff88f56bf481630887f45b3b3f09035ee1ba959807d22129b827b30324b7730e7cf5c248656b62c38a0beab8ad2581
data/README.adoc CHANGED
@@ -54,6 +54,9 @@ cdx.between("example.com", from: "20220101", to: "20221231").each do |snap|
54
54
  puts snap.timestamp
55
55
  end
56
56
 
57
+ # Count snapshots
58
+ cdx.count("example.com") # => Integer
59
+
57
60
  # Filter by status code, mimetype, or URL pattern
58
61
  cdx.snapshots("example.com",
59
62
  filters: [Archaeo::CdxFilter.by_status(200)],
@@ -69,6 +72,11 @@ filters = Archaeo::CdxFilter.combine(
69
72
  )
70
73
  cdx.snapshots("example.com", filters: filters)
71
74
 
75
+ # Convenience filter factories
76
+ Archaeo::CdxFilter.only_html # text/html only
77
+ Archaeo::CdxFilter.by_mimetype_prefix("image") # any image/*
78
+ Archaeo::CdxFilter.excluding_redirects # exclude 3xx
79
+
72
80
  # Page-based pagination
73
81
  cdx.snapshots("example.com", page: 0)
74
82
 
@@ -90,6 +98,8 @@ result.available? # => true/false
90
98
  result.archive_url # => "https://web.archive.org/web/..."
91
99
  result.timestamp # => Archaeo::Timestamp
92
100
  result.archived_status # => HTTP status code of the archived page
101
+ result.to_h # => Hash representation
102
+ result.as_json # => JSON-serializable Hash
93
103
 
94
104
  api.available?("example.com") # => true/false
95
105
  ----
@@ -104,6 +114,8 @@ result.url # => "https://example.com/"
104
114
  result.archive_url # => "https://web.archive.org/web/..."
105
115
  result.timestamp # => Archaeo::Timestamp
106
116
  result.cached? # => true if already archived
117
+ result.to_h # => Hash representation
118
+ result.as_json # => JSON-serializable Hash
107
119
  ----
108
120
 
109
121
  === Fetch Archived Content
@@ -120,13 +132,22 @@ page.status_code # => 200
120
132
  page.archive_url # => full archive URL
121
133
  page.title # => "Example Domain"
122
134
  page.html? # => true
135
+ page.css? # => true for text/css
123
136
  page.json? # => false
124
137
  page.size # => content length in bytes
138
+ page.to_h # => Hash with all fields
139
+ page.as_json # => JSON-serializable Hash
140
+ page.inspect # => "#<Archaeo::Page text/html 1234 bytes>"
125
141
 
126
142
  # Raw (identity) mode -- no Wayback Machine rewriting
127
143
  page = fetcher.fetch("https://example.com/",
128
144
  timestamp: "20220615000000",
129
145
  identity: true)
146
+
147
+ # With digest verification (raises IntegrityError on mismatch)
148
+ page = fetcher.fetch("https://example.com/",
149
+ timestamp: "20220615000000",
150
+ snapshot: snap)
130
151
  ----
131
152
 
132
153
  === Fetch Page with Assets
@@ -146,10 +167,25 @@ bundle.assets.fonts
146
167
  bundle.assets.media
147
168
  bundle.size # => total count (page + assets)
148
169
  bundle.asset_count # => number of assets
170
+ bundle.to_h # => Hash representation
171
+ bundle.to_json # => JSON string
149
172
 
150
173
  # Serialize asset list
151
174
  bundle.assets.to_json
152
175
  bundle.assets.counts # => { css: 1, js: 2, image: 3, font: 0, media: 1 }
176
+
177
+ # Filter assets by type
178
+ css_only = bundle.assets.filter(:css)
179
+ images_and_fonts = bundle.assets.filter(:image, :font)
180
+
181
+ # Merge asset lists (deduplicates)
182
+ merged = bundle.assets.merge(other_assets)
183
+
184
+ # Reconstruct from JSON
185
+ restored = Archaeo::AssetList.from_json(json_string)
186
+
187
+ # Safe type access
188
+ bundle.assets.urls_by_type(:image) # works for any type key
153
189
  ----
154
190
 
155
191
  === Bulk Download with Resume
@@ -157,13 +193,22 @@ bundle.assets.counts # => { css: 1, js: 2, image: 3, font: 0, media: 1 }
157
193
  [source,ruby]
158
194
  ----
159
195
  downloader = Archaeo::BulkDownloader.new(output_dir: "archive")
160
- downloader.download("example.com") do |current, total, snapshot|
196
+ summary = downloader.download("example.com") do |current, total, snapshot|
161
197
  puts "[#{current}/#{total}] #{snapshot.original_url}"
162
198
  end
163
199
 
200
+ summary.total # => total snapshots found
201
+ summary.downloaded # => successfully downloaded
202
+ summary.skipped # => skipped (already downloaded with resume)
203
+ summary.bytes_written # => total bytes written
204
+ summary.elapsed # => seconds elapsed
205
+
164
206
  # Resume interrupted download
165
207
  downloader.download("example.com", resume: true)
166
208
 
209
+ # Dry run (preview without fetching)
210
+ summary = downloader.download("example.com", dry_run: true)
211
+
167
212
  # Filter by date range
168
213
  downloader.download("example.com",
169
214
  from: "20220101", to: "20221231")
@@ -175,6 +220,27 @@ downloader = Archaeo::BulkDownloader.new(
175
220
  downloader.download("example.com")
176
221
  ----
177
222
 
223
+ === Download State (Resume Tracking)
224
+
225
+ [source,ruby]
226
+ ----
227
+ state = Archaeo::DownloadState.new("archive")
228
+
229
+ # Check if a snapshot was already downloaded
230
+ state.completed?("20220615000000") # => true/false
231
+
232
+ # Get metadata for a completed snapshot
233
+ entry = state.entry_for("20220615000000")
234
+ # => { "ts" => "20220615000000", "at" => "2022-06-15T12:00:00Z",
235
+ # "url" => "https://example.com/", "bytes" => 12345 }
236
+
237
+ # Total bytes downloaded
238
+ state.total_bytes # => Integer
239
+
240
+ # Clear state for a fresh download
241
+ state.clear
242
+ ----
243
+
178
244
  === URL Normalization
179
245
 
180
246
  [source,ruby]
@@ -187,6 +253,10 @@ Archaeo::UrlNormalizer.normalize('"https://example.com/%252F"')
187
253
 
188
254
  Archaeo::UrlNormalizer.with_scheme("example.com")
189
255
  # => "https://example.com"
256
+
257
+ # Default ports are stripped
258
+ Archaeo::UrlNormalizer.normalize("https://example.com:443/path")
259
+ # => "https://example.com/path"
190
260
  ----
191
261
 
192
262
  === CDX Filters
@@ -202,6 +272,33 @@ Archaeo::CdxFilter.by_url("example.com") # => "original:example.com"
202
272
  # Compose filters
203
273
  filters = Archaeo::CdxFilter.only_successful
204
274
  error_filters = Archaeo::CdxFilter.excluding_errors
275
+
276
+ # Mimetype prefix matching
277
+ Archaeo::CdxFilter.by_mimetype_prefix("image") # => matches image/*
278
+
279
+ # Convenience factories
280
+ Archaeo::CdxFilter.only_html # => text/html only
281
+ Archaeo::CdxFilter.excluding_redirects # => excludes 3xx statuses
282
+ ----
283
+
284
+ === URL Rewriting
285
+
286
+ [source,ruby]
287
+ ----
288
+ rewriter = Archaeo::UrlRewriter.new(
289
+ "https://web.archive.org/web/20220615000000/",
290
+ "local",
291
+ )
292
+
293
+ # Rewrite single URL
294
+ rewriter.rewrite("https://web.archive.org/web/20220615000000/style.css")
295
+ # => "local/style.css"
296
+
297
+ # Rewrite batch
298
+ rewriter.rewrite_batch(["url1", "url2"])
299
+
300
+ # Rewrite URLs within HTML (src, href, srcset, data-src, poster)
301
+ rewritten_html = rewriter.rewrite_html(html_content)
205
302
  ----
206
303
 
207
304
  === Snapshot Convenience
@@ -217,6 +314,14 @@ snap.client_error? # => true for 4xx
217
314
  snap.server_error? # => true for 5xx
218
315
  snap.error? # => true for 4xx/5xx
219
316
 
317
+ # Age helpers
318
+ snap.age # => seconds since capture
319
+ snap.older_than?(3600) # => true if older than 1 hour
320
+ snap.newer_than?(3600) # => true if newer than 1 hour
321
+
322
+ # Identity URL (raw content, no Wayback rewriting)
323
+ snap.identity_url
324
+
220
325
  # Fetch content directly from a snapshot
221
326
  page = snap.fetch
222
327
 
@@ -225,6 +330,7 @@ bundle = snap.fetch_with_assets
225
330
 
226
331
  # JSON-serializable representation
227
332
  snap.as_json # => Hash with primitive values only
333
+ snap.inspect # => "#<Archaeo::Snapshot 20220101 ...>"
228
334
  ----
229
335
 
230
336
  === Timestamps
@@ -250,6 +356,10 @@ ts.to_s # => "20220615000000"
250
356
  ts.to_iso8601 # => "2022-06-15T00:00:00Z"
251
357
  ts.to_rfc3339 # => "2022-06-15T00:00:00+00:00"
252
358
 
359
+ # Decompose
360
+ ts.to_h # => { year: 2022, month: 6, day: 15, hour: 0, minute: 0, second: 0 }
361
+ ts.to_a # => [2022, 6, 15, 0, 0, 0]
362
+
253
363
  # Arithmetic
254
364
  ts + 3600 # => Timestamp one hour later
255
365
  ts - 3600 # => Timestamp one hour earlier
@@ -257,6 +367,27 @@ ts1 - ts2 # => seconds between timestamps
257
367
 
258
368
  # Comparison
259
369
  ts1 < ts2 # => true/false
370
+
371
+ # Immutable -- frozen on creation
372
+ ts.frozen? # => true
373
+ ----
374
+
375
+ === HTTP Client Observability
376
+
377
+ [source,ruby]
378
+ ----
379
+ # Track every request with a callback
380
+ client = Archaeo::HttpClient.new(
381
+ on_request: ->(uri, elapsed, status, retries) {
382
+ puts "#{status} #{uri} (#{elapsed.round(3)}s, #{retries} retries)"
383
+ },
384
+ )
385
+
386
+ # Inspect connection pool state
387
+ client.pool_stats
388
+ # => { active_connections: 2, max_pool_size: 8,
389
+ # hosts: ["web.archive.org"],
390
+ # idle_times: { "web.archive.org": 12 } }
260
391
  ----
261
392
 
262
393
  === Command-Line Interface
@@ -279,6 +410,16 @@ archaeo near --format json example.com 20220101
279
410
  archaeo oldest example.com
280
411
  archaeo newest --format json example.com
281
412
 
413
+ # Find before/after a timestamp
414
+ archaeo before example.com 20220101
415
+ archaeo after example.com 20220101
416
+
417
+ # List snapshots in a date range
418
+ archaeo between example.com 20220101 20221231
419
+
420
+ # Count snapshots
421
+ archaeo count example.com
422
+
282
423
  # Check availability (with optional timestamp)
283
424
  archaeo available example.com
284
425
  archaeo available --timestamp 20220101 example.com
@@ -295,15 +436,25 @@ archaeo fetch --output page.html https://example.com/ 20220615120000
295
436
  # Fetch raw (identity) content
296
437
  archaeo fetch --identity https://example.com/ 20220615120000
297
438
 
439
+ # Fetch a page and list its extracted assets
440
+ archaeo fetch-assets https://example.com/ 20220615120000
441
+ archaeo fetch-assets --format json https://example.com/ 20220615120000
442
+
298
443
  # Download all snapshots
299
444
  archaeo download example.com --output ./archive
300
445
 
446
+ # Dry run (preview without fetching)
447
+ archaeo download --dry_run example.com
448
+
301
449
  # Parallel downloads
302
450
  archaeo download --concurrency 4 example.com --output ./archive
303
451
 
304
452
  # Resume interrupted download
305
453
  archaeo download example.com --resume
306
454
 
455
+ # Suppress progress messages
456
+ archaeo --quiet download example.com
457
+
307
458
  # Discover all known URLs for a domain
308
459
  archaeo known_urls example.com
309
460
  ----
@@ -326,6 +477,9 @@ Archaeo::MaximumRetriesExceeded
326
477
 
327
478
  # SavePageNow session limit
328
479
  Archaeo::SaveFailed
480
+
481
+ # Content digest mismatch
482
+ Archaeo::IntegrityError
329
483
  ----
330
484
 
331
485
  == Architecture
@@ -338,15 +492,15 @@ Archaeo follows a model-driven, OOP design:
338
492
 
339
493
  | *Models*
340
494
  | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`
341
- | Domain value objects
495
+ | Domain value objects with `to_h`, `as_json`, `inspect` support
342
496
 
343
497
  | *URL Processing*
344
498
  | `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
345
- | URL sanitization, validated filtering with composition, and rewriting
499
+ | URL sanitization, validated filtering with composition, and HTML URL rewriting
346
500
 
347
501
  | *Asset Extraction*
348
502
  | `AssetExtractor`, `AssetList`
349
- | Parse HTML for resource URLs
503
+ | Parse HTML for resource URLs including preloads and modulepreload
350
504
 
351
505
  | *APIs*
352
506
  | `CdxApi`, `AvailabilityApi`, `SaveApi`
@@ -354,11 +508,11 @@ Archaeo follows a model-driven, OOP design:
354
508
 
355
509
  | *Operations*
356
510
  | `Fetcher`, `BulkDownloader`, `DownloadState`
357
- | Download content with resume support
511
+ | Download content with resume, dry-run, digest verification, and download summaries
358
512
 
359
513
  | *Infrastructure*
360
514
  | `HttpClient`
361
- | HTTP transport with retries, gzip, 429/503 handling, connection pooling with eviction
515
+ | HTTP transport with retries, gzip, 429/503 handling, connection pooling, and per-request observability
362
516
  |===
363
517
 
364
518
  All API classes accept an `HttpClient` via dependency injection for testability.
@@ -53,6 +53,22 @@ module Archaeo
53
53
  "#{BASE}/#{@timestamp}#{suffix}/#{@original_url}"
54
54
  end
55
55
 
56
+ def identity_url
57
+ return to_s if identity?
58
+
59
+ self.class.new(@original_url, timestamp: @timestamp, identity: true).to_s
60
+ end
61
+
62
+ def to_h
63
+ { original_url: @original_url, timestamp: @timestamp,
64
+ identity: @identity }
65
+ end
66
+
67
+ def as_json(*)
68
+ { original_url: @original_url, timestamp: @timestamp.to_s,
69
+ identity: @identity, url: to_s }
70
+ end
71
+
56
72
  def self.extract_original_url(string, ts_str, identity)
57
73
  marker = identity ? "#{ts_str}id_/" : "#{ts_str}/"
58
74
  idx = string.index(marker)
@@ -24,6 +24,12 @@ module Archaeo
24
24
  "\\s*:[^;]*#{CSS_URL_PATTERN.source}",
25
25
  )
26
26
 
27
+ PRELOAD_TYPE_MAP = {
28
+ "style" => :css,
29
+ "script" => :js,
30
+ "image" => :image,
31
+ }.freeze
32
+
27
33
  def initialize(html, base_url: nil)
28
34
  @doc = Nokogiri::HTML(html.to_s)
29
35
  @base_url = base_url
@@ -38,6 +44,7 @@ module Archaeo
38
44
  extract_media(list)
39
45
  extract_inline_css(list)
40
46
  extract_inline_styles(list)
47
+ extract_preloads(list)
41
48
  list
42
49
  end
43
50
 
@@ -53,6 +60,9 @@ module Archaeo
53
60
  @doc.css("script[src]").each do |el|
54
61
  list.add(resolve(el["src"]), type: :js)
55
62
  end
63
+ @doc.css('link[rel="modulepreload"]').each do |el|
64
+ list.add(resolve(el["href"]), type: :js)
65
+ end
56
66
  end
57
67
 
58
68
  def extract_images(list)
@@ -202,5 +212,14 @@ module Archaeo
202
212
  rescue URI::InvalidURIError
203
213
  url
204
214
  end
215
+
216
+ def extract_preloads(list)
217
+ @doc.css('link[rel="preload"][as]').each do |el|
218
+ type = PRELOAD_TYPE_MAP[el["as"]]
219
+ next unless type
220
+
221
+ list.add(resolve(el["href"]), type: type)
222
+ end
223
+ end
205
224
  end
206
225
  end
@@ -40,6 +40,10 @@ module Archaeo
40
40
  @urls_by_type[:image]
41
41
  end
42
42
 
43
+ def urls_by_type(type)
44
+ @urls_by_type[type] || []
45
+ end
46
+
43
47
  def fonts
44
48
  @urls_by_type[:font]
45
49
  end
@@ -71,5 +75,32 @@ module Archaeo
71
75
  def counts
72
76
  @urls_by_type.transform_values(&:size)
73
77
  end
78
+
79
+ def filter(*types)
80
+ result = self.class.new
81
+ types.each do |type|
82
+ @urls_by_type[type]&.each { |url| result.add(url, type: type) }
83
+ end
84
+ result
85
+ end
86
+
87
+ def merge(other)
88
+ CATEGORIES.each do |type|
89
+ other.urls_by_type(type).each { |url| add(url, type: type) }
90
+ end
91
+ self
92
+ end
93
+
94
+ def self.from_json(json_string)
95
+ data = JSON.parse(json_string)
96
+ list = new
97
+ data.each do |type, urls|
98
+ sym = type.to_sym
99
+ next unless CATEGORIES.include?(sym)
100
+
101
+ Array(urls).each { |url| list.add(url, type: sym) }
102
+ end
103
+ list
104
+ end
74
105
  end
75
106
  end
@@ -32,5 +32,29 @@ module Archaeo
32
32
  "#{url} -> not available"
33
33
  end
34
34
  end
35
+
36
+ def to_h
37
+ {
38
+ url: @url,
39
+ available: @available,
40
+ archive_url: @archive_url,
41
+ timestamp: @timestamp,
42
+ archived_status: @archived_status,
43
+ }
44
+ end
45
+
46
+ def as_json(*)
47
+ {
48
+ url: @url,
49
+ available: @available,
50
+ archive_url: @archive_url,
51
+ timestamp: @timestamp&.to_s,
52
+ archived_status: @archived_status,
53
+ }
54
+ end
55
+
56
+ def inspect
57
+ "#<#{self.class.name} #{@url} available=#{@available}>"
58
+ end
35
59
  end
36
60
  end
@@ -3,6 +3,11 @@
3
3
  require "fileutils"
4
4
 
5
5
  module Archaeo
6
+ DownloadSummary = Struct.new(
7
+ :total, :downloaded, :skipped, :failed, :bytes_written, :elapsed,
8
+ keyword_init: true
9
+ )
10
+
6
11
  # Downloads all archived snapshots of a URL with resume support.
7
12
  #
8
13
  # Queries the CDX API for matching snapshots, fetches each page,
@@ -17,20 +22,17 @@ module Archaeo
17
22
  @concurrency = [1, concurrency.to_i].max
18
23
  end
19
24
 
20
- def download(url, from: nil, to: nil, resume: false, &block)
25
+ def download(url, from: nil, to: nil, resume: false,
26
+ dry_run: false, &block)
27
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
21
28
  url = UrlNormalizer.normalize(url)
22
- FileUtils.mkdir_p(@output_dir)
23
- state = DownloadState.new(@output_dir)
29
+ FileUtils.mkdir_p(@output_dir) unless dry_run
24
30
 
25
31
  snapshots = fetch_snapshots(url, from: from, to: to)
26
- total = snapshots.size
27
- progress = block
32
+ downloaded, skipped, bytes =
33
+ run_download(snapshots, resume, dry_run, block)
28
34
 
29
- if @concurrency == 1
30
- download_sequential(snapshots, total, state, resume, progress)
31
- else
32
- download_concurrent(snapshots, total, state, resume, progress)
33
- end
35
+ build_summary(start_time, snapshots.size, downloaded, skipped, bytes)
34
36
  end
35
37
 
36
38
  private
@@ -44,29 +46,75 @@ module Archaeo
44
46
  .select { |snap| !snap.blocked? && snap.status_code == 200 }
45
47
  end
46
48
 
47
- def download_sequential(snapshots, total, state, resume, progress)
48
- snapshots.each_with_index do |snap, index|
49
- next if resume && state.completed?(snap.timestamp)
49
+ def run_download(snapshots, resume, dry_run, progress)
50
+ state = DownloadState.new(@output_dir)
51
+ total = snapshots.size
50
52
 
51
- fetch_and_save(snap)
52
- state.mark_completed(snap.timestamp)
53
+ if @concurrency == 1
54
+ download_sequential(snapshots, total, state, resume,
55
+ dry_run, progress)
56
+ else
57
+ download_concurrent(snapshots, total, state, resume,
58
+ dry_run, progress)
59
+ end
60
+ end
53
61
 
62
+ def build_summary(start_time, total, downloaded, skipped, bytes)
63
+ elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
64
+ DownloadSummary.new(
65
+ total: total, downloaded: downloaded, skipped: skipped,
66
+ failed: 0, bytes_written: bytes, elapsed: elapsed
67
+ )
68
+ end
69
+
70
+ def download_sequential(snapshots, total, state, resume,
71
+ dry_run, progress)
72
+ counters = { downloaded: 0, skipped: 0, bytes: 0 }
73
+
74
+ snapshots.each_with_index do |snap, index|
75
+ process_sequential(snap, state, resume, dry_run, counters)
54
76
  progress&.call(index + 1, total, snap)
55
77
  end
78
+
79
+ [counters[:downloaded], counters[:skipped], counters[:bytes]]
80
+ end
81
+
82
+ def process_sequential(snap, state, resume, dry_run, counters)
83
+ if resume && state.completed?(snap.timestamp)
84
+ counters[:skipped] += 1
85
+ return
86
+ end
87
+
88
+ counters[:bytes] += download_snapshot(snap, state) unless dry_run
89
+ counters[:downloaded] += 1
56
90
  end
57
91
 
58
- def download_concurrent(snapshots, total, state, resume, progress)
92
+ def download_snapshot(snap, state)
93
+ content = fetch_and_save(snap)
94
+ state.mark_completed(snap.timestamp, url: snap.original_url,
95
+ bytes: content.bytesize)
96
+ content.bytesize
97
+ end
98
+
99
+ def download_concurrent(snapshots, total, state, resume,
100
+ dry_run, progress)
59
101
  queue = snapshots.each_with_index.to_a
60
- mutex = Mutex.new
61
- errors = []
102
+ shared = { mutex: Mutex.new, errors: [],
103
+ downloaded: 0, skipped: 0, bytes: 0 }
62
104
 
63
105
  threads = Array.new(@concurrency) do
64
106
  Thread.new do
65
- process_queue(queue, total, state, resume, progress, mutex, errors)
107
+ process_queue(queue, total, state, resume,
108
+ dry_run, progress, shared)
66
109
  end
67
110
  end
68
111
  threads.each(&:join)
112
+ raise_on_errors(shared[:errors])
113
+
114
+ [shared[:downloaded], shared[:skipped], shared[:bytes]]
115
+ end
69
116
 
117
+ def raise_on_errors(errors)
70
118
  return unless errors.any?
71
119
 
72
120
  raise Error,
@@ -74,24 +122,44 @@ module Archaeo
74
122
  "#{errors.map { |s, _| s.timestamp }.join(', ')}"
75
123
  end
76
124
 
77
- def process_queue(queue, total, state, resume, progress, mutex, errors)
125
+ def process_queue(queue, total, state, resume, dry_run,
126
+ progress, shared)
78
127
  loop do
79
- snap, index = mutex.synchronize { queue.shift }
128
+ snap, index = shared[:mutex].synchronize { queue.shift }
80
129
  break unless snap
81
130
 
82
- next if resume && state.completed?(snap.timestamp)
83
-
84
- begin
85
- fetch_and_save(snap)
86
- state.mark_completed(snap.timestamp)
87
- rescue StandardError => e
88
- mutex.synchronize { errors << [snap, e] }
131
+ if skip_snapshot?(snap, state, resume, shared)
132
+ progress&.call(index + 1, total, snap)
133
+ next
89
134
  end
90
135
 
136
+ concurrent_fetch(snap, state, dry_run, shared)
91
137
  progress&.call(index + 1, total, snap)
92
138
  end
93
139
  end
94
140
 
141
+ def skip_snapshot?(snap, state, resume, shared)
142
+ return false unless resume && state.completed?(snap.timestamp)
143
+
144
+ shared[:mutex].synchronize { shared[:skipped] += 1 }
145
+ true
146
+ end
147
+
148
+ def concurrent_fetch(snap, state, dry_run, shared)
149
+ unless dry_run
150
+ content = fetch_and_save(snap)
151
+ shared[:mutex].synchronize do
152
+ state.mark_completed(snap.timestamp,
153
+ url: snap.original_url,
154
+ bytes: content.bytesize)
155
+ shared[:bytes] += content.bytesize
156
+ end
157
+ end
158
+ shared[:mutex].synchronize { shared[:downloaded] += 1 }
159
+ rescue StandardError => e
160
+ shared[:mutex].synchronize { shared[:errors] << [snap, e] }
161
+ end
162
+
95
163
  def fetch_and_save(snapshot)
96
164
  fetcher = Fetcher.new(client: @client)
97
165
  page = fetcher.fetch(snapshot.original_url,
@@ -102,6 +170,7 @@ module Archaeo
102
170
  tmp_path = "#{filename}.tmp"
103
171
  File.binwrite(tmp_path, page.content)
104
172
  File.rename(tmp_path, filename)
173
+ page.content
105
174
  rescue StandardError
106
175
  FileUtils.rm_f(tmp_path) if defined?(tmp_path)
107
176
  raise
@@ -106,6 +106,10 @@ module Archaeo
106
106
  **options)
107
107
  end
108
108
 
109
+ def count(url, **options)
110
+ snapshots(url, **options).count
111
+ end
112
+
109
113
  # Returns the number of pages for a paginated query.
110
114
  def num_pages(url, **options)
111
115
  url = UrlNormalizer.normalize(url)
@@ -76,6 +76,18 @@ module Archaeo
76
76
  excluding_status(502), excluding_status(503)]
77
77
  end
78
78
 
79
+ def self.only_html
80
+ [by_mimetype("text/html")]
81
+ end
82
+
83
+ def self.by_mimetype_prefix(prefix)
84
+ new("mimetype:#{Regexp.escape(prefix)}.*")
85
+ end
86
+
87
+ def self.excluding_redirects
88
+ %w[301 302 303 307 308].map { |c| excluding_status(c) }
89
+ end
90
+
79
91
  private
80
92
 
81
93
  def validate!