archaeo 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 721131f1413aaacb26685abe006fdd243e3ef53e1d5f23764b2774717aae93ec
4
- data.tar.gz: f3c90969cf684e06a6cdd1e0025a605141e1ee543430488fbf960e787ca1ba7d
3
+ metadata.gz: e318dfb4a6478af2e663418fda9952308323be35ef9fc6582a5fa3a327cdbb6d
4
+ data.tar.gz: 2f745ac2ea371e6b64d4f83ca39d0f247991882d3104bcff90e523b73e421f9b
5
5
  SHA512:
6
- metadata.gz: 494ba22650c56df4a2ae119f0b6965679441bf988f013fe22f54c8c02e285d9df6ade6db4c2574ec23ba3e95f917e541e96dbd79a5b6deae178d7b6eaa5fd1a7
7
- data.tar.gz: cbaf296d51ecae3ad77eee66100f6ca6aa40d0ddf0abd3b6c73b7c61b5cf92131b991437c28537a4db97de0182b7a9133df54ad07c7017d6f613760bcadf3cca
6
+ metadata.gz: dc20f6483c99aba0059a224dba1758cec00d3d5921e7f8296b9826554f8d45780981571df3bdd8c05d0704066e14163c7e1a192339da30b3b98a367b0860a669
7
+ data.tar.gz: f4ca21a9c5d5f68d29bfe24ff5caf598a9e8819c7bfc920e46cba3d3a9980f4a086433c0305897fc7506e8cc002d943b94b5c4ea15e12372d0b92389df30f3c3
data/README.adoc CHANGED
@@ -49,6 +49,11 @@ near = cdx.near("example.com", timestamp: "20220101")
49
49
  before = cdx.before("example.com", timestamp: "20220101")
50
50
  after = cdx.after("example.com", timestamp: "20220101")
51
51
 
52
+ # Time range query
53
+ cdx.between("example.com", from: "20220101", to: "20221231").each do |snap|
54
+ puts snap.timestamp
55
+ end
56
+
52
57
  # Filter by status code, mimetype, or URL pattern
53
58
  cdx.snapshots("example.com",
54
59
  filters: [Archaeo::CdxFilter.by_status(200)],
@@ -57,6 +62,13 @@ cdx.snapshots("example.com",
57
62
  sort: "reverse",
58
63
  )
59
64
 
65
+ # Compose multiple filters
66
+ filters = Archaeo::CdxFilter.combine(
67
+ Archaeo::CdxFilter.only_successful,
68
+ Archaeo::CdxFilter.excluding_mimetype("text/css"),
69
+ )
70
+ cdx.snapshots("example.com", filters: filters)
71
+
60
72
  # Page-based pagination
61
73
  cdx.snapshots("example.com", page: 0)
62
74
 
@@ -77,6 +89,7 @@ result = api.near("example.com")
77
89
  result.available? # => true/false
78
90
  result.archive_url # => "https://web.archive.org/web/..."
79
91
  result.timestamp # => Archaeo::Timestamp
92
+ result.archived_status # => HTTP status code of the archived page
80
93
 
81
94
  api.available?("example.com") # => true/false
82
95
  ----
@@ -87,6 +100,7 @@ api.available?("example.com") # => true/false
87
100
  ----
88
101
  save = Archaeo::SaveApi.new
89
102
  result = save.save("https://example.com/")
103
+ result.url # => "https://example.com/"
90
104
  result.archive_url # => "https://web.archive.org/web/..."
91
105
  result.timestamp # => Archaeo::Timestamp
92
106
  result.cached? # => true if already archived
@@ -104,6 +118,10 @@ page.content # => "<html>...</html>"
104
118
  page.content_type # => "text/html"
105
119
  page.status_code # => 200
106
120
  page.archive_url # => full archive URL
121
+ page.title # => "Example Domain"
122
+ page.html? # => true
123
+ page.json? # => false
124
+ page.size # => content length in bytes
107
125
 
108
126
  # Raw (identity) mode -- no Wayback Machine rewriting
109
127
  page = fetcher.fetch("https://example.com/",
@@ -126,6 +144,12 @@ bundle.assets.js # => ["https://example.com/app.js", ...]
126
144
  bundle.assets.images
127
145
  bundle.assets.fonts
128
146
  bundle.assets.media
147
+ bundle.size # => total count (page + assets)
148
+ bundle.asset_count # => number of assets
149
+
150
+ # Serialize asset list
151
+ bundle.assets.to_json
152
+ bundle.assets.counts # => { css: 1, js: 2, image: 3, font: 0, media: 1 }
129
153
  ----
130
154
 
131
155
  === Bulk Download with Resume
@@ -143,6 +167,12 @@ downloader.download("example.com", resume: true)
143
167
  # Filter by date range
144
168
  downloader.download("example.com",
145
169
  from: "20220101", to: "20221231")
170
+
171
+ # Parallel downloads
172
+ downloader = Archaeo::BulkDownloader.new(
173
+ output_dir: "archive", concurrency: 4,
174
+ )
175
+ downloader.download("example.com")
146
176
  ----
147
177
 
148
178
  === URL Normalization
@@ -168,6 +198,33 @@ Archaeo::CdxFilter.by_status(200) # => "statuscode:200"
168
198
  Archaeo::CdxFilter.excluding_status(404) # => "!statuscode:404"
169
199
  Archaeo::CdxFilter.by_mimetype("text/html") # => "mimetype:text/html"
170
200
  Archaeo::CdxFilter.by_url("example.com") # => "original:example.com"
201
+
202
+ # Compose filters
203
+ filters = Archaeo::CdxFilter.only_successful
204
+ error_filters = Archaeo::CdxFilter.excluding_errors
205
+ ----
206
+
207
+ === Snapshot Convenience
208
+
209
+ [source,ruby]
210
+ ----
211
+ snap = cdx.near("example.com", timestamp: "20220101")
212
+
213
+ # Status predicates
214
+ snap.success? # => true (200)
215
+ snap.redirect? # => true for 3xx
216
+ snap.client_error? # => true for 4xx
217
+ snap.server_error? # => true for 5xx
218
+ snap.error? # => true for 4xx/5xx
219
+
220
+ # Fetch content directly from a snapshot
221
+ page = snap.fetch
222
+
223
+ # Fetch with assets
224
+ bundle = snap.fetch_with_assets
225
+
226
+ # JSON-serializable representation
227
+ snap.as_json # => Hash with primitive values only
171
228
  ----
172
229
 
173
230
  === Timestamps
@@ -189,6 +246,15 @@ ts = Archaeo::Timestamp.now
189
246
  # Format as 14-digit string
190
247
  ts.to_s # => "20220615000000"
191
248
 
249
+ # Standard time formats
250
+ ts.to_iso8601 # => "2022-06-15T00:00:00Z"
251
+ ts.to_rfc3339 # => "2022-06-15T00:00:00+00:00"
252
+
253
+ # Arithmetic
254
+ ts + 3600 # => Timestamp one hour later
255
+ ts - 3600 # => Timestamp one hour earlier
256
+ ts1 - ts2 # => seconds between timestamps
257
+
192
258
  # Comparison
193
259
  ts1 < ts2 # => true/false
194
260
  ----
@@ -207,9 +273,15 @@ archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
207
273
 
208
274
  # Find closest snapshot
209
275
  archaeo near example.com 20220101
276
+ archaeo near --format json example.com 20220101
210
277
 
211
- # Check availability
278
+ # Find oldest/newest
279
+ archaeo oldest example.com
280
+ archaeo newest --format json example.com
281
+
282
+ # Check availability (with optional timestamp)
212
283
  archaeo available example.com
284
+ archaeo available --timestamp 20220101 example.com
213
285
 
214
286
  # Save a URL
215
287
  archaeo save https://example.com/
@@ -226,6 +298,9 @@ archaeo fetch --identity https://example.com/ 20220615120000
226
298
  # Download all snapshots
227
299
  archaeo download example.com --output ./archive
228
300
 
301
+ # Parallel downloads
302
+ archaeo download --concurrency 4 example.com --output ./archive
303
+
229
304
  # Resume interrupted download
230
305
  archaeo download example.com --resume
231
306
 
@@ -267,7 +342,7 @@ Archaeo follows a model-driven, OOP design:
267
342
 
268
343
  | *URL Processing*
269
344
  | `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
270
- | URL sanitization, filtering, and rewriting
345
+ | URL sanitization, validated filtering with composition, and rewriting
271
346
 
272
347
  | *Asset Extraction*
273
348
  | `AssetExtractor`, `AssetList`
@@ -283,7 +358,7 @@ Archaeo follows a model-driven, OOP design:
283
358
 
284
359
  | *Infrastructure*
285
360
  | `HttpClient`
286
- | HTTP transport with retries, gzip, connection pooling
361
+ | HTTP transport with retries, gzip, 429/503 handling, connection pooling with eviction
287
362
  |===
288
363
 
289
364
  All API classes accept an `HttpClient` via dependency injection for testability.
@@ -36,6 +36,18 @@ module Archaeo
36
36
  @identity
37
37
  end
38
38
 
39
+ def ==(other)
40
+ other.is_a?(self.class) &&
41
+ original_url == other.original_url &&
42
+ timestamp == other.timestamp &&
43
+ identity? == other.identity?
44
+ end
45
+ alias_method :eql?, :==
46
+
47
+ def hash
48
+ [original_url, timestamp, identity?].hash
49
+ end
50
+
39
51
  def to_s
40
52
  suffix = identity? ? "id_" : ""
41
53
  "#{BASE}/#{@timestamp}#{suffix}/#{@original_url}"
@@ -10,6 +10,20 @@ module Archaeo
10
10
  # and media resources referenced by the page. Optionally resolves
11
11
  # relative URLs against a base URL.
12
12
  class AssetExtractor
13
+ FONT_CDN_PATTERNS = %w[
14
+ fonts.googleapis.com
15
+ fonts.gstatic.com
16
+ use.typekit.net
17
+ fast.fonts.net
18
+ cloud.typography.com
19
+ ].freeze
20
+
21
+ CSS_URL_PATTERN = /url\(\s*['"]?([^'")\s]+)['"]?\s*\)/
22
+ CSS_IMAGE_PROPS = Regexp.new(
23
+ "(?:background-image|background|list-style-image|content|cursor)" \
24
+ "\\s*:[^;]*#{CSS_URL_PATTERN.source}",
25
+ )
26
+
13
27
  def initialize(html, base_url: nil)
14
28
  @doc = Nokogiri::HTML(html.to_s)
15
29
  @base_url = base_url
@@ -23,6 +37,7 @@ module Archaeo
23
37
  extract_fonts(list)
24
38
  extract_media(list)
25
39
  extract_inline_css(list)
40
+ extract_inline_styles(list)
26
41
  list
27
42
  end
28
43
 
@@ -32,9 +47,6 @@ module Archaeo
32
47
  @doc.css('link[rel="stylesheet"]').each do |el|
33
48
  list.add(resolve(el["href"]), type: :css)
34
49
  end
35
- @doc.css('link[rel="icon"], link[rel="shortcut icon"]').each do |el|
36
- list.add(resolve(el["href"]), type: :image)
37
- end
38
50
  end
39
51
 
40
52
  def extract_js(list)
@@ -44,8 +56,42 @@ module Archaeo
44
56
  end
45
57
 
46
58
  def extract_images(list)
59
+ extract_img_tags(list)
60
+ extract_picture_sources(list)
61
+ extract_lazy_images(list)
62
+ extract_icon_links(list)
63
+ end
64
+
65
+ def extract_img_tags(list)
47
66
  @doc.css("img[src]").each do |el|
48
67
  list.add(resolve(el["src"]), type: :image)
68
+ extract_srcset(el["srcset"], list, :image)
69
+ end
70
+ end
71
+
72
+ def extract_picture_sources(list)
73
+ @doc.css("picture source[srcset]").each do |el|
74
+ extract_srcset(el["srcset"], list, :image)
75
+ end
76
+ end
77
+
78
+ def extract_lazy_images(list)
79
+ @doc.css("img[data-src]").each do |el|
80
+ list.add(resolve(el["data-src"]), type: :image)
81
+ end
82
+ end
83
+
84
+ def extract_icon_links(list)
85
+ @doc.css(
86
+ 'link[rel~="icon"], link[rel="apple-touch-icon"], ' \
87
+ 'link[rel="apple-touch-icon-precomposed"], ' \
88
+ 'link[rel="mask-icon"]',
89
+ ).each do |el|
90
+ list.add(resolve(el["href"]), type: :image)
91
+ end
92
+
93
+ @doc.css('link[rel="manifest"]').each do |el|
94
+ list.add(resolve(el["href"]), type: :media)
49
95
  end
50
96
  end
51
97
 
@@ -55,29 +101,92 @@ module Archaeo
55
101
  end
56
102
  @doc.css('link[rel="stylesheet"]').each do |el|
57
103
  if font_stylesheet?(el["href"])
58
- list.add(resolve(el["href"]),
59
- type: :font)
104
+ list.add(resolve(el["href"]), type: :font)
60
105
  end
61
106
  end
62
107
  end
63
108
 
64
109
  def extract_media(list)
110
+ extract_media_sources(list)
111
+ extract_video_posters(list)
112
+ extract_embeds(list)
113
+ end
114
+
115
+ def extract_media_sources(list)
65
116
  @doc.css("source[src], video[src], audio[src]").each do |el|
66
117
  list.add(resolve(el["src"]), type: :media)
67
118
  end
68
119
  end
69
120
 
121
+ def extract_video_posters(list)
122
+ @doc.css("video[poster]").each do |el|
123
+ list.add(resolve(el["poster"]), type: :image)
124
+ end
125
+ end
126
+
127
+ def extract_embeds(list)
128
+ @doc.css("iframe[src], embed[src]").each do |el|
129
+ list.add(resolve(el["src"]), type: :media)
130
+ end
131
+ end
132
+
70
133
  def extract_inline_css(list)
71
134
  @doc.css("style").each do |el|
72
- extract_css_urls(el.text).each do |url|
135
+ text = el.text
136
+ extract_css_at_imports(text, list)
137
+ extract_css_font_urls(text, list)
138
+ extract_css_image_urls(text, list)
139
+ end
140
+ end
141
+
142
+ def extract_inline_styles(list)
143
+ @doc.css("[style]").each do |el|
144
+ style = el["style"]
145
+ next unless style
146
+
147
+ style.scan(/url\(\s*['"]?([^'")\s]+)['"]?\s*\)/).flatten.each do |url|
148
+ list.add(resolve(url), type: :image)
149
+ end
150
+ end
151
+ end
152
+
153
+ def extract_srcset(srcset_value, list, type)
154
+ return if srcset_value.nil?
155
+
156
+ srcset_value.split(",").each do |entry|
157
+ url = entry.strip.split(/\s+/, 2).first
158
+ list.add(resolve(url), type: type) if url && !url.empty?
159
+ end
160
+ end
161
+
162
+ def extract_css_at_imports(text, list)
163
+ text.scan(
164
+ /@import\s+(?:url\(\s*['"]?([^'")\s]+)['"]?\s*\)|['"]([^'"]+)['"])/,
165
+ ).flatten.compact.each do |url|
166
+ next if url.nil? || url.empty?
167
+
168
+ list.add(resolve(url), type: :css)
169
+ end
170
+ end
171
+
172
+ def extract_css_font_urls(text, list)
173
+ text.scan(/@font-face\s*\{[^}]*\}/m).each do |font_block|
174
+ extract_css_urls(font_block).each do |url|
73
175
  list.add(resolve(url), type: :font)
74
176
  end
75
177
  end
76
178
  end
77
179
 
180
+ def extract_css_image_urls(text, list)
181
+ text.scan(CSS_IMAGE_PROPS).flatten.each do |url|
182
+ list.add(resolve(url), type: :image)
183
+ end
184
+ end
185
+
78
186
  def font_stylesheet?(href)
79
- href.to_s.include?("fonts.googleapis.com") ||
80
- href.to_s.include?("font")
187
+ return false if href.nil?
188
+
189
+ FONT_CDN_PATTERNS.any? { |pattern| href.include?(pattern) }
81
190
  end
82
191
 
83
192
  def extract_css_urls(css_text)
@@ -1,11 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "json"
4
+
3
5
  module Archaeo
4
6
  # Categorized collection of asset URLs extracted from an archived page.
5
7
  #
6
8
  # Assets are grouped by type (css, js, image, font, media) for
7
9
  # convenient access during bulk download or local archiving.
8
10
  class AssetList
11
+ include Enumerable
12
+
9
13
  CATEGORIES = %i[css js image font media].freeze
10
14
 
11
15
  def initialize
@@ -14,7 +18,14 @@ module Archaeo
14
18
  end
15
19
 
16
20
  def add(url, type:)
17
- @urls_by_type[type] << url unless url.nil? || url.empty?
21
+ return if url.nil? || url.empty?
22
+ return if @urls_by_type[type].include?(url)
23
+
24
+ @urls_by_type[type] << url
25
+ end
26
+
27
+ def each(&block)
28
+ all.each(&block)
18
29
  end
19
30
 
20
31
  def css
@@ -48,5 +59,17 @@ module Archaeo
48
59
  def empty?
49
60
  all.empty?
50
61
  end
62
+
63
+ def to_h
64
+ @urls_by_type.transform_values(&:dup)
65
+ end
66
+
67
+ def to_json(*args)
68
+ to_h.to_json(*args)
69
+ end
70
+
71
+ def counts
72
+ @urls_by_type.transform_values(&:size)
73
+ end
51
74
  end
52
75
  end
@@ -68,12 +68,14 @@ module Archaeo
68
68
  def build_result(closest, url)
69
69
  archive_url = closest["url"].to_s.sub(%r{^http://}, "https://")
70
70
  ts = Timestamp.parse(closest["timestamp"])
71
+ archived_status = closest["status"].to_i
71
72
 
72
73
  AvailabilityResult.new(
73
74
  url: url,
74
- available: closest["status"].to_s == "200",
75
+ available: true,
75
76
  archive_url: archive_url,
76
77
  timestamp: ts,
78
+ archived_status: archived_status,
77
79
  )
78
80
  end
79
81
  end
@@ -6,17 +6,31 @@ module Archaeo
6
6
  # Indicates whether a URL is archived and, if so, provides
7
7
  # the closest snapshot's archive URL and timestamp.
8
8
  class AvailabilityResult
9
- attr_reader :url, :archive_url, :timestamp
9
+ attr_reader :url, :archive_url, :timestamp, :archived_status
10
10
 
11
- def initialize(url:, available:, archive_url: nil, timestamp: nil)
11
+ def initialize(url:, available:, archive_url: nil,
12
+ timestamp: nil, archived_status: nil)
12
13
  @url = url
13
14
  @available = available
14
15
  @archive_url = archive_url
15
16
  @timestamp = timestamp
17
+ @archived_status = archived_status
16
18
  end
17
19
 
18
20
  def available?
19
21
  @available
20
22
  end
23
+
24
+ def unavailable?
25
+ !@available
26
+ end
27
+
28
+ def to_s
29
+ if available?
30
+ "#{url} -> #{archive_url} (#{timestamp})"
31
+ else
32
+ "#{url} -> not available"
33
+ end
34
+ end
21
35
  end
22
36
  end
@@ -10,27 +10,26 @@ module Archaeo
10
10
  # for interrupted download recovery.
11
11
  class BulkDownloader
12
12
  def initialize(client: HttpClient.new, output_dir: "archive",
13
- cdx_api: nil)
13
+ cdx_api: nil, concurrency: 1)
14
14
  @client = client
15
15
  @output_dir = output_dir
16
16
  @cdx_api = cdx_api
17
+ @concurrency = [1, concurrency.to_i].max
17
18
  end
18
19
 
19
- def download(url, from: nil, to: nil, resume: false)
20
+ def download(url, from: nil, to: nil, resume: false, &block)
20
21
  url = UrlNormalizer.normalize(url)
21
22
  FileUtils.mkdir_p(@output_dir)
22
23
  state = DownloadState.new(@output_dir)
23
24
 
24
25
  snapshots = fetch_snapshots(url, from: from, to: to)
25
26
  total = snapshots.size
27
+ progress = block
26
28
 
27
- snapshots.each_with_index do |snap, index|
28
- next if resume && state.completed?(snap.timestamp)
29
-
30
- fetch_and_save(snap)
31
- state.mark_completed(snap.timestamp)
32
-
33
- yield index + 1, total, snap if block_given?
29
+ if @concurrency == 1
30
+ download_sequential(snapshots, total, state, resume, progress)
31
+ else
32
+ download_concurrent(snapshots, total, state, resume, progress)
34
33
  end
35
34
  end
36
35
 
@@ -45,6 +44,54 @@ module Archaeo
45
44
  .select { |snap| !snap.blocked? && snap.status_code == 200 }
46
45
  end
47
46
 
47
+ def download_sequential(snapshots, total, state, resume, progress)
48
+ snapshots.each_with_index do |snap, index|
49
+ next if resume && state.completed?(snap.timestamp)
50
+
51
+ fetch_and_save(snap)
52
+ state.mark_completed(snap.timestamp)
53
+
54
+ progress&.call(index + 1, total, snap)
55
+ end
56
+ end
57
+
58
+ def download_concurrent(snapshots, total, state, resume, progress)
59
+ queue = snapshots.each_with_index.to_a
60
+ mutex = Mutex.new
61
+ errors = []
62
+
63
+ threads = Array.new(@concurrency) do
64
+ Thread.new do
65
+ process_queue(queue, total, state, resume, progress, mutex, errors)
66
+ end
67
+ end
68
+ threads.each(&:join)
69
+
70
+ return unless errors.any?
71
+
72
+ raise Error,
73
+ "#{errors.size} download(s) failed: " \
74
+ "#{errors.map { |s, _| s.timestamp }.join(', ')}"
75
+ end
76
+
77
+ def process_queue(queue, total, state, resume, progress, mutex, errors)
78
+ loop do
79
+ snap, index = mutex.synchronize { queue.shift }
80
+ break unless snap
81
+
82
+ next if resume && state.completed?(snap.timestamp)
83
+
84
+ begin
85
+ fetch_and_save(snap)
86
+ state.mark_completed(snap.timestamp)
87
+ rescue StandardError => e
88
+ mutex.synchronize { errors << [snap, e] }
89
+ end
90
+
91
+ progress&.call(index + 1, total, snap)
92
+ end
93
+ end
94
+
48
95
  def fetch_and_save(snapshot)
49
96
  fetcher = Fetcher.new(client: @client)
50
97
  page = fetcher.fetch(snapshot.original_url,
@@ -52,40 +99,61 @@ module Archaeo
52
99
 
53
100
  filename = build_filename(snapshot)
54
101
  FileUtils.mkdir_p(File.dirname(filename))
55
- File.binwrite(filename, page.content)
102
+ tmp_path = "#{filename}.tmp"
103
+ File.binwrite(tmp_path, page.content)
104
+ File.rename(tmp_path, filename)
105
+ rescue StandardError
106
+ FileUtils.rm_f(tmp_path) if defined?(tmp_path)
107
+ raise
56
108
  end
57
109
 
58
110
  EXTENSION_MAP = {
59
111
  "text/html" => ".html",
60
112
  "text/css" => ".css",
113
+ "text/plain" => ".txt",
114
+ "text/javascript" => ".js",
61
115
  "application/javascript" => ".js",
116
+ "application/x-javascript" => ".js",
62
117
  "application/json" => ".json",
118
+ "application/xml" => ".xml",
63
119
  "application/pdf" => ".pdf",
120
+ "application/octet-stream" => ".bin",
64
121
  "image/png" => ".png",
65
122
  "image/jpeg" => ".jpg",
66
123
  "image/gif" => ".gif",
67
124
  "image/svg+xml" => ".svg",
68
125
  "image/webp" => ".webp",
126
+ "image/x-icon" => ".ico",
127
+ "image/bmp" => ".bmp",
69
128
  "font/woff2" => ".woff2",
70
129
  "font/woff" => ".woff",
130
+ "font/ttf" => ".ttf",
131
+ "font/eot" => ".eot",
71
132
  "video/mp4" => ".mp4",
72
133
  "audio/mpeg" => ".mp3",
73
134
  }.freeze
74
135
 
75
136
  def extension_for(snapshot)
76
- EXTENSION_MAP[snapshot.mimetype] || ".bin"
137
+ mime = snapshot.mimetype.to_s.split(";").first.strip.downcase
138
+ EXTENSION_MAP[mime] || ".bin"
77
139
  end
78
140
 
79
141
  def build_filename(snapshot)
80
142
  ts = snapshot.timestamp.to_s
81
143
  safe_path = snapshot.original_url
82
144
  .sub(%r{\Ahttps?://}, "")
83
- .gsub(%r{/}, File::SEPARATOR)
145
+ .gsub(%r{[<>:"|?*#]}, "_")
146
+ .gsub(%r{[/\\]}, File::SEPARATOR)
84
147
  .gsub(%r{[?&=]}, "_")
148
+
85
149
  safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
86
150
  safe_path = "#{safe_path}index" if safe_path.empty?
87
151
 
88
- File.join(@output_dir, safe_path, "#{ts}#{extension_for(snapshot)}")
152
+ segments = safe_path.split(File::SEPARATOR).map do |seg|
153
+ seg.length > 200 ? seg[0..200] : seg
154
+ end
155
+
156
+ File.join(@output_dir, *segments, "#{ts}#{extension_for(snapshot)}")
89
157
  end
90
158
  end
91
159
  end
@@ -99,6 +99,13 @@ module Archaeo
99
99
  "No snapshot found after #{ts} for #{url}"
100
100
  end
101
101
 
102
+ def between(url, from:, to:, **options)
103
+ snapshots(url,
104
+ from: Timestamp.coerce(from).to_s,
105
+ to: Timestamp.coerce(to).to_s,
106
+ **options)
107
+ end
108
+
102
109
  # Returns the number of pages for a paginated query.
103
110
  def num_pages(url, **options)
104
111
  url = UrlNormalizer.normalize(url)
@@ -59,10 +59,30 @@ module Archaeo
59
59
  new("urlkey:#{pattern}")
60
60
  end
61
61
 
62
+ def and(other)
63
+ [self, other]
64
+ end
65
+
66
+ def self.combine(*filters)
67
+ filters.flatten
68
+ end
69
+
70
+ def self.only_successful
71
+ [by_status(200)]
72
+ end
73
+
74
+ def self.excluding_errors
75
+ [excluding_status(404), excluding_status(500),
76
+ excluding_status(502), excluding_status(503)]
77
+ end
78
+
62
79
  private
63
80
 
64
81
  def validate!
65
- return if @expression.empty?
82
+ if @expression.empty?
83
+ raise ArgumentError,
84
+ "CDX filter expression cannot be empty"
85
+ end
66
86
 
67
87
  field_name = field
68
88
  return if VALID_FIELDS.include?(field_name)