archaeo 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: afb71bf4cd68f3d2aa80cd7169fba891b42975dd25ee3c35cc2323663a04dec7
4
- data.tar.gz: 566cfaf6758c96c09e1d718a9fc8a576d2b3800dee57456ff5d107a12eaf692b
3
+ metadata.gz: 721131f1413aaacb26685abe006fdd243e3ef53e1d5f23764b2774717aae93ec
4
+ data.tar.gz: f3c90969cf684e06a6cdd1e0025a605141e1ee543430488fbf960e787ca1ba7d
5
5
  SHA512:
6
- metadata.gz: efd13cd5d167bc2ac72fc278f45520808e79bf64f4bf2253161806efccbe9c8e16991be8e6171757e3c6a3b773e3c82c3bdf4c92562c276e1568d7098f6ade1f
7
- data.tar.gz: 0f8c8e91959c79b5ff2b0d2a868e08e86dee8675965696eba583f408940ffa5fd63caa87bb0567e916baf4a4c120f8b81ddf546f748802cfc9e07635dfa52e3a
6
+ metadata.gz: 494ba22650c56df4a2ae119f0b6965679441bf988f013fe22f54c8c02e285d9df6ade6db4c2574ec23ba3e95f917e541e96dbd79a5b6deae178d7b6eaa5fd1a7
7
+ data.tar.gz: cbaf296d51ecae3ad77eee66100f6ca6aa40d0ddf0abd3b6c73b7c61b5cf92131b991437c28537a4db97de0182b7a9133df54ad07c7017d6f613760bcadf3cca
@@ -32,6 +32,9 @@ module Archaeo
32
32
  @doc.css('link[rel="stylesheet"]').each do |el|
33
33
  list.add(resolve(el["href"]), type: :css)
34
34
  end
35
+ @doc.css('link[rel="icon"], link[rel="shortcut icon"]').each do |el|
36
+ list.add(resolve(el["href"]), type: :image)
37
+ end
35
38
  end
36
39
 
37
40
  def extract_js(list)
@@ -42,6 +42,11 @@ module Archaeo
42
42
 
43
43
  def parse_response(response, url)
44
44
  unless response.status == 200
45
+ if response.status == 503
46
+ raise RateLimitError,
47
+ "Availability API rate limited (HTTP 503)"
48
+ end
49
+
45
50
  raise InvalidResponse,
46
51
  "Availability API returned HTTP #{response.status}"
47
52
  end
@@ -9,9 +9,11 @@ module Archaeo
9
9
  # and saves content to disk. Progress is tracked in a state file
10
10
  # for interrupted download recovery.
11
11
  class BulkDownloader
12
- def initialize(client: HttpClient.new, output_dir: "archive")
12
+ def initialize(client: HttpClient.new, output_dir: "archive",
13
+ cdx_api: nil)
13
14
  @client = client
14
15
  @output_dir = output_dir
16
+ @cdx_api = cdx_api
15
17
  end
16
18
 
17
19
  def download(url, from: nil, to: nil, resume: false)
@@ -35,7 +37,7 @@ module Archaeo
35
37
  private
36
38
 
37
39
  def fetch_snapshots(url, from:, to:)
38
- cdx = CdxApi.new(client: @client)
40
+ cdx = @cdx_api || CdxApi.new(client: @client)
39
41
  options = {}
40
42
  options[:from] = from if from
41
43
  options[:to] = to if to
@@ -53,6 +55,27 @@ module Archaeo
53
55
  File.binwrite(filename, page.content)
54
56
  end
55
57
 
58
+ EXTENSION_MAP = {
59
+ "text/html" => ".html",
60
+ "text/css" => ".css",
61
+ "application/javascript" => ".js",
62
+ "application/json" => ".json",
63
+ "application/pdf" => ".pdf",
64
+ "image/png" => ".png",
65
+ "image/jpeg" => ".jpg",
66
+ "image/gif" => ".gif",
67
+ "image/svg+xml" => ".svg",
68
+ "image/webp" => ".webp",
69
+ "font/woff2" => ".woff2",
70
+ "font/woff" => ".woff",
71
+ "video/mp4" => ".mp4",
72
+ "audio/mpeg" => ".mp3",
73
+ }.freeze
74
+
75
+ def extension_for(snapshot)
76
+ EXTENSION_MAP[snapshot.mimetype] || ".bin"
77
+ end
78
+
56
79
  def build_filename(snapshot)
57
80
  ts = snapshot.timestamp.to_s
58
81
  safe_path = snapshot.original_url
@@ -62,7 +85,7 @@ module Archaeo
62
85
  safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
63
86
  safe_path = "#{safe_path}index" if safe_path.empty?
64
87
 
65
- File.join(@output_dir, safe_path, "#{ts}.html")
88
+ File.join(@output_dir, safe_path, "#{ts}#{extension_for(snapshot)}")
66
89
  end
67
90
  end
68
91
  end
@@ -55,6 +55,10 @@ module Archaeo
55
55
  new("original:#{pattern}")
56
56
  end
57
57
 
58
+ def self.by_urlkey(pattern)
59
+ new("urlkey:#{pattern}")
60
+ end
61
+
58
62
  private
59
63
 
60
64
  def validate!
data/lib/archaeo/cli.rb CHANGED
@@ -119,6 +119,18 @@ module Archaeo
119
119
  end
120
120
  end
121
121
 
122
+ desc "num_pages URL",
123
+ "Show number of CDX result pages for a URL"
124
+ def num_pages(url)
125
+ puts CdxApi.new.num_pages(url)
126
+ rescue RateLimitError => e
127
+ warn "Error: #{e.message}"
128
+ exit 1
129
+ rescue Error => e
130
+ warn "Error: #{e.message}"
131
+ exit 1
132
+ end
133
+
122
134
  CDX_OPTION_MAP = {
123
135
  from: :from,
124
136
  to: :to,
@@ -70,7 +70,13 @@ module Archaeo
70
70
  def get(url, headers: {})
71
71
  merged = default_headers.merge(headers)
72
72
  uri = URI(url)
73
- attempt_with_retries(uri, merged)
73
+ attempt_with_retries(uri, merged, Net::HTTP::Get)
74
+ end
75
+
76
+ def head(url, headers: {})
77
+ merged = default_headers.merge(headers)
78
+ uri = URI(url)
79
+ attempt_with_retries(uri, merged, Net::HTTP::Head)
74
80
  end
75
81
 
76
82
  def shutdown
@@ -128,10 +134,10 @@ module Archaeo
128
134
  end
129
135
  end
130
136
 
131
- def attempt_with_retries(uri, headers)
137
+ def attempt_with_retries(uri, headers, request_class)
132
138
  retries = 0
133
139
  begin
134
- execute_with_connection(uri, headers)
140
+ execute_with_connection(uri, headers, request_class)
135
141
  rescue *TRANSIENT_ERRORS => e
136
142
  retries += 1
137
143
  raise_if_exhausted(retries, e)
@@ -148,9 +154,9 @@ module Archaeo
148
154
  "Failed after #{retries} retries: #{error.message}"
149
155
  end
150
156
 
151
- def execute_with_connection(uri, headers)
157
+ def execute_with_connection(uri, headers, request_class)
152
158
  http = connection_for(uri)
153
- request = Net::HTTP::Get.new(uri)
159
+ request = request_class.new(uri)
154
160
  headers.each { |k, v| request[k] = v }
155
161
  raw = http.request(request)
156
162
  build_response(raw)
data/lib/archaeo/page.rb CHANGED
@@ -26,6 +26,10 @@ module Archaeo
26
26
  @content ||= transcode(@raw_content)
27
27
  end
28
28
 
29
+ def size
30
+ content.length
31
+ end
32
+
29
33
  def encoding
30
34
  @encoding ||= detect_encoding
31
35
  end
@@ -33,11 +33,27 @@ module Archaeo
33
33
  @status_code == BLOCKED_STATUS
34
34
  end
35
35
 
36
+ def success?
37
+ @status_code == 200
38
+ end
39
+
36
40
  def to_a
37
41
  [@urlkey, @timestamp, @original_url, @mimetype,
38
42
  @status_code, @digest, @length]
39
43
  end
40
44
 
45
+ def to_h
46
+ {
47
+ urlkey: @urlkey,
48
+ timestamp: @timestamp,
49
+ original_url: @original_url,
50
+ mimetype: @mimetype,
51
+ status_code: @status_code,
52
+ digest: @digest,
53
+ length: @length,
54
+ }
55
+ end
56
+
41
57
  def ==(other)
42
58
  other.is_a?(self.class) && to_a == other.to_a
43
59
  end
@@ -62,6 +62,14 @@ module Archaeo
62
62
  @to_time.strftime(FORMAT)
63
63
  end
64
64
 
65
+ def to_date
66
+ Date.new(year, month, day)
67
+ end
68
+
69
+ def to_i
70
+ @to_time.to_i
71
+ end
72
+
65
73
  def <=>(other)
66
74
  return nil unless other.is_a?(self.class)
67
75
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.3"
4
+ VERSION = "0.2.4"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-09 00:00:00.000000000 Z
11
+ date: 2026-05-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: csv