archaeo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Downloads archived content from the Wayback Machine.
5
+ #
6
+ # Constructs the appropriate archive URL, follows redirects,
7
+ # and returns a Page model with content and metadata.
8
+ class Fetcher
9
+ MAX_REDIRECTS = 5
10
+ BASE = "https://web.archive.org"
11
+
12
+ def initialize(client: HttpClient.new)
13
+ @client = client
14
+ end
15
+
16
+ def fetch(url, timestamp:, identity: false)
17
+ ts = Timestamp.coerce(timestamp)
18
+ archive_url = ArchiveUrl.new(url, timestamp: ts,
19
+ identity: identity)
20
+ response = follow_redirects(archive_url.to_s)
21
+ build_page(response, archive_url.to_s, url, ts)
22
+ end
23
+
24
+ private
25
+
26
+ def build_page(response, archive_url, url, timestamp)
27
+ Page.new(
28
+ content: response.body,
29
+ content_type: response.headers["content-type"],
30
+ status_code: response.status,
31
+ archive_url: archive_url,
32
+ original_url: url,
33
+ timestamp: timestamp,
34
+ )
35
+ end
36
+
37
+ def follow_redirects(url, remaining = MAX_REDIRECTS)
38
+ raise Error, "Too many redirects for #{url}" if remaining.negative?
39
+
40
+ response = @client.get(url)
41
+ return response unless redirect?(response)
42
+
43
+ new_url = resolve_redirect(url, response.headers["location"])
44
+ follow_redirects(new_url, remaining - 1)
45
+ end
46
+
47
+ def redirect?(response)
48
+ status = response.status
49
+ location = response.headers["location"]
50
+ status.between?(300, 399) && location
51
+ end
52
+
53
+ def resolve_redirect(current_url, location)
54
+ return location if location.start_with?("http")
55
+ return "#{BASE}#{location}" if location.start_with?("/web/")
56
+
57
+ URI.join(current_url, location).to_s
58
+ rescue URI::InvalidURIError
59
+ location
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,137 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "uri"
5
+ require "zlib"
6
+ require "stringio"
7
+
8
+ module Archaeo
9
+ # HTTP client with retry logic, gzip decompression, and
10
+ # rotating realistic User-Agent profiles.
11
+ #
12
+ # Injected via constructor for testability.
13
+ class HttpClient
14
+ DEFAULT_TIMEOUT = 30
15
+ DEFAULT_MAX_RETRIES = 3
16
+ DEFAULT_RETRY_DELAY = 2
17
+
18
+ TRANSIENT_ERRORS = [
19
+ Net::ReadTimeout,
20
+ Net::OpenTimeout,
21
+ IOError,
22
+ Errno::ECONNRESET,
23
+ Errno::ECONNREFUSED,
24
+ ].freeze
25
+
26
+ USER_AGENT_PROFILES = [
27
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
28
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
29
+ "Chrome/131.0.0.0 Safari/537.36",
30
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
31
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
32
+ "Chrome/130.0.0.0 Safari/537.36",
33
+ "Mozilla/5.0 (X11; Linux x86_64) " \
34
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
35
+ "Chrome/131.0.0.0 Safari/537.36",
36
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
37
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
38
+ "Chrome/129.0.0.0 Safari/537.36",
39
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
40
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
41
+ "Chrome/131.0.0.0 Safari/537.36",
42
+ ].freeze
43
+
44
+ # HTTP response with status code, headers (lowercase keys), and body.
45
+ class Response
46
+ attr_reader :status, :headers, :body
47
+
48
+ def initialize(status:, headers:, body:)
49
+ @status = status
50
+ @headers = headers
51
+ @body = body
52
+ end
53
+ end
54
+
55
+ def initialize(timeout: DEFAULT_TIMEOUT,
56
+ max_retries: DEFAULT_MAX_RETRIES,
57
+ retry_delay: DEFAULT_RETRY_DELAY,
58
+ user_agent: nil)
59
+ @timeout = timeout
60
+ @max_retries = max_retries
61
+ @retry_delay = retry_delay
62
+ @user_agent = user_agent
63
+ end
64
+
65
+ def get(url, headers: {})
66
+ merged = default_headers.merge(headers)
67
+ attempt_with_retries(url, merged)
68
+ end
69
+
70
+ private
71
+
72
+ def select_user_agent
73
+ @user_agent || USER_AGENT_PROFILES.sample
74
+ end
75
+
76
+ def attempt_with_retries(url, headers)
77
+ retries = 0
78
+ begin
79
+ execute_get(url, headers)
80
+ rescue *TRANSIENT_ERRORS => e
81
+ retries += 1
82
+ raise_if_exhausted(retries, e)
83
+ sleep(@retry_delay * retries)
84
+ retry
85
+ end
86
+ end
87
+
88
+ def raise_if_exhausted(retries, error)
89
+ return unless retries > @max_retries
90
+
91
+ raise MaximumRetriesExceeded,
92
+ "Failed after #{retries} retries: #{error.message}"
93
+ end
94
+
95
+ def default_headers
96
+ {
97
+ "User-Agent" => select_user_agent,
98
+ "Accept" => "text/html,application/xhtml+xml," \
99
+ "application/xml;q=0.9,*/*;q=0.8",
100
+ "Accept-Encoding" => "gzip",
101
+ "Accept-Language" => "en-US,en;q=0.9",
102
+ "Connection" => "keep-alive",
103
+ }
104
+ end
105
+
106
+ def execute_get(url, headers)
107
+ uri = URI(url)
108
+ Net::HTTP.start(uri.host, uri.port,
109
+ use_ssl: uri.scheme == "https",
110
+ read_timeout: @timeout,
111
+ open_timeout: @timeout) do |http|
112
+ request = Net::HTTP::Get.new(uri)
113
+ headers.each { |k, v| request[k] = v }
114
+ raw = http.request(request)
115
+ build_response(raw)
116
+ end
117
+ end
118
+
119
+ def build_response(raw)
120
+ headers = raw.each_header.to_h { |k, v| [k.downcase, v] }
121
+ Response.new(
122
+ status: raw.code.to_i,
123
+ headers: headers,
124
+ body: decompress_body(raw),
125
+ )
126
+ end
127
+
128
+ def decompress_body(raw)
129
+ body = raw.body.to_s
130
+ return body unless raw["content-encoding"] == "gzip" && !body.empty?
131
+
132
+ Zlib::GzipReader.new(StringIO.new(body)).read
133
+ rescue Zlib::GzipFile::Error
134
+ body
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Model representing a fetched archived page from the Wayback Machine.
5
+ #
6
+ # Contains the page content, metadata, and provenance information
7
+ # for a single archived resource.
8
+ class Page
9
+ attr_reader :content, :content_type, :status_code,
10
+ :archive_url, :original_url, :timestamp
11
+
12
+ def initialize(content:, content_type:, status_code:,
13
+ archive_url:, original_url:, timestamp:)
14
+ @content = content
15
+ @content_type = content_type
16
+ @status_code = status_code
17
+ @archive_url = archive_url
18
+ @original_url = original_url
19
+ @timestamp = Timestamp.coerce(timestamp)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Client for the Wayback Machine SavePageNow (SPN) API.
5
+ #
6
+ # Request the Wayback Machine to archive a URL and retrieve the
7
+ # resulting archive URL and timestamp as a SaveResult.
8
+ class SaveApi
9
+ ENDPOINT = "https://web.archive.org/save"
10
+ DEFAULT_MAX_TRIES = 8
11
+ TIMESTAMP_RE = %r{web\.archive\.org/web/(\d{14})}
12
+
13
+ def initialize(client: HttpClient.new,
14
+ max_tries: DEFAULT_MAX_TRIES)
15
+ @client = client
16
+ @max_tries = max_tries
17
+ end
18
+
19
+ def save(url)
20
+ save_url = "#{ENDPOINT}/#{url}"
21
+ start_time = Time.now.utc
22
+ attempt_save(save_url, start_time, url)
23
+ end
24
+
25
+ private
26
+
27
+ def attempt_save(save_url, start_time, url)
28
+ @max_tries.times do |attempt|
29
+ sleep(retry_delay(attempt)) if attempt.positive?
30
+
31
+ response = @client.get(save_url)
32
+ check_response_errors!(response, url)
33
+
34
+ result = process_save_response(response, start_time)
35
+ return result if result
36
+ end
37
+
38
+ raise MaximumRetriesExceeded,
39
+ "Failed to save #{url} after #{@max_tries} attempts"
40
+ end
41
+
42
+ def process_save_response(response, start_time)
43
+ archive_url = extract_archive_url(response)
44
+ return nil unless archive_url
45
+
46
+ ts = Timestamp.parse(extract_timestamp(archive_url))
47
+ cached = ts.to_time < start_time - 2700
48
+ SaveResult.new(archive_url: archive_url,
49
+ timestamp: ts, cached: cached)
50
+ end
51
+
52
+ def check_response_errors!(response, url)
53
+ case response.status
54
+ when 429
55
+ raise RateLimitError, "Rate limited while saving #{url}"
56
+ when 509
57
+ raise SaveFailed, "Session limit reached while saving #{url}"
58
+ end
59
+ end
60
+
61
+ def retry_delay(attempt)
62
+ ((attempt + 1) % 3).zero? ? 10 : 5
63
+ end
64
+
65
+ def extract_archive_url(response)
66
+ headers = response.headers
67
+ from_content_location(headers) ||
68
+ from_memento_link(headers) ||
69
+ from_cache_key(headers) ||
70
+ from_location(headers)
71
+ end
72
+
73
+ def from_content_location(headers)
74
+ location = headers["content-location"]
75
+ return unless location&.match?(%r{^/web/\d{14}/})
76
+
77
+ "https://web.archive.org#{location}"
78
+ end
79
+
80
+ def from_memento_link(headers)
81
+ link = headers["link"].to_s
82
+ match = link.match(
83
+ %r{rel="memento".*?href="(web\.archive\.org/web/\d{14}/.*?)"},
84
+ )
85
+ return unless match
86
+
87
+ "https://#{match[1]}"
88
+ end
89
+
90
+ def from_cache_key(headers)
91
+ cache_key = headers["x-cache-key"].to_s
92
+ match = cache_key.match(/(https.*)[A-Z]{2}/)
93
+ match ? match[1] : nil
94
+ end
95
+
96
+ def from_location(headers)
97
+ location = headers["location"].to_s
98
+ match = location.match(%r{(web\.archive\.org/web/\d+/.*)$})
99
+ return unless match
100
+
101
+ "https://#{match[1]}"
102
+ end
103
+
104
+ def extract_timestamp(archive_url)
105
+ match = archive_url.match(TIMESTAMP_RE)
106
+ return match[1] if match
107
+
108
+ raise InvalidResponse,
109
+ "Cannot parse timestamp from: #{archive_url}"
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Model representing the outcome of a SavePageNow request.
5
+ #
6
+ # Contains the resulting archive URL, timestamp, and whether
7
+ # the page was already cached in the archive.
8
+ class SaveResult
9
+ attr_reader :archive_url, :timestamp
10
+
11
+ def initialize(archive_url:, timestamp:, cached:)
12
+ @archive_url = archive_url
13
+ @timestamp = Timestamp.coerce(timestamp)
14
+ @cached = cached
15
+ end
16
+
17
+ def cached?
18
+ @cached
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # A single CDX Server API record representing an archived document.
5
+ #
6
+ # Maps the seven standard CDX fields and provides the computed
7
+ # archive URL via the ArchiveUrl model.
8
+ class Snapshot
9
+ FIELDS = %i[urlkey timestamp original_url
10
+ mimetype status_code digest length].freeze
11
+
12
+ attr_reader(*FIELDS)
13
+
14
+ def initialize(urlkey:, timestamp:, original_url:,
15
+ mimetype: nil, status_code: nil,
16
+ digest: nil, length: nil)
17
+ @urlkey = urlkey.to_s
18
+ @timestamp = Timestamp.coerce(timestamp)
19
+ @original_url = original_url.to_s
20
+ @mimetype = mimetype.to_s
21
+ @status_code = status_code.to_i
22
+ @digest = digest.to_s
23
+ @length = length.to_i
24
+ end
25
+
26
+ def archive_url
27
+ ArchiveUrl.new(original_url, timestamp: @timestamp).to_s
28
+ end
29
+
30
+ def ==(other)
31
+ other.is_a?(self.class) &&
32
+ FIELDS.all? { |f| send(f) == other.send(f) }
33
+ end
34
+ alias_method :eql?, :==
35
+
36
+ def hash
37
+ FIELDS.map { |f| send(f) }.hash
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Value object representing a Wayback Machine timestamp (YYYYMMDDHHmmss).
5
+ #
6
+ # Supports parsing, formatting, comparison, and coercion from various
7
+ # time representations.
8
+ class Timestamp
9
+ include Comparable
10
+
11
+ FORMAT = "%Y%m%d%H%M%S"
12
+
13
+ attr_reader :to_time
14
+
15
+ def initialize(year:, month: 1, day: 1,
16
+ hour: 0, minute: 0, second: 0)
17
+ @to_time = Time.utc(year, month, day, hour, minute, second)
18
+ end
19
+
20
+ def self.parse(string)
21
+ year = string[0, 4].to_i
22
+ month = string[4, 2].to_i if string.length >= 6
23
+ day = string[6, 2].to_i if string.length >= 8
24
+
25
+ new(year: year, month: month, day: day,
26
+ **parse_time_parts(string))
27
+ end
28
+
29
+ def self.parse_time_parts(string)
30
+ return {} if string.length < 10
31
+
32
+ {
33
+ hour: string[8, 2].to_i,
34
+ minute: string[10, 2].to_i,
35
+ second: string[12, 2].to_i,
36
+ }
37
+ end
38
+ private_class_method :parse_time_parts
39
+
40
+ def self.from_time(time)
41
+ utc = time.getutc
42
+ new(year: utc.year, month: utc.month, day: utc.day,
43
+ hour: utc.hour, minute: utc.min, second: utc.sec)
44
+ end
45
+
46
+ def self.now
47
+ from_time(Time.now)
48
+ end
49
+
50
+ def self.coerce(value)
51
+ case value
52
+ when Timestamp then value
53
+ when String then parse(value)
54
+ when Time then from_time(value)
55
+ else
56
+ raise ArgumentError,
57
+ "Cannot coerce #{value.class} to Archaeo::Timestamp"
58
+ end
59
+ end
60
+
61
+ def to_s
62
+ @to_time.strftime(FORMAT)
63
+ end
64
+
65
+ def <=>(other)
66
+ return nil unless other.is_a?(self.class)
67
+
68
+ to_s <=> other.to_s
69
+ end
70
+
71
+ def hash
72
+ to_s.hash
73
+ end
74
+
75
+ def eql?(other)
76
+ self == other
77
+ end
78
+
79
+ def year
80
+ @to_time.year
81
+ end
82
+
83
+ def month
84
+ @to_time.month
85
+ end
86
+
87
+ def day
88
+ @to_time.day
89
+ end
90
+
91
+ def hour
92
+ @to_time.hour
93
+ end
94
+
95
+ def minute
96
+ @to_time.min
97
+ end
98
+
99
+ def second
100
+ @to_time.sec
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ VERSION = "0.1.0"
5
+ end
data/lib/archaeo.rb ADDED
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "archaeo/version"
4
+
5
+ # Archaeo provides a Ruby interface to the Internet Archive's Wayback Machine
6
+ # APIs, including the CDX Server API, Availability API, SavePageNow API,
7
+ # and content fetching.
8
+ module Archaeo
9
+ class Error < StandardError; end
10
+ class NoSnapshotFound < Error; end
11
+ class BlockedSiteError < Error; end
12
+ class RateLimitError < Error; end
13
+ class MaximumRetriesExceeded < Error; end
14
+ class ArchiveNotAvailable < Error; end
15
+ class InvalidResponse < Error; end
16
+ class SaveFailed < Error; end
17
+
18
+ autoload :Timestamp, "archaeo/timestamp"
19
+ autoload :ArchiveUrl, "archaeo/archive_url"
20
+ autoload :Snapshot, "archaeo/snapshot"
21
+ autoload :Page, "archaeo/page"
22
+ autoload :SaveResult, "archaeo/save_result"
23
+ autoload :AvailabilityResult, "archaeo/availability_result"
24
+ autoload :HttpClient, "archaeo/http_client"
25
+ autoload :CdxApi, "archaeo/cdx_api"
26
+ autoload :AvailabilityApi, "archaeo/availability_api"
27
+ autoload :SaveApi, "archaeo/save_api"
28
+ autoload :Fetcher, "archaeo/fetcher"
29
+ autoload :Cli, "archaeo/cli"
30
+ end