archaeo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +10 -0
- data/README.adoc +186 -0
- data/Rakefile +12 -0
- data/archaeo.gemspec +38 -0
- data/bin/console +11 -0
- data/bin/setup +8 -0
- data/exe/archaeo +6 -0
- data/lib/archaeo/archive_url.rb +54 -0
- data/lib/archaeo/availability_api.rb +74 -0
- data/lib/archaeo/availability_result.rb +22 -0
- data/lib/archaeo/cdx_api.rb +162 -0
- data/lib/archaeo/cli.rb +94 -0
- data/lib/archaeo/fetcher.rb +62 -0
- data/lib/archaeo/http_client.rb +137 -0
- data/lib/archaeo/page.rb +22 -0
- data/lib/archaeo/save_api.rb +112 -0
- data/lib/archaeo/save_result.rb +21 -0
- data/lib/archaeo/snapshot.rb +40 -0
- data/lib/archaeo/timestamp.rb +103 -0
- data/lib/archaeo/version.rb +5 -0
- data/lib/archaeo.rb +30 -0
- data/sig/archaeo.rbs +241 -0
- metadata +84 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 65cb8ec1434b72774ed3a1d49ac87920bebb549cc5a4aebb0966b8d110d740ba
|
|
4
|
+
data.tar.gz: a10b0bf2b8555d3a259c8ec02364e3031697189a784411ab54c4a1bfd17ab402
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 74eac73369d611a491152f7018d63b8fe3b46f8154a374d43d62f3ca023e1837dae666b98c795c211add5c6282fffe70576e581165aecd6644ecbccd15efe623
|
|
7
|
+
data.tar.gz: 03dd2e1ea518ef34a2b2c427c91a9a562aa328aced3b8cad4910fb634ba8e723f1cfb50bd57b247403a060c67f2f2f4638ad9e4293abb76e6225a731bb7493fd
|
data/CHANGELOG.md
ADDED
data/CODE_OF_CONDUCT.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Code of Conduct
|
|
2
|
+
|
|
3
|
+
"archaeo" follows [The Ruby Community Conduct Guideline](https://www.ruby-lang.org/en/conduct) in all "collaborative space", which is defined as community communications channels (such as mailing lists, submitted patches, commit comments, etc.):
|
|
4
|
+
|
|
5
|
+
* Participants will be tolerant of opposing views.
|
|
6
|
+
* Participants must ensure that their language and actions are free of personal attacks and disparaging personal remarks.
|
|
7
|
+
* When interpreting the words and actions of others, participants should always assume good intentions.
|
|
8
|
+
* Behaviour which can be reasonably considered harassment will not be tolerated.
|
|
9
|
+
|
|
10
|
+
If you have any concerns about behaviour within this project, please contact us at ["ronald.tse@ribose.com"](mailto:"ronald.tse@ribose.com").
|
data/README.adoc
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
= Archaeo
|
|
2
|
+
|
|
3
|
+
== Purpose
|
|
4
|
+
|
|
5
|
+
Archaeo is a Ruby client for the Internet Archive's https://web.archive.org[Wayback Machine] APIs.
|
|
6
|
+
|
|
7
|
+
It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, and fetching archived content.
|
|
8
|
+
|
|
9
|
+
== Installation
|
|
10
|
+
|
|
11
|
+
[source,bash]
|
|
12
|
+
----
|
|
13
|
+
gem install archaeo
|
|
14
|
+
----
|
|
15
|
+
|
|
16
|
+
Or add to your Gemfile:
|
|
17
|
+
|
|
18
|
+
[source,ruby]
|
|
19
|
+
----
|
|
20
|
+
gem "archaeo"
|
|
21
|
+
----
|
|
22
|
+
|
|
23
|
+
== Quick Start
|
|
24
|
+
|
|
25
|
+
[source,ruby]
|
|
26
|
+
----
|
|
27
|
+
require "archaeo"
|
|
28
|
+
----
|
|
29
|
+
|
|
30
|
+
=== Query Snapshots (CDX API)
|
|
31
|
+
|
|
32
|
+
[source,ruby]
|
|
33
|
+
----
|
|
34
|
+
cdx = Archaeo::CdxApi.new
|
|
35
|
+
|
|
36
|
+
# Enumerate all snapshots
|
|
37
|
+
cdx.snapshots("example.com").each do |snapshot|
|
|
38
|
+
puts snapshot.timestamp
|
|
39
|
+
puts snapshot.original_url
|
|
40
|
+
puts snapshot.archive_url
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Find specific snapshots
|
|
44
|
+
oldest = cdx.oldest("example.com")
|
|
45
|
+
newest = cdx.newest("example.com")
|
|
46
|
+
near = cdx.near("example.com", timestamp: "20220101")
|
|
47
|
+
|
|
48
|
+
# Filter by time
|
|
49
|
+
before = cdx.before("example.com", timestamp: "20220101")
|
|
50
|
+
after = cdx.after("example.com", timestamp: "20220101")
|
|
51
|
+
----
|
|
52
|
+
|
|
53
|
+
=== Check Availability
|
|
54
|
+
|
|
55
|
+
[source,ruby]
|
|
56
|
+
----
|
|
57
|
+
api = Archaeo::AvailabilityApi.new
|
|
58
|
+
|
|
59
|
+
result = api.near("example.com")
|
|
60
|
+
result.available? # => true/false
|
|
61
|
+
result.archive_url # => "https://web.archive.org/web/..."
|
|
62
|
+
result.timestamp # => Archaeo::Timestamp
|
|
63
|
+
|
|
64
|
+
api.available?("example.com") # => true/false
|
|
65
|
+
----
|
|
66
|
+
|
|
67
|
+
=== Save a URL (SavePageNow)
|
|
68
|
+
|
|
69
|
+
[source,ruby]
|
|
70
|
+
----
|
|
71
|
+
save = Archaeo::SaveApi.new
|
|
72
|
+
result = save.save("https://example.com/")
|
|
73
|
+
result.archive_url # => "https://web.archive.org/web/..."
|
|
74
|
+
result.timestamp # => Archaeo::Timestamp
|
|
75
|
+
result.cached? # => true if already archived
|
|
76
|
+
----
|
|
77
|
+
|
|
78
|
+
=== Fetch Archived Content
|
|
79
|
+
|
|
80
|
+
[source,ruby]
|
|
81
|
+
----
|
|
82
|
+
fetcher = Archaeo::Fetcher.new
|
|
83
|
+
page = fetcher.fetch("https://example.com/",
|
|
84
|
+
timestamp: "20220615000000")
|
|
85
|
+
|
|
86
|
+
page.content # => "<html>...</html>"
|
|
87
|
+
page.content_type # => "text/html"
|
|
88
|
+
page.status_code # => 200
|
|
89
|
+
page.archive_url # => full archive URL
|
|
90
|
+
|
|
91
|
+
# Raw (identity) mode -- no Wayback Machine rewriting
|
|
92
|
+
page = fetcher.fetch("https://example.com/",
|
|
93
|
+
timestamp: "20220615000000",
|
|
94
|
+
identity: true)
|
|
95
|
+
----
|
|
96
|
+
|
|
97
|
+
=== Timestamps
|
|
98
|
+
|
|
99
|
+
[source,ruby]
|
|
100
|
+
----
|
|
101
|
+
# Create from components
|
|
102
|
+
ts = Archaeo::Timestamp.new(year: 2022, month: 6, day: 15)
|
|
103
|
+
|
|
104
|
+
# Parse from Wayback format
|
|
105
|
+
ts = Archaeo::Timestamp.parse("20220615120000")
|
|
106
|
+
|
|
107
|
+
# From Time object
|
|
108
|
+
ts = Archaeo::Timestamp.from_time(Time.now)
|
|
109
|
+
|
|
110
|
+
# Current time
|
|
111
|
+
ts = Archaeo::Timestamp.now
|
|
112
|
+
|
|
113
|
+
# Format as 14-digit string
|
|
114
|
+
ts.to_s # => "20220615000000"
|
|
115
|
+
|
|
116
|
+
# Comparison
|
|
117
|
+
ts1 < ts2 # => true/false
|
|
118
|
+
----
|
|
119
|
+
|
|
120
|
+
=== Command-Line Interface
|
|
121
|
+
|
|
122
|
+
[source,bash]
|
|
123
|
+
----
|
|
124
|
+
# List snapshots
|
|
125
|
+
archaeo snapshots example.com
|
|
126
|
+
|
|
127
|
+
# Find closest snapshot
|
|
128
|
+
archaeo near example.com 20220101
|
|
129
|
+
|
|
130
|
+
# Check availability
|
|
131
|
+
archaeo available example.com
|
|
132
|
+
|
|
133
|
+
# Save a URL
|
|
134
|
+
archaeo save https://example.com/
|
|
135
|
+
|
|
136
|
+
# Fetch archived content
|
|
137
|
+
archaeo fetch https://example.com/ 20220615120000
|
|
138
|
+
|
|
139
|
+
# Fetch raw (identity) content
|
|
140
|
+
archaeo fetch --identity https://example.com/ 20220615120000
|
|
141
|
+
----
|
|
142
|
+
|
|
143
|
+
== Architecture
|
|
144
|
+
|
|
145
|
+
Archaeo follows a model-driven, OOP design:
|
|
146
|
+
|
|
147
|
+
[cols="1,2,1"]
|
|
148
|
+
|===
|
|
149
|
+
| Layer | Classes | Purpose
|
|
150
|
+
|
|
151
|
+
| *Models*
|
|
152
|
+
| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `SaveResult`, `AvailabilityResult`
|
|
153
|
+
| Domain value objects
|
|
154
|
+
|
|
155
|
+
| *APIs*
|
|
156
|
+
| `CdxApi`, `AvailabilityApi`, `SaveApi`
|
|
157
|
+
| Query and mutate the archive
|
|
158
|
+
|
|
159
|
+
| *Operations*
|
|
160
|
+
| `Fetcher`
|
|
161
|
+
| Download archived content
|
|
162
|
+
|
|
163
|
+
| *Infrastructure*
|
|
164
|
+
| `HttpClient`
|
|
165
|
+
| HTTP transport with retries and gzip
|
|
166
|
+
|===
|
|
167
|
+
|
|
168
|
+
All API classes accept an `HttpClient` via dependency injection for testability.
|
|
169
|
+
|
|
170
|
+
== Development
|
|
171
|
+
|
|
172
|
+
[source,bash]
|
|
173
|
+
----
|
|
174
|
+
bundle install
|
|
175
|
+
bundle exec rspec
|
|
176
|
+
bundle exec rubocop
|
|
177
|
+
----
|
|
178
|
+
|
|
179
|
+
== Contributing
|
|
180
|
+
|
|
181
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/riboseinc/archaeo[].
|
|
182
|
+
|
|
183
|
+
== License
|
|
184
|
+
|
|
185
|
+
MIT License.
|
|
186
|
+
See link:LICENSE[LICENSE] for details.
|
data/Rakefile
ADDED
data/archaeo.gemspec
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/archaeo/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "archaeo"
|
|
7
|
+
spec.version = Archaeo::VERSION
|
|
8
|
+
spec.authors = ["Ribose Inc."]
|
|
9
|
+
spec.email = ["open.source@ribose.com"]
|
|
10
|
+
|
|
11
|
+
spec.summary = "Ruby client for the Internet Archive Wayback Machine APIs"
|
|
12
|
+
spec.description = "Archaeo provides a Ruby interface to query, fetch, " \
|
|
13
|
+
"and save archived web content via the Wayback Machine " \
|
|
14
|
+
"CDX Server API, Availability API, SavePageNow API, " \
|
|
15
|
+
"and content fetching."
|
|
16
|
+
spec.homepage = "https://github.com/riboseinc/archaeo"
|
|
17
|
+
spec.required_ruby_version = ">= 3.0.0"
|
|
18
|
+
spec.license = "MIT"
|
|
19
|
+
|
|
20
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
21
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
22
|
+
spec.metadata["changelog_uri"] =
|
|
23
|
+
"#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
24
|
+
spec.metadata["rubygems_mfa_required"] = "true"
|
|
25
|
+
|
|
26
|
+
spec.files = IO.popen(%w[git ls-files -z], chdir: __dir__,
|
|
27
|
+
err: IO::NULL) do |ls|
|
|
28
|
+
ls.readlines("\x0", chomp: true).reject do |f|
|
|
29
|
+
f == __FILE__ ||
|
|
30
|
+
f.start_with?(*%w[Gemfile .gitignore .rspec spec/ .github/ .rubocop])
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
spec.bindir = "exe"
|
|
34
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
35
|
+
spec.require_paths = ["lib"]
|
|
36
|
+
|
|
37
|
+
spec.add_dependency "thor", "~> 1.3"
|
|
38
|
+
end
|
data/bin/console
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "bundler/setup"
|
|
5
|
+
require "archaeo"
|
|
6
|
+
|
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
|
9
|
+
|
|
10
|
+
require "irb"
|
|
11
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/exe/archaeo
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Model representing a Wayback Machine archive URL.
|
|
5
|
+
#
|
|
6
|
+
# Encapsulates URL construction and parsing for archive.org URLs,
|
|
7
|
+
# supporting both normal and identity (raw) modes.
|
|
8
|
+
class ArchiveUrl
|
|
9
|
+
BASE = "https://web.archive.org/web"
|
|
10
|
+
|
|
11
|
+
TIMESTAMP_RE = %r{web\.archive\.org/web/(\d{14})}
|
|
12
|
+
|
|
13
|
+
attr_reader :original_url, :timestamp
|
|
14
|
+
|
|
15
|
+
def initialize(original_url, timestamp:, identity: false)
|
|
16
|
+
@original_url = original_url.to_s
|
|
17
|
+
@timestamp = Timestamp.coerce(timestamp)
|
|
18
|
+
@identity = identity
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.parse(string)
|
|
22
|
+
match = string.match(TIMESTAMP_RE)
|
|
23
|
+
unless match
|
|
24
|
+
raise ArgumentError,
|
|
25
|
+
"Not a valid archive URL: #{string}"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
ts = Timestamp.parse(match[1])
|
|
29
|
+
identity = string.include?("#{match[1]}id_/")
|
|
30
|
+
rest = extract_original_url(string, match[1], identity)
|
|
31
|
+
|
|
32
|
+
new(rest, timestamp: ts, identity: identity)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def identity?
|
|
36
|
+
@identity
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def to_s
|
|
40
|
+
suffix = identity? ? "id_" : ""
|
|
41
|
+
"#{BASE}/#{@timestamp}#{suffix}/#{@original_url}"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def self.extract_original_url(string, ts_str, identity)
|
|
45
|
+
marker = identity ? "#{ts_str}id_/" : "#{ts_str}/"
|
|
46
|
+
idx = string.index(marker)
|
|
47
|
+
return "" unless idx
|
|
48
|
+
|
|
49
|
+
string[(idx + marker.length)..]
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private_class_method :extract_original_url
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module Archaeo
|
|
7
|
+
# Client for the Wayback Machine Availability API.
|
|
8
|
+
#
|
|
9
|
+
# Check whether a URL has been archived and retrieve the closest
|
|
10
|
+
# available snapshot for a given point in time.
|
|
11
|
+
class AvailabilityApi
|
|
12
|
+
ENDPOINT = "https://archive.org/wayback/available"
|
|
13
|
+
|
|
14
|
+
def initialize(client: HttpClient.new)
|
|
15
|
+
@client = client
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def near(url, timestamp: nil)
|
|
19
|
+
params = { "url" => url }
|
|
20
|
+
params["timestamp"] = timestamp.to_s if timestamp
|
|
21
|
+
|
|
22
|
+
response = @client.get(
|
|
23
|
+
"#{ENDPOINT}?#{URI.encode_www_form(params)}",
|
|
24
|
+
)
|
|
25
|
+
parse_response(response, url)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def oldest(url)
|
|
29
|
+
near(url, timestamp: Timestamp.new(year: 1994, month: 1, day: 1))
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def newest(url)
|
|
33
|
+
near(url, timestamp: Timestamp.now)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def available?(url)
|
|
37
|
+
near(url).available?
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def parse_response(response, url)
|
|
43
|
+
unless response.status == 200
|
|
44
|
+
raise InvalidResponse,
|
|
45
|
+
"Availability API returned HTTP #{response.status}"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
json = JSON.parse(response.body)
|
|
49
|
+
snapshots = json["archived_snapshots"]
|
|
50
|
+
return unavailable(url) if snapshots.nil? || snapshots.empty?
|
|
51
|
+
|
|
52
|
+
closest = snapshots["closest"]
|
|
53
|
+
return unavailable(url) if closest.nil?
|
|
54
|
+
|
|
55
|
+
build_result(closest, url)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def unavailable(url)
|
|
59
|
+
AvailabilityResult.new(url: url, available: false)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def build_result(closest, url)
|
|
63
|
+
archive_url = closest["url"].to_s.sub(%r{^http://}, "https://")
|
|
64
|
+
ts = Timestamp.parse(closest["timestamp"])
|
|
65
|
+
|
|
66
|
+
AvailabilityResult.new(
|
|
67
|
+
url: url,
|
|
68
|
+
available: closest["status"].to_s == "200",
|
|
69
|
+
archive_url: archive_url,
|
|
70
|
+
timestamp: ts,
|
|
71
|
+
)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Model representing the result of an availability query.
|
|
5
|
+
#
|
|
6
|
+
# Indicates whether a URL is archived and, if so, provides
|
|
7
|
+
# the closest snapshot's archive URL and timestamp.
|
|
8
|
+
class AvailabilityResult
|
|
9
|
+
attr_reader :url, :archive_url, :timestamp
|
|
10
|
+
|
|
11
|
+
def initialize(url:, available:, archive_url: nil, timestamp: nil)
|
|
12
|
+
@url = url
|
|
13
|
+
@available = available
|
|
14
|
+
@archive_url = archive_url
|
|
15
|
+
@timestamp = timestamp
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def available?
|
|
19
|
+
@available
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module Archaeo
|
|
7
|
+
# Client for the Wayback Machine CDX Server API.
|
|
8
|
+
#
|
|
9
|
+
# Query archived snapshots by URL, timestamp range, filters,
|
|
10
|
+
# and more. Returns Snapshot objects for each matching CDX record.
|
|
11
|
+
class CdxApi
|
|
12
|
+
ENDPOINT = "https://web.archive.org/cdx/search/cdx"
|
|
13
|
+
|
|
14
|
+
ALL_FIELDS = %w[
|
|
15
|
+
urlkey timestamp original
|
|
16
|
+
mimetype statuscode digest length
|
|
17
|
+
].freeze
|
|
18
|
+
|
|
19
|
+
MATCH_TYPES = %w[exact prefix host domain].freeze
|
|
20
|
+
SORT_ORDERS = %w[default closest reverse].freeze
|
|
21
|
+
DEFAULT_LIMIT = 25_000
|
|
22
|
+
|
|
23
|
+
SCALAR_PARAMS = {
|
|
24
|
+
from: "from",
|
|
25
|
+
to: "to",
|
|
26
|
+
match_type: "matchType",
|
|
27
|
+
sort: "sort",
|
|
28
|
+
limit: "limit",
|
|
29
|
+
closest: "closest",
|
|
30
|
+
}.freeze
|
|
31
|
+
|
|
32
|
+
def initialize(client: HttpClient.new)
|
|
33
|
+
@client = client
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def snapshots(url, **options)
|
|
37
|
+
validate_options!(options)
|
|
38
|
+
|
|
39
|
+
Enumerator.new do |yielder|
|
|
40
|
+
fetch_snapshots(url, options, yielder)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def near(url, timestamp:)
|
|
45
|
+
ts = Timestamp.coerce(timestamp)
|
|
46
|
+
result = snapshots(url, sort: "closest",
|
|
47
|
+
closest: ts.to_s, limit: 1).first
|
|
48
|
+
result || raise(NoSnapshotFound,
|
|
49
|
+
"No snapshot found near #{ts} for #{url}")
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def oldest(url)
|
|
53
|
+
near(url, timestamp: Timestamp.new(year: 1994, month: 1, day: 1))
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def newest(url)
|
|
57
|
+
near(url, timestamp: Timestamp.now)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def before(url, timestamp:)
|
|
61
|
+
ts = Timestamp.coerce(timestamp)
|
|
62
|
+
snapshots(url, sort: "closest", closest: ts.to_s).each do |snap|
|
|
63
|
+
return snap if snap.timestamp < ts
|
|
64
|
+
end
|
|
65
|
+
raise NoSnapshotFound,
|
|
66
|
+
"No snapshot found before #{ts} for #{url}"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def after(url, timestamp:)
|
|
70
|
+
ts = Timestamp.coerce(timestamp)
|
|
71
|
+
snapshots(url, sort: "closest", closest: ts.to_s).each do |snap|
|
|
72
|
+
return snap if snap.timestamp > ts
|
|
73
|
+
end
|
|
74
|
+
raise NoSnapshotFound,
|
|
75
|
+
"No snapshot found after #{ts} for #{url}"
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
def fetch_snapshots(url, options, yielder)
|
|
81
|
+
params = build_params(url, options)
|
|
82
|
+
response = @client.get(
|
|
83
|
+
"#{ENDPOINT}?#{URI.encode_www_form(params)}",
|
|
84
|
+
)
|
|
85
|
+
unless response.status == 200
|
|
86
|
+
raise Error, "CDX API returned HTTP #{response.status}"
|
|
87
|
+
end
|
|
88
|
+
return if response.body.nil? || response.body.strip.empty?
|
|
89
|
+
|
|
90
|
+
parse_cdx_json(response.body, yielder)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def validate_options!(options)
|
|
94
|
+
validate_match_type!(options[:match_type])
|
|
95
|
+
validate_sort!(options[:sort])
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def validate_match_type!(type)
|
|
99
|
+
return if type.nil? || MATCH_TYPES.include?(type.to_s)
|
|
100
|
+
|
|
101
|
+
raise ArgumentError,
|
|
102
|
+
"Invalid match_type: #{type}. " \
|
|
103
|
+
"Use: #{MATCH_TYPES.join(', ')}"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def validate_sort!(sort)
|
|
107
|
+
return if sort.nil? || SORT_ORDERS.include?(sort.to_s)
|
|
108
|
+
|
|
109
|
+
raise ArgumentError,
|
|
110
|
+
"Invalid sort: #{sort}. Use: #{SORT_ORDERS.join(', ')}"
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def build_params(url, options)
|
|
114
|
+
{
|
|
115
|
+
"url" => url,
|
|
116
|
+
"output" => "json",
|
|
117
|
+
"fl" => ALL_FIELDS.join(","),
|
|
118
|
+
"gzip" => options.fetch(:gzip, true) ? "true" : "false",
|
|
119
|
+
}.tap do |params|
|
|
120
|
+
merge_scalar_params!(params, options)
|
|
121
|
+
merge_array_params!(params, options[:filters], "filter")
|
|
122
|
+
merge_array_params!(params, options[:collapse], "collapse")
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def merge_scalar_params!(params, options)
|
|
127
|
+
SCALAR_PARAMS.each do |key, api_key|
|
|
128
|
+
value = options[key]
|
|
129
|
+
params[api_key] = value.to_s if value
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def merge_array_params!(params, values, prefix)
|
|
134
|
+
Array(values).each_with_index do |v, i|
|
|
135
|
+
params["#{prefix}#{i}"] = v
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def parse_cdx_json(body, yielder)
|
|
140
|
+
json = JSON.parse(body)
|
|
141
|
+
return unless json.is_a?(Array) && json.length > 1
|
|
142
|
+
|
|
143
|
+
header, *rows = json
|
|
144
|
+
field_map = header.each_with_index.to_h
|
|
145
|
+
rows.each { |row| yielder << build_snapshot(field_map, row) }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def build_snapshot(field_map, row)
|
|
149
|
+
fetch = ->(f) { row[field_map[f]] if field_map[f] }
|
|
150
|
+
|
|
151
|
+
Snapshot.new(
|
|
152
|
+
urlkey: fetch.call("urlkey"),
|
|
153
|
+
timestamp: fetch.call("timestamp"),
|
|
154
|
+
original_url: fetch.call("original"),
|
|
155
|
+
mimetype: fetch.call("mimetype"),
|
|
156
|
+
status_code: fetch.call("statuscode"),
|
|
157
|
+
digest: fetch.call("digest"),
|
|
158
|
+
length: fetch.call("length"),
|
|
159
|
+
)
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
data/lib/archaeo/cli.rb
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "thor"
|
|
4
|
+
|
|
5
|
+
module Archaeo
|
|
6
|
+
# Command-line interface powered by Thor.
|
|
7
|
+
class Cli < Thor
|
|
8
|
+
desc "snapshots URL", "List archived snapshots for a URL"
|
|
9
|
+
option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
|
|
10
|
+
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
11
|
+
option :match_type,
|
|
12
|
+
desc: "Match type (exact, prefix, host, domain)"
|
|
13
|
+
option :filter, type: :array, desc: "CDX filter expressions"
|
|
14
|
+
option :collapse, type: :array, desc: "CDX collapse fields"
|
|
15
|
+
option :sort, desc: "Sort order (default, closest, reverse)"
|
|
16
|
+
option :limit, type: :numeric, desc: "Max snapshots to return"
|
|
17
|
+
def snapshots(url)
|
|
18
|
+
cdx = CdxApi.new
|
|
19
|
+
opts = build_cdx_options(options)
|
|
20
|
+
cdx.snapshots(url, **opts).each do |snap|
|
|
21
|
+
puts "#{snap.timestamp} #{snap.status_code} " \
|
|
22
|
+
"#{snap.original_url}"
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
desc "near URL TIMESTAMP",
|
|
27
|
+
"Find the snapshot closest to a timestamp"
|
|
28
|
+
def near(url, timestamp)
|
|
29
|
+
snap = CdxApi.new.near(url, timestamp: timestamp)
|
|
30
|
+
puts snap.archive_url
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
desc "oldest URL", "Find the oldest snapshot of a URL"
|
|
34
|
+
def oldest(url)
|
|
35
|
+
snap = CdxApi.new.oldest(url)
|
|
36
|
+
puts snap.archive_url
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
desc "newest URL", "Find the newest snapshot of a URL"
|
|
40
|
+
def newest(url)
|
|
41
|
+
snap = CdxApi.new.newest(url)
|
|
42
|
+
puts snap.archive_url
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
desc "available URL", "Check if a URL is archived"
|
|
46
|
+
def available(url)
|
|
47
|
+
result = AvailabilityApi.new.near(url)
|
|
48
|
+
if result.available?
|
|
49
|
+
puts "Available: #{result.archive_url}"
|
|
50
|
+
else
|
|
51
|
+
puts "Not available"
|
|
52
|
+
exit 1
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
desc "save URL", "Save a URL to the Wayback Machine"
|
|
57
|
+
def save(url)
|
|
58
|
+
result = SaveApi.new.save(url)
|
|
59
|
+
label = result.cached? ? "Cached" : "Saved"
|
|
60
|
+
puts "#{label}: #{result.archive_url}"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
desc "fetch URL TIMESTAMP",
|
|
64
|
+
"Fetch archived content for a URL at a timestamp"
|
|
65
|
+
option :identity, type: :boolean, default: false,
|
|
66
|
+
desc: "Fetch raw (identity) content"
|
|
67
|
+
def fetch(url, timestamp)
|
|
68
|
+
page = Fetcher.new.fetch(
|
|
69
|
+
url, timestamp: timestamp,
|
|
70
|
+
identity: options[:identity]
|
|
71
|
+
)
|
|
72
|
+
$stdout.write(page.content)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
CDX_OPTION_MAP = {
|
|
76
|
+
from: :from,
|
|
77
|
+
to: :to,
|
|
78
|
+
match_type: :match_type,
|
|
79
|
+
filter: :filters,
|
|
80
|
+
collapse: :collapse,
|
|
81
|
+
sort: :sort,
|
|
82
|
+
limit: :limit,
|
|
83
|
+
}.freeze
|
|
84
|
+
|
|
85
|
+
private
|
|
86
|
+
|
|
87
|
+
def build_cdx_options(opts)
|
|
88
|
+
CDX_OPTION_MAP.each_with_object({}) do |(cli_key, api_key), result|
|
|
89
|
+
value = opts[cli_key]
|
|
90
|
+
result[api_key] = value if value
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|