archaeo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +10 -0
- data/README.adoc +186 -0
- data/Rakefile +12 -0
- data/archaeo.gemspec +38 -0
- data/bin/console +11 -0
- data/bin/setup +8 -0
- data/exe/archaeo +6 -0
- data/lib/archaeo/archive_url.rb +54 -0
- data/lib/archaeo/availability_api.rb +74 -0
- data/lib/archaeo/availability_result.rb +22 -0
- data/lib/archaeo/cdx_api.rb +162 -0
- data/lib/archaeo/cli.rb +94 -0
- data/lib/archaeo/fetcher.rb +62 -0
- data/lib/archaeo/http_client.rb +137 -0
- data/lib/archaeo/page.rb +22 -0
- data/lib/archaeo/save_api.rb +112 -0
- data/lib/archaeo/save_result.rb +21 -0
- data/lib/archaeo/snapshot.rb +40 -0
- data/lib/archaeo/timestamp.rb +103 -0
- data/lib/archaeo/version.rb +5 -0
- data/lib/archaeo.rb +30 -0
- data/sig/archaeo.rbs +241 -0
- metadata +84 -0
data/sig/archaeo.rbs
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
module Archaeo
|
|
2
|
+
VERSION: String
|
|
3
|
+
|
|
4
|
+
class Error < StandardError
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
class NoSnapshotFound < Error
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
class BlockedSiteError < Error
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
class RateLimitError < Error
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
class MaximumRetriesExceeded < Error
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
class ArchiveNotAvailable < Error
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
class InvalidResponse < Error
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
class SaveFailed < Error
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
class Timestamp
|
|
29
|
+
include Comparable
|
|
30
|
+
|
|
31
|
+
FORMAT: String
|
|
32
|
+
|
|
33
|
+
attr_reader to_time: Time
|
|
34
|
+
|
|
35
|
+
def initialize: (
|
|
36
|
+
year: Integer,
|
|
37
|
+
?month: Integer,
|
|
38
|
+
?day: Integer,
|
|
39
|
+
?hour: Integer,
|
|
40
|
+
?minute: Integer,
|
|
41
|
+
?second: Integer
|
|
42
|
+
) -> void
|
|
43
|
+
|
|
44
|
+
def self.parse: (String string) -> Timestamp
|
|
45
|
+
def self.from_time: (Time time) -> Timestamp
|
|
46
|
+
def self.now: () -> Timestamp
|
|
47
|
+
def self.coerce: (Timestamp | String | Time value) -> Timestamp
|
|
48
|
+
|
|
49
|
+
def to_s: () -> String
|
|
50
|
+
def <=>: (untyped other) -> String?
|
|
51
|
+
|
|
52
|
+
attr_reader year: Integer
|
|
53
|
+
attr_reader month: Integer
|
|
54
|
+
attr_reader day: Integer
|
|
55
|
+
attr_reader hour: Integer
|
|
56
|
+
attr_reader minute: Integer
|
|
57
|
+
attr_reader second: Integer
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
class ArchiveUrl
|
|
61
|
+
BASE: String
|
|
62
|
+
TIMESTAMP_RE: Regexp
|
|
63
|
+
|
|
64
|
+
attr_reader original_url: String
|
|
65
|
+
attr_reader timestamp: Timestamp
|
|
66
|
+
|
|
67
|
+
def initialize: (
|
|
68
|
+
String original_url,
|
|
69
|
+
timestamp: Timestamp | String,
|
|
70
|
+
?identity: bool
|
|
71
|
+
) -> void
|
|
72
|
+
|
|
73
|
+
def self.parse: (String string) -> ArchiveUrl
|
|
74
|
+
|
|
75
|
+
def identity?: () -> bool
|
|
76
|
+
def to_s: () -> String
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
class Snapshot
|
|
80
|
+
FIELDS: Array[Symbol]
|
|
81
|
+
|
|
82
|
+
attr_reader urlkey: String
|
|
83
|
+
attr_reader timestamp: Timestamp
|
|
84
|
+
attr_reader original_url: String
|
|
85
|
+
attr_reader mimetype: String
|
|
86
|
+
attr_reader status_code: Integer
|
|
87
|
+
attr_reader digest: String
|
|
88
|
+
attr_reader length: Integer
|
|
89
|
+
|
|
90
|
+
def initialize: (
|
|
91
|
+
urlkey: String,
|
|
92
|
+
timestamp: Timestamp | String,
|
|
93
|
+
original_url: String,
|
|
94
|
+
?mimetype: String?,
|
|
95
|
+
?status_code: Integer?,
|
|
96
|
+
?digest: String?,
|
|
97
|
+
?length: Integer?
|
|
98
|
+
) -> void
|
|
99
|
+
|
|
100
|
+
def archive_url: () -> String
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
class Page
|
|
104
|
+
attr_reader content: String
|
|
105
|
+
attr_reader content_type: String
|
|
106
|
+
attr_reader status_code: Integer
|
|
107
|
+
attr_reader archive_url: String
|
|
108
|
+
attr_reader original_url: String
|
|
109
|
+
attr_reader timestamp: Timestamp
|
|
110
|
+
|
|
111
|
+
def initialize: (
|
|
112
|
+
content: String,
|
|
113
|
+
content_type: String,
|
|
114
|
+
status_code: Integer,
|
|
115
|
+
archive_url: String,
|
|
116
|
+
original_url: String,
|
|
117
|
+
timestamp: Timestamp | String
|
|
118
|
+
) -> void
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
class SaveResult
|
|
122
|
+
attr_reader archive_url: String
|
|
123
|
+
attr_reader timestamp: Timestamp
|
|
124
|
+
|
|
125
|
+
def initialize: (
|
|
126
|
+
archive_url: String,
|
|
127
|
+
timestamp: Timestamp,
|
|
128
|
+
cached: bool
|
|
129
|
+
) -> void
|
|
130
|
+
|
|
131
|
+
def cached?: () -> bool
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
class AvailabilityResult
|
|
135
|
+
attr_reader url: String
|
|
136
|
+
attr_reader archive_url: String?
|
|
137
|
+
attr_reader timestamp: Timestamp?
|
|
138
|
+
|
|
139
|
+
def initialize: (
|
|
140
|
+
url: String,
|
|
141
|
+
available: bool,
|
|
142
|
+
?archive_url: String?,
|
|
143
|
+
?timestamp: Timestamp?
|
|
144
|
+
) -> void
|
|
145
|
+
|
|
146
|
+
def available?: () -> bool
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
class HttpClient
|
|
150
|
+
DEFAULT_TIMEOUT: Integer
|
|
151
|
+
DEFAULT_MAX_RETRIES: Integer
|
|
152
|
+
DEFAULT_RETRY_DELAY: Integer
|
|
153
|
+
TRANSIENT_ERRORS: Array[singleton(StandardError)]
|
|
154
|
+
USER_AGENT_PROFILES: Array[String]
|
|
155
|
+
|
|
156
|
+
class Response
|
|
157
|
+
attr_reader status: Integer
|
|
158
|
+
attr_reader headers: Hash[String, String]
|
|
159
|
+
attr_reader body: String
|
|
160
|
+
|
|
161
|
+
def initialize: (
|
|
162
|
+
status: Integer,
|
|
163
|
+
headers: Hash[String, String],
|
|
164
|
+
body: String
|
|
165
|
+
) -> void
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def initialize: (
|
|
169
|
+
?timeout: Integer,
|
|
170
|
+
?max_retries: Integer,
|
|
171
|
+
?retry_delay: Integer,
|
|
172
|
+
?user_agent: String?
|
|
173
|
+
) -> void
|
|
174
|
+
|
|
175
|
+
def get: (String url, ?headers: Hash[String, String]) -> Response
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
class CdxApi
|
|
179
|
+
ENDPOINT: String
|
|
180
|
+
ALL_FIELDS: Array[String]
|
|
181
|
+
MATCH_TYPES: Array[String]
|
|
182
|
+
SORT_ORDERS: Array[String]
|
|
183
|
+
DEFAULT_LIMIT: Integer
|
|
184
|
+
SCALAR_PARAMS: Hash[Symbol, String]
|
|
185
|
+
|
|
186
|
+
def initialize: (?client: HttpClient) -> void
|
|
187
|
+
|
|
188
|
+
def snapshots: (
|
|
189
|
+
String url,
|
|
190
|
+
**untyped options
|
|
191
|
+
) -> Enumerator[Snapshot, void]
|
|
192
|
+
|
|
193
|
+
def near: (String url, timestamp: Timestamp | String) -> Snapshot
|
|
194
|
+
def oldest: (String url) -> Snapshot
|
|
195
|
+
def newest: (String url) -> Snapshot
|
|
196
|
+
|
|
197
|
+
def before: (String url, timestamp: Timestamp | String) -> Snapshot
|
|
198
|
+
def after: (String url, timestamp: Timestamp | String) -> Snapshot
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
class AvailabilityApi
|
|
202
|
+
ENDPOINT: String
|
|
203
|
+
|
|
204
|
+
def initialize: (?client: HttpClient) -> void
|
|
205
|
+
|
|
206
|
+
def near: (
|
|
207
|
+
String url,
|
|
208
|
+
?timestamp: Timestamp?
|
|
209
|
+
) -> AvailabilityResult
|
|
210
|
+
|
|
211
|
+
def oldest: (String url) -> AvailabilityResult
|
|
212
|
+
def newest: (String url) -> AvailabilityResult
|
|
213
|
+
def available?: (String url) -> bool
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
class SaveApi
|
|
217
|
+
ENDPOINT: String
|
|
218
|
+
DEFAULT_MAX_TRIES: Integer
|
|
219
|
+
TIMESTAMP_RE: Regexp
|
|
220
|
+
|
|
221
|
+
def initialize: (
|
|
222
|
+
?client: HttpClient,
|
|
223
|
+
?max_tries: Integer
|
|
224
|
+
) -> void
|
|
225
|
+
|
|
226
|
+
def save: (String url) -> SaveResult
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
class Fetcher
|
|
230
|
+
MAX_REDIRECTS: Integer
|
|
231
|
+
BASE: String
|
|
232
|
+
|
|
233
|
+
def initialize: (?client: HttpClient) -> void
|
|
234
|
+
|
|
235
|
+
def fetch: (
|
|
236
|
+
String url,
|
|
237
|
+
timestamp: Timestamp | String,
|
|
238
|
+
?identity: bool
|
|
239
|
+
) -> Page
|
|
240
|
+
end
|
|
241
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: archaeo
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Ribose Inc.
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: thor
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '1.3'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '1.3'
|
|
26
|
+
description: Archaeo provides a Ruby interface to query, fetch, and save archived
|
|
27
|
+
web content via the Wayback Machine CDX Server API, Availability API, SavePageNow
|
|
28
|
+
API, and content fetching.
|
|
29
|
+
email:
|
|
30
|
+
- open.source@ribose.com
|
|
31
|
+
executables:
|
|
32
|
+
- archaeo
|
|
33
|
+
extensions: []
|
|
34
|
+
extra_rdoc_files: []
|
|
35
|
+
files:
|
|
36
|
+
- CHANGELOG.md
|
|
37
|
+
- CODE_OF_CONDUCT.md
|
|
38
|
+
- README.adoc
|
|
39
|
+
- Rakefile
|
|
40
|
+
- archaeo.gemspec
|
|
41
|
+
- bin/console
|
|
42
|
+
- bin/setup
|
|
43
|
+
- exe/archaeo
|
|
44
|
+
- lib/archaeo.rb
|
|
45
|
+
- lib/archaeo/archive_url.rb
|
|
46
|
+
- lib/archaeo/availability_api.rb
|
|
47
|
+
- lib/archaeo/availability_result.rb
|
|
48
|
+
- lib/archaeo/cdx_api.rb
|
|
49
|
+
- lib/archaeo/cli.rb
|
|
50
|
+
- lib/archaeo/fetcher.rb
|
|
51
|
+
- lib/archaeo/http_client.rb
|
|
52
|
+
- lib/archaeo/page.rb
|
|
53
|
+
- lib/archaeo/save_api.rb
|
|
54
|
+
- lib/archaeo/save_result.rb
|
|
55
|
+
- lib/archaeo/snapshot.rb
|
|
56
|
+
- lib/archaeo/timestamp.rb
|
|
57
|
+
- lib/archaeo/version.rb
|
|
58
|
+
- sig/archaeo.rbs
|
|
59
|
+
homepage: https://github.com/riboseinc/archaeo
|
|
60
|
+
licenses:
|
|
61
|
+
- MIT
|
|
62
|
+
metadata:
|
|
63
|
+
homepage_uri: https://github.com/riboseinc/archaeo
|
|
64
|
+
source_code_uri: https://github.com/riboseinc/archaeo
|
|
65
|
+
changelog_uri: https://github.com/riboseinc/archaeo/blob/main/CHANGELOG.md
|
|
66
|
+
rubygems_mfa_required: 'true'
|
|
67
|
+
rdoc_options: []
|
|
68
|
+
require_paths:
|
|
69
|
+
- lib
|
|
70
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: 3.0.0
|
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
|
+
requirements:
|
|
77
|
+
- - ">="
|
|
78
|
+
- !ruby/object:Gem::Version
|
|
79
|
+
version: '0'
|
|
80
|
+
requirements: []
|
|
81
|
+
rubygems_version: 3.6.9
|
|
82
|
+
specification_version: 4
|
|
83
|
+
summary: Ruby client for the Internet Archive Wayback Machine APIs
|
|
84
|
+
test_files: []
|