archaeo 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +78 -3
- data/lib/archaeo/archive_url.rb +12 -0
- data/lib/archaeo/asset_extractor.rb +117 -8
- data/lib/archaeo/asset_list.rb +24 -1
- data/lib/archaeo/availability_api.rb +3 -1
- data/lib/archaeo/availability_result.rb +16 -2
- data/lib/archaeo/bulk_downloader.rb +81 -13
- data/lib/archaeo/cdx_api.rb +7 -0
- data/lib/archaeo/cdx_filter.rb +21 -1
- data/lib/archaeo/cli.rb +134 -58
- data/lib/archaeo/download_state.rb +17 -3
- data/lib/archaeo/http_client.rb +96 -14
- data/lib/archaeo/page.rb +29 -0
- data/lib/archaeo/page_bundle.rb +14 -0
- data/lib/archaeo/save_api.rb +3 -3
- data/lib/archaeo/save_result.rb +3 -2
- data/lib/archaeo/snapshot.rb +40 -0
- data/lib/archaeo/timestamp.rb +22 -0
- data/lib/archaeo/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e318dfb4a6478af2e663418fda9952308323be35ef9fc6582a5fa3a327cdbb6d
|
|
4
|
+
data.tar.gz: 2f745ac2ea371e6b64d4f83ca39d0f247991882d3104bcff90e523b73e421f9b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dc20f6483c99aba0059a224dba1758cec00d3d5921e7f8296b9826554f8d45780981571df3bdd8c05d0704066e14163c7e1a192339da30b3b98a367b0860a669
|
|
7
|
+
data.tar.gz: f4ca21a9c5d5f68d29bfe24ff5caf598a9e8819c7bfc920e46cba3d3a9980f4a086433c0305897fc7506e8cc002d943b94b5c4ea15e12372d0b92389df30f3c3
|
data/README.adoc
CHANGED
|
@@ -49,6 +49,11 @@ near = cdx.near("example.com", timestamp: "20220101")
|
|
|
49
49
|
before = cdx.before("example.com", timestamp: "20220101")
|
|
50
50
|
after = cdx.after("example.com", timestamp: "20220101")
|
|
51
51
|
|
|
52
|
+
# Time range query
|
|
53
|
+
cdx.between("example.com", from: "20220101", to: "20221231").each do |snap|
|
|
54
|
+
puts snap.timestamp
|
|
55
|
+
end
|
|
56
|
+
|
|
52
57
|
# Filter by status code, mimetype, or URL pattern
|
|
53
58
|
cdx.snapshots("example.com",
|
|
54
59
|
filters: [Archaeo::CdxFilter.by_status(200)],
|
|
@@ -57,6 +62,13 @@ cdx.snapshots("example.com",
|
|
|
57
62
|
sort: "reverse",
|
|
58
63
|
)
|
|
59
64
|
|
|
65
|
+
# Compose multiple filters
|
|
66
|
+
filters = Archaeo::CdxFilter.combine(
|
|
67
|
+
Archaeo::CdxFilter.only_successful,
|
|
68
|
+
Archaeo::CdxFilter.excluding_mimetype("text/css"),
|
|
69
|
+
)
|
|
70
|
+
cdx.snapshots("example.com", filters: filters)
|
|
71
|
+
|
|
60
72
|
# Page-based pagination
|
|
61
73
|
cdx.snapshots("example.com", page: 0)
|
|
62
74
|
|
|
@@ -77,6 +89,7 @@ result = api.near("example.com")
|
|
|
77
89
|
result.available? # => true/false
|
|
78
90
|
result.archive_url # => "https://web.archive.org/web/..."
|
|
79
91
|
result.timestamp # => Archaeo::Timestamp
|
|
92
|
+
result.archived_status # => HTTP status code of the archived page
|
|
80
93
|
|
|
81
94
|
api.available?("example.com") # => true/false
|
|
82
95
|
----
|
|
@@ -87,6 +100,7 @@ api.available?("example.com") # => true/false
|
|
|
87
100
|
----
|
|
88
101
|
save = Archaeo::SaveApi.new
|
|
89
102
|
result = save.save("https://example.com/")
|
|
103
|
+
result.url # => "https://example.com/"
|
|
90
104
|
result.archive_url # => "https://web.archive.org/web/..."
|
|
91
105
|
result.timestamp # => Archaeo::Timestamp
|
|
92
106
|
result.cached? # => true if already archived
|
|
@@ -104,6 +118,10 @@ page.content # => "<html>...</html>"
|
|
|
104
118
|
page.content_type # => "text/html"
|
|
105
119
|
page.status_code # => 200
|
|
106
120
|
page.archive_url # => full archive URL
|
|
121
|
+
page.title # => "Example Domain"
|
|
122
|
+
page.html? # => true
|
|
123
|
+
page.json? # => false
|
|
124
|
+
page.size # => content length in bytes
|
|
107
125
|
|
|
108
126
|
# Raw (identity) mode -- no Wayback Machine rewriting
|
|
109
127
|
page = fetcher.fetch("https://example.com/",
|
|
@@ -126,6 +144,12 @@ bundle.assets.js # => ["https://example.com/app.js", ...]
|
|
|
126
144
|
bundle.assets.images
|
|
127
145
|
bundle.assets.fonts
|
|
128
146
|
bundle.assets.media
|
|
147
|
+
bundle.size # => total count (page + assets)
|
|
148
|
+
bundle.asset_count # => number of assets
|
|
149
|
+
|
|
150
|
+
# Serialize asset list
|
|
151
|
+
bundle.assets.to_json
|
|
152
|
+
bundle.assets.counts # => { css: 1, js: 2, image: 3, font: 0, media: 1 }
|
|
129
153
|
----
|
|
130
154
|
|
|
131
155
|
=== Bulk Download with Resume
|
|
@@ -143,6 +167,12 @@ downloader.download("example.com", resume: true)
|
|
|
143
167
|
# Filter by date range
|
|
144
168
|
downloader.download("example.com",
|
|
145
169
|
from: "20220101", to: "20221231")
|
|
170
|
+
|
|
171
|
+
# Parallel downloads
|
|
172
|
+
downloader = Archaeo::BulkDownloader.new(
|
|
173
|
+
output_dir: "archive", concurrency: 4,
|
|
174
|
+
)
|
|
175
|
+
downloader.download("example.com")
|
|
146
176
|
----
|
|
147
177
|
|
|
148
178
|
=== URL Normalization
|
|
@@ -168,6 +198,33 @@ Archaeo::CdxFilter.by_status(200) # => "statuscode:200"
|
|
|
168
198
|
Archaeo::CdxFilter.excluding_status(404) # => "!statuscode:404"
|
|
169
199
|
Archaeo::CdxFilter.by_mimetype("text/html") # => "mimetype:text/html"
|
|
170
200
|
Archaeo::CdxFilter.by_url("example.com") # => "original:example.com"
|
|
201
|
+
|
|
202
|
+
# Compose filters
|
|
203
|
+
filters = Archaeo::CdxFilter.only_successful
|
|
204
|
+
error_filters = Archaeo::CdxFilter.excluding_errors
|
|
205
|
+
----
|
|
206
|
+
|
|
207
|
+
=== Snapshot Convenience
|
|
208
|
+
|
|
209
|
+
[source,ruby]
|
|
210
|
+
----
|
|
211
|
+
snap = cdx.near("example.com", timestamp: "20220101")
|
|
212
|
+
|
|
213
|
+
# Status predicates
|
|
214
|
+
snap.success? # => true (200)
|
|
215
|
+
snap.redirect? # => true for 3xx
|
|
216
|
+
snap.client_error? # => true for 4xx
|
|
217
|
+
snap.server_error? # => true for 5xx
|
|
218
|
+
snap.error? # => true for 4xx/5xx
|
|
219
|
+
|
|
220
|
+
# Fetch content directly from a snapshot
|
|
221
|
+
page = snap.fetch
|
|
222
|
+
|
|
223
|
+
# Fetch with assets
|
|
224
|
+
bundle = snap.fetch_with_assets
|
|
225
|
+
|
|
226
|
+
# JSON-serializable representation
|
|
227
|
+
snap.as_json # => Hash with primitive values only
|
|
171
228
|
----
|
|
172
229
|
|
|
173
230
|
=== Timestamps
|
|
@@ -189,6 +246,15 @@ ts = Archaeo::Timestamp.now
|
|
|
189
246
|
# Format as 14-digit string
|
|
190
247
|
ts.to_s # => "20220615000000"
|
|
191
248
|
|
|
249
|
+
# Standard time formats
|
|
250
|
+
ts.to_iso8601 # => "2022-06-15T00:00:00Z"
|
|
251
|
+
ts.to_rfc3339 # => "2022-06-15T00:00:00+00:00"
|
|
252
|
+
|
|
253
|
+
# Arithmetic
|
|
254
|
+
ts + 3600 # => Timestamp one hour later
|
|
255
|
+
ts - 3600 # => Timestamp one hour earlier
|
|
256
|
+
ts1 - ts2 # => seconds between timestamps
|
|
257
|
+
|
|
192
258
|
# Comparison
|
|
193
259
|
ts1 < ts2 # => true/false
|
|
194
260
|
----
|
|
@@ -207,9 +273,15 @@ archaeo snapshots --format csv --from 20220101 --to 20221231 example.com
|
|
|
207
273
|
|
|
208
274
|
# Find closest snapshot
|
|
209
275
|
archaeo near example.com 20220101
|
|
276
|
+
archaeo near --format json example.com 20220101
|
|
210
277
|
|
|
211
|
-
#
|
|
278
|
+
# Find oldest/newest
|
|
279
|
+
archaeo oldest example.com
|
|
280
|
+
archaeo newest --format json example.com
|
|
281
|
+
|
|
282
|
+
# Check availability (with optional timestamp)
|
|
212
283
|
archaeo available example.com
|
|
284
|
+
archaeo available --timestamp 20220101 example.com
|
|
213
285
|
|
|
214
286
|
# Save a URL
|
|
215
287
|
archaeo save https://example.com/
|
|
@@ -226,6 +298,9 @@ archaeo fetch --identity https://example.com/ 20220615120000
|
|
|
226
298
|
# Download all snapshots
|
|
227
299
|
archaeo download example.com --output ./archive
|
|
228
300
|
|
|
301
|
+
# Parallel downloads
|
|
302
|
+
archaeo download --concurrency 4 example.com --output ./archive
|
|
303
|
+
|
|
229
304
|
# Resume interrupted download
|
|
230
305
|
archaeo download example.com --resume
|
|
231
306
|
|
|
@@ -267,7 +342,7 @@ Archaeo follows a model-driven, OOP design:
|
|
|
267
342
|
|
|
268
343
|
| *URL Processing*
|
|
269
344
|
| `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
|
|
270
|
-
| URL sanitization, filtering, and rewriting
|
|
345
|
+
| URL sanitization, validated filtering with composition, and rewriting
|
|
271
346
|
|
|
272
347
|
| *Asset Extraction*
|
|
273
348
|
| `AssetExtractor`, `AssetList`
|
|
@@ -283,7 +358,7 @@ Archaeo follows a model-driven, OOP design:
|
|
|
283
358
|
|
|
284
359
|
| *Infrastructure*
|
|
285
360
|
| `HttpClient`
|
|
286
|
-
| HTTP transport with retries, gzip, connection pooling
|
|
361
|
+
| HTTP transport with retries, gzip, 429/503 handling, connection pooling with eviction
|
|
287
362
|
|===
|
|
288
363
|
|
|
289
364
|
All API classes accept an `HttpClient` via dependency injection for testability.
|
data/lib/archaeo/archive_url.rb
CHANGED
|
@@ -36,6 +36,18 @@ module Archaeo
|
|
|
36
36
|
@identity
|
|
37
37
|
end
|
|
38
38
|
|
|
39
|
+
def ==(other)
|
|
40
|
+
other.is_a?(self.class) &&
|
|
41
|
+
original_url == other.original_url &&
|
|
42
|
+
timestamp == other.timestamp &&
|
|
43
|
+
identity? == other.identity?
|
|
44
|
+
end
|
|
45
|
+
alias_method :eql?, :==
|
|
46
|
+
|
|
47
|
+
def hash
|
|
48
|
+
[original_url, timestamp, identity?].hash
|
|
49
|
+
end
|
|
50
|
+
|
|
39
51
|
def to_s
|
|
40
52
|
suffix = identity? ? "id_" : ""
|
|
41
53
|
"#{BASE}/#{@timestamp}#{suffix}/#{@original_url}"
|
|
@@ -10,6 +10,20 @@ module Archaeo
|
|
|
10
10
|
# and media resources referenced by the page. Optionally resolves
|
|
11
11
|
# relative URLs against a base URL.
|
|
12
12
|
class AssetExtractor
|
|
13
|
+
FONT_CDN_PATTERNS = %w[
|
|
14
|
+
fonts.googleapis.com
|
|
15
|
+
fonts.gstatic.com
|
|
16
|
+
use.typekit.net
|
|
17
|
+
fast.fonts.net
|
|
18
|
+
cloud.typography.com
|
|
19
|
+
].freeze
|
|
20
|
+
|
|
21
|
+
CSS_URL_PATTERN = /url\(\s*['"]?([^'")\s]+)['"]?\s*\)/
|
|
22
|
+
CSS_IMAGE_PROPS = Regexp.new(
|
|
23
|
+
"(?:background-image|background|list-style-image|content|cursor)" \
|
|
24
|
+
"\\s*:[^;]*#{CSS_URL_PATTERN.source}",
|
|
25
|
+
)
|
|
26
|
+
|
|
13
27
|
def initialize(html, base_url: nil)
|
|
14
28
|
@doc = Nokogiri::HTML(html.to_s)
|
|
15
29
|
@base_url = base_url
|
|
@@ -23,6 +37,7 @@ module Archaeo
|
|
|
23
37
|
extract_fonts(list)
|
|
24
38
|
extract_media(list)
|
|
25
39
|
extract_inline_css(list)
|
|
40
|
+
extract_inline_styles(list)
|
|
26
41
|
list
|
|
27
42
|
end
|
|
28
43
|
|
|
@@ -32,9 +47,6 @@ module Archaeo
|
|
|
32
47
|
@doc.css('link[rel="stylesheet"]').each do |el|
|
|
33
48
|
list.add(resolve(el["href"]), type: :css)
|
|
34
49
|
end
|
|
35
|
-
@doc.css('link[rel="icon"], link[rel="shortcut icon"]').each do |el|
|
|
36
|
-
list.add(resolve(el["href"]), type: :image)
|
|
37
|
-
end
|
|
38
50
|
end
|
|
39
51
|
|
|
40
52
|
def extract_js(list)
|
|
@@ -44,8 +56,42 @@ module Archaeo
|
|
|
44
56
|
end
|
|
45
57
|
|
|
46
58
|
def extract_images(list)
|
|
59
|
+
extract_img_tags(list)
|
|
60
|
+
extract_picture_sources(list)
|
|
61
|
+
extract_lazy_images(list)
|
|
62
|
+
extract_icon_links(list)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def extract_img_tags(list)
|
|
47
66
|
@doc.css("img[src]").each do |el|
|
|
48
67
|
list.add(resolve(el["src"]), type: :image)
|
|
68
|
+
extract_srcset(el["srcset"], list, :image)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def extract_picture_sources(list)
|
|
73
|
+
@doc.css("picture source[srcset]").each do |el|
|
|
74
|
+
extract_srcset(el["srcset"], list, :image)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def extract_lazy_images(list)
|
|
79
|
+
@doc.css("img[data-src]").each do |el|
|
|
80
|
+
list.add(resolve(el["data-src"]), type: :image)
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def extract_icon_links(list)
|
|
85
|
+
@doc.css(
|
|
86
|
+
'link[rel~="icon"], link[rel="apple-touch-icon"], ' \
|
|
87
|
+
'link[rel="apple-touch-icon-precomposed"], ' \
|
|
88
|
+
'link[rel="mask-icon"]',
|
|
89
|
+
).each do |el|
|
|
90
|
+
list.add(resolve(el["href"]), type: :image)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
@doc.css('link[rel="manifest"]').each do |el|
|
|
94
|
+
list.add(resolve(el["href"]), type: :media)
|
|
49
95
|
end
|
|
50
96
|
end
|
|
51
97
|
|
|
@@ -55,29 +101,92 @@ module Archaeo
|
|
|
55
101
|
end
|
|
56
102
|
@doc.css('link[rel="stylesheet"]').each do |el|
|
|
57
103
|
if font_stylesheet?(el["href"])
|
|
58
|
-
list.add(resolve(el["href"]),
|
|
59
|
-
type: :font)
|
|
104
|
+
list.add(resolve(el["href"]), type: :font)
|
|
60
105
|
end
|
|
61
106
|
end
|
|
62
107
|
end
|
|
63
108
|
|
|
64
109
|
def extract_media(list)
|
|
110
|
+
extract_media_sources(list)
|
|
111
|
+
extract_video_posters(list)
|
|
112
|
+
extract_embeds(list)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def extract_media_sources(list)
|
|
65
116
|
@doc.css("source[src], video[src], audio[src]").each do |el|
|
|
66
117
|
list.add(resolve(el["src"]), type: :media)
|
|
67
118
|
end
|
|
68
119
|
end
|
|
69
120
|
|
|
121
|
+
def extract_video_posters(list)
|
|
122
|
+
@doc.css("video[poster]").each do |el|
|
|
123
|
+
list.add(resolve(el["poster"]), type: :image)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def extract_embeds(list)
|
|
128
|
+
@doc.css("iframe[src], embed[src]").each do |el|
|
|
129
|
+
list.add(resolve(el["src"]), type: :media)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
70
133
|
def extract_inline_css(list)
|
|
71
134
|
@doc.css("style").each do |el|
|
|
72
|
-
|
|
135
|
+
text = el.text
|
|
136
|
+
extract_css_at_imports(text, list)
|
|
137
|
+
extract_css_font_urls(text, list)
|
|
138
|
+
extract_css_image_urls(text, list)
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def extract_inline_styles(list)
|
|
143
|
+
@doc.css("[style]").each do |el|
|
|
144
|
+
style = el["style"]
|
|
145
|
+
next unless style
|
|
146
|
+
|
|
147
|
+
style.scan(/url\(\s*['"]?([^'")\s]+)['"]?\s*\)/).flatten.each do |url|
|
|
148
|
+
list.add(resolve(url), type: :image)
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def extract_srcset(srcset_value, list, type)
|
|
154
|
+
return if srcset_value.nil?
|
|
155
|
+
|
|
156
|
+
srcset_value.split(",").each do |entry|
|
|
157
|
+
url = entry.strip.split(/\s+/, 2).first
|
|
158
|
+
list.add(resolve(url), type: type) if url && !url.empty?
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def extract_css_at_imports(text, list)
|
|
163
|
+
text.scan(
|
|
164
|
+
/@import\s+(?:url\(\s*['"]?([^'")\s]+)['"]?\s*\)|['"]([^'"]+)['"])/,
|
|
165
|
+
).flatten.compact.each do |url|
|
|
166
|
+
next if url.nil? || url.empty?
|
|
167
|
+
|
|
168
|
+
list.add(resolve(url), type: :css)
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def extract_css_font_urls(text, list)
|
|
173
|
+
text.scan(/@font-face\s*\{[^}]*\}/m).each do |font_block|
|
|
174
|
+
extract_css_urls(font_block).each do |url|
|
|
73
175
|
list.add(resolve(url), type: :font)
|
|
74
176
|
end
|
|
75
177
|
end
|
|
76
178
|
end
|
|
77
179
|
|
|
180
|
+
def extract_css_image_urls(text, list)
|
|
181
|
+
text.scan(CSS_IMAGE_PROPS).flatten.each do |url|
|
|
182
|
+
list.add(resolve(url), type: :image)
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
78
186
|
def font_stylesheet?(href)
|
|
79
|
-
href.
|
|
80
|
-
|
|
187
|
+
return false if href.nil?
|
|
188
|
+
|
|
189
|
+
FONT_CDN_PATTERNS.any? { |pattern| href.include?(pattern) }
|
|
81
190
|
end
|
|
82
191
|
|
|
83
192
|
def extract_css_urls(css_text)
|
data/lib/archaeo/asset_list.rb
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
3
5
|
module Archaeo
|
|
4
6
|
# Categorized collection of asset URLs extracted from an archived page.
|
|
5
7
|
#
|
|
6
8
|
# Assets are grouped by type (css, js, image, font, media) for
|
|
7
9
|
# convenient access during bulk download or local archiving.
|
|
8
10
|
class AssetList
|
|
11
|
+
include Enumerable
|
|
12
|
+
|
|
9
13
|
CATEGORIES = %i[css js image font media].freeze
|
|
10
14
|
|
|
11
15
|
def initialize
|
|
@@ -14,7 +18,14 @@ module Archaeo
|
|
|
14
18
|
end
|
|
15
19
|
|
|
16
20
|
def add(url, type:)
|
|
17
|
-
|
|
21
|
+
return if url.nil? || url.empty?
|
|
22
|
+
return if @urls_by_type[type].include?(url)
|
|
23
|
+
|
|
24
|
+
@urls_by_type[type] << url
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def each(&block)
|
|
28
|
+
all.each(&block)
|
|
18
29
|
end
|
|
19
30
|
|
|
20
31
|
def css
|
|
@@ -48,5 +59,17 @@ module Archaeo
|
|
|
48
59
|
def empty?
|
|
49
60
|
all.empty?
|
|
50
61
|
end
|
|
62
|
+
|
|
63
|
+
def to_h
|
|
64
|
+
@urls_by_type.transform_values(&:dup)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def to_json(*args)
|
|
68
|
+
to_h.to_json(*args)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def counts
|
|
72
|
+
@urls_by_type.transform_values(&:size)
|
|
73
|
+
end
|
|
51
74
|
end
|
|
52
75
|
end
|
|
@@ -68,12 +68,14 @@ module Archaeo
|
|
|
68
68
|
def build_result(closest, url)
|
|
69
69
|
archive_url = closest["url"].to_s.sub(%r{^http://}, "https://")
|
|
70
70
|
ts = Timestamp.parse(closest["timestamp"])
|
|
71
|
+
archived_status = closest["status"].to_i
|
|
71
72
|
|
|
72
73
|
AvailabilityResult.new(
|
|
73
74
|
url: url,
|
|
74
|
-
available:
|
|
75
|
+
available: true,
|
|
75
76
|
archive_url: archive_url,
|
|
76
77
|
timestamp: ts,
|
|
78
|
+
archived_status: archived_status,
|
|
77
79
|
)
|
|
78
80
|
end
|
|
79
81
|
end
|
|
@@ -6,17 +6,31 @@ module Archaeo
|
|
|
6
6
|
# Indicates whether a URL is archived and, if so, provides
|
|
7
7
|
# the closest snapshot's archive URL and timestamp.
|
|
8
8
|
class AvailabilityResult
|
|
9
|
-
attr_reader :url, :archive_url, :timestamp
|
|
9
|
+
attr_reader :url, :archive_url, :timestamp, :archived_status
|
|
10
10
|
|
|
11
|
-
def initialize(url:, available:, archive_url: nil,
|
|
11
|
+
def initialize(url:, available:, archive_url: nil,
|
|
12
|
+
timestamp: nil, archived_status: nil)
|
|
12
13
|
@url = url
|
|
13
14
|
@available = available
|
|
14
15
|
@archive_url = archive_url
|
|
15
16
|
@timestamp = timestamp
|
|
17
|
+
@archived_status = archived_status
|
|
16
18
|
end
|
|
17
19
|
|
|
18
20
|
def available?
|
|
19
21
|
@available
|
|
20
22
|
end
|
|
23
|
+
|
|
24
|
+
def unavailable?
|
|
25
|
+
!@available
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def to_s
|
|
29
|
+
if available?
|
|
30
|
+
"#{url} -> #{archive_url} (#{timestamp})"
|
|
31
|
+
else
|
|
32
|
+
"#{url} -> not available"
|
|
33
|
+
end
|
|
34
|
+
end
|
|
21
35
|
end
|
|
22
36
|
end
|
|
@@ -10,27 +10,26 @@ module Archaeo
|
|
|
10
10
|
# for interrupted download recovery.
|
|
11
11
|
class BulkDownloader
|
|
12
12
|
def initialize(client: HttpClient.new, output_dir: "archive",
|
|
13
|
-
cdx_api: nil)
|
|
13
|
+
cdx_api: nil, concurrency: 1)
|
|
14
14
|
@client = client
|
|
15
15
|
@output_dir = output_dir
|
|
16
16
|
@cdx_api = cdx_api
|
|
17
|
+
@concurrency = [1, concurrency.to_i].max
|
|
17
18
|
end
|
|
18
19
|
|
|
19
|
-
def download(url, from: nil, to: nil, resume: false)
|
|
20
|
+
def download(url, from: nil, to: nil, resume: false, &block)
|
|
20
21
|
url = UrlNormalizer.normalize(url)
|
|
21
22
|
FileUtils.mkdir_p(@output_dir)
|
|
22
23
|
state = DownloadState.new(@output_dir)
|
|
23
24
|
|
|
24
25
|
snapshots = fetch_snapshots(url, from: from, to: to)
|
|
25
26
|
total = snapshots.size
|
|
27
|
+
progress = block
|
|
26
28
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
state.mark_completed(snap.timestamp)
|
|
32
|
-
|
|
33
|
-
yield index + 1, total, snap if block_given?
|
|
29
|
+
if @concurrency == 1
|
|
30
|
+
download_sequential(snapshots, total, state, resume, progress)
|
|
31
|
+
else
|
|
32
|
+
download_concurrent(snapshots, total, state, resume, progress)
|
|
34
33
|
end
|
|
35
34
|
end
|
|
36
35
|
|
|
@@ -45,6 +44,54 @@ module Archaeo
|
|
|
45
44
|
.select { |snap| !snap.blocked? && snap.status_code == 200 }
|
|
46
45
|
end
|
|
47
46
|
|
|
47
|
+
def download_sequential(snapshots, total, state, resume, progress)
|
|
48
|
+
snapshots.each_with_index do |snap, index|
|
|
49
|
+
next if resume && state.completed?(snap.timestamp)
|
|
50
|
+
|
|
51
|
+
fetch_and_save(snap)
|
|
52
|
+
state.mark_completed(snap.timestamp)
|
|
53
|
+
|
|
54
|
+
progress&.call(index + 1, total, snap)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def download_concurrent(snapshots, total, state, resume, progress)
|
|
59
|
+
queue = snapshots.each_with_index.to_a
|
|
60
|
+
mutex = Mutex.new
|
|
61
|
+
errors = []
|
|
62
|
+
|
|
63
|
+
threads = Array.new(@concurrency) do
|
|
64
|
+
Thread.new do
|
|
65
|
+
process_queue(queue, total, state, resume, progress, mutex, errors)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
threads.each(&:join)
|
|
69
|
+
|
|
70
|
+
return unless errors.any?
|
|
71
|
+
|
|
72
|
+
raise Error,
|
|
73
|
+
"#{errors.size} download(s) failed: " \
|
|
74
|
+
"#{errors.map { |s, _| s.timestamp }.join(', ')}"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def process_queue(queue, total, state, resume, progress, mutex, errors)
|
|
78
|
+
loop do
|
|
79
|
+
snap, index = mutex.synchronize { queue.shift }
|
|
80
|
+
break unless snap
|
|
81
|
+
|
|
82
|
+
next if resume && state.completed?(snap.timestamp)
|
|
83
|
+
|
|
84
|
+
begin
|
|
85
|
+
fetch_and_save(snap)
|
|
86
|
+
state.mark_completed(snap.timestamp)
|
|
87
|
+
rescue StandardError => e
|
|
88
|
+
mutex.synchronize { errors << [snap, e] }
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
progress&.call(index + 1, total, snap)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
48
95
|
def fetch_and_save(snapshot)
|
|
49
96
|
fetcher = Fetcher.new(client: @client)
|
|
50
97
|
page = fetcher.fetch(snapshot.original_url,
|
|
@@ -52,40 +99,61 @@ module Archaeo
|
|
|
52
99
|
|
|
53
100
|
filename = build_filename(snapshot)
|
|
54
101
|
FileUtils.mkdir_p(File.dirname(filename))
|
|
55
|
-
|
|
102
|
+
tmp_path = "#{filename}.tmp"
|
|
103
|
+
File.binwrite(tmp_path, page.content)
|
|
104
|
+
File.rename(tmp_path, filename)
|
|
105
|
+
rescue StandardError
|
|
106
|
+
FileUtils.rm_f(tmp_path) if defined?(tmp_path)
|
|
107
|
+
raise
|
|
56
108
|
end
|
|
57
109
|
|
|
58
110
|
EXTENSION_MAP = {
|
|
59
111
|
"text/html" => ".html",
|
|
60
112
|
"text/css" => ".css",
|
|
113
|
+
"text/plain" => ".txt",
|
|
114
|
+
"text/javascript" => ".js",
|
|
61
115
|
"application/javascript" => ".js",
|
|
116
|
+
"application/x-javascript" => ".js",
|
|
62
117
|
"application/json" => ".json",
|
|
118
|
+
"application/xml" => ".xml",
|
|
63
119
|
"application/pdf" => ".pdf",
|
|
120
|
+
"application/octet-stream" => ".bin",
|
|
64
121
|
"image/png" => ".png",
|
|
65
122
|
"image/jpeg" => ".jpg",
|
|
66
123
|
"image/gif" => ".gif",
|
|
67
124
|
"image/svg+xml" => ".svg",
|
|
68
125
|
"image/webp" => ".webp",
|
|
126
|
+
"image/x-icon" => ".ico",
|
|
127
|
+
"image/bmp" => ".bmp",
|
|
69
128
|
"font/woff2" => ".woff2",
|
|
70
129
|
"font/woff" => ".woff",
|
|
130
|
+
"font/ttf" => ".ttf",
|
|
131
|
+
"font/eot" => ".eot",
|
|
71
132
|
"video/mp4" => ".mp4",
|
|
72
133
|
"audio/mpeg" => ".mp3",
|
|
73
134
|
}.freeze
|
|
74
135
|
|
|
75
136
|
def extension_for(snapshot)
|
|
76
|
-
|
|
137
|
+
mime = snapshot.mimetype.to_s.split(";").first.strip.downcase
|
|
138
|
+
EXTENSION_MAP[mime] || ".bin"
|
|
77
139
|
end
|
|
78
140
|
|
|
79
141
|
def build_filename(snapshot)
|
|
80
142
|
ts = snapshot.timestamp.to_s
|
|
81
143
|
safe_path = snapshot.original_url
|
|
82
144
|
.sub(%r{\Ahttps?://}, "")
|
|
83
|
-
.gsub(%r{
|
|
145
|
+
.gsub(%r{[<>:"|?*#]}, "_")
|
|
146
|
+
.gsub(%r{[/\\]}, File::SEPARATOR)
|
|
84
147
|
.gsub(%r{[?&=]}, "_")
|
|
148
|
+
|
|
85
149
|
safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
|
|
86
150
|
safe_path = "#{safe_path}index" if safe_path.empty?
|
|
87
151
|
|
|
88
|
-
File.
|
|
152
|
+
segments = safe_path.split(File::SEPARATOR).map do |seg|
|
|
153
|
+
seg.length > 200 ? seg[0..200] : seg
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
File.join(@output_dir, *segments, "#{ts}#{extension_for(snapshot)}")
|
|
89
157
|
end
|
|
90
158
|
end
|
|
91
159
|
end
|
data/lib/archaeo/cdx_api.rb
CHANGED
|
@@ -99,6 +99,13 @@ module Archaeo
|
|
|
99
99
|
"No snapshot found after #{ts} for #{url}"
|
|
100
100
|
end
|
|
101
101
|
|
|
102
|
+
def between(url, from:, to:, **options)
|
|
103
|
+
snapshots(url,
|
|
104
|
+
from: Timestamp.coerce(from).to_s,
|
|
105
|
+
to: Timestamp.coerce(to).to_s,
|
|
106
|
+
**options)
|
|
107
|
+
end
|
|
108
|
+
|
|
102
109
|
# Returns the number of pages for a paginated query.
|
|
103
110
|
def num_pages(url, **options)
|
|
104
111
|
url = UrlNormalizer.normalize(url)
|
data/lib/archaeo/cdx_filter.rb
CHANGED
|
@@ -59,10 +59,30 @@ module Archaeo
|
|
|
59
59
|
new("urlkey:#{pattern}")
|
|
60
60
|
end
|
|
61
61
|
|
|
62
|
+
def and(other)
|
|
63
|
+
[self, other]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def self.combine(*filters)
|
|
67
|
+
filters.flatten
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def self.only_successful
|
|
71
|
+
[by_status(200)]
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def self.excluding_errors
|
|
75
|
+
[excluding_status(404), excluding_status(500),
|
|
76
|
+
excluding_status(502), excluding_status(503)]
|
|
77
|
+
end
|
|
78
|
+
|
|
62
79
|
private
|
|
63
80
|
|
|
64
81
|
def validate!
|
|
65
|
-
|
|
82
|
+
if @expression.empty?
|
|
83
|
+
raise ArgumentError,
|
|
84
|
+
"CDX filter expression cannot be empty"
|
|
85
|
+
end
|
|
66
86
|
|
|
67
87
|
field_name = field
|
|
68
88
|
return if VALID_FIELDS.include?(field_name)
|