archaeo 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/archaeo.gemspec +5 -2
- data/lib/archaeo/asset_extractor.rb +94 -0
- data/lib/archaeo/asset_list.rb +52 -0
- data/lib/archaeo/bulk_downloader.rb +67 -0
- data/lib/archaeo/cdx_api.rb +111 -13
- data/lib/archaeo/cdx_filter.rb +71 -0
- data/lib/archaeo/cli.rb +86 -4
- data/lib/archaeo/download_state.rb +49 -0
- data/lib/archaeo/http_client.rb +75 -19
- data/lib/archaeo/page.rb +66 -3
- data/lib/archaeo/snapshot.rb +13 -3
- data/lib/archaeo/url_normalizer.rb +56 -0
- data/lib/archaeo/url_rewriter.rb +22 -0
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +7 -0
- metadata +42 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 24c6b37575e8f673a8e6acb7aba38264ac811236cb91e905663914d08a283289
|
|
4
|
+
data.tar.gz: 632f36d31ee83b23f727dd8eb8217929d3dad26c94c6948fb0621aff2b937701
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 33cf0ea6c5317be5aafba988e8652555a8e7a77620e93571a51c1537ae4cdd455e76d170fe78549a5e3068ec82db12138159cb87a31c1aafda5c036a6ca2511e
|
|
7
|
+
data.tar.gz: 2c0ae7a6a461913ed475dd3084e495f057db995ebf9ea8a1e6c9595aae74cd6bc65a0cdc6f2b2a3a831d9cc0e45e92e06a13a4f7653e89c0a8bbcd7226207677
|
data/archaeo.gemspec
CHANGED
|
@@ -20,19 +20,22 @@ Gem::Specification.new do |spec|
|
|
|
20
20
|
spec.metadata["homepage_uri"] = spec.homepage
|
|
21
21
|
spec.metadata["source_code_uri"] = spec.homepage
|
|
22
22
|
spec.metadata["changelog_uri"] =
|
|
23
|
-
"#{spec.homepage}/blob/main/CHANGELOG.
|
|
23
|
+
"#{spec.homepage}/blob/main/CHANGELOG.adoc"
|
|
24
24
|
spec.metadata["rubygems_mfa_required"] = "true"
|
|
25
25
|
|
|
26
26
|
spec.files = IO.popen(%w[git ls-files -z], chdir: __dir__,
|
|
27
27
|
err: IO::NULL) do |ls|
|
|
28
28
|
ls.readlines("\x0", chomp: true).reject do |f|
|
|
29
29
|
f == __FILE__ ||
|
|
30
|
-
f.start_with?(*%w[Gemfile .gitignore .rspec spec/ .github/
|
|
30
|
+
f.start_with?(*%w[Gemfile .gitignore .rspec spec/ .github/
|
|
31
|
+
.rubocop TODO])
|
|
31
32
|
end
|
|
32
33
|
end
|
|
33
34
|
spec.bindir = "exe"
|
|
34
35
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
35
36
|
spec.require_paths = ["lib"]
|
|
36
37
|
|
|
38
|
+
spec.add_dependency "csv", "~> 3.3"
|
|
39
|
+
spec.add_dependency "nokogiri", "~> 1.14"
|
|
37
40
|
spec.add_dependency "thor", "~> 1.3"
|
|
38
41
|
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module Archaeo
|
|
7
|
+
# Extracts resource URLs from archived HTML content using Nokogiri.
|
|
8
|
+
#
|
|
9
|
+
# Parses the HTML DOM to find CSS, JavaScript, images, fonts,
|
|
10
|
+
# and media resources referenced by the page. Optionally resolves
|
|
11
|
+
# relative URLs against a base URL.
|
|
12
|
+
class AssetExtractor
|
|
13
|
+
def initialize(html, base_url: nil)
|
|
14
|
+
@doc = Nokogiri::HTML(html.to_s)
|
|
15
|
+
@base_url = base_url
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def extract
|
|
19
|
+
list = AssetList.new
|
|
20
|
+
extract_css(list)
|
|
21
|
+
extract_js(list)
|
|
22
|
+
extract_images(list)
|
|
23
|
+
extract_fonts(list)
|
|
24
|
+
extract_media(list)
|
|
25
|
+
extract_inline_css(list)
|
|
26
|
+
list
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def extract_css(list)
|
|
32
|
+
@doc.css('link[rel="stylesheet"]').each do |el|
|
|
33
|
+
list.add(resolve(el["href"]), type: :css)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def extract_js(list)
|
|
38
|
+
@doc.css("script[src]").each do |el|
|
|
39
|
+
list.add(resolve(el["src"]), type: :js)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def extract_images(list)
|
|
44
|
+
@doc.css("img[src]").each do |el|
|
|
45
|
+
list.add(resolve(el["src"]), type: :image)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def extract_fonts(list)
|
|
50
|
+
@doc.css('link[rel="preload"][as="font"]').each do |el|
|
|
51
|
+
list.add(resolve(el["href"]), type: :font)
|
|
52
|
+
end
|
|
53
|
+
@doc.css('link[rel="stylesheet"]').each do |el|
|
|
54
|
+
if font_stylesheet?(el["href"])
|
|
55
|
+
list.add(resolve(el["href"]),
|
|
56
|
+
type: :font)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def extract_media(list)
|
|
62
|
+
@doc.css("source[src], video[src], audio[src]").each do |el|
|
|
63
|
+
list.add(resolve(el["src"]), type: :media)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def extract_inline_css(list)
|
|
68
|
+
@doc.css("style").each do |el|
|
|
69
|
+
extract_css_urls(el.text).each do |url|
|
|
70
|
+
list.add(resolve(url), type: :font)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def font_stylesheet?(href)
|
|
76
|
+
href.to_s.include?("fonts.googleapis.com") ||
|
|
77
|
+
href.to_s.include?("font")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def extract_css_urls(css_text)
|
|
81
|
+
css_text.scan(/url\(\s*['"]?([^'")\s]+)['"]?\s*\)/).flatten
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def resolve(url)
|
|
85
|
+
return url if url.nil? || url.empty?
|
|
86
|
+
return url if url.start_with?("http", "//", "data:", "#")
|
|
87
|
+
return url unless @base_url
|
|
88
|
+
|
|
89
|
+
URI.join(@base_url, url).to_s
|
|
90
|
+
rescue URI::InvalidURIError
|
|
91
|
+
url
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Categorized collection of asset URLs extracted from an archived page.
|
|
5
|
+
#
|
|
6
|
+
# Assets are grouped by type (css, js, image, font, media) for
|
|
7
|
+
# convenient access during bulk download or local archiving.
|
|
8
|
+
class AssetList
|
|
9
|
+
CATEGORIES = %i[css js image font media].freeze
|
|
10
|
+
|
|
11
|
+
def initialize
|
|
12
|
+
@urls_by_type = {}
|
|
13
|
+
CATEGORIES.each { |c| @urls_by_type[c] = [] }
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def add(url, type:)
|
|
17
|
+
@urls_by_type[type] << url unless url.nil? || url.empty?
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def css
|
|
21
|
+
@urls_by_type[:css]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def js
|
|
25
|
+
@urls_by_type[:js]
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def images
|
|
29
|
+
@urls_by_type[:image]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def fonts
|
|
33
|
+
@urls_by_type[:font]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def media
|
|
37
|
+
@urls_by_type[:media]
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def all
|
|
41
|
+
@urls_by_type.values.flatten.uniq
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def size
|
|
45
|
+
all.size
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def empty?
|
|
49
|
+
all.empty?
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
|
|
5
|
+
module Archaeo
|
|
6
|
+
# Downloads all archived snapshots of a URL with resume support.
|
|
7
|
+
#
|
|
8
|
+
# Queries the CDX API for matching snapshots, fetches each page,
|
|
9
|
+
# and saves content to disk. Progress is tracked in a state file
|
|
10
|
+
# for interrupted download recovery.
|
|
11
|
+
class BulkDownloader
|
|
12
|
+
def initialize(client: HttpClient.new, output_dir: "archive")
|
|
13
|
+
@client = client
|
|
14
|
+
@output_dir = output_dir
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def download(url, from: nil, to: nil, resume: false)
|
|
18
|
+
FileUtils.mkdir_p(@output_dir)
|
|
19
|
+
state = DownloadState.new(@output_dir)
|
|
20
|
+
|
|
21
|
+
snapshots = fetch_snapshots(url, from: from, to: to)
|
|
22
|
+
total = snapshots.size
|
|
23
|
+
|
|
24
|
+
snapshots.each_with_index do |snap, index|
|
|
25
|
+
next if resume && state.completed?(snap.timestamp)
|
|
26
|
+
|
|
27
|
+
fetch_and_save(snap)
|
|
28
|
+
state.mark_completed(snap.timestamp)
|
|
29
|
+
|
|
30
|
+
yield index + 1, total, snap if block_given?
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def fetch_snapshots(url, from:, to:)
|
|
37
|
+
cdx = CdxApi.new(client: @client)
|
|
38
|
+
options = {}
|
|
39
|
+
options[:from] = from if from
|
|
40
|
+
options[:to] = to if to
|
|
41
|
+
cdx.snapshots(url, **options)
|
|
42
|
+
.select { |snap| !snap.blocked? && snap.status_code == 200 }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def fetch_and_save(snapshot)
|
|
46
|
+
fetcher = Fetcher.new(client: @client)
|
|
47
|
+
page = fetcher.fetch(snapshot.original_url,
|
|
48
|
+
timestamp: snapshot.timestamp)
|
|
49
|
+
|
|
50
|
+
filename = build_filename(snapshot)
|
|
51
|
+
FileUtils.mkdir_p(File.dirname(filename))
|
|
52
|
+
File.binwrite(filename, page.content)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def build_filename(snapshot)
|
|
56
|
+
ts = snapshot.timestamp.to_s
|
|
57
|
+
safe_path = snapshot.original_url
|
|
58
|
+
.sub(%r{\Ahttps?://}, "")
|
|
59
|
+
.gsub(%r{/}, File::SEPARATOR)
|
|
60
|
+
.gsub(%r{[?&=]}, "_")
|
|
61
|
+
safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
|
|
62
|
+
safe_path = "#{safe_path}index" if safe_path.empty?
|
|
63
|
+
|
|
64
|
+
File.join(@output_dir, safe_path, "#{ts}.html")
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
data/lib/archaeo/cdx_api.rb
CHANGED
|
@@ -6,8 +6,11 @@ require "uri"
|
|
|
6
6
|
module Archaeo
|
|
7
7
|
# Client for the Wayback Machine CDX Server API.
|
|
8
8
|
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
9
|
+
# Supports all CDX features: field selection, filtering with regex,
|
|
10
|
+
# collapsing, resume-key pagination, page-based pagination,
|
|
11
|
+
# closest timestamp match, resolve revisits, and counters.
|
|
12
|
+
#
|
|
13
|
+
# @see https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
|
|
11
14
|
class CdxApi
|
|
12
15
|
ENDPOINT = "https://web.archive.org/cdx/search/cdx"
|
|
13
16
|
|
|
@@ -27,17 +30,31 @@ module Archaeo
|
|
|
27
30
|
sort: "sort",
|
|
28
31
|
limit: "limit",
|
|
29
32
|
closest: "closest",
|
|
33
|
+
offset: "offset",
|
|
34
|
+
page: "page",
|
|
35
|
+
page_size: "pageSize",
|
|
36
|
+
fast_latest: "fastLatest",
|
|
37
|
+
resolve_revisits: "resolveRevisits",
|
|
38
|
+
show_dupe_count: "showDupeCount",
|
|
39
|
+
show_skip_count: "showSkipCount",
|
|
40
|
+
last_skip_timestamp: "lastSkipTimestamp",
|
|
30
41
|
}.freeze
|
|
31
42
|
|
|
32
43
|
def initialize(client: HttpClient.new)
|
|
33
44
|
@client = client
|
|
34
45
|
end
|
|
35
46
|
|
|
47
|
+
# Returns an Enumerator of Snapshot objects, auto-paginating
|
|
48
|
+
# via resume key unless an explicit page is requested.
|
|
36
49
|
def snapshots(url, **options)
|
|
37
50
|
validate_options!(options)
|
|
38
51
|
|
|
39
52
|
Enumerator.new do |yielder|
|
|
40
|
-
|
|
53
|
+
if options.key?(:page)
|
|
54
|
+
fetch_page(url, options, yielder)
|
|
55
|
+
else
|
|
56
|
+
fetch_with_resume_key(url, options, yielder)
|
|
57
|
+
end
|
|
41
58
|
end
|
|
42
59
|
end
|
|
43
60
|
|
|
@@ -75,24 +92,64 @@ module Archaeo
|
|
|
75
92
|
"No snapshot found after #{ts} for #{url}"
|
|
76
93
|
end
|
|
77
94
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
params
|
|
95
|
+
# Returns the number of pages for a paginated query.
|
|
96
|
+
def num_pages(url, **options)
|
|
97
|
+
params = { "url" => url, "showNumPages" => "true" }
|
|
98
|
+
merge_scalar_params!(params, options)
|
|
82
99
|
response = @client.get(
|
|
83
100
|
"#{ENDPOINT}?#{URI.encode_www_form(params)}",
|
|
84
101
|
)
|
|
85
102
|
unless response.status == 200
|
|
86
|
-
raise Error,
|
|
103
|
+
raise Error,
|
|
104
|
+
"CDX API returned HTTP #{response.status}"
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
response.body.strip.to_i
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Returns all unique original URLs under a domain.
|
|
111
|
+
def known_urls(domain, match_type: "domain")
|
|
112
|
+
snapshots(domain, match_type: match_type,
|
|
113
|
+
collapse: ["urlkey"]).map(&:original_url).uniq
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
def fetch_with_resume_key(url, options, yielder)
|
|
119
|
+
params = build_params(url, options)
|
|
120
|
+
loop do
|
|
121
|
+
response = cdx_get(params)
|
|
122
|
+
return if response.body.nil? || response.body.strip.empty?
|
|
123
|
+
|
|
124
|
+
resume_key = parse_cdx_json(response.body, yielder)
|
|
125
|
+
break if resume_key.nil? || resume_key.empty?
|
|
126
|
+
|
|
127
|
+
params = params.merge("resumeKey" => resume_key)
|
|
87
128
|
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def fetch_page(url, options, yielder)
|
|
132
|
+
params = build_params(url, options)
|
|
133
|
+
response = cdx_get(params)
|
|
88
134
|
return if response.body.nil? || response.body.strip.empty?
|
|
89
135
|
|
|
90
136
|
parse_cdx_json(response.body, yielder)
|
|
91
137
|
end
|
|
92
138
|
|
|
139
|
+
def cdx_get(params)
|
|
140
|
+
response = @client.get(
|
|
141
|
+
"#{ENDPOINT}?#{URI.encode_www_form(params)}",
|
|
142
|
+
)
|
|
143
|
+
return response if response.status == 200
|
|
144
|
+
|
|
145
|
+
raise Error, "CDX API returned HTTP #{response.status}"
|
|
146
|
+
end
|
|
147
|
+
|
|
93
148
|
def validate_options!(options)
|
|
94
149
|
validate_match_type!(options[:match_type])
|
|
95
150
|
validate_sort!(options[:sort])
|
|
151
|
+
validate_filters!(options[:filters])
|
|
152
|
+
validate_collapses!(options[:collapse])
|
|
96
153
|
end
|
|
97
154
|
|
|
98
155
|
def validate_match_type!(type)
|
|
@@ -110,11 +167,27 @@ module Archaeo
|
|
|
110
167
|
"Invalid sort: #{sort}. Use: #{SORT_ORDERS.join(', ')}"
|
|
111
168
|
end
|
|
112
169
|
|
|
170
|
+
def validate_filters!(filters)
|
|
171
|
+
Array(filters).each { |f| CdxFilter.new(f) }
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def validate_collapses!(collapses)
|
|
175
|
+
Array(collapses).each do |c|
|
|
176
|
+
field = c.to_s.split(":").first
|
|
177
|
+
next if CdxFilter::VALID_FIELDS.include?(field)
|
|
178
|
+
|
|
179
|
+
raise ArgumentError,
|
|
180
|
+
"Invalid collapse field: #{field}. " \
|
|
181
|
+
"Valid fields: #{CdxFilter::VALID_FIELDS.join(', ')}"
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
113
185
|
def build_params(url, options)
|
|
114
186
|
{
|
|
115
187
|
"url" => url,
|
|
116
188
|
"output" => "json",
|
|
117
189
|
"fl" => ALL_FIELDS.join(","),
|
|
190
|
+
"showResumeKey" => "true",
|
|
118
191
|
"gzip" => options.fetch(:gzip, true) ? "true" : "false",
|
|
119
192
|
}.tap do |params|
|
|
120
193
|
merge_scalar_params!(params, options)
|
|
@@ -126,23 +199,48 @@ module Archaeo
|
|
|
126
199
|
def merge_scalar_params!(params, options)
|
|
127
200
|
SCALAR_PARAMS.each do |key, api_key|
|
|
128
201
|
value = options[key]
|
|
129
|
-
|
|
202
|
+
next if value.nil?
|
|
203
|
+
|
|
204
|
+
params[api_key] = value.to_s
|
|
130
205
|
end
|
|
131
206
|
end
|
|
132
207
|
|
|
133
208
|
def merge_array_params!(params, values, prefix)
|
|
134
209
|
Array(values).each_with_index do |v, i|
|
|
135
|
-
params["#{prefix}#{i}"] = v
|
|
210
|
+
params["#{prefix}#{i}"] = v.to_s
|
|
136
211
|
end
|
|
137
212
|
end
|
|
138
213
|
|
|
214
|
+
# Parses CDX JSON response, handling the resume key trailer.
|
|
215
|
+
#
|
|
216
|
+
# JSON resume key format:
|
|
217
|
+
# [header, row1, row2, ..., [], ["resume_key_value"]]
|
|
139
218
|
def parse_cdx_json(body, yielder)
|
|
140
219
|
json = JSON.parse(body)
|
|
141
|
-
return unless json.is_a?(Array) && json.length > 1
|
|
220
|
+
return nil unless json.is_a?(Array) && json.length > 1
|
|
221
|
+
|
|
222
|
+
json, resume_key = extract_resume_key(json)
|
|
142
223
|
|
|
143
|
-
header
|
|
224
|
+
header = json[0]
|
|
144
225
|
field_map = header.each_with_index.to_h
|
|
145
|
-
|
|
226
|
+
json[1..].each do |row|
|
|
227
|
+
next unless row.is_a?(Array) && !row.empty?
|
|
228
|
+
|
|
229
|
+
yielder << build_snapshot(field_map, row)
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
resume_key
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def extract_resume_key(json)
|
|
236
|
+
last = json.last
|
|
237
|
+
return [json, nil] unless last.is_a?(Array) && last.length == 1
|
|
238
|
+
|
|
239
|
+
remaining = json[0..-2]
|
|
240
|
+
if remaining.last.is_a?(Array) && remaining.last.empty?
|
|
241
|
+
remaining = remaining[0..-2]
|
|
242
|
+
end
|
|
243
|
+
[remaining, last[0].to_s]
|
|
146
244
|
end
|
|
147
245
|
|
|
148
246
|
def build_snapshot(field_map, row)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Builds and validates CDX Server API filter expressions.
|
|
5
|
+
#
|
|
6
|
+
# CDX filter format: [!]field:regex
|
|
7
|
+
# The optional ! prefix inverts the match. The field must be a
|
|
8
|
+
# recognized CDX field name. The regex is a Java-compatible
|
|
9
|
+
# regex pattern matched against the field value.
|
|
10
|
+
class CdxFilter
|
|
11
|
+
VALID_FIELDS = %w[
|
|
12
|
+
urlkey timestamp original mimetype statuscode
|
|
13
|
+
digest length
|
|
14
|
+
].freeze
|
|
15
|
+
|
|
16
|
+
def initialize(expression)
|
|
17
|
+
@expression = expression.to_s
|
|
18
|
+
validate!
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def to_s
|
|
22
|
+
@expression
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def negated?
|
|
26
|
+
@expression.start_with?("!")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def field
|
|
30
|
+
stripped = @expression.delete_prefix("!")
|
|
31
|
+
stripped.split(":", 2).first.to_s
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.by_status(code)
|
|
35
|
+
new("statuscode:#{code}")
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def self.excluding_status(code)
|
|
39
|
+
new("!statuscode:#{code}")
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def self.by_mimetype(type)
|
|
43
|
+
new("mimetype:#{type}")
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.excluding_mimetype(type)
|
|
47
|
+
new("!mimetype:#{type}")
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def self.by_digest(digest)
|
|
51
|
+
new("digest:#{digest}")
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def self.by_url(pattern)
|
|
55
|
+
new("original:#{pattern}")
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def validate!
|
|
61
|
+
return if @expression.empty?
|
|
62
|
+
|
|
63
|
+
field_name = field
|
|
64
|
+
return if VALID_FIELDS.include?(field_name)
|
|
65
|
+
|
|
66
|
+
raise ArgumentError,
|
|
67
|
+
"Invalid CDX filter field: #{field_name}. " \
|
|
68
|
+
"Valid fields: #{VALID_FIELDS.join(', ')}"
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
data/lib/archaeo/cli.rb
CHANGED
|
@@ -1,10 +1,19 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "csv"
|
|
4
|
+
require "json"
|
|
3
5
|
require "thor"
|
|
4
6
|
|
|
5
7
|
module Archaeo
|
|
6
8
|
# Command-line interface powered by Thor.
|
|
7
9
|
class Cli < Thor
|
|
10
|
+
map %w[--version -v] => :version
|
|
11
|
+
|
|
12
|
+
desc "version", "Show archaeo version"
|
|
13
|
+
def version
|
|
14
|
+
puts "archaeo #{VERSION}"
|
|
15
|
+
end
|
|
16
|
+
|
|
8
17
|
desc "snapshots URL", "List archived snapshots for a URL"
|
|
9
18
|
option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
|
|
10
19
|
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
@@ -14,12 +23,16 @@ module Archaeo
|
|
|
14
23
|
option :collapse, type: :array, desc: "CDX collapse fields"
|
|
15
24
|
option :sort, desc: "Sort order (default, closest, reverse)"
|
|
16
25
|
option :limit, type: :numeric, desc: "Max snapshots to return"
|
|
26
|
+
option :format, desc: "Output format (table, json, csv)",
|
|
27
|
+
default: "table"
|
|
17
28
|
def snapshots(url)
|
|
18
29
|
cdx = CdxApi.new
|
|
19
30
|
opts = build_cdx_options(options)
|
|
20
|
-
cdx.snapshots(url, **opts).
|
|
21
|
-
|
|
22
|
-
|
|
31
|
+
snaps = cdx.snapshots(url, **opts).to_a
|
|
32
|
+
case options[:format]
|
|
33
|
+
when "json" then output_json(snaps)
|
|
34
|
+
when "csv" then output_csv(snaps)
|
|
35
|
+
else output_table(snaps)
|
|
23
36
|
end
|
|
24
37
|
end
|
|
25
38
|
|
|
@@ -64,12 +77,46 @@ module Archaeo
|
|
|
64
77
|
"Fetch archived content for a URL at a timestamp"
|
|
65
78
|
option :identity, type: :boolean, default: false,
|
|
66
79
|
desc: "Fetch raw (identity) content"
|
|
80
|
+
option :output, desc: "Write content to file"
|
|
67
81
|
def fetch(url, timestamp)
|
|
68
82
|
page = Fetcher.new.fetch(
|
|
69
83
|
url, timestamp: timestamp,
|
|
70
84
|
identity: options[:identity]
|
|
71
85
|
)
|
|
72
|
-
|
|
86
|
+
|
|
87
|
+
if options[:output]
|
|
88
|
+
write_output(options[:output], page.content)
|
|
89
|
+
else
|
|
90
|
+
$stdout.write(page.content)
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
desc "download URL", "Download all archived snapshots of a URL"
|
|
95
|
+
option :output, desc: "Output directory", default: "archive"
|
|
96
|
+
option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
|
|
97
|
+
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
98
|
+
option :resume, type: :boolean, default: false,
|
|
99
|
+
desc: "Resume interrupted download"
|
|
100
|
+
def download(url)
|
|
101
|
+
downloader = BulkDownloader.new(output_dir: options[:output])
|
|
102
|
+
|
|
103
|
+
downloader.download(
|
|
104
|
+
url,
|
|
105
|
+
from: options[:from],
|
|
106
|
+
to: options[:to],
|
|
107
|
+
resume: options[:resume],
|
|
108
|
+
) do |current, total, snap|
|
|
109
|
+
warn "[#{current}/#{total}] " \
|
|
110
|
+
"#{snap.timestamp} #{snap.original_url}"
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
desc "known_urls DOMAIN",
|
|
115
|
+
"List all known URLs for a domain"
|
|
116
|
+
def known_urls(domain)
|
|
117
|
+
CdxApi.new.known_urls(domain).each do |u|
|
|
118
|
+
puts u
|
|
119
|
+
end
|
|
73
120
|
end
|
|
74
121
|
|
|
75
122
|
CDX_OPTION_MAP = {
|
|
@@ -90,5 +137,40 @@ module Archaeo
|
|
|
90
137
|
result[api_key] = value if value
|
|
91
138
|
end
|
|
92
139
|
end
|
|
140
|
+
|
|
141
|
+
def output_table(snaps)
|
|
142
|
+
snaps.each do |snap|
|
|
143
|
+
puts "#{snap.timestamp} #{snap.status_code} " \
|
|
144
|
+
"#{snap.original_url}"
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def output_json(snaps)
|
|
149
|
+
data = snaps.map do |snap|
|
|
150
|
+
{
|
|
151
|
+
timestamp: snap.timestamp.to_s,
|
|
152
|
+
status_code: snap.status_code,
|
|
153
|
+
url: snap.original_url,
|
|
154
|
+
archive_url: snap.archive_url,
|
|
155
|
+
}
|
|
156
|
+
end
|
|
157
|
+
puts JSON.generate(data)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def output_csv(snaps)
|
|
161
|
+
puts CSV.generate do |csv|
|
|
162
|
+
csv << %w[timestamp status_code url archive_url]
|
|
163
|
+
snaps.each do |snap|
|
|
164
|
+
csv << [snap.timestamp.to_s, snap.status_code,
|
|
165
|
+
snap.original_url, snap.archive_url]
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def write_output(path, content)
|
|
171
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
172
|
+
File.binwrite(path, content)
|
|
173
|
+
warn "Written to #{path}"
|
|
174
|
+
end
|
|
93
175
|
end
|
|
94
176
|
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Tracks download progress for resume support.
|
|
5
|
+
#
|
|
6
|
+
# Persists completed snapshot timestamps to a state file within
|
|
7
|
+
# the output directory, allowing interrupted downloads to resume
|
|
8
|
+
# without re-fetching already downloaded snapshots.
|
|
9
|
+
class DownloadState
|
|
10
|
+
STATE_FILE = ".archaeo-state"
|
|
11
|
+
|
|
12
|
+
attr_reader :output_dir
|
|
13
|
+
|
|
14
|
+
def initialize(output_dir)
|
|
15
|
+
@output_dir = output_dir
|
|
16
|
+
@path = File.join(output_dir, STATE_FILE)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def completed?(timestamp)
|
|
20
|
+
timestamps.include?(timestamp.to_s)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def mark_completed(timestamp)
|
|
24
|
+
timestamps << timestamp.to_s
|
|
25
|
+
save
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def clear
|
|
29
|
+
@timestamps = []
|
|
30
|
+
FileUtils.rm_f(@path)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def timestamps
|
|
36
|
+
@timestamps ||= load_timestamps
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def load_timestamps
|
|
40
|
+
return [] unless File.exist?(@path)
|
|
41
|
+
|
|
42
|
+
File.readlines(@path, chomp: true).reject(&:empty?)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def save
|
|
46
|
+
File.write(@path, "#{timestamps.uniq.sort.join("\n")}\n")
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
data/lib/archaeo/http_client.rb
CHANGED
|
@@ -6,10 +6,11 @@ require "zlib"
|
|
|
6
6
|
require "stringio"
|
|
7
7
|
|
|
8
8
|
module Archaeo
|
|
9
|
-
# HTTP client with retry logic, gzip decompression,
|
|
10
|
-
# rotating realistic User-Agent profiles.
|
|
9
|
+
# HTTP client with retry logic, gzip decompression,
|
|
10
|
+
# rotating realistic User-Agent profiles, and connection pooling.
|
|
11
11
|
#
|
|
12
|
-
# Injected via constructor for testability.
|
|
12
|
+
# Injected via constructor for testability. Connections are reused
|
|
13
|
+
# across requests to the same host for improved performance.
|
|
13
14
|
class HttpClient
|
|
14
15
|
DEFAULT_TIMEOUT = 30
|
|
15
16
|
DEFAULT_MAX_RETRIES = 3
|
|
@@ -21,6 +22,8 @@ module Archaeo
|
|
|
21
22
|
IOError,
|
|
22
23
|
Errno::ECONNRESET,
|
|
23
24
|
Errno::ECONNREFUSED,
|
|
25
|
+
EOFError,
|
|
26
|
+
Errno::EPIPE,
|
|
24
27
|
].freeze
|
|
25
28
|
|
|
26
29
|
USER_AGENT_PROFILES = [
|
|
@@ -60,11 +63,25 @@ module Archaeo
|
|
|
60
63
|
@max_retries = max_retries
|
|
61
64
|
@retry_delay = retry_delay
|
|
62
65
|
@user_agent = user_agent
|
|
66
|
+
@connections = {}
|
|
67
|
+
@mutex = Mutex.new
|
|
63
68
|
end
|
|
64
69
|
|
|
65
70
|
def get(url, headers: {})
|
|
66
71
|
merged = default_headers.merge(headers)
|
|
67
|
-
|
|
72
|
+
uri = URI(url)
|
|
73
|
+
attempt_with_retries(uri, merged)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def shutdown
|
|
77
|
+
@mutex.synchronize do
|
|
78
|
+
@connections.each_value do |http|
|
|
79
|
+
http.finish
|
|
80
|
+
rescue StandardError
|
|
81
|
+
nil
|
|
82
|
+
end
|
|
83
|
+
@connections.clear
|
|
84
|
+
end
|
|
68
85
|
end
|
|
69
86
|
|
|
70
87
|
private
|
|
@@ -73,13 +90,52 @@ module Archaeo
|
|
|
73
90
|
@user_agent || USER_AGENT_PROFILES.sample
|
|
74
91
|
end
|
|
75
92
|
|
|
76
|
-
def
|
|
93
|
+
def connection_key(uri)
|
|
94
|
+
"#{uri.scheme}://#{uri.host}:#{uri.port}"
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def connection_for(uri)
|
|
98
|
+
key = connection_key(uri)
|
|
99
|
+
@mutex.synchronize do
|
|
100
|
+
http = @connections[key]
|
|
101
|
+
if http && !http.active?
|
|
102
|
+
@connections.delete(key)
|
|
103
|
+
http = nil
|
|
104
|
+
end
|
|
105
|
+
@connections[key] = build_connection(uri) unless http
|
|
106
|
+
@connections[key]
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def build_connection(uri)
|
|
111
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
112
|
+
http.use_ssl = uri.scheme == "https"
|
|
113
|
+
http.read_timeout = @timeout
|
|
114
|
+
http.open_timeout = @timeout
|
|
115
|
+
http.start
|
|
116
|
+
http
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def invalidate_connection(uri)
|
|
120
|
+
key = connection_key(uri)
|
|
121
|
+
@mutex.synchronize do
|
|
122
|
+
http = @connections.delete(key)
|
|
123
|
+
begin
|
|
124
|
+
http&.finish
|
|
125
|
+
rescue StandardError
|
|
126
|
+
nil
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def attempt_with_retries(uri, headers)
|
|
77
132
|
retries = 0
|
|
78
133
|
begin
|
|
79
|
-
|
|
134
|
+
execute_with_connection(uri, headers)
|
|
80
135
|
rescue *TRANSIENT_ERRORS => e
|
|
81
136
|
retries += 1
|
|
82
137
|
raise_if_exhausted(retries, e)
|
|
138
|
+
invalidate_connection(uri)
|
|
83
139
|
sleep(@retry_delay * retries)
|
|
84
140
|
retry
|
|
85
141
|
end
|
|
@@ -92,6 +148,19 @@ module Archaeo
|
|
|
92
148
|
"Failed after #{retries} retries: #{error.message}"
|
|
93
149
|
end
|
|
94
150
|
|
|
151
|
+
def execute_with_connection(uri, headers)
|
|
152
|
+
http = connection_for(uri)
|
|
153
|
+
request = Net::HTTP::Get.new(uri)
|
|
154
|
+
headers.each { |k, v| request[k] = v }
|
|
155
|
+
raw = http.request(request)
|
|
156
|
+
build_response(raw)
|
|
157
|
+
rescue *TRANSIENT_ERRORS
|
|
158
|
+
raise
|
|
159
|
+
rescue StandardError
|
|
160
|
+
invalidate_connection(uri)
|
|
161
|
+
raise
|
|
162
|
+
end
|
|
163
|
+
|
|
95
164
|
def default_headers
|
|
96
165
|
{
|
|
97
166
|
"User-Agent" => select_user_agent,
|
|
@@ -103,19 +172,6 @@ module Archaeo
|
|
|
103
172
|
}
|
|
104
173
|
end
|
|
105
174
|
|
|
106
|
-
def execute_get(url, headers)
|
|
107
|
-
uri = URI(url)
|
|
108
|
-
Net::HTTP.start(uri.host, uri.port,
|
|
109
|
-
use_ssl: uri.scheme == "https",
|
|
110
|
-
read_timeout: @timeout,
|
|
111
|
-
open_timeout: @timeout) do |http|
|
|
112
|
-
request = Net::HTTP::Get.new(uri)
|
|
113
|
-
headers.each { |k, v| request[k] = v }
|
|
114
|
-
raw = http.request(request)
|
|
115
|
-
build_response(raw)
|
|
116
|
-
end
|
|
117
|
-
end
|
|
118
|
-
|
|
119
175
|
def build_response(raw)
|
|
120
176
|
headers = raw.each_header.to_h { |k, v| [k.downcase, v] }
|
|
121
177
|
Response.new(
|
data/lib/archaeo/page.rb
CHANGED
|
@@ -1,22 +1,85 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
3
5
|
module Archaeo
|
|
4
6
|
# Model representing a fetched archived page from the Wayback Machine.
|
|
5
7
|
#
|
|
6
8
|
# Contains the page content, metadata, and provenance information
|
|
7
|
-
# for a single archived resource.
|
|
9
|
+
# for a single archived resource. Content is automatically transcoded
|
|
10
|
+
# to UTF-8 from the detected source encoding.
|
|
8
11
|
class Page
|
|
9
|
-
attr_reader :
|
|
12
|
+
attr_reader :content_type, :status_code,
|
|
10
13
|
:archive_url, :original_url, :timestamp
|
|
11
14
|
|
|
12
15
|
def initialize(content:, content_type:, status_code:,
|
|
13
16
|
archive_url:, original_url:, timestamp:)
|
|
14
|
-
@
|
|
17
|
+
@raw_content = content
|
|
15
18
|
@content_type = content_type
|
|
16
19
|
@status_code = status_code
|
|
17
20
|
@archive_url = archive_url
|
|
18
21
|
@original_url = original_url
|
|
19
22
|
@timestamp = Timestamp.coerce(timestamp)
|
|
20
23
|
end
|
|
24
|
+
|
|
25
|
+
def content
|
|
26
|
+
@content ||= transcode(@raw_content)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def encoding
|
|
30
|
+
@encoding ||= detect_encoding
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def detect_encoding
|
|
36
|
+
charset = extract_charset(@content_type)
|
|
37
|
+
return Encoding.find(charset) if charset
|
|
38
|
+
|
|
39
|
+
html_charset = detect_html_charset
|
|
40
|
+
return Encoding.find(html_charset) if html_charset
|
|
41
|
+
|
|
42
|
+
Encoding::UTF_8
|
|
43
|
+
rescue ArgumentError
|
|
44
|
+
Encoding::UTF_8
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def extract_charset(content_type)
|
|
48
|
+
return nil unless content_type
|
|
49
|
+
|
|
50
|
+
match = content_type.match(/charset=([^\s;]+)/i)
|
|
51
|
+
match ? match[1] : nil
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def detect_html_charset
|
|
55
|
+
doc = Nokogiri::HTML(@raw_content)
|
|
56
|
+
node = doc.at_css("meta[charset]")
|
|
57
|
+
return node["charset"] if node
|
|
58
|
+
|
|
59
|
+
content = doc.at_css('meta[http-equiv="Content-Type"]')&.[]("content")
|
|
60
|
+
return nil unless content
|
|
61
|
+
|
|
62
|
+
match = content.match(/charset=([^\s;]+)/i)
|
|
63
|
+
match ? match[1] : nil
|
|
64
|
+
rescue StandardError
|
|
65
|
+
nil
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def transcode(raw)
|
|
69
|
+
return raw if raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
|
|
70
|
+
return raw if raw.empty?
|
|
71
|
+
|
|
72
|
+
encode_to_utf8(raw, encoding)
|
|
73
|
+
rescue Encoding::InvalidByteSequenceError,
|
|
74
|
+
Encoding::UndefinedConversionError
|
|
75
|
+
encode_to_utf8(raw, Encoding::UTF_8)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def encode_to_utf8(raw, source_encoding)
|
|
79
|
+
raw.force_encoding(source_encoding)
|
|
80
|
+
.encode("UTF-8",
|
|
81
|
+
invalid: :replace, undef: :replace,
|
|
82
|
+
replace: "?")
|
|
83
|
+
end
|
|
21
84
|
end
|
|
22
85
|
end
|
data/lib/archaeo/snapshot.rb
CHANGED
|
@@ -9,6 +9,8 @@ module Archaeo
|
|
|
9
9
|
FIELDS = %i[urlkey timestamp original_url
|
|
10
10
|
mimetype status_code digest length].freeze
|
|
11
11
|
|
|
12
|
+
BLOCKED_STATUS = -1
|
|
13
|
+
|
|
12
14
|
attr_reader(*FIELDS)
|
|
13
15
|
|
|
14
16
|
def initialize(urlkey:, timestamp:, original_url:,
|
|
@@ -27,14 +29,22 @@ module Archaeo
|
|
|
27
29
|
ArchiveUrl.new(original_url, timestamp: @timestamp).to_s
|
|
28
30
|
end
|
|
29
31
|
|
|
32
|
+
def blocked?
|
|
33
|
+
@status_code == BLOCKED_STATUS
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def to_a
|
|
37
|
+
[@urlkey, @timestamp, @original_url, @mimetype,
|
|
38
|
+
@status_code, @digest, @length]
|
|
39
|
+
end
|
|
40
|
+
|
|
30
41
|
def ==(other)
|
|
31
|
-
other.is_a?(self.class) &&
|
|
32
|
-
FIELDS.all? { |f| send(f) == other.send(f) }
|
|
42
|
+
other.is_a?(self.class) && to_a == other.to_a
|
|
33
43
|
end
|
|
34
44
|
alias_method :eql?, :==
|
|
35
45
|
|
|
36
46
|
def hash
|
|
37
|
-
|
|
47
|
+
to_a.hash
|
|
38
48
|
end
|
|
39
49
|
end
|
|
40
50
|
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Sanitizes and normalizes URLs for Wayback Machine API queries.
|
|
5
|
+
#
|
|
6
|
+
# Handles common URL issues: whitespace, surrounding quotes,
|
|
7
|
+
# double percent-encoding, and inconsistent percent-encoding case.
|
|
8
|
+
class UrlNormalizer
|
|
9
|
+
attr_reader :original, :normalized
|
|
10
|
+
|
|
11
|
+
def initialize(url)
|
|
12
|
+
@original = url.to_s
|
|
13
|
+
@normalized = normalize(@original)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.normalize(url)
|
|
17
|
+
new(url).normalized
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def self.with_scheme(url)
|
|
21
|
+
normalized = normalize(url)
|
|
22
|
+
normalized.match?(%r{\A[a-z][a-z0-9+\-.]*://}) ? normalized : "https://#{normalized}"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def to_s
|
|
26
|
+
@normalized
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def normalize(url)
|
|
32
|
+
url = strip_whitespace(url)
|
|
33
|
+
url = strip_surrounding_quotes(url)
|
|
34
|
+
url = fix_double_percent_encoding(url)
|
|
35
|
+
normalize_percent_encoding(url)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def strip_whitespace(url)
|
|
39
|
+
url.strip
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def strip_surrounding_quotes(url)
|
|
43
|
+
url = url[1..-2] if url.start_with?('"') && url.end_with?('"')
|
|
44
|
+
url = url[1..-2] if url.start_with?("'") && url.end_with?("'")
|
|
45
|
+
url
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def fix_double_percent_encoding(url)
|
|
49
|
+
url.gsub(/%25([0-9A-Fa-f]{2})/i, '%\1')
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def normalize_percent_encoding(url)
|
|
53
|
+
url.gsub(/%[0-9a-f]{2}/i, &:upcase)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Rewrites Wayback Machine archive URLs to local file paths.
|
|
5
|
+
#
|
|
6
|
+
# Used for saving archived pages and their assets for offline
|
|
7
|
+
# browsing. Converts absolute archive URLs into relative paths
|
|
8
|
+
# rooted at a configurable local directory.
|
|
9
|
+
class UrlRewriter
|
|
10
|
+
def initialize(archive_prefix, local_prefix)
|
|
11
|
+
@archive_prefix = archive_prefix.to_s
|
|
12
|
+
@local_prefix = local_prefix.to_s
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def rewrite(url)
|
|
16
|
+
return url unless url.start_with?(@archive_prefix)
|
|
17
|
+
|
|
18
|
+
relative = url.sub(@archive_prefix, "")
|
|
19
|
+
File.join(@local_prefix, relative)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
data/lib/archaeo/version.rb
CHANGED
data/lib/archaeo.rb
CHANGED
|
@@ -21,10 +21,17 @@ module Archaeo
|
|
|
21
21
|
autoload :Page, "archaeo/page"
|
|
22
22
|
autoload :SaveResult, "archaeo/save_result"
|
|
23
23
|
autoload :AvailabilityResult, "archaeo/availability_result"
|
|
24
|
+
autoload :UrlNormalizer, "archaeo/url_normalizer"
|
|
25
|
+
autoload :CdxFilter, "archaeo/cdx_filter"
|
|
26
|
+
autoload :AssetList, "archaeo/asset_list"
|
|
27
|
+
autoload :AssetExtractor, "archaeo/asset_extractor"
|
|
28
|
+
autoload :UrlRewriter, "archaeo/url_rewriter"
|
|
29
|
+
autoload :DownloadState, "archaeo/download_state"
|
|
24
30
|
autoload :HttpClient, "archaeo/http_client"
|
|
25
31
|
autoload :CdxApi, "archaeo/cdx_api"
|
|
26
32
|
autoload :AvailabilityApi, "archaeo/availability_api"
|
|
27
33
|
autoload :SaveApi, "archaeo/save_api"
|
|
28
34
|
autoload :Fetcher, "archaeo/fetcher"
|
|
35
|
+
autoload :BulkDownloader, "archaeo/bulk_downloader"
|
|
29
36
|
autoload :Cli, "archaeo/cli"
|
|
30
37
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,43 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: archaeo
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
|
+
autorequire:
|
|
8
9
|
bindir: exe
|
|
9
10
|
cert_chain: []
|
|
10
|
-
date:
|
|
11
|
+
date: 2026-05-09 00:00:00.000000000 Z
|
|
11
12
|
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: csv
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '3.3'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '3.3'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: nokogiri
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '1.14'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '1.14'
|
|
12
41
|
- !ruby/object:Gem::Dependency
|
|
13
42
|
name: thor
|
|
14
43
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -43,10 +72,15 @@ files:
|
|
|
43
72
|
- exe/archaeo
|
|
44
73
|
- lib/archaeo.rb
|
|
45
74
|
- lib/archaeo/archive_url.rb
|
|
75
|
+
- lib/archaeo/asset_extractor.rb
|
|
76
|
+
- lib/archaeo/asset_list.rb
|
|
46
77
|
- lib/archaeo/availability_api.rb
|
|
47
78
|
- lib/archaeo/availability_result.rb
|
|
79
|
+
- lib/archaeo/bulk_downloader.rb
|
|
48
80
|
- lib/archaeo/cdx_api.rb
|
|
81
|
+
- lib/archaeo/cdx_filter.rb
|
|
49
82
|
- lib/archaeo/cli.rb
|
|
83
|
+
- lib/archaeo/download_state.rb
|
|
50
84
|
- lib/archaeo/fetcher.rb
|
|
51
85
|
- lib/archaeo/http_client.rb
|
|
52
86
|
- lib/archaeo/page.rb
|
|
@@ -54,6 +88,8 @@ files:
|
|
|
54
88
|
- lib/archaeo/save_result.rb
|
|
55
89
|
- lib/archaeo/snapshot.rb
|
|
56
90
|
- lib/archaeo/timestamp.rb
|
|
91
|
+
- lib/archaeo/url_normalizer.rb
|
|
92
|
+
- lib/archaeo/url_rewriter.rb
|
|
57
93
|
- lib/archaeo/version.rb
|
|
58
94
|
- sig/archaeo.rbs
|
|
59
95
|
homepage: https://github.com/riboseinc/archaeo
|
|
@@ -62,8 +98,9 @@ licenses:
|
|
|
62
98
|
metadata:
|
|
63
99
|
homepage_uri: https://github.com/riboseinc/archaeo
|
|
64
100
|
source_code_uri: https://github.com/riboseinc/archaeo
|
|
65
|
-
changelog_uri: https://github.com/riboseinc/archaeo/blob/main/CHANGELOG.
|
|
101
|
+
changelog_uri: https://github.com/riboseinc/archaeo/blob/main/CHANGELOG.adoc
|
|
66
102
|
rubygems_mfa_required: 'true'
|
|
103
|
+
post_install_message:
|
|
67
104
|
rdoc_options: []
|
|
68
105
|
require_paths:
|
|
69
106
|
- lib
|
|
@@ -78,7 +115,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
78
115
|
- !ruby/object:Gem::Version
|
|
79
116
|
version: '0'
|
|
80
117
|
requirements: []
|
|
81
|
-
rubygems_version: 3.
|
|
118
|
+
rubygems_version: 3.5.22
|
|
119
|
+
signing_key:
|
|
82
120
|
specification_version: 4
|
|
83
121
|
summary: Ruby client for the Internet Archive Wayback Machine APIs
|
|
84
122
|
test_files: []
|