archaeo 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 65cb8ec1434b72774ed3a1d49ac87920bebb549cc5a4aebb0966b8d110d740ba
4
- data.tar.gz: a10b0bf2b8555d3a259c8ec02364e3031697189a784411ab54c4a1bfd17ab402
3
+ metadata.gz: 24c6b37575e8f673a8e6acb7aba38264ac811236cb91e905663914d08a283289
4
+ data.tar.gz: 632f36d31ee83b23f727dd8eb8217929d3dad26c94c6948fb0621aff2b937701
5
5
  SHA512:
6
- metadata.gz: 74eac73369d611a491152f7018d63b8fe3b46f8154a374d43d62f3ca023e1837dae666b98c795c211add5c6282fffe70576e581165aecd6644ecbccd15efe623
7
- data.tar.gz: 03dd2e1ea518ef34a2b2c427c91a9a562aa328aced3b8cad4910fb634ba8e723f1cfb50bd57b247403a060c67f2f2f4638ad9e4293abb76e6225a731bb7493fd
6
+ metadata.gz: 33cf0ea6c5317be5aafba988e8652555a8e7a77620e93571a51c1537ae4cdd455e76d170fe78549a5e3068ec82db12138159cb87a31c1aafda5c036a6ca2511e
7
+ data.tar.gz: 2c0ae7a6a461913ed475dd3084e495f057db995ebf9ea8a1e6c9595aae74cd6bc65a0cdc6f2b2a3a831d9cc0e45e92e06a13a4f7653e89c0a8bbcd7226207677
data/archaeo.gemspec CHANGED
@@ -20,19 +20,22 @@ Gem::Specification.new do |spec|
20
20
  spec.metadata["homepage_uri"] = spec.homepage
21
21
  spec.metadata["source_code_uri"] = spec.homepage
22
22
  spec.metadata["changelog_uri"] =
23
- "#{spec.homepage}/blob/main/CHANGELOG.md"
23
+ "#{spec.homepage}/blob/main/CHANGELOG.adoc"
24
24
  spec.metadata["rubygems_mfa_required"] = "true"
25
25
 
26
26
  spec.files = IO.popen(%w[git ls-files -z], chdir: __dir__,
27
27
  err: IO::NULL) do |ls|
28
28
  ls.readlines("\x0", chomp: true).reject do |f|
29
29
  f == __FILE__ ||
30
- f.start_with?(*%w[Gemfile .gitignore .rspec spec/ .github/ .rubocop])
30
+ f.start_with?(*%w[Gemfile .gitignore .rspec spec/ .github/
31
+ .rubocop TODO])
31
32
  end
32
33
  end
33
34
  spec.bindir = "exe"
34
35
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
35
36
  spec.require_paths = ["lib"]
36
37
 
38
+ spec.add_dependency "csv", "~> 3.3"
39
+ spec.add_dependency "nokogiri", "~> 1.14"
37
40
  spec.add_dependency "thor", "~> 1.3"
38
41
  end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "uri"
5
+
6
+ module Archaeo
7
+ # Extracts resource URLs from archived HTML content using Nokogiri.
8
+ #
9
+ # Parses the HTML DOM to find CSS, JavaScript, images, fonts,
10
+ # and media resources referenced by the page. Optionally resolves
11
+ # relative URLs against a base URL.
12
+ class AssetExtractor
13
+ def initialize(html, base_url: nil)
14
+ @doc = Nokogiri::HTML(html.to_s)
15
+ @base_url = base_url
16
+ end
17
+
18
+ def extract
19
+ list = AssetList.new
20
+ extract_css(list)
21
+ extract_js(list)
22
+ extract_images(list)
23
+ extract_fonts(list)
24
+ extract_media(list)
25
+ extract_inline_css(list)
26
+ list
27
+ end
28
+
29
+ private
30
+
31
+ def extract_css(list)
32
+ @doc.css('link[rel="stylesheet"]').each do |el|
33
+ list.add(resolve(el["href"]), type: :css)
34
+ end
35
+ end
36
+
37
+ def extract_js(list)
38
+ @doc.css("script[src]").each do |el|
39
+ list.add(resolve(el["src"]), type: :js)
40
+ end
41
+ end
42
+
43
+ def extract_images(list)
44
+ @doc.css("img[src]").each do |el|
45
+ list.add(resolve(el["src"]), type: :image)
46
+ end
47
+ end
48
+
49
+ def extract_fonts(list)
50
+ @doc.css('link[rel="preload"][as="font"]').each do |el|
51
+ list.add(resolve(el["href"]), type: :font)
52
+ end
53
+ @doc.css('link[rel="stylesheet"]').each do |el|
54
+ if font_stylesheet?(el["href"])
55
+ list.add(resolve(el["href"]),
56
+ type: :font)
57
+ end
58
+ end
59
+ end
60
+
61
+ def extract_media(list)
62
+ @doc.css("source[src], video[src], audio[src]").each do |el|
63
+ list.add(resolve(el["src"]), type: :media)
64
+ end
65
+ end
66
+
67
+ def extract_inline_css(list)
68
+ @doc.css("style").each do |el|
69
+ extract_css_urls(el.text).each do |url|
70
+ list.add(resolve(url), type: :font)
71
+ end
72
+ end
73
+ end
74
+
75
+ def font_stylesheet?(href)
76
+ href.to_s.include?("fonts.googleapis.com") ||
77
+ href.to_s.include?("font")
78
+ end
79
+
80
+ def extract_css_urls(css_text)
81
+ css_text.scan(/url\(\s*['"]?([^'")\s]+)['"]?\s*\)/).flatten
82
+ end
83
+
84
+ def resolve(url)
85
+ return url if url.nil? || url.empty?
86
+ return url if url.start_with?("http", "//", "data:", "#")
87
+ return url unless @base_url
88
+
89
+ URI.join(@base_url, url).to_s
90
+ rescue URI::InvalidURIError
91
+ url
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Categorized collection of asset URLs extracted from an archived page.
5
+ #
6
+ # Assets are grouped by type (css, js, image, font, media) for
7
+ # convenient access during bulk download or local archiving.
8
+ class AssetList
9
+ CATEGORIES = %i[css js image font media].freeze
10
+
11
+ def initialize
12
+ @urls_by_type = {}
13
+ CATEGORIES.each { |c| @urls_by_type[c] = [] }
14
+ end
15
+
16
+ def add(url, type:)
17
+ @urls_by_type[type] << url unless url.nil? || url.empty?
18
+ end
19
+
20
+ def css
21
+ @urls_by_type[:css]
22
+ end
23
+
24
+ def js
25
+ @urls_by_type[:js]
26
+ end
27
+
28
+ def images
29
+ @urls_by_type[:image]
30
+ end
31
+
32
+ def fonts
33
+ @urls_by_type[:font]
34
+ end
35
+
36
+ def media
37
+ @urls_by_type[:media]
38
+ end
39
+
40
+ def all
41
+ @urls_by_type.values.flatten.uniq
42
+ end
43
+
44
+ def size
45
+ all.size
46
+ end
47
+
48
+ def empty?
49
+ all.empty?
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+
5
+ module Archaeo
6
+ # Downloads all archived snapshots of a URL with resume support.
7
+ #
8
+ # Queries the CDX API for matching snapshots, fetches each page,
9
+ # and saves content to disk. Progress is tracked in a state file
10
+ # for interrupted download recovery.
11
+ class BulkDownloader
12
+ def initialize(client: HttpClient.new, output_dir: "archive")
13
+ @client = client
14
+ @output_dir = output_dir
15
+ end
16
+
17
+ def download(url, from: nil, to: nil, resume: false)
18
+ FileUtils.mkdir_p(@output_dir)
19
+ state = DownloadState.new(@output_dir)
20
+
21
+ snapshots = fetch_snapshots(url, from: from, to: to)
22
+ total = snapshots.size
23
+
24
+ snapshots.each_with_index do |snap, index|
25
+ next if resume && state.completed?(snap.timestamp)
26
+
27
+ fetch_and_save(snap)
28
+ state.mark_completed(snap.timestamp)
29
+
30
+ yield index + 1, total, snap if block_given?
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def fetch_snapshots(url, from:, to:)
37
+ cdx = CdxApi.new(client: @client)
38
+ options = {}
39
+ options[:from] = from if from
40
+ options[:to] = to if to
41
+ cdx.snapshots(url, **options)
42
+ .select { |snap| !snap.blocked? && snap.status_code == 200 }
43
+ end
44
+
45
+ def fetch_and_save(snapshot)
46
+ fetcher = Fetcher.new(client: @client)
47
+ page = fetcher.fetch(snapshot.original_url,
48
+ timestamp: snapshot.timestamp)
49
+
50
+ filename = build_filename(snapshot)
51
+ FileUtils.mkdir_p(File.dirname(filename))
52
+ File.binwrite(filename, page.content)
53
+ end
54
+
55
+ def build_filename(snapshot)
56
+ ts = snapshot.timestamp.to_s
57
+ safe_path = snapshot.original_url
58
+ .sub(%r{\Ahttps?://}, "")
59
+ .gsub(%r{/}, File::SEPARATOR)
60
+ .gsub(%r{[?&=]}, "_")
61
+ safe_path = safe_path[0..-2] if safe_path.end_with?(File::SEPARATOR)
62
+ safe_path = "#{safe_path}index" if safe_path.empty?
63
+
64
+ File.join(@output_dir, safe_path, "#{ts}.html")
65
+ end
66
+ end
67
+ end
@@ -6,8 +6,11 @@ require "uri"
6
6
  module Archaeo
7
7
  # Client for the Wayback Machine CDX Server API.
8
8
  #
9
- # Query archived snapshots by URL, timestamp range, filters,
10
- # and more. Returns Snapshot objects for each matching CDX record.
9
+ # Supports all CDX features: field selection, filtering with regex,
10
+ # collapsing, resume-key pagination, page-based pagination,
11
+ # closest timestamp match, resolve revisits, and counters.
12
+ #
13
+ # @see https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
11
14
  class CdxApi
12
15
  ENDPOINT = "https://web.archive.org/cdx/search/cdx"
13
16
 
@@ -27,17 +30,31 @@ module Archaeo
27
30
  sort: "sort",
28
31
  limit: "limit",
29
32
  closest: "closest",
33
+ offset: "offset",
34
+ page: "page",
35
+ page_size: "pageSize",
36
+ fast_latest: "fastLatest",
37
+ resolve_revisits: "resolveRevisits",
38
+ show_dupe_count: "showDupeCount",
39
+ show_skip_count: "showSkipCount",
40
+ last_skip_timestamp: "lastSkipTimestamp",
30
41
  }.freeze
31
42
 
32
43
  def initialize(client: HttpClient.new)
33
44
  @client = client
34
45
  end
35
46
 
47
+ # Returns an Enumerator of Snapshot objects, auto-paginating
48
+ # via resume key unless an explicit page is requested.
36
49
  def snapshots(url, **options)
37
50
  validate_options!(options)
38
51
 
39
52
  Enumerator.new do |yielder|
40
- fetch_snapshots(url, options, yielder)
53
+ if options.key?(:page)
54
+ fetch_page(url, options, yielder)
55
+ else
56
+ fetch_with_resume_key(url, options, yielder)
57
+ end
41
58
  end
42
59
  end
43
60
 
@@ -75,24 +92,64 @@ module Archaeo
75
92
  "No snapshot found after #{ts} for #{url}"
76
93
  end
77
94
 
78
- private
79
-
80
- def fetch_snapshots(url, options, yielder)
81
- params = build_params(url, options)
95
+ # Returns the number of pages for a paginated query.
96
+ def num_pages(url, **options)
97
+ params = { "url" => url, "showNumPages" => "true" }
98
+ merge_scalar_params!(params, options)
82
99
  response = @client.get(
83
100
  "#{ENDPOINT}?#{URI.encode_www_form(params)}",
84
101
  )
85
102
  unless response.status == 200
86
- raise Error, "CDX API returned HTTP #{response.status}"
103
+ raise Error,
104
+ "CDX API returned HTTP #{response.status}"
105
+ end
106
+
107
+ response.body.strip.to_i
108
+ end
109
+
110
+ # Returns all unique original URLs under a domain.
111
+ def known_urls(domain, match_type: "domain")
112
+ snapshots(domain, match_type: match_type,
113
+ collapse: ["urlkey"]).map(&:original_url).uniq
114
+ end
115
+
116
+ private
117
+
118
+ def fetch_with_resume_key(url, options, yielder)
119
+ params = build_params(url, options)
120
+ loop do
121
+ response = cdx_get(params)
122
+ return if response.body.nil? || response.body.strip.empty?
123
+
124
+ resume_key = parse_cdx_json(response.body, yielder)
125
+ break if resume_key.nil? || resume_key.empty?
126
+
127
+ params = params.merge("resumeKey" => resume_key)
87
128
  end
129
+ end
130
+
131
+ def fetch_page(url, options, yielder)
132
+ params = build_params(url, options)
133
+ response = cdx_get(params)
88
134
  return if response.body.nil? || response.body.strip.empty?
89
135
 
90
136
  parse_cdx_json(response.body, yielder)
91
137
  end
92
138
 
139
+ def cdx_get(params)
140
+ response = @client.get(
141
+ "#{ENDPOINT}?#{URI.encode_www_form(params)}",
142
+ )
143
+ return response if response.status == 200
144
+
145
+ raise Error, "CDX API returned HTTP #{response.status}"
146
+ end
147
+
93
148
  def validate_options!(options)
94
149
  validate_match_type!(options[:match_type])
95
150
  validate_sort!(options[:sort])
151
+ validate_filters!(options[:filters])
152
+ validate_collapses!(options[:collapse])
96
153
  end
97
154
 
98
155
  def validate_match_type!(type)
@@ -110,11 +167,27 @@ module Archaeo
110
167
  "Invalid sort: #{sort}. Use: #{SORT_ORDERS.join(', ')}"
111
168
  end
112
169
 
170
+ def validate_filters!(filters)
171
+ Array(filters).each { |f| CdxFilter.new(f) }
172
+ end
173
+
174
+ def validate_collapses!(collapses)
175
+ Array(collapses).each do |c|
176
+ field = c.to_s.split(":").first
177
+ next if CdxFilter::VALID_FIELDS.include?(field)
178
+
179
+ raise ArgumentError,
180
+ "Invalid collapse field: #{field}. " \
181
+ "Valid fields: #{CdxFilter::VALID_FIELDS.join(', ')}"
182
+ end
183
+ end
184
+
113
185
  def build_params(url, options)
114
186
  {
115
187
  "url" => url,
116
188
  "output" => "json",
117
189
  "fl" => ALL_FIELDS.join(","),
190
+ "showResumeKey" => "true",
118
191
  "gzip" => options.fetch(:gzip, true) ? "true" : "false",
119
192
  }.tap do |params|
120
193
  merge_scalar_params!(params, options)
@@ -126,23 +199,48 @@ module Archaeo
126
199
  def merge_scalar_params!(params, options)
127
200
  SCALAR_PARAMS.each do |key, api_key|
128
201
  value = options[key]
129
- params[api_key] = value.to_s if value
202
+ next if value.nil?
203
+
204
+ params[api_key] = value.to_s
130
205
  end
131
206
  end
132
207
 
133
208
  def merge_array_params!(params, values, prefix)
134
209
  Array(values).each_with_index do |v, i|
135
- params["#{prefix}#{i}"] = v
210
+ params["#{prefix}#{i}"] = v.to_s
136
211
  end
137
212
  end
138
213
 
214
+ # Parses CDX JSON response, handling the resume key trailer.
215
+ #
216
+ # JSON resume key format:
217
+ # [header, row1, row2, ..., [], ["resume_key_value"]]
139
218
  def parse_cdx_json(body, yielder)
140
219
  json = JSON.parse(body)
141
- return unless json.is_a?(Array) && json.length > 1
220
+ return nil unless json.is_a?(Array) && json.length > 1
221
+
222
+ json, resume_key = extract_resume_key(json)
142
223
 
143
- header, *rows = json
224
+ header = json[0]
144
225
  field_map = header.each_with_index.to_h
145
- rows.each { |row| yielder << build_snapshot(field_map, row) }
226
+ json[1..].each do |row|
227
+ next unless row.is_a?(Array) && !row.empty?
228
+
229
+ yielder << build_snapshot(field_map, row)
230
+ end
231
+
232
+ resume_key
233
+ end
234
+
235
+ def extract_resume_key(json)
236
+ last = json.last
237
+ return [json, nil] unless last.is_a?(Array) && last.length == 1
238
+
239
+ remaining = json[0..-2]
240
+ if remaining.last.is_a?(Array) && remaining.last.empty?
241
+ remaining = remaining[0..-2]
242
+ end
243
+ [remaining, last[0].to_s]
146
244
  end
147
245
 
148
246
  def build_snapshot(field_map, row)
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Builds and validates CDX Server API filter expressions.
5
+ #
6
+ # CDX filter format: [!]field:regex
7
+ # The optional ! prefix inverts the match. The field must be a
8
+ # recognized CDX field name. The regex is a Java-compatible
9
+ # regex pattern matched against the field value.
10
+ class CdxFilter
11
+ VALID_FIELDS = %w[
12
+ urlkey timestamp original mimetype statuscode
13
+ digest length
14
+ ].freeze
15
+
16
+ def initialize(expression)
17
+ @expression = expression.to_s
18
+ validate!
19
+ end
20
+
21
+ def to_s
22
+ @expression
23
+ end
24
+
25
+ def negated?
26
+ @expression.start_with?("!")
27
+ end
28
+
29
+ def field
30
+ stripped = @expression.delete_prefix("!")
31
+ stripped.split(":", 2).first.to_s
32
+ end
33
+
34
+ def self.by_status(code)
35
+ new("statuscode:#{code}")
36
+ end
37
+
38
+ def self.excluding_status(code)
39
+ new("!statuscode:#{code}")
40
+ end
41
+
42
+ def self.by_mimetype(type)
43
+ new("mimetype:#{type}")
44
+ end
45
+
46
+ def self.excluding_mimetype(type)
47
+ new("!mimetype:#{type}")
48
+ end
49
+
50
+ def self.by_digest(digest)
51
+ new("digest:#{digest}")
52
+ end
53
+
54
+ def self.by_url(pattern)
55
+ new("original:#{pattern}")
56
+ end
57
+
58
+ private
59
+
60
+ def validate!
61
+ return if @expression.empty?
62
+
63
+ field_name = field
64
+ return if VALID_FIELDS.include?(field_name)
65
+
66
+ raise ArgumentError,
67
+ "Invalid CDX filter field: #{field_name}. " \
68
+ "Valid fields: #{VALID_FIELDS.join(', ')}"
69
+ end
70
+ end
71
+ end
data/lib/archaeo/cli.rb CHANGED
@@ -1,10 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "csv"
4
+ require "json"
3
5
  require "thor"
4
6
 
5
7
  module Archaeo
6
8
  # Command-line interface powered by Thor.
7
9
  class Cli < Thor
10
+ map %w[--version -v] => :version
11
+
12
+ desc "version", "Show archaeo version"
13
+ def version
14
+ puts "archaeo #{VERSION}"
15
+ end
16
+
8
17
  desc "snapshots URL", "List archived snapshots for a URL"
9
18
  option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
10
19
  option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
@@ -14,12 +23,16 @@ module Archaeo
14
23
  option :collapse, type: :array, desc: "CDX collapse fields"
15
24
  option :sort, desc: "Sort order (default, closest, reverse)"
16
25
  option :limit, type: :numeric, desc: "Max snapshots to return"
26
+ option :format, desc: "Output format (table, json, csv)",
27
+ default: "table"
17
28
  def snapshots(url)
18
29
  cdx = CdxApi.new
19
30
  opts = build_cdx_options(options)
20
- cdx.snapshots(url, **opts).each do |snap|
21
- puts "#{snap.timestamp} #{snap.status_code} " \
22
- "#{snap.original_url}"
31
+ snaps = cdx.snapshots(url, **opts).to_a
32
+ case options[:format]
33
+ when "json" then output_json(snaps)
34
+ when "csv" then output_csv(snaps)
35
+ else output_table(snaps)
23
36
  end
24
37
  end
25
38
 
@@ -64,12 +77,46 @@ module Archaeo
64
77
  "Fetch archived content for a URL at a timestamp"
65
78
  option :identity, type: :boolean, default: false,
66
79
  desc: "Fetch raw (identity) content"
80
+ option :output, desc: "Write content to file"
67
81
  def fetch(url, timestamp)
68
82
  page = Fetcher.new.fetch(
69
83
  url, timestamp: timestamp,
70
84
  identity: options[:identity]
71
85
  )
72
- $stdout.write(page.content)
86
+
87
+ if options[:output]
88
+ write_output(options[:output], page.content)
89
+ else
90
+ $stdout.write(page.content)
91
+ end
92
+ end
93
+
94
+ desc "download URL", "Download all archived snapshots of a URL"
95
+ option :output, desc: "Output directory", default: "archive"
96
+ option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
97
+ option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
98
+ option :resume, type: :boolean, default: false,
99
+ desc: "Resume interrupted download"
100
+ def download(url)
101
+ downloader = BulkDownloader.new(output_dir: options[:output])
102
+
103
+ downloader.download(
104
+ url,
105
+ from: options[:from],
106
+ to: options[:to],
107
+ resume: options[:resume],
108
+ ) do |current, total, snap|
109
+ warn "[#{current}/#{total}] " \
110
+ "#{snap.timestamp} #{snap.original_url}"
111
+ end
112
+ end
113
+
114
+ desc "known_urls DOMAIN",
115
+ "List all known URLs for a domain"
116
+ def known_urls(domain)
117
+ CdxApi.new.known_urls(domain).each do |u|
118
+ puts u
119
+ end
73
120
  end
74
121
 
75
122
  CDX_OPTION_MAP = {
@@ -90,5 +137,40 @@ module Archaeo
90
137
  result[api_key] = value if value
91
138
  end
92
139
  end
140
+
141
+ def output_table(snaps)
142
+ snaps.each do |snap|
143
+ puts "#{snap.timestamp} #{snap.status_code} " \
144
+ "#{snap.original_url}"
145
+ end
146
+ end
147
+
148
+ def output_json(snaps)
149
+ data = snaps.map do |snap|
150
+ {
151
+ timestamp: snap.timestamp.to_s,
152
+ status_code: snap.status_code,
153
+ url: snap.original_url,
154
+ archive_url: snap.archive_url,
155
+ }
156
+ end
157
+ puts JSON.generate(data)
158
+ end
159
+
160
+ def output_csv(snaps)
161
+ puts CSV.generate do |csv|
162
+ csv << %w[timestamp status_code url archive_url]
163
+ snaps.each do |snap|
164
+ csv << [snap.timestamp.to_s, snap.status_code,
165
+ snap.original_url, snap.archive_url]
166
+ end
167
+ end
168
+ end
169
+
170
+ def write_output(path, content)
171
+ FileUtils.mkdir_p(File.dirname(path))
172
+ File.binwrite(path, content)
173
+ warn "Written to #{path}"
174
+ end
93
175
  end
94
176
  end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Tracks download progress for resume support.
5
+ #
6
+ # Persists completed snapshot timestamps to a state file within
7
+ # the output directory, allowing interrupted downloads to resume
8
+ # without re-fetching already downloaded snapshots.
9
+ class DownloadState
10
+ STATE_FILE = ".archaeo-state"
11
+
12
+ attr_reader :output_dir
13
+
14
+ def initialize(output_dir)
15
+ @output_dir = output_dir
16
+ @path = File.join(output_dir, STATE_FILE)
17
+ end
18
+
19
+ def completed?(timestamp)
20
+ timestamps.include?(timestamp.to_s)
21
+ end
22
+
23
+ def mark_completed(timestamp)
24
+ timestamps << timestamp.to_s
25
+ save
26
+ end
27
+
28
+ def clear
29
+ @timestamps = []
30
+ FileUtils.rm_f(@path)
31
+ end
32
+
33
+ private
34
+
35
+ def timestamps
36
+ @timestamps ||= load_timestamps
37
+ end
38
+
39
+ def load_timestamps
40
+ return [] unless File.exist?(@path)
41
+
42
+ File.readlines(@path, chomp: true).reject(&:empty?)
43
+ end
44
+
45
+ def save
46
+ File.write(@path, "#{timestamps.uniq.sort.join("\n")}\n")
47
+ end
48
+ end
49
+ end
@@ -6,10 +6,11 @@ require "zlib"
6
6
  require "stringio"
7
7
 
8
8
  module Archaeo
9
- # HTTP client with retry logic, gzip decompression, and
10
- # rotating realistic User-Agent profiles.
9
+ # HTTP client with retry logic, gzip decompression,
10
+ # rotating realistic User-Agent profiles, and connection pooling.
11
11
  #
12
- # Injected via constructor for testability.
12
+ # Injected via constructor for testability. Connections are reused
13
+ # across requests to the same host for improved performance.
13
14
  class HttpClient
14
15
  DEFAULT_TIMEOUT = 30
15
16
  DEFAULT_MAX_RETRIES = 3
@@ -21,6 +22,8 @@ module Archaeo
21
22
  IOError,
22
23
  Errno::ECONNRESET,
23
24
  Errno::ECONNREFUSED,
25
+ EOFError,
26
+ Errno::EPIPE,
24
27
  ].freeze
25
28
 
26
29
  USER_AGENT_PROFILES = [
@@ -60,11 +63,25 @@ module Archaeo
60
63
  @max_retries = max_retries
61
64
  @retry_delay = retry_delay
62
65
  @user_agent = user_agent
66
+ @connections = {}
67
+ @mutex = Mutex.new
63
68
  end
64
69
 
65
70
  def get(url, headers: {})
66
71
  merged = default_headers.merge(headers)
67
- attempt_with_retries(url, merged)
72
+ uri = URI(url)
73
+ attempt_with_retries(uri, merged)
74
+ end
75
+
76
+ def shutdown
77
+ @mutex.synchronize do
78
+ @connections.each_value do |http|
79
+ http.finish
80
+ rescue StandardError
81
+ nil
82
+ end
83
+ @connections.clear
84
+ end
68
85
  end
69
86
 
70
87
  private
@@ -73,13 +90,52 @@ module Archaeo
73
90
  @user_agent || USER_AGENT_PROFILES.sample
74
91
  end
75
92
 
76
- def attempt_with_retries(url, headers)
93
+ def connection_key(uri)
94
+ "#{uri.scheme}://#{uri.host}:#{uri.port}"
95
+ end
96
+
97
+ def connection_for(uri)
98
+ key = connection_key(uri)
99
+ @mutex.synchronize do
100
+ http = @connections[key]
101
+ if http && !http.active?
102
+ @connections.delete(key)
103
+ http = nil
104
+ end
105
+ @connections[key] = build_connection(uri) unless http
106
+ @connections[key]
107
+ end
108
+ end
109
+
110
+ def build_connection(uri)
111
+ http = Net::HTTP.new(uri.host, uri.port)
112
+ http.use_ssl = uri.scheme == "https"
113
+ http.read_timeout = @timeout
114
+ http.open_timeout = @timeout
115
+ http.start
116
+ http
117
+ end
118
+
119
+ def invalidate_connection(uri)
120
+ key = connection_key(uri)
121
+ @mutex.synchronize do
122
+ http = @connections.delete(key)
123
+ begin
124
+ http&.finish
125
+ rescue StandardError
126
+ nil
127
+ end
128
+ end
129
+ end
130
+
131
+ def attempt_with_retries(uri, headers)
77
132
  retries = 0
78
133
  begin
79
- execute_get(url, headers)
134
+ execute_with_connection(uri, headers)
80
135
  rescue *TRANSIENT_ERRORS => e
81
136
  retries += 1
82
137
  raise_if_exhausted(retries, e)
138
+ invalidate_connection(uri)
83
139
  sleep(@retry_delay * retries)
84
140
  retry
85
141
  end
@@ -92,6 +148,19 @@ module Archaeo
92
148
  "Failed after #{retries} retries: #{error.message}"
93
149
  end
94
150
 
151
+ def execute_with_connection(uri, headers)
152
+ http = connection_for(uri)
153
+ request = Net::HTTP::Get.new(uri)
154
+ headers.each { |k, v| request[k] = v }
155
+ raw = http.request(request)
156
+ build_response(raw)
157
+ rescue *TRANSIENT_ERRORS
158
+ raise
159
+ rescue StandardError
160
+ invalidate_connection(uri)
161
+ raise
162
+ end
163
+
95
164
  def default_headers
96
165
  {
97
166
  "User-Agent" => select_user_agent,
@@ -103,19 +172,6 @@ module Archaeo
103
172
  }
104
173
  end
105
174
 
106
- def execute_get(url, headers)
107
- uri = URI(url)
108
- Net::HTTP.start(uri.host, uri.port,
109
- use_ssl: uri.scheme == "https",
110
- read_timeout: @timeout,
111
- open_timeout: @timeout) do |http|
112
- request = Net::HTTP::Get.new(uri)
113
- headers.each { |k, v| request[k] = v }
114
- raw = http.request(request)
115
- build_response(raw)
116
- end
117
- end
118
-
119
175
  def build_response(raw)
120
176
  headers = raw.each_header.to_h { |k, v| [k.downcase, v] }
121
177
  Response.new(
data/lib/archaeo/page.rb CHANGED
@@ -1,22 +1,85 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "nokogiri"
4
+
3
5
  module Archaeo
4
6
  # Model representing a fetched archived page from the Wayback Machine.
5
7
  #
6
8
  # Contains the page content, metadata, and provenance information
7
- # for a single archived resource.
9
+ # for a single archived resource. Content is automatically transcoded
10
+ # to UTF-8 from the detected source encoding.
8
11
  class Page
9
- attr_reader :content, :content_type, :status_code,
12
+ attr_reader :content_type, :status_code,
10
13
  :archive_url, :original_url, :timestamp
11
14
 
12
15
  def initialize(content:, content_type:, status_code:,
13
16
  archive_url:, original_url:, timestamp:)
14
- @content = content
17
+ @raw_content = content
15
18
  @content_type = content_type
16
19
  @status_code = status_code
17
20
  @archive_url = archive_url
18
21
  @original_url = original_url
19
22
  @timestamp = Timestamp.coerce(timestamp)
20
23
  end
24
+
25
+ def content
26
+ @content ||= transcode(@raw_content)
27
+ end
28
+
29
+ def encoding
30
+ @encoding ||= detect_encoding
31
+ end
32
+
33
+ private
34
+
35
+ def detect_encoding
36
+ charset = extract_charset(@content_type)
37
+ return Encoding.find(charset) if charset
38
+
39
+ html_charset = detect_html_charset
40
+ return Encoding.find(html_charset) if html_charset
41
+
42
+ Encoding::UTF_8
43
+ rescue ArgumentError
44
+ Encoding::UTF_8
45
+ end
46
+
47
+ def extract_charset(content_type)
48
+ return nil unless content_type
49
+
50
+ match = content_type.match(/charset=([^\s;]+)/i)
51
+ match ? match[1] : nil
52
+ end
53
+
54
+ def detect_html_charset
55
+ doc = Nokogiri::HTML(@raw_content)
56
+ node = doc.at_css("meta[charset]")
57
+ return node["charset"] if node
58
+
59
+ content = doc.at_css('meta[http-equiv="Content-Type"]')&.[]("content")
60
+ return nil unless content
61
+
62
+ match = content.match(/charset=([^\s;]+)/i)
63
+ match ? match[1] : nil
64
+ rescue StandardError
65
+ nil
66
+ end
67
+
68
+ def transcode(raw)
69
+ return raw if raw.encoding == Encoding::UTF_8 && raw.valid_encoding?
70
+ return raw if raw.empty?
71
+
72
+ encode_to_utf8(raw, encoding)
73
+ rescue Encoding::InvalidByteSequenceError,
74
+ Encoding::UndefinedConversionError
75
+ encode_to_utf8(raw, Encoding::UTF_8)
76
+ end
77
+
78
+ def encode_to_utf8(raw, source_encoding)
79
+ raw.force_encoding(source_encoding)
80
+ .encode("UTF-8",
81
+ invalid: :replace, undef: :replace,
82
+ replace: "?")
83
+ end
21
84
  end
22
85
  end
@@ -9,6 +9,8 @@ module Archaeo
9
9
  FIELDS = %i[urlkey timestamp original_url
10
10
  mimetype status_code digest length].freeze
11
11
 
12
+ BLOCKED_STATUS = -1
13
+
12
14
  attr_reader(*FIELDS)
13
15
 
14
16
  def initialize(urlkey:, timestamp:, original_url:,
@@ -27,14 +29,22 @@ module Archaeo
27
29
  ArchiveUrl.new(original_url, timestamp: @timestamp).to_s
28
30
  end
29
31
 
32
+ def blocked?
33
+ @status_code == BLOCKED_STATUS
34
+ end
35
+
36
+ def to_a
37
+ [@urlkey, @timestamp, @original_url, @mimetype,
38
+ @status_code, @digest, @length]
39
+ end
40
+
30
41
  def ==(other)
31
- other.is_a?(self.class) &&
32
- FIELDS.all? { |f| send(f) == other.send(f) }
42
+ other.is_a?(self.class) && to_a == other.to_a
33
43
  end
34
44
  alias_method :eql?, :==
35
45
 
36
46
  def hash
37
- FIELDS.map { |f| send(f) }.hash
47
+ to_a.hash
38
48
  end
39
49
  end
40
50
  end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Sanitizes and normalizes URLs for Wayback Machine API queries.
5
+ #
6
+ # Handles common URL issues: whitespace, surrounding quotes,
7
+ # double percent-encoding, and inconsistent percent-encoding case.
8
+ class UrlNormalizer
9
+ attr_reader :original, :normalized
10
+
11
+ def initialize(url)
12
+ @original = url.to_s
13
+ @normalized = normalize(@original)
14
+ end
15
+
16
+ def self.normalize(url)
17
+ new(url).normalized
18
+ end
19
+
20
+ def self.with_scheme(url)
21
+ normalized = normalize(url)
22
+ normalized.match?(%r{\A[a-z][a-z0-9+\-.]*://}) ? normalized : "https://#{normalized}"
23
+ end
24
+
25
+ def to_s
26
+ @normalized
27
+ end
28
+
29
+ private
30
+
31
+ def normalize(url)
32
+ url = strip_whitespace(url)
33
+ url = strip_surrounding_quotes(url)
34
+ url = fix_double_percent_encoding(url)
35
+ normalize_percent_encoding(url)
36
+ end
37
+
38
+ def strip_whitespace(url)
39
+ url.strip
40
+ end
41
+
42
+ def strip_surrounding_quotes(url)
43
+ url = url[1..-2] if url.start_with?('"') && url.end_with?('"')
44
+ url = url[1..-2] if url.start_with?("'") && url.end_with?("'")
45
+ url
46
+ end
47
+
48
+ def fix_double_percent_encoding(url)
49
+ url.gsub(/%25([0-9A-Fa-f]{2})/i, '%\1')
50
+ end
51
+
52
+ def normalize_percent_encoding(url)
53
+ url.gsub(/%[0-9a-f]{2}/i, &:upcase)
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Rewrites Wayback Machine archive URLs to local file paths.
5
+ #
6
+ # Used for saving archived pages and their assets for offline
7
+ # browsing. Converts absolute archive URLs into relative paths
8
+ # rooted at a configurable local directory.
9
+ class UrlRewriter
10
+ def initialize(archive_prefix, local_prefix)
11
+ @archive_prefix = archive_prefix.to_s
12
+ @local_prefix = local_prefix.to_s
13
+ end
14
+
15
+ def rewrite(url)
16
+ return url unless url.start_with?(@archive_prefix)
17
+
18
+ relative = url.sub(@archive_prefix, "")
19
+ File.join(@local_prefix, relative)
20
+ end
21
+ end
22
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/archaeo.rb CHANGED
@@ -21,10 +21,17 @@ module Archaeo
21
21
  autoload :Page, "archaeo/page"
22
22
  autoload :SaveResult, "archaeo/save_result"
23
23
  autoload :AvailabilityResult, "archaeo/availability_result"
24
+ autoload :UrlNormalizer, "archaeo/url_normalizer"
25
+ autoload :CdxFilter, "archaeo/cdx_filter"
26
+ autoload :AssetList, "archaeo/asset_list"
27
+ autoload :AssetExtractor, "archaeo/asset_extractor"
28
+ autoload :UrlRewriter, "archaeo/url_rewriter"
29
+ autoload :DownloadState, "archaeo/download_state"
24
30
  autoload :HttpClient, "archaeo/http_client"
25
31
  autoload :CdxApi, "archaeo/cdx_api"
26
32
  autoload :AvailabilityApi, "archaeo/availability_api"
27
33
  autoload :SaveApi, "archaeo/save_api"
28
34
  autoload :Fetcher, "archaeo/fetcher"
35
+ autoload :BulkDownloader, "archaeo/bulk_downloader"
29
36
  autoload :Cli, "archaeo/cli"
30
37
  end
metadata CHANGED
@@ -1,14 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
+ autorequire:
8
9
  bindir: exe
9
10
  cert_chain: []
10
- date: 1980-01-02 00:00:00.000000000 Z
11
+ date: 2026-05-09 00:00:00.000000000 Z
11
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: csv
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.14'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.14'
12
41
  - !ruby/object:Gem::Dependency
13
42
  name: thor
14
43
  requirement: !ruby/object:Gem::Requirement
@@ -43,10 +72,15 @@ files:
43
72
  - exe/archaeo
44
73
  - lib/archaeo.rb
45
74
  - lib/archaeo/archive_url.rb
75
+ - lib/archaeo/asset_extractor.rb
76
+ - lib/archaeo/asset_list.rb
46
77
  - lib/archaeo/availability_api.rb
47
78
  - lib/archaeo/availability_result.rb
79
+ - lib/archaeo/bulk_downloader.rb
48
80
  - lib/archaeo/cdx_api.rb
81
+ - lib/archaeo/cdx_filter.rb
49
82
  - lib/archaeo/cli.rb
83
+ - lib/archaeo/download_state.rb
50
84
  - lib/archaeo/fetcher.rb
51
85
  - lib/archaeo/http_client.rb
52
86
  - lib/archaeo/page.rb
@@ -54,6 +88,8 @@ files:
54
88
  - lib/archaeo/save_result.rb
55
89
  - lib/archaeo/snapshot.rb
56
90
  - lib/archaeo/timestamp.rb
91
+ - lib/archaeo/url_normalizer.rb
92
+ - lib/archaeo/url_rewriter.rb
57
93
  - lib/archaeo/version.rb
58
94
  - sig/archaeo.rbs
59
95
  homepage: https://github.com/riboseinc/archaeo
@@ -62,8 +98,9 @@ licenses:
62
98
  metadata:
63
99
  homepage_uri: https://github.com/riboseinc/archaeo
64
100
  source_code_uri: https://github.com/riboseinc/archaeo
65
- changelog_uri: https://github.com/riboseinc/archaeo/blob/main/CHANGELOG.md
101
+ changelog_uri: https://github.com/riboseinc/archaeo/blob/main/CHANGELOG.adoc
66
102
  rubygems_mfa_required: 'true'
103
+ post_install_message:
67
104
  rdoc_options: []
68
105
  require_paths:
69
106
  - lib
@@ -78,7 +115,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
115
  - !ruby/object:Gem::Version
79
116
  version: '0'
80
117
  requirements: []
81
- rubygems_version: 3.6.9
118
+ rubygems_version: 3.5.22
119
+ signing_key:
82
120
  specification_version: 4
83
121
  summary: Ruby client for the Internet Archive Wayback Machine APIs
84
122
  test_files: []