archaeo 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/archaeo/cli.rb CHANGED
@@ -9,6 +9,9 @@ module Archaeo
9
9
  class Cli < Thor
10
10
  map %w[--version -v] => :version
11
11
 
12
+ class_option :quiet, type: :boolean, default: false,
13
+ desc: "Suppress progress messages"
14
+
12
15
  def self.exit_on_failure?
13
16
  true
14
17
  end
@@ -65,6 +68,39 @@ module Archaeo
65
68
  end
66
69
  end
67
70
 
71
+ desc "before URL TIMESTAMP",
72
+ "Find the nearest snapshot before a timestamp"
73
+ option :format, desc: "Output format (url, json)", default: "url"
74
+ def before(url, timestamp)
75
+ handle_errors do
76
+ snap = CdxApi.new.before(url, timestamp: timestamp)
77
+ output_snapshot(snap)
78
+ end
79
+ end
80
+
81
+ desc "after URL TIMESTAMP",
82
+ "Find the nearest snapshot after a timestamp"
83
+ option :format, desc: "Output format (url, json)", default: "url"
84
+ def after(url, timestamp)
85
+ handle_errors do
86
+ snap = CdxApi.new.after(url, timestamp: timestamp)
87
+ output_snapshot(snap)
88
+ end
89
+ end
90
+
91
+ desc "between URL FROM TO",
92
+ "List snapshots in a date range"
93
+ option :format, desc: "Output format (table, json, csv)",
94
+ default: "table"
95
+ def between(url, from, to)
96
+ fmt = validate_output_format
97
+ handle_errors do
98
+ cdx = CdxApi.new
99
+ snaps = cdx.between(url, from: from, to: to).to_a
100
+ output_formatted(snaps, fmt)
101
+ end
102
+ end
103
+
68
104
  desc "available URL", "Check if a URL is archived"
69
105
  option :timestamp, desc: "Check near this timestamp (YYYYMMDDHHmmss)"
70
106
  def available(url)
@@ -105,6 +141,18 @@ module Archaeo
105
141
  end
106
142
  end
107
143
 
144
+ desc "fetch-assets URL TIMESTAMP",
145
+ "Fetch a page and list its extracted assets"
146
+ option :format, desc: "Output format (json, table)", default: "table"
147
+ def fetch_assets(url, timestamp)
148
+ handle_errors do
149
+ bundle = Fetcher.new.fetch_page_with_assets(
150
+ url, timestamp: timestamp
151
+ )
152
+ output_assets(bundle)
153
+ end
154
+ end
155
+
108
156
  desc "download URL", "Download all archived snapshots of a URL"
109
157
  option :output, desc: "Output directory", default: "archive"
110
158
  option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
@@ -113,6 +161,8 @@ module Archaeo
113
161
  desc: "Resume interrupted download"
114
162
  option :concurrency, type: :numeric, default: 1,
115
163
  desc: "Number of parallel downloads"
164
+ option :dry_run, type: :boolean, default: false,
165
+ desc: "Preview downloads without fetching"
116
166
  def download(url)
117
167
  handle_errors do
118
168
  downloader = BulkDownloader.new(
@@ -141,6 +191,14 @@ module Archaeo
141
191
  end
142
192
  end
143
193
 
194
+ desc "count URL",
195
+ "Count snapshots for a URL"
196
+ def count(url)
197
+ handle_errors do
198
+ puts CdxApi.new.count(url)
199
+ end
200
+ end
201
+
144
202
  CDX_OPTION_MAP = {
145
203
  from: :from,
146
204
  to: :to,
@@ -153,6 +211,10 @@ module Archaeo
153
211
 
154
212
  private
155
213
 
214
+ def quiet?
215
+ options[:quiet]
216
+ end
217
+
156
218
  def handle_errors
157
219
  yield
158
220
  rescue RateLimitError => e
@@ -214,18 +276,42 @@ module Archaeo
214
276
  end
215
277
  end
216
278
 
217
- def download_with_progress(downloader, url)
218
- downloader.download(
219
- url,
220
- from: options[:from],
221
- to: options[:to],
222
- resume: options[:resume],
223
- ) do |current, total, snap|
224
- warn "[#{current}/#{total}] " \
225
- "#{snap.timestamp} #{snap.original_url}"
279
+ def output_assets(bundle)
280
+ case options[:format]
281
+ when "json"
282
+ puts bundle.assets.to_json
283
+ else
284
+ bundle.assets.to_h.each do |type, urls|
285
+ next if urls.empty?
286
+
287
+ puts "#{type}:"
288
+ urls.each { |url| puts " #{url}" }
289
+ end
226
290
  end
227
291
  end
228
292
 
293
+ def download_with_progress(downloader, url)
294
+ summary = downloader.download(
295
+ url, from: options[:from], to: options[:to],
296
+ resume: options[:resume], dry_run: options[:dry_run]
297
+ ) { |c, t, s| print_progress(c, t, s) }
298
+ print_summary(summary)
299
+ end
300
+
301
+ def print_progress(current, total, snap)
302
+ return if quiet?
303
+
304
+ warn "[#{current}/#{total}] #{snap.timestamp} #{snap.original_url}"
305
+ end
306
+
307
+ def print_summary(summary)
308
+ return if quiet?
309
+
310
+ warn "Downloaded #{summary.downloaded}/#{summary.total} " \
311
+ "(#{summary.bytes_written} bytes) in " \
312
+ "#{summary.elapsed.round(1)}s"
313
+ end
314
+
229
315
  def build_cdx_options(opts)
230
316
  CDX_OPTION_MAP.each_with_object({}) do |(cli_key, api_key), result|
231
317
  value = opts[cli_key]
@@ -259,7 +345,7 @@ module Archaeo
259
345
  def write_output(path, content)
260
346
  FileUtils.mkdir_p(File.dirname(path))
261
347
  File.binwrite(path, content)
262
- warn "Written to #{path}"
348
+ warn "Written to #{path}" unless quiet?
263
349
  end
264
350
  end
265
351
  end
@@ -1,11 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "json"
3
4
  require "set"
4
5
 
5
6
  module Archaeo
6
7
  # Tracks download progress for resume support.
7
8
  #
8
- # Persists completed snapshot timestamps to a state file within
9
+ # Persists completed snapshot metadata to a JSONL state file within
9
10
  # the output directory, allowing interrupted downloads to resume
10
11
  # without re-fetching already downloaded snapshots.
11
12
  class DownloadState
@@ -19,42 +20,72 @@ module Archaeo
19
20
  end
20
21
 
21
22
  def completed?(timestamp)
22
- timestamps_set.include?(timestamp.to_s)
23
+ entries_key.include?(timestamp.to_s)
23
24
  end
24
25
 
25
- def mark_completed(timestamp)
26
+ def mark_completed(timestamp, url: nil, bytes: nil)
26
27
  ts = timestamp.to_s
27
- return if timestamps_set.include?(ts)
28
+ return if entries_key.include?(ts)
28
29
 
29
- timestamps << ts
30
- @timestamps_set = nil
30
+ entry = { "ts" => ts, "at" => Time.now.utc.iso8601 }
31
+ entry["url"] = url if url
32
+ entry["bytes"] = bytes if bytes
33
+ entries << entry
34
+ @entries_key = nil
31
35
  save
32
36
  end
33
37
 
38
+ def entry_for(timestamp)
39
+ entries.find { |e| e["ts"] == timestamp.to_s }
40
+ end
41
+
42
+ def total_bytes
43
+ entries.sum { |e| e["bytes"].to_i }
44
+ end
45
+
34
46
  def clear
35
- @timestamps = []
36
- @timestamps_set = nil
47
+ @entries = []
48
+ @entries_key = nil
37
49
  FileUtils.rm_f(@path)
38
50
  end
39
51
 
40
52
  private
41
53
 
42
- def timestamps
43
- @timestamps ||= load_timestamps
54
+ def entries
55
+ @entries ||= load_entries
44
56
  end
45
57
 
46
- def timestamps_set
47
- @timestamps_set ||= timestamps.to_set
58
+ def entries_key
59
+ @entries_key ||= entries.each_with_object(Set.new) { |e, s| s << e["ts"] }
48
60
  end
49
61
 
50
- def load_timestamps
62
+ def load_entries
51
63
  return [] unless File.exist?(@path)
52
64
 
53
- File.readlines(@path, chomp: true).reject(&:empty?)
65
+ first_line = File.open(@path, &:readline).strip
66
+ if first_line.start_with?("{")
67
+ parse_jsonl
68
+ else
69
+ migrate_legacy(first_line)
70
+ end
71
+ rescue EOFError
72
+ []
73
+ end
74
+
75
+ def parse_jsonl
76
+ File.readlines(@path, chomp: true).reject(&:empty?).map do |line|
77
+ JSON.parse(line)
78
+ end
79
+ end
80
+
81
+ def migrate_legacy(_first_line)
82
+ File.readlines(@path, chomp: true).reject(&:empty?).map do |ts|
83
+ { "ts" => ts }
84
+ end
54
85
  end
55
86
 
56
87
  def save
57
- content = "#{timestamps.sort.join("\n")}\n"
88
+ content = "#{entries.map { |e| JSON.generate(e) }.join("\n")}\n"
58
89
  tmp_path = "#{@path}.tmp"
59
90
  File.write(tmp_path, content)
60
91
  File.rename(tmp_path, @path)
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "digest"
4
+
3
5
  module Archaeo
4
6
  # Downloads archived content from the Wayback Machine.
5
7
  #
@@ -13,12 +15,13 @@ module Archaeo
13
15
  @client = client
14
16
  end
15
17
 
16
- def fetch(url, timestamp:, identity: false)
18
+ def fetch(url, timestamp:, identity: false, snapshot: nil)
17
19
  url = UrlNormalizer.normalize(url)
18
20
  ts = Timestamp.coerce(timestamp)
19
21
  archive_url = ArchiveUrl.new(url, timestamp: ts,
20
22
  identity: identity)
21
23
  response = follow_redirects(archive_url.to_s)
24
+ verify_integrity!(response, snapshot) if snapshot
22
25
  build_page(response, archive_url.to_s, url, ts)
23
26
  end
24
27
 
@@ -31,6 +34,18 @@ module Archaeo
31
34
 
32
35
  private
33
36
 
37
+ def verify_integrity!(response, snapshot)
38
+ return unless snapshot.digest && !snapshot.digest.empty?
39
+
40
+ expected = snapshot.digest.delete_prefix("SHA1-")
41
+ actual = Digest::SHA1.hexdigest(response.body)
42
+ return if expected == actual
43
+
44
+ raise IntegrityError,
45
+ "Digest mismatch for #{snapshot.original_url}: " \
46
+ "expected #{expected}, got #{actual}"
47
+ end
48
+
34
49
  def build_page(response, archive_url, url, timestamp)
35
50
  Page.new(
36
51
  content: response.body,
@@ -62,11 +62,13 @@ module Archaeo
62
62
  def initialize(timeout: DEFAULT_TIMEOUT,
63
63
  max_retries: DEFAULT_MAX_RETRIES,
64
64
  retry_delay: DEFAULT_RETRY_DELAY,
65
- user_agent: nil)
65
+ user_agent: nil,
66
+ on_request: nil)
66
67
  @timeout = timeout
67
68
  @max_retries = max_retries
68
69
  @retry_delay = retry_delay
69
70
  @user_agent = user_agent
71
+ @on_request = on_request
70
72
  @connections = {}
71
73
  @last_used = {}
72
74
  @mutex = Mutex.new
@@ -106,6 +108,18 @@ module Archaeo
106
108
  end
107
109
  end
108
110
 
111
+ def pool_stats
112
+ now = Time.now
113
+ @mutex.synchronize do
114
+ {
115
+ active_connections: @connections.size,
116
+ max_pool_size: MAX_POOL_SIZE,
117
+ hosts: @connections.keys,
118
+ idle_times: @last_used.transform_values { |t| (now - t).round },
119
+ }.freeze
120
+ end
121
+ end
122
+
109
123
  private
110
124
 
111
125
  def select_user_agent
@@ -222,11 +236,16 @@ module Archaeo
222
236
  value = response.headers["retry-after"]
223
237
  return nil unless value
224
238
 
225
- begin
226
- Integer(value)
227
- rescue StandardError
228
- nil
229
- end
239
+ Integer(value)
240
+ rescue ArgumentError
241
+ parse_retry_after_date(value)
242
+ end
243
+
244
+ def parse_retry_after_date(value)
245
+ remaining = (Time.httpdate(value) - Time.now).to_i
246
+ [remaining, 0].max
247
+ rescue ArgumentError
248
+ nil
230
249
  end
231
250
 
232
251
  def raise_if_exhausted(retries, error)
@@ -237,11 +256,8 @@ module Archaeo
237
256
  end
238
257
 
239
258
  def execute_with_connection(uri, headers, request_class)
240
- http = connection_for(uri)
241
- request = request_class.new(uri)
242
- headers.each { |k, v| request[k] = v }
243
- raw = http.request(request)
244
- build_response(raw)
259
+ request = build_request(uri, headers, request_class)
260
+ execute_tracked_request(uri, request)
245
261
  rescue *TRANSIENT_ERRORS
246
262
  raise
247
263
  rescue StandardError
@@ -249,6 +265,22 @@ module Archaeo
249
265
  raise
250
266
  end
251
267
 
268
+ def build_request(uri, headers, request_class)
269
+ request = request_class.new(uri)
270
+ headers.each { |k, v| request[k] = v }
271
+ request
272
+ end
273
+
274
+ def execute_tracked_request(uri, request)
275
+ http = connection_for(uri)
276
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
277
+ raw = http.request(request)
278
+ elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start
279
+ response = build_response(raw)
280
+ @on_request&.call(uri, elapsed, response.status, 0)
281
+ response
282
+ end
283
+
252
284
  def default_headers
253
285
  {
254
286
  "User-Agent" => select_user_agent,
data/lib/archaeo/page.rb CHANGED
@@ -50,6 +50,10 @@ module Archaeo
50
50
  @content_type&.start_with?("text/")
51
51
  end
52
52
 
53
+ def css?
54
+ @content_type&.include?("text/css")
55
+ end
56
+
53
57
  def binary?
54
58
  !(text? || json? || html?)
55
59
  end
@@ -63,6 +67,34 @@ module Archaeo
63
67
  end
64
68
  end
65
69
 
70
+ def to_h
71
+ {
72
+ content_type: @content_type,
73
+ status_code: @status_code,
74
+ archive_url: @archive_url,
75
+ original_url: @original_url,
76
+ timestamp: @timestamp,
77
+ size: size,
78
+ encoding: encoding.to_s,
79
+ }
80
+ end
81
+
82
+ def as_json(*)
83
+ {
84
+ content_type: @content_type,
85
+ status_code: @status_code,
86
+ archive_url: @archive_url,
87
+ original_url: @original_url,
88
+ timestamp: @timestamp.to_s,
89
+ size: size,
90
+ encoding: encoding.to_s,
91
+ }
92
+ end
93
+
94
+ def inspect
95
+ "#<#{self.class.name} #{@content_type} #{size} bytes>"
96
+ end
97
+
66
98
  private
67
99
 
68
100
  def detect_encoding
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "json"
4
+
3
5
  module Archaeo
4
6
  # A fetched page together with all its extracted asset URLs.
5
7
  #
@@ -26,5 +28,31 @@ module Archaeo
26
28
  def asset_count
27
29
  assets.size
28
30
  end
31
+
32
+ def to_h
33
+ { page: @page.to_h, assets: @assets.to_h }
34
+ end
35
+
36
+ def as_json(*)
37
+ { page: @page.as_json, assets: @assets.to_h }
38
+ end
39
+
40
+ def to_json(*args)
41
+ JSON.generate(as_json, *args)
42
+ end
43
+
44
+ def download_assets(output_dir:, client: HttpClient.new)
45
+ FileUtils.mkdir_p(output_dir)
46
+ @assets.all.each do |url|
47
+ filename = File.join(output_dir,
48
+ File.basename(URI.parse(url).path))
49
+ tmp_path = "#{filename}.tmp"
50
+ response = client.get(url)
51
+ File.binwrite(tmp_path, response.body)
52
+ File.rename(tmp_path, filename)
53
+ rescue StandardError
54
+ FileUtils.rm_f(tmp_path) if defined?(tmp_path)
55
+ end
56
+ end
29
57
  end
30
58
  end
@@ -18,5 +18,24 @@ module Archaeo
18
18
  def cached?
19
19
  @cached
20
20
  end
21
+
22
+ def to_h
23
+ { url: @url, archive_url: @archive_url,
24
+ timestamp: @timestamp, cached: @cached }
25
+ end
26
+
27
+ def as_json(*)
28
+ { url: @url, archive_url: @archive_url,
29
+ timestamp: @timestamp.to_s, cached: @cached }
30
+ end
31
+
32
+ def to_s
33
+ label = @cached ? "Cached" : "Saved"
34
+ "#{label}: #{@archive_url}"
35
+ end
36
+
37
+ def inspect
38
+ "#<#{self.class.name} #{@url} cached=#{@cached}>"
39
+ end
21
40
  end
22
41
  end
@@ -29,6 +29,11 @@ module Archaeo
29
29
  ArchiveUrl.new(original_url, timestamp: @timestamp).to_s
30
30
  end
31
31
 
32
+ def identity_url
33
+ ArchiveUrl.new(original_url, timestamp: @timestamp,
34
+ identity: true).to_s
35
+ end
36
+
32
37
  def blocked?
33
38
  @status_code == BLOCKED_STATUS
34
39
  end
@@ -53,6 +58,18 @@ module Archaeo
53
58
  client_error? || server_error?
54
59
  end
55
60
 
61
+ def age
62
+ Time.now - @timestamp.to_time
63
+ end
64
+
65
+ def older_than?(seconds)
66
+ age > seconds
67
+ end
68
+
69
+ def newer_than?(seconds)
70
+ age <= seconds
71
+ end
72
+
56
73
  def fetch(client: HttpClient.new, identity: false)
57
74
  Fetcher.new(client: client).fetch(
58
75
  original_url, timestamp: @timestamp, identity: identity
@@ -102,5 +119,10 @@ module Archaeo
102
119
  def hash
103
120
  to_a.hash
104
121
  end
122
+
123
+ def inspect
124
+ "#<#{self.class.name} #{timestamp} " \
125
+ "#{original_url} status=#{status_code}>"
126
+ end
105
127
  end
106
128
  end
@@ -17,6 +17,7 @@ module Archaeo
17
17
  def initialize(year:, month: 1, day: 1,
18
18
  hour: 0, minute: 0, second: 0)
19
19
  @to_time = Time.utc(year, month, day, hour, minute, second)
20
+ freeze
20
21
  end
21
22
 
22
23
  def self.parse(string)
@@ -129,5 +130,18 @@ module Archaeo
129
130
  def second
130
131
  @to_time.sec
131
132
  end
133
+
134
+ def to_h
135
+ { year: year, month: month, day: day,
136
+ hour: hour, minute: minute, second: second }
137
+ end
138
+
139
+ def to_a
140
+ [year, month, day, hour, minute, second]
141
+ end
142
+
143
+ def inspect
144
+ "#<#{self.class.name} #{self}>"
145
+ end
132
146
  end
133
147
  end
@@ -49,7 +49,8 @@ module Archaeo
49
49
  url = strip_whitespace(url)
50
50
  url = strip_surrounding_quotes(url)
51
51
  url = fix_double_percent_encoding(url)
52
- normalize_percent_encoding(url)
52
+ url = normalize_percent_encoding(url)
53
+ remove_default_port(url)
53
54
  end
54
55
 
55
56
  def strip_whitespace(url)
@@ -69,5 +70,10 @@ module Archaeo
69
70
  def normalize_percent_encoding(url)
70
71
  url.gsub(/%[0-9a-f]{2}/i, &:upcase)
71
72
  end
73
+
74
+ def remove_default_port(url)
75
+ url.sub(%r{(https://[^/:]+):443(?=/|$)}, '\1')
76
+ .sub(%r{(http://[^/:]+):80(?=/|$)}, '\1')
77
+ end
72
78
  end
73
79
  end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "nokogiri"
4
+
3
5
  module Archaeo
4
6
  # Rewrites Wayback Machine archive URLs to local file paths.
5
7
  #
@@ -7,6 +9,8 @@ module Archaeo
7
9
  # browsing. Converts absolute archive URLs into relative paths
8
10
  # rooted at a configurable local directory.
9
11
  class UrlRewriter
12
+ URL_ATTRS = %w[src href data-src poster].freeze
13
+
10
14
  def initialize(archive_prefix, local_prefix)
11
15
  @archive_prefix = archive_prefix.to_s
12
16
  @local_prefix = local_prefix.to_s
@@ -18,5 +22,47 @@ module Archaeo
18
22
  relative = url.sub(@archive_prefix, "")
19
23
  File.join(@local_prefix, relative)
20
24
  end
25
+
26
+ def rewrite_batch(urls)
27
+ urls.map { |url| rewrite(url) }
28
+ end
29
+
30
+ def rewrite_html(html_content)
31
+ doc = Nokogiri::HTML(html_content)
32
+ rewrite_url_attrs(doc)
33
+ rewrite_srcset_attrs(doc)
34
+ doc.to_html
35
+ end
36
+
37
+ def rewrite_url_attrs(doc)
38
+ URL_ATTRS.each do |attr|
39
+ doc.css("[#{attr}]").each do |el|
40
+ original = el[attr]
41
+ next unless original&.start_with?(@archive_prefix)
42
+
43
+ el[attr] = rewrite(original)
44
+ end
45
+ end
46
+ end
47
+
48
+ def rewrite_srcset_attrs(doc)
49
+ doc.css("[srcset]").each do |el|
50
+ el["srcset"] = rewrite_srcset(el["srcset"])
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def rewrite_srcset(srcset)
57
+ return srcset unless srcset
58
+
59
+ srcset.split(",").map do |entry|
60
+ parts = entry.strip.split(/\s+/, 2)
61
+ url = parts[0]
62
+ descriptor = parts[1]
63
+ rewritten = url.start_with?(@archive_prefix) ? rewrite(url) : url
64
+ descriptor ? "#{rewritten} #{descriptor}" : rewritten
65
+ end.join(", ")
66
+ end
21
67
  end
22
68
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.5"
4
+ VERSION = "0.2.6"
5
5
  end
data/lib/archaeo.rb CHANGED
@@ -14,6 +14,7 @@ module Archaeo
14
14
  class ArchiveNotAvailable < Error; end
15
15
  class InvalidResponse < Error; end
16
16
  class SaveFailed < Error; end
17
+ class IntegrityError < Error; end
17
18
 
18
19
  autoload :Timestamp, "archaeo/timestamp"
19
20
  autoload :ArchiveUrl, "archaeo/archive_url"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.