archaeo 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 67239af7cc927c495c67a849ecefb1cdc886ce8d95ddd6e27a2decdde6a93cd3
4
- data.tar.gz: 8ce4a0f786c2e7db3268b6660a1aa9e2f3b913ff99c22c85c3c2190457defc90
3
+ metadata.gz: fb2b99e313bf2a3ac807cebf0052d369d83c8514ad89a1b9ca18deed421a0c4d
4
+ data.tar.gz: fa1f9536f838d8246706d5eca3350ba4eae97546ae88c48e28a27e5df952d987
5
5
  SHA512:
6
- metadata.gz: ca0a9cc2bf0ad33a0d3dfd88e3228fd79fc3291a42fd3d13bbfbe4e37e744b0e3a5dadcec1cab48c0e13b6af872a8e3f4e80ce3e6593b18f024416b9cf7370fa
7
- data.tar.gz: bb4b1d9e720dfdcc18c7c4ccb73cc55e29a3e31fb6ffb5bf3b8c0fce1548a63a06da4a710ab5fc5020f142ede69dbdfbef5451944183e280e86e018379a792eb
6
+ metadata.gz: 148875e2dae2319e4c96d892c2233bd889e3c193a9ddad8995faded2d637ce398a3e971d27ca05c62c93674c3c6db05322814d823d8493c8d0318f052e1278d4
7
+ data.tar.gz: ee1d6df5dc3623d6aee2e7803b82306c69099a72b52475e6969ef5a2a4bdff73ea4544faf3aa6fdcd54e4594a1020dc7b5dd05ffe5d437327eda18a5c23d35ec
@@ -29,7 +29,8 @@ module Archaeo
29
29
  def download(url, from: nil, to: nil, resume: false,
30
30
  dry_run: false, all_timestamps: false,
31
31
  filter: nil, page_requisites: false,
32
- snapshot_at: nil, &block)
32
+ snapshot_at: nil, max_snapshots: nil,
33
+ strategy: nil, &block)
33
34
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
34
35
  url = UrlNormalizer.normalize(url)
35
36
  FileUtils.mkdir_p(@output_dir) unless dry_run
@@ -38,6 +39,8 @@ module Archaeo
38
39
  all_timestamps: all_timestamps,
39
40
  snapshot_at: snapshot_at)
40
41
  snapshots = apply_filter(snapshots, filter)
42
+ snapshots = schedule_snapshots(snapshots, strategy)
43
+ snapshots = snapshots.first(max_snapshots) if max_snapshots
41
44
  downloaded, skipped, bytes, failed =
42
45
  run_download(snapshots, resume, dry_run, page_requisites, block)
43
46
 
@@ -70,6 +73,13 @@ module Archaeo
70
73
  snapshots.select { |snap| filter.match?(snap.original_url) }
71
74
  end
72
75
 
76
+ def schedule_snapshots(snapshots, strategy)
77
+ return snapshots unless strategy
78
+
79
+ scheduler = DownloadScheduler.new(strategy: strategy)
80
+ scheduler.schedule(snapshots)
81
+ end
82
+
73
83
  def run_download(snapshots, resume, dry_run, page_requisites, progress)
74
84
  state = DownloadState.new(@output_dir)
75
85
  total = snapshots.size
data/lib/archaeo/cli.rb CHANGED
@@ -27,6 +27,8 @@ module Archaeo
27
27
  option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
28
28
  option :match_type,
29
29
  desc: "Match type (exact, prefix, host, domain)"
30
+ option :exact_url, type: :boolean, default: false,
31
+ desc: "Match exact URL only"
30
32
  option :filter, type: :array, desc: "CDX filter expressions"
31
33
  option :filter_status, type: :array,
32
34
  desc: "Only include these status codes"
@@ -39,6 +41,8 @@ module Archaeo
39
41
  default: "table"
40
42
  option :fields, type: :array,
41
43
  desc: "Specific fields to print (timestamp,original,etc)"
44
+ option :list_only, type: :boolean, default: false,
45
+ desc: "List files that would be downloaded"
42
46
  def snapshots(url)
43
47
  fmt = validate_output_format
44
48
  handle_errors do
@@ -228,6 +232,8 @@ module Archaeo
228
232
  option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
229
233
  option :resume, type: :boolean, default: false,
230
234
  desc: "Resume interrupted download"
235
+ option :reset, type: :boolean, default: false,
236
+ desc: "Clear download state and cache for fresh start"
231
237
  option :concurrency, type: :numeric, default: 1,
232
238
  desc: "Number of parallel downloads"
233
239
  option :dry_run, type: :boolean, default: false,
@@ -241,6 +247,15 @@ module Archaeo
241
247
  option :snapshot_at, desc: "Download composite snapshot at timestamp"
242
248
  option :rate_limit, type: :numeric, default: 0,
243
249
  desc: "Min seconds between requests"
250
+ option :max_snapshots, type: :numeric,
251
+ desc: "Limit to N most recent snapshots"
252
+ option :recursive_subdomains, type: :boolean, default: false,
253
+ desc: "Discover and download subdomains"
254
+ option :subdomain_depth, type: :numeric, default: 1,
255
+ desc: "Max subdomain recursion depth"
256
+ option :strategy, desc: "Download strategy (newest_first, oldest_first, " \
257
+ "breadth_first, depth_first)",
258
+ default: "newest_first"
244
259
  def download(url)
245
260
  handle_errors do
246
261
  rate_limiter = RateLimiter.new(
@@ -307,6 +322,35 @@ module Archaeo
307
322
  end
308
323
  end
309
324
 
325
+ desc "coverage URL",
326
+ "Analyze archive coverage for a URL"
327
+ option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
328
+ option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
329
+ option :format, desc: "Output format (table, json)", default: "table"
330
+ def coverage(url)
331
+ handle_errors do
332
+ analyzer = CoverageAnalyzer.new
333
+ report = analyzer.analyze(url, from: options[:from], to: options[:to])
334
+ output_coverage(report)
335
+ end
336
+ end
337
+
338
+ desc "snapshot-diff URL TIMESTAMP_A TIMESTAMP_B",
339
+ "Compare two snapshots of a URL"
340
+ option :format, desc: "Output format (table, json)", default: "table"
341
+ def snapshot_diff(url, timestamp_a, timestamp_b)
342
+ handle_errors do
343
+ fetcher = Fetcher.new
344
+ page_a = fetcher.fetch(url, timestamp: timestamp_a)
345
+ page_b = fetcher.fetch(url, timestamp: timestamp_b)
346
+ diff = SnapshotDiff.new(
347
+ url: url, page_a: page_a, page_b: page_b,
348
+ timestamp_a: timestamp_a, timestamp_b: timestamp_b
349
+ )
350
+ output_snapshot_diff(diff)
351
+ end
352
+ end
353
+
310
354
  CDX_OPTION_MAP = {
311
355
  from: :from,
312
356
  to: :to,
@@ -352,6 +396,9 @@ module Archaeo
352
396
  def fetch_snapshots(url)
353
397
  cdx = CdxApi.new
354
398
  opts = build_cdx_options(options)
399
+ if options[:exact_url]
400
+ opts[:match_type] = "exact"
401
+ end
355
402
  cdx.snapshots(url, **opts).to_a
356
403
  end
357
404
 
@@ -435,6 +482,11 @@ module Archaeo
435
482
  end
436
483
 
437
484
  def download_with_progress(downloader, url, filter)
485
+ if options[:reset]
486
+ state = DownloadState.new(options[:output])
487
+ state.clear
488
+ end
489
+
438
490
  summary = downloader.download(
439
491
  url,
440
492
  from: options[:from], to: options[:to],
@@ -442,9 +494,32 @@ module Archaeo
442
494
  all_timestamps: options[:all_timestamps],
443
495
  filter: filter,
444
496
  page_requisites: options[:page_requisites],
445
- snapshot_at: options[:snapshot_at]
497
+ snapshot_at: options[:snapshot_at],
498
+ max_snapshots: options[:max_snapshots],
499
+ strategy: options[:strategy]&.to_sym
446
500
  ) { |c, t, s| print_progress(c, t, s) }
447
501
  print_summary(summary)
502
+
503
+ return unless options[:recursive_subdomains]
504
+
505
+ discover_and_download_subdomains(url, downloader, filter)
506
+ end
507
+
508
+ def discover_and_download_subdomains(url, downloader, filter)
509
+ discovery = SubdomainDiscovery.new(
510
+ URI.parse(UrlNormalizer.normalize(url)).host,
511
+ max_depth: options[:subdomain_depth],
512
+ )
513
+ subdomains = discovery.scan_files(options[:output])
514
+ subdomains.each do |subdomain|
515
+ warn "Downloading subdomain: #{subdomain}" unless quiet?
516
+ downloader.download(
517
+ subdomain,
518
+ from: options[:from], to: options[:to],
519
+ resume: options[:resume],
520
+ filter: filter
521
+ ) { |c, t, s| print_progress(c, t, s) }
522
+ end
448
523
  end
449
524
 
450
525
  def output_health(report)
@@ -625,5 +700,51 @@ module Archaeo
625
700
  end
626
701
  dupes
627
702
  end
703
+
704
+ def output_coverage(report)
705
+ case options[:format]
706
+ when "json"
707
+ puts JSON.generate(report.as_json)
708
+ else
709
+ puts "URL: #{report.url}"
710
+ puts "Total URLs: #{report.total_urls}"
711
+ puts "Archived URLs: #{report.archived_urls}"
712
+ puts "Coverage: #{report.coverage_percent}%"
713
+ puts "Missing: #{report.missing_count}"
714
+ if report.has_gaps?
715
+ puts "Temporal gaps:"
716
+ report.temporal_gaps.each do |gap|
717
+ puts " #{gap[:from]} → #{gap[:to]} (#{gap[:gap_days]} days)"
718
+ end
719
+ end
720
+ puts "Status distribution:"
721
+ report.status_distribution.sort_by { |_, v| -v }.each do |code, count|
722
+ puts " #{code}: #{count}"
723
+ end
724
+ end
725
+ end
726
+
727
+ def output_snapshot_diff(diff)
728
+ case options[:format]
729
+ when "json"
730
+ puts JSON.generate(diff.as_json)
731
+ else
732
+ puts "Comparing #{diff.to_h[:timestamp_a]} vs #{diff.to_h[:timestamp_b]}"
733
+ puts "Content changed: #{diff.content_changed? ? 'Yes' : 'No'}"
734
+ link_changes = diff.link_changes
735
+ puts "Links added: #{link_changes[:added].size}"
736
+ puts "Links removed: #{link_changes[:removed].size}"
737
+ asset_changes = diff.asset_changes
738
+ puts "Assets added: #{asset_changes[:added].size}"
739
+ puts "Assets removed: #{asset_changes[:removed].size}"
740
+ structural = diff.structural_changes
741
+ unless structural.empty?
742
+ puts "Structural changes:"
743
+ structural.each do |tag, change|
744
+ puts " <#{tag}>: #{change[:from]} → #{change[:to]}"
745
+ end
746
+ end
747
+ end
748
+ end
628
749
  end
629
750
  end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+
5
+ module Archaeo
6
+ # Manages persistent configuration across sessions.
7
+ #
8
+ # Loads settings from .archaeo.yml files, supports named profiles,
9
+ # and falls back to sensible defaults. Settings cascade: defaults
10
+ # < global config < profile overrides.
11
+ class Configuration
12
+ DEFAULTS = {
13
+ "output_dir" => "archive",
14
+ "format" => "table",
15
+ "rate_limit" => 0,
16
+ "concurrency" => 1,
17
+ "max_retries" => 3,
18
+ }.freeze
19
+
20
+ def initialize(path: ".archaeo.yml")
21
+ @path = path
22
+ @data = load_config
23
+ end
24
+
25
+ def get(key, profile: nil)
26
+ keys = key.to_s.split(".")
27
+ value = dig_nested(@data, keys, profile)
28
+ value.nil? ? DEFAULTS[keys.last] : value
29
+ end
30
+
31
+ def profile(name)
32
+ profiles = @data["profiles"] || {}
33
+ profiles[name.to_s] || {}
34
+ end
35
+
36
+ def profiles
37
+ (@data["profiles"] || {}).keys
38
+ end
39
+
40
+ def set(key, value, profile: nil)
41
+ if profile
42
+ @data["profiles"] ||= {}
43
+ @data["profiles"][profile.to_s] ||= {}
44
+ @data["profiles"][profile.to_s][key.to_s] = value
45
+ else
46
+ @data["defaults"] ||= {}
47
+ @data["defaults"][key.to_s] = value
48
+ end
49
+ save_config
50
+ end
51
+
52
+ def to_h
53
+ {
54
+ defaults: @data.fetch("defaults", {}),
55
+ profiles: @data.fetch("profiles", {}),
56
+ }
57
+ end
58
+
59
+ def save(path: nil)
60
+ target = path || @path
61
+ File.write(target, YAML.dump(@data))
62
+ end
63
+
64
+ private
65
+
66
+ def load_config
67
+ return {} unless File.exist?(@path)
68
+
69
+ content = File.read(@path)
70
+ YAML.safe_load(content, permitted_classes: [Symbol]) || {}
71
+ rescue StandardError
72
+ {}
73
+ end
74
+
75
+ def save_config
76
+ FileUtils.mkdir_p(File.dirname(@path)) unless File.dirname(@path) == "."
77
+ File.write(@path, YAML.dump(@data))
78
+ end
79
+
80
+ def dig_nested(data, keys, profile_name)
81
+ if profile_name
82
+ profile_data = data.dig("profiles", profile_name.to_s) || {}
83
+ return dig_value(profile_data, keys)
84
+ end
85
+
86
+ defaults = data["defaults"] || {}
87
+ dig_value(defaults, keys)
88
+ end
89
+
90
+ def dig_value(hash, keys)
91
+ keys.reduce(hash) { |h, k| h.is_a?(Hash) ? h[k] : nil }
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Analyzes how thoroughly a site was archived by the Wayback Machine.
5
+ #
6
+ # Produces coverage statistics including total URLs, archived URLs,
7
+ # coverage percentage, temporal gaps, and status distribution.
8
+ class CoverageReport
9
+ attr_reader :url, :total_urls, :archived_urls, :status_distribution,
10
+ :temporal_gaps, :missing_assets
11
+
12
+ def initialize(url:, total_urls:, archived_urls:,
13
+ status_distribution: {}, temporal_gaps: [],
14
+ missing_assets: [])
15
+ @url = url
16
+ @total_urls = total_urls
17
+ @archived_urls = archived_urls
18
+ @status_distribution = status_distribution
19
+ @temporal_gaps = temporal_gaps
20
+ @missing_assets = missing_assets
21
+ end
22
+
23
+ def coverage_percent
24
+ return 0.0 if total_urls.zero?
25
+
26
+ (archived_urls.to_f / total_urls * 100).round(1)
27
+ end
28
+
29
+ def missing_count
30
+ total_urls - archived_urls
31
+ end
32
+
33
+ def has_gaps?
34
+ !temporal_gaps.empty?
35
+ end
36
+
37
+ def to_h
38
+ {
39
+ url: @url,
40
+ total_urls: @total_urls,
41
+ archived_urls: @archived_urls,
42
+ coverage_percent: coverage_percent,
43
+ missing_count: missing_count,
44
+ status_distribution: @status_distribution,
45
+ temporal_gaps: @temporal_gaps,
46
+ missing_assets: @missing_assets,
47
+ }
48
+ end
49
+
50
+ def as_json(*)
51
+ to_h
52
+ end
53
+ end
54
+
55
+ # Builds a CoverageReport from CDX snapshot data.
56
+ class CoverageAnalyzer
57
+ def initialize(cdx_api: nil)
58
+ @cdx_api = cdx_api
59
+ end
60
+
61
+ def analyze(url, from: nil, to: nil)
62
+ cdx = @cdx_api || CdxApi.new
63
+ snapshots = cdx.snapshots(url, from: from, to: to).to_a
64
+
65
+ unique_urls = snapshots.map(&:original_url).uniq
66
+ status_dist = compute_status_distribution(snapshots)
67
+ gaps = compute_temporal_gaps(snapshots)
68
+
69
+ CoverageReport.new(
70
+ url: url,
71
+ total_urls: unique_urls.size,
72
+ archived_urls: snapshots.count(&:success?),
73
+ status_distribution: status_dist,
74
+ temporal_gaps: gaps,
75
+ )
76
+ end
77
+
78
+ private
79
+
80
+ def compute_status_distribution(snapshots)
81
+ snapshots.each_with_object(Hash.new(0)) do |snap, counts|
82
+ counts[snap.status_code] += 1
83
+ end
84
+ end
85
+
86
+ def compute_temporal_gaps(snapshots)
87
+ return [] if snapshots.size < 2
88
+
89
+ sorted = snapshots.sort_by(&:timestamp)
90
+ gaps = []
91
+ sorted.each_cons(2) do |a, b|
92
+ diff_days = (b.timestamp.to_time - a.timestamp.to_time) / 86400
93
+ next unless diff_days > 30
94
+
95
+ gaps << { from: a.timestamp.to_s, to: b.timestamp.to_s,
96
+ gap_days: diff_days.round }
97
+ end
98
+ gaps
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Schedules and orders snapshot downloads by strategy and priority.
5
+ #
6
+ # Supports multiple ordering strategies (newest_first, oldest_first,
7
+ # breadth_first, depth_first) and priority rules (html_first,
8
+ # smallest_first, largest_first) for intelligent download ordering.
9
+ class DownloadScheduler
10
+ STRATEGIES = %i[newest_first oldest_first breadth_first depth_first].freeze
11
+ PRIORITIES = %i[html_first smallest_first largest_first].freeze
12
+
13
+ def initialize(strategy: :newest_first, priority: nil,
14
+ max_file_size: nil, min_file_size: nil)
15
+ validate_strategy(strategy)
16
+ validate_priority(priority) if priority
17
+
18
+ @strategy = strategy
19
+ @priority = priority
20
+ @max_file_size = max_file_size
21
+ @min_file_size = min_file_size
22
+ end
23
+
24
+ def schedule(snapshots)
25
+ filtered = apply_size_filters(snapshots)
26
+ ordered = apply_strategy(filtered)
27
+ apply_priority(ordered)
28
+ end
29
+
30
+ private
31
+
32
+ def validate_strategy(strategy)
33
+ return if STRATEGIES.include?(strategy.to_sym)
34
+
35
+ raise ArgumentError,
36
+ "Invalid strategy: #{strategy}. Use: #{STRATEGIES.join(', ')}"
37
+ end
38
+
39
+ def validate_priority(priority)
40
+ return if PRIORITIES.include?(priority.to_sym)
41
+
42
+ raise ArgumentError,
43
+ "Invalid priority: #{priority}. Use: #{PRIORITIES.join(', ')}"
44
+ end
45
+
46
+ def apply_size_filters(snapshots)
47
+ result = snapshots
48
+ if @max_file_size
49
+ result = result.reject { |s| s.length && s.length > @max_file_size }
50
+ end
51
+ if @min_file_size
52
+ result = result.reject { |s| s.length && s.length < @min_file_size }
53
+ end
54
+ result
55
+ end
56
+
57
+ def apply_strategy(snapshots)
58
+ case @strategy.to_sym
59
+ when :newest_first
60
+ snapshots.sort_by { |s| -s.timestamp.to_i }
61
+ when :oldest_first
62
+ snapshots.sort_by(&:timestamp)
63
+ when :breadth_first
64
+ sort_by_depth(snapshots, depth: :shallow)
65
+ when :depth_first
66
+ sort_by_depth(snapshots, depth: :deep)
67
+ end
68
+ end
69
+
70
+ def apply_priority(snapshots)
71
+ return snapshots unless @priority
72
+
73
+ case @priority.to_sym
74
+ when :html_first
75
+ html, rest = snapshots.partition { |s| html?(s) }
76
+ html + rest
77
+ when :smallest_first
78
+ snapshots.sort_by { |s| s.length || 0 }
79
+ when :largest_first
80
+ snapshots.sort_by { |s| -(s.length || 0) }
81
+ end
82
+ end
83
+
84
+ def sort_by_depth(snapshots, depth:)
85
+ segments = snapshots.map do |snap|
86
+ path = snap.original_url.to_s
87
+ depth_count = path.count("/")
88
+ [snap, depth_count]
89
+ end
90
+
91
+ if depth == :shallow
92
+ segments.sort_by { |_, d| d }.map(&:first)
93
+ else
94
+ segments.sort_by { |_, d| -d }.map(&:first)
95
+ end
96
+ end
97
+
98
+ def html?(snapshot)
99
+ snapshot.mimetype.to_s.include?("text/html")
100
+ end
101
+ end
102
+ end
@@ -64,13 +64,15 @@ module Archaeo
64
64
  retry_delay: DEFAULT_RETRY_DELAY,
65
65
  user_agent: nil,
66
66
  on_request: nil,
67
- before_request: nil)
67
+ before_request: nil,
68
+ rate_limiter: nil)
68
69
  @timeout = timeout
69
70
  @max_retries = max_retries
70
71
  @retry_delay = retry_delay
71
72
  @user_agent = user_agent
72
73
  @on_request = on_request
73
74
  @before_request = before_request
75
+ @rate_limiter = rate_limiter
74
76
  @connections = {}
75
77
  @last_used = {}
76
78
  @mutex = Mutex.new
@@ -276,6 +278,7 @@ module Archaeo
276
278
  end
277
279
 
278
280
  def execute_tracked_request(uri, request, retry_count)
281
+ @rate_limiter&.wait(host: uri.host)
279
282
  http = connection_for(uri)
280
283
  start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
281
284
  raw = http.request(request)
data/lib/archaeo/page.rb CHANGED
@@ -93,6 +93,52 @@ module Archaeo
93
93
  end
94
94
  end
95
95
 
96
+ def headings
97
+ return [] unless html?
98
+
99
+ @headings ||= begin
100
+ doc = Nokogiri::HTML(@raw_content)
101
+ doc.css("h1, h2, h3, h4, h5, h6").map do |el|
102
+ { level: el.name[1].to_i, text: el.text.strip }
103
+ end
104
+ end
105
+ end
106
+
107
+ def images
108
+ return [] unless html?
109
+
110
+ @images ||= begin
111
+ doc = Nokogiri::HTML(@raw_content)
112
+ doc.css("img[src]").map do |el|
113
+ { src: el["src"], alt: el["alt"].to_s,
114
+ width: el["width"]&.to_i, height: el["height"]&.to_i }
115
+ end
116
+ end
117
+ end
118
+
119
+ def forms
120
+ return [] unless html?
121
+
122
+ @forms ||= begin
123
+ doc = Nokogiri::HTML(@raw_content)
124
+ doc.css("form").map do |form|
125
+ { action: form["action"].to_s, method: (form["method"] || "GET").upcase,
126
+ fields: extract_form_fields(form) }
127
+ end
128
+ end
129
+ end
130
+
131
+ def scripts
132
+ return [] unless html?
133
+
134
+ @scripts ||= begin
135
+ doc = Nokogiri::HTML(@raw_content)
136
+ doc.css("script").map do |el|
137
+ { src: el["src"].to_s, type: el["type"].to_s }
138
+ end
139
+ end
140
+ end
141
+
96
142
  def to_h
97
143
  {
98
144
  content_type: @content_type,
@@ -202,5 +248,12 @@ module Archaeo
202
248
  rescue URI::InvalidURIError
203
249
  nil
204
250
  end
251
+
252
+ def extract_form_fields(form)
253
+ inputs = form.css("input, select, textarea").map do |el|
254
+ { name: el["name"].to_s, type: (el["type"] || el.name).to_s }
255
+ end
256
+ inputs.reject { |f| f[:name].empty? }
257
+ end
205
258
  end
206
259
  end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Value object representing download progress at a point in time.
5
+ #
6
+ # Provides computed metrics like percentage, speed, and ETA
7
+ # based on current counters and elapsed time.
8
+ ProgressReport = Struct.new(
9
+ :current, :total, :downloaded_bytes, :elapsed, :current_url,
10
+ keyword_init: true
11
+ ) do
12
+ def percent_complete
13
+ return 0.0 if total.nil? || total.zero?
14
+
15
+ (current.to_f / total * 100).round(1)
16
+ end
17
+
18
+ def speed
19
+ return 0.0 if elapsed.nil? || elapsed.zero?
20
+
21
+ downloaded_bytes.to_f / elapsed
22
+ end
23
+
24
+ def eta
25
+ return nil if elapsed.nil? || elapsed.zero?
26
+ return nil if total.nil? || current.nil? || current.zero?
27
+
28
+ rate = current.to_f / elapsed
29
+ remaining = total - current
30
+ remaining / rate
31
+ end
32
+
33
+ def to_h
34
+ {
35
+ current: current,
36
+ total: total,
37
+ percent_complete: percent_complete,
38
+ downloaded_bytes: downloaded_bytes,
39
+ speed: speed,
40
+ eta: eta,
41
+ current_url: current_url,
42
+ elapsed: elapsed,
43
+ }
44
+ end
45
+
46
+ def as_json(*)
47
+ to_h.transform_values { |v| v.is_a?(Float) ? v.round(2) : v }
48
+ end
49
+ end
50
+ end
@@ -11,9 +11,11 @@ module Archaeo
11
11
  TIMESTAMP_RE = %r{web\.archive\.org/web/(\d{14})}
12
12
 
13
13
  def initialize(client: HttpClient.new,
14
- max_tries: DEFAULT_MAX_TRIES)
14
+ max_tries: DEFAULT_MAX_TRIES,
15
+ rate_limiter: nil)
15
16
  @client = client
16
17
  @max_tries = max_tries
18
+ @rate_limiter = rate_limiter
17
19
  end
18
20
 
19
21
  def save(url)
@@ -44,6 +46,7 @@ module Archaeo
44
46
  def attempt_save(save_url, start_time, url)
45
47
  @max_tries.times do |attempt|
46
48
  sleep(retry_delay(attempt)) if attempt.positive?
49
+ @rate_limiter&.wait(host: "web.archive.org")
47
50
 
48
51
  response = @client.get(save_url)
49
52
  check_response_errors!(response, url)
@@ -0,0 +1,135 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+
5
+ module Archaeo
6
+ # Compares two archived snapshots of the same URL.
7
+ #
8
+ # Produces text diffs, structural change analysis, link and
9
+ # asset change tracking between snapshots at different timestamps.
10
+ class SnapshotDiff
11
+ attr_reader :url, :snapshot_a, :snapshot_b
12
+
13
+ def initialize(url:, page_a:, page_b:, timestamp_a:, timestamp_b:)
14
+ @url = url
15
+ @page_a = page_a
16
+ @page_b = page_b
17
+ @timestamp_a = Timestamp.coerce(timestamp_a)
18
+ @timestamp_b = Timestamp.coerce(timestamp_b)
19
+ end
20
+
21
+ def content_changed?
22
+ content_digest(@page_a.content) != content_digest(@page_b.content)
23
+ end
24
+
25
+ def text_diff
26
+ lines_a = @page_a.content.to_s.lines
27
+ lines_b = @page_b.content.to_s.lines
28
+ build_unified_diff(lines_a, lines_b)
29
+ end
30
+
31
+ def link_changes
32
+ links_a = extract_links(@page_a)
33
+ links_b = extract_links(@page_b)
34
+ compute_set_diff(links_a, links_b)
35
+ end
36
+
37
+ def asset_changes
38
+ assets_a = extract_assets(@page_a)
39
+ assets_b = extract_assets(@page_b)
40
+ compute_set_diff(assets_a, assets_b)
41
+ end
42
+
43
+ def structural_changes
44
+ return {} unless @page_a.html? && @page_b.html?
45
+
46
+ elements_a = count_elements(@page_a)
47
+ elements_b = count_elements(@page_b)
48
+ build_element_diff(elements_a, elements_b)
49
+ end
50
+
51
+ def to_h
52
+ {
53
+ url: @url,
54
+ timestamp_a: @timestamp_a.to_s,
55
+ timestamp_b: @timestamp_b.to_s,
56
+ content_changed: content_changed?,
57
+ links_added: link_changes[:added],
58
+ links_removed: link_changes[:removed],
59
+ assets_added: asset_changes[:added],
60
+ assets_removed: asset_changes[:removed],
61
+ structural_changes: structural_changes,
62
+ }
63
+ end
64
+
65
+ def as_json(*)
66
+ to_h
67
+ end
68
+
69
+ private
70
+
71
+ def content_digest(content)
72
+ Digest::SHA256.hexdigest(content.to_s)
73
+ end
74
+
75
+ def build_unified_diff(lines_a, lines_b)
76
+ diff = []
77
+ max_len = [lines_a.size, lines_b.size].max
78
+ max_len.times do |i|
79
+ la = lines_a[i]
80
+ lb = lines_b[i]
81
+ if la == lb
82
+ diff << " #{la}"
83
+ else
84
+ diff << "- #{la}" if la
85
+ diff << "+ #{lb}" if lb
86
+ end
87
+ end
88
+ diff.join
89
+ end
90
+
91
+ def extract_links(page)
92
+ return Set.new unless page.html?
93
+
94
+ page.links.filter_map { |l| l[:href] }.to_set
95
+ end
96
+
97
+ def extract_assets(page)
98
+ return Set.new unless page.html?
99
+
100
+ extractor = AssetExtractor.new(page.content, base_url: page.archive_url)
101
+ extractor.extract.all.to_set
102
+ rescue StandardError
103
+ Set.new
104
+ end
105
+
106
+ def count_elements(page)
107
+ require "nokogiri"
108
+ doc = Nokogiri::HTML(page.content)
109
+ counts = Hash.new(0)
110
+ doc.css("*").each { |el| counts[el.name] += 1 }
111
+ counts
112
+ end
113
+
114
+ def compute_set_diff(set_a, set_b)
115
+ {
116
+ added: (set_b - set_a).to_a.sort,
117
+ removed: (set_a - set_b).to_a.sort,
118
+ unchanged: (set_a & set_b).size,
119
+ }
120
+ end
121
+
122
+ def build_element_diff(counts_a, counts_b)
123
+ all_tags = (counts_a.keys + counts_b.keys).uniq.sort
124
+ changes = {}
125
+ all_tags.each do |tag|
126
+ ca = counts_a[tag]
127
+ cb = counts_b[tag]
128
+ next if ca == cb
129
+
130
+ changes[tag] = { from: ca, to: cb }
131
+ end
132
+ changes
133
+ end
134
+ end
135
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.8"
4
+ VERSION = "0.2.9"
5
5
  end
data/lib/archaeo.rb CHANGED
@@ -55,4 +55,9 @@ module Archaeo
55
55
  autoload :CdxCache, "archaeo/cdx_cache"
56
56
  autoload :SubdomainDiscovery, "archaeo/subdomain_discovery"
57
57
  autoload :ArchiveHealthCheck, "archaeo/archive_health_check"
58
+ autoload :DownloadScheduler, "archaeo/download_scheduler"
59
+ autoload :SnapshotDiff, "archaeo/snapshot_diff"
60
+ autoload :Configuration, "archaeo/configuration"
61
+ autoload :CoverageReport, "archaeo/coverage_report"
62
+ autoload :ProgressReport, "archaeo/progress_report"
58
63
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
@@ -83,6 +83,9 @@ files:
83
83
  - lib/archaeo/cdx_filter.rb
84
84
  - lib/archaeo/cdx_timeline.rb
85
85
  - lib/archaeo/cli.rb
86
+ - lib/archaeo/configuration.rb
87
+ - lib/archaeo/coverage_report.rb
88
+ - lib/archaeo/download_scheduler.rb
86
89
  - lib/archaeo/download_state.rb
87
90
  - lib/archaeo/encoding_detector.rb
88
91
  - lib/archaeo/fetcher.rb
@@ -91,10 +94,12 @@ files:
91
94
  - lib/archaeo/page_bundle.rb
92
95
  - lib/archaeo/path_sanitizer.rb
93
96
  - lib/archaeo/pattern_filter.rb
97
+ - lib/archaeo/progress_report.rb
94
98
  - lib/archaeo/rate_limiter.rb
95
99
  - lib/archaeo/save_api.rb
96
100
  - lib/archaeo/save_result.rb
97
101
  - lib/archaeo/snapshot.rb
102
+ - lib/archaeo/snapshot_diff.rb
98
103
  - lib/archaeo/subdomain_discovery.rb
99
104
  - lib/archaeo/timestamp.rb
100
105
  - lib/archaeo/url_normalizer.rb