archaeo 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/archaeo/bulk_downloader.rb +11 -1
- data/lib/archaeo/cli.rb +122 -1
- data/lib/archaeo/configuration.rb +94 -0
- data/lib/archaeo/coverage_report.rb +101 -0
- data/lib/archaeo/download_scheduler.rb +102 -0
- data/lib/archaeo/http_client.rb +4 -1
- data/lib/archaeo/page.rb +53 -0
- data/lib/archaeo/progress_report.rb +50 -0
- data/lib/archaeo/save_api.rb +4 -1
- data/lib/archaeo/snapshot_diff.rb +135 -0
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +5 -0
- metadata +6 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fb2b99e313bf2a3ac807cebf0052d369d83c8514ad89a1b9ca18deed421a0c4d
|
|
4
|
+
data.tar.gz: fa1f9536f838d8246706d5eca3350ba4eae97546ae88c48e28a27e5df952d987
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 148875e2dae2319e4c96d892c2233bd889e3c193a9ddad8995faded2d637ce398a3e971d27ca05c62c93674c3c6db05322814d823d8493c8d0318f052e1278d4
|
|
7
|
+
data.tar.gz: ee1d6df5dc3623d6aee2e7803b82306c69099a72b52475e6969ef5a2a4bdff73ea4544faf3aa6fdcd54e4594a1020dc7b5dd05ffe5d437327eda18a5c23d35ec
|
|
@@ -29,7 +29,8 @@ module Archaeo
|
|
|
29
29
|
def download(url, from: nil, to: nil, resume: false,
|
|
30
30
|
dry_run: false, all_timestamps: false,
|
|
31
31
|
filter: nil, page_requisites: false,
|
|
32
|
-
snapshot_at: nil,
|
|
32
|
+
snapshot_at: nil, max_snapshots: nil,
|
|
33
|
+
strategy: nil, &block)
|
|
33
34
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
34
35
|
url = UrlNormalizer.normalize(url)
|
|
35
36
|
FileUtils.mkdir_p(@output_dir) unless dry_run
|
|
@@ -38,6 +39,8 @@ module Archaeo
|
|
|
38
39
|
all_timestamps: all_timestamps,
|
|
39
40
|
snapshot_at: snapshot_at)
|
|
40
41
|
snapshots = apply_filter(snapshots, filter)
|
|
42
|
+
snapshots = schedule_snapshots(snapshots, strategy)
|
|
43
|
+
snapshots = snapshots.first(max_snapshots) if max_snapshots
|
|
41
44
|
downloaded, skipped, bytes, failed =
|
|
42
45
|
run_download(snapshots, resume, dry_run, page_requisites, block)
|
|
43
46
|
|
|
@@ -70,6 +73,13 @@ module Archaeo
|
|
|
70
73
|
snapshots.select { |snap| filter.match?(snap.original_url) }
|
|
71
74
|
end
|
|
72
75
|
|
|
76
|
+
def schedule_snapshots(snapshots, strategy)
|
|
77
|
+
return snapshots unless strategy
|
|
78
|
+
|
|
79
|
+
scheduler = DownloadScheduler.new(strategy: strategy)
|
|
80
|
+
scheduler.schedule(snapshots)
|
|
81
|
+
end
|
|
82
|
+
|
|
73
83
|
def run_download(snapshots, resume, dry_run, page_requisites, progress)
|
|
74
84
|
state = DownloadState.new(@output_dir)
|
|
75
85
|
total = snapshots.size
|
data/lib/archaeo/cli.rb
CHANGED
|
@@ -27,6 +27,8 @@ module Archaeo
|
|
|
27
27
|
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
28
28
|
option :match_type,
|
|
29
29
|
desc: "Match type (exact, prefix, host, domain)"
|
|
30
|
+
option :exact_url, type: :boolean, default: false,
|
|
31
|
+
desc: "Match exact URL only"
|
|
30
32
|
option :filter, type: :array, desc: "CDX filter expressions"
|
|
31
33
|
option :filter_status, type: :array,
|
|
32
34
|
desc: "Only include these status codes"
|
|
@@ -39,6 +41,8 @@ module Archaeo
|
|
|
39
41
|
default: "table"
|
|
40
42
|
option :fields, type: :array,
|
|
41
43
|
desc: "Specific fields to print (timestamp,original,etc)"
|
|
44
|
+
option :list_only, type: :boolean, default: false,
|
|
45
|
+
desc: "List files that would be downloaded"
|
|
42
46
|
def snapshots(url)
|
|
43
47
|
fmt = validate_output_format
|
|
44
48
|
handle_errors do
|
|
@@ -228,6 +232,8 @@ module Archaeo
|
|
|
228
232
|
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
229
233
|
option :resume, type: :boolean, default: false,
|
|
230
234
|
desc: "Resume interrupted download"
|
|
235
|
+
option :reset, type: :boolean, default: false,
|
|
236
|
+
desc: "Clear download state and cache for fresh start"
|
|
231
237
|
option :concurrency, type: :numeric, default: 1,
|
|
232
238
|
desc: "Number of parallel downloads"
|
|
233
239
|
option :dry_run, type: :boolean, default: false,
|
|
@@ -241,6 +247,15 @@ module Archaeo
|
|
|
241
247
|
option :snapshot_at, desc: "Download composite snapshot at timestamp"
|
|
242
248
|
option :rate_limit, type: :numeric, default: 0,
|
|
243
249
|
desc: "Min seconds between requests"
|
|
250
|
+
option :max_snapshots, type: :numeric,
|
|
251
|
+
desc: "Limit to N most recent snapshots"
|
|
252
|
+
option :recursive_subdomains, type: :boolean, default: false,
|
|
253
|
+
desc: "Discover and download subdomains"
|
|
254
|
+
option :subdomain_depth, type: :numeric, default: 1,
|
|
255
|
+
desc: "Max subdomain recursion depth"
|
|
256
|
+
option :strategy, desc: "Download strategy (newest_first, oldest_first, " \
|
|
257
|
+
"breadth_first, depth_first)",
|
|
258
|
+
default: "newest_first"
|
|
244
259
|
def download(url)
|
|
245
260
|
handle_errors do
|
|
246
261
|
rate_limiter = RateLimiter.new(
|
|
@@ -307,6 +322,35 @@ module Archaeo
|
|
|
307
322
|
end
|
|
308
323
|
end
|
|
309
324
|
|
|
325
|
+
desc "coverage URL",
|
|
326
|
+
"Analyze archive coverage for a URL"
|
|
327
|
+
option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
|
|
328
|
+
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
329
|
+
option :format, desc: "Output format (table, json)", default: "table"
|
|
330
|
+
def coverage(url)
|
|
331
|
+
handle_errors do
|
|
332
|
+
analyzer = CoverageAnalyzer.new
|
|
333
|
+
report = analyzer.analyze(url, from: options[:from], to: options[:to])
|
|
334
|
+
output_coverage(report)
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
desc "snapshot-diff URL TIMESTAMP_A TIMESTAMP_B",
|
|
339
|
+
"Compare two snapshots of a URL"
|
|
340
|
+
option :format, desc: "Output format (table, json)", default: "table"
|
|
341
|
+
def snapshot_diff(url, timestamp_a, timestamp_b)
|
|
342
|
+
handle_errors do
|
|
343
|
+
fetcher = Fetcher.new
|
|
344
|
+
page_a = fetcher.fetch(url, timestamp: timestamp_a)
|
|
345
|
+
page_b = fetcher.fetch(url, timestamp: timestamp_b)
|
|
346
|
+
diff = SnapshotDiff.new(
|
|
347
|
+
url: url, page_a: page_a, page_b: page_b,
|
|
348
|
+
timestamp_a: timestamp_a, timestamp_b: timestamp_b
|
|
349
|
+
)
|
|
350
|
+
output_snapshot_diff(diff)
|
|
351
|
+
end
|
|
352
|
+
end
|
|
353
|
+
|
|
310
354
|
CDX_OPTION_MAP = {
|
|
311
355
|
from: :from,
|
|
312
356
|
to: :to,
|
|
@@ -352,6 +396,9 @@ module Archaeo
|
|
|
352
396
|
def fetch_snapshots(url)
|
|
353
397
|
cdx = CdxApi.new
|
|
354
398
|
opts = build_cdx_options(options)
|
|
399
|
+
if options[:exact_url]
|
|
400
|
+
opts[:match_type] = "exact"
|
|
401
|
+
end
|
|
355
402
|
cdx.snapshots(url, **opts).to_a
|
|
356
403
|
end
|
|
357
404
|
|
|
@@ -435,6 +482,11 @@ module Archaeo
|
|
|
435
482
|
end
|
|
436
483
|
|
|
437
484
|
def download_with_progress(downloader, url, filter)
|
|
485
|
+
if options[:reset]
|
|
486
|
+
state = DownloadState.new(options[:output])
|
|
487
|
+
state.clear
|
|
488
|
+
end
|
|
489
|
+
|
|
438
490
|
summary = downloader.download(
|
|
439
491
|
url,
|
|
440
492
|
from: options[:from], to: options[:to],
|
|
@@ -442,9 +494,32 @@ module Archaeo
|
|
|
442
494
|
all_timestamps: options[:all_timestamps],
|
|
443
495
|
filter: filter,
|
|
444
496
|
page_requisites: options[:page_requisites],
|
|
445
|
-
snapshot_at: options[:snapshot_at]
|
|
497
|
+
snapshot_at: options[:snapshot_at],
|
|
498
|
+
max_snapshots: options[:max_snapshots],
|
|
499
|
+
strategy: options[:strategy]&.to_sym
|
|
446
500
|
) { |c, t, s| print_progress(c, t, s) }
|
|
447
501
|
print_summary(summary)
|
|
502
|
+
|
|
503
|
+
return unless options[:recursive_subdomains]
|
|
504
|
+
|
|
505
|
+
discover_and_download_subdomains(url, downloader, filter)
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
def discover_and_download_subdomains(url, downloader, filter)
|
|
509
|
+
discovery = SubdomainDiscovery.new(
|
|
510
|
+
URI.parse(UrlNormalizer.normalize(url)).host,
|
|
511
|
+
max_depth: options[:subdomain_depth],
|
|
512
|
+
)
|
|
513
|
+
subdomains = discovery.scan_files(options[:output])
|
|
514
|
+
subdomains.each do |subdomain|
|
|
515
|
+
warn "Downloading subdomain: #{subdomain}" unless quiet?
|
|
516
|
+
downloader.download(
|
|
517
|
+
subdomain,
|
|
518
|
+
from: options[:from], to: options[:to],
|
|
519
|
+
resume: options[:resume],
|
|
520
|
+
filter: filter
|
|
521
|
+
) { |c, t, s| print_progress(c, t, s) }
|
|
522
|
+
end
|
|
448
523
|
end
|
|
449
524
|
|
|
450
525
|
def output_health(report)
|
|
@@ -625,5 +700,51 @@ module Archaeo
|
|
|
625
700
|
end
|
|
626
701
|
dupes
|
|
627
702
|
end
|
|
703
|
+
|
|
704
|
+
def output_coverage(report)
|
|
705
|
+
case options[:format]
|
|
706
|
+
when "json"
|
|
707
|
+
puts JSON.generate(report.as_json)
|
|
708
|
+
else
|
|
709
|
+
puts "URL: #{report.url}"
|
|
710
|
+
puts "Total URLs: #{report.total_urls}"
|
|
711
|
+
puts "Archived URLs: #{report.archived_urls}"
|
|
712
|
+
puts "Coverage: #{report.coverage_percent}%"
|
|
713
|
+
puts "Missing: #{report.missing_count}"
|
|
714
|
+
if report.has_gaps?
|
|
715
|
+
puts "Temporal gaps:"
|
|
716
|
+
report.temporal_gaps.each do |gap|
|
|
717
|
+
puts " #{gap[:from]} → #{gap[:to]} (#{gap[:gap_days]} days)"
|
|
718
|
+
end
|
|
719
|
+
end
|
|
720
|
+
puts "Status distribution:"
|
|
721
|
+
report.status_distribution.sort_by { |_, v| -v }.each do |code, count|
|
|
722
|
+
puts " #{code}: #{count}"
|
|
723
|
+
end
|
|
724
|
+
end
|
|
725
|
+
end
|
|
726
|
+
|
|
727
|
+
def output_snapshot_diff(diff)
|
|
728
|
+
case options[:format]
|
|
729
|
+
when "json"
|
|
730
|
+
puts JSON.generate(diff.as_json)
|
|
731
|
+
else
|
|
732
|
+
puts "Comparing #{diff.to_h[:timestamp_a]} vs #{diff.to_h[:timestamp_b]}"
|
|
733
|
+
puts "Content changed: #{diff.content_changed? ? 'Yes' : 'No'}"
|
|
734
|
+
link_changes = diff.link_changes
|
|
735
|
+
puts "Links added: #{link_changes[:added].size}"
|
|
736
|
+
puts "Links removed: #{link_changes[:removed].size}"
|
|
737
|
+
asset_changes = diff.asset_changes
|
|
738
|
+
puts "Assets added: #{asset_changes[:added].size}"
|
|
739
|
+
puts "Assets removed: #{asset_changes[:removed].size}"
|
|
740
|
+
structural = diff.structural_changes
|
|
741
|
+
unless structural.empty?
|
|
742
|
+
puts "Structural changes:"
|
|
743
|
+
structural.each do |tag, change|
|
|
744
|
+
puts " <#{tag}>: #{change[:from]} → #{change[:to]}"
|
|
745
|
+
end
|
|
746
|
+
end
|
|
747
|
+
end
|
|
748
|
+
end
|
|
628
749
|
end
|
|
629
750
|
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
|
|
5
|
+
module Archaeo
|
|
6
|
+
# Manages persistent configuration across sessions.
|
|
7
|
+
#
|
|
8
|
+
# Loads settings from .archaeo.yml files, supports named profiles,
|
|
9
|
+
# and falls back to sensible defaults. Settings cascade: defaults
|
|
10
|
+
# < global config < profile overrides.
|
|
11
|
+
class Configuration
|
|
12
|
+
DEFAULTS = {
|
|
13
|
+
"output_dir" => "archive",
|
|
14
|
+
"format" => "table",
|
|
15
|
+
"rate_limit" => 0,
|
|
16
|
+
"concurrency" => 1,
|
|
17
|
+
"max_retries" => 3,
|
|
18
|
+
}.freeze
|
|
19
|
+
|
|
20
|
+
def initialize(path: ".archaeo.yml")
|
|
21
|
+
@path = path
|
|
22
|
+
@data = load_config
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def get(key, profile: nil)
|
|
26
|
+
keys = key.to_s.split(".")
|
|
27
|
+
value = dig_nested(@data, keys, profile)
|
|
28
|
+
value.nil? ? DEFAULTS[keys.last] : value
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def profile(name)
|
|
32
|
+
profiles = @data["profiles"] || {}
|
|
33
|
+
profiles[name.to_s] || {}
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def profiles
|
|
37
|
+
(@data["profiles"] || {}).keys
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def set(key, value, profile: nil)
|
|
41
|
+
if profile
|
|
42
|
+
@data["profiles"] ||= {}
|
|
43
|
+
@data["profiles"][profile.to_s] ||= {}
|
|
44
|
+
@data["profiles"][profile.to_s][key.to_s] = value
|
|
45
|
+
else
|
|
46
|
+
@data["defaults"] ||= {}
|
|
47
|
+
@data["defaults"][key.to_s] = value
|
|
48
|
+
end
|
|
49
|
+
save_config
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def to_h
|
|
53
|
+
{
|
|
54
|
+
defaults: @data.fetch("defaults", {}),
|
|
55
|
+
profiles: @data.fetch("profiles", {}),
|
|
56
|
+
}
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def save(path: nil)
|
|
60
|
+
target = path || @path
|
|
61
|
+
File.write(target, YAML.dump(@data))
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
def load_config
|
|
67
|
+
return {} unless File.exist?(@path)
|
|
68
|
+
|
|
69
|
+
content = File.read(@path)
|
|
70
|
+
YAML.safe_load(content, permitted_classes: [Symbol]) || {}
|
|
71
|
+
rescue StandardError
|
|
72
|
+
{}
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def save_config
|
|
76
|
+
FileUtils.mkdir_p(File.dirname(@path)) unless File.dirname(@path) == "."
|
|
77
|
+
File.write(@path, YAML.dump(@data))
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def dig_nested(data, keys, profile_name)
|
|
81
|
+
if profile_name
|
|
82
|
+
profile_data = data.dig("profiles", profile_name.to_s) || {}
|
|
83
|
+
return dig_value(profile_data, keys)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
defaults = data["defaults"] || {}
|
|
87
|
+
dig_value(defaults, keys)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def dig_value(hash, keys)
|
|
91
|
+
keys.reduce(hash) { |h, k| h.is_a?(Hash) ? h[k] : nil }
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Analyzes how thoroughly a site was archived by the Wayback Machine.
|
|
5
|
+
#
|
|
6
|
+
# Produces coverage statistics including total URLs, archived URLs,
|
|
7
|
+
# coverage percentage, temporal gaps, and status distribution.
|
|
8
|
+
class CoverageReport
|
|
9
|
+
attr_reader :url, :total_urls, :archived_urls, :status_distribution,
|
|
10
|
+
:temporal_gaps, :missing_assets
|
|
11
|
+
|
|
12
|
+
def initialize(url:, total_urls:, archived_urls:,
|
|
13
|
+
status_distribution: {}, temporal_gaps: [],
|
|
14
|
+
missing_assets: [])
|
|
15
|
+
@url = url
|
|
16
|
+
@total_urls = total_urls
|
|
17
|
+
@archived_urls = archived_urls
|
|
18
|
+
@status_distribution = status_distribution
|
|
19
|
+
@temporal_gaps = temporal_gaps
|
|
20
|
+
@missing_assets = missing_assets
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def coverage_percent
|
|
24
|
+
return 0.0 if total_urls.zero?
|
|
25
|
+
|
|
26
|
+
(archived_urls.to_f / total_urls * 100).round(1)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def missing_count
|
|
30
|
+
total_urls - archived_urls
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def has_gaps?
|
|
34
|
+
!temporal_gaps.empty?
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def to_h
|
|
38
|
+
{
|
|
39
|
+
url: @url,
|
|
40
|
+
total_urls: @total_urls,
|
|
41
|
+
archived_urls: @archived_urls,
|
|
42
|
+
coverage_percent: coverage_percent,
|
|
43
|
+
missing_count: missing_count,
|
|
44
|
+
status_distribution: @status_distribution,
|
|
45
|
+
temporal_gaps: @temporal_gaps,
|
|
46
|
+
missing_assets: @missing_assets,
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def as_json(*)
|
|
51
|
+
to_h
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Builds a CoverageReport from CDX snapshot data.
|
|
56
|
+
class CoverageAnalyzer
|
|
57
|
+
def initialize(cdx_api: nil)
|
|
58
|
+
@cdx_api = cdx_api
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def analyze(url, from: nil, to: nil)
|
|
62
|
+
cdx = @cdx_api || CdxApi.new
|
|
63
|
+
snapshots = cdx.snapshots(url, from: from, to: to).to_a
|
|
64
|
+
|
|
65
|
+
unique_urls = snapshots.map(&:original_url).uniq
|
|
66
|
+
status_dist = compute_status_distribution(snapshots)
|
|
67
|
+
gaps = compute_temporal_gaps(snapshots)
|
|
68
|
+
|
|
69
|
+
CoverageReport.new(
|
|
70
|
+
url: url,
|
|
71
|
+
total_urls: unique_urls.size,
|
|
72
|
+
archived_urls: snapshots.count(&:success?),
|
|
73
|
+
status_distribution: status_dist,
|
|
74
|
+
temporal_gaps: gaps,
|
|
75
|
+
)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
def compute_status_distribution(snapshots)
|
|
81
|
+
snapshots.each_with_object(Hash.new(0)) do |snap, counts|
|
|
82
|
+
counts[snap.status_code] += 1
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def compute_temporal_gaps(snapshots)
|
|
87
|
+
return [] if snapshots.size < 2
|
|
88
|
+
|
|
89
|
+
sorted = snapshots.sort_by(&:timestamp)
|
|
90
|
+
gaps = []
|
|
91
|
+
sorted.each_cons(2) do |a, b|
|
|
92
|
+
diff_days = (b.timestamp.to_time - a.timestamp.to_time) / 86400
|
|
93
|
+
next unless diff_days > 30
|
|
94
|
+
|
|
95
|
+
gaps << { from: a.timestamp.to_s, to: b.timestamp.to_s,
|
|
96
|
+
gap_days: diff_days.round }
|
|
97
|
+
end
|
|
98
|
+
gaps
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Schedules and orders snapshot downloads by strategy and priority.
|
|
5
|
+
#
|
|
6
|
+
# Supports multiple ordering strategies (newest_first, oldest_first,
|
|
7
|
+
# breadth_first, depth_first) and priority rules (html_first,
|
|
8
|
+
# smallest_first, largest_first) for intelligent download ordering.
|
|
9
|
+
class DownloadScheduler
|
|
10
|
+
STRATEGIES = %i[newest_first oldest_first breadth_first depth_first].freeze
|
|
11
|
+
PRIORITIES = %i[html_first smallest_first largest_first].freeze
|
|
12
|
+
|
|
13
|
+
def initialize(strategy: :newest_first, priority: nil,
|
|
14
|
+
max_file_size: nil, min_file_size: nil)
|
|
15
|
+
validate_strategy(strategy)
|
|
16
|
+
validate_priority(priority) if priority
|
|
17
|
+
|
|
18
|
+
@strategy = strategy
|
|
19
|
+
@priority = priority
|
|
20
|
+
@max_file_size = max_file_size
|
|
21
|
+
@min_file_size = min_file_size
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def schedule(snapshots)
|
|
25
|
+
filtered = apply_size_filters(snapshots)
|
|
26
|
+
ordered = apply_strategy(filtered)
|
|
27
|
+
apply_priority(ordered)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def validate_strategy(strategy)
|
|
33
|
+
return if STRATEGIES.include?(strategy.to_sym)
|
|
34
|
+
|
|
35
|
+
raise ArgumentError,
|
|
36
|
+
"Invalid strategy: #{strategy}. Use: #{STRATEGIES.join(', ')}"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def validate_priority(priority)
|
|
40
|
+
return if PRIORITIES.include?(priority.to_sym)
|
|
41
|
+
|
|
42
|
+
raise ArgumentError,
|
|
43
|
+
"Invalid priority: #{priority}. Use: #{PRIORITIES.join(', ')}"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def apply_size_filters(snapshots)
|
|
47
|
+
result = snapshots
|
|
48
|
+
if @max_file_size
|
|
49
|
+
result = result.reject { |s| s.length && s.length > @max_file_size }
|
|
50
|
+
end
|
|
51
|
+
if @min_file_size
|
|
52
|
+
result = result.reject { |s| s.length && s.length < @min_file_size }
|
|
53
|
+
end
|
|
54
|
+
result
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def apply_strategy(snapshots)
|
|
58
|
+
case @strategy.to_sym
|
|
59
|
+
when :newest_first
|
|
60
|
+
snapshots.sort_by { |s| -s.timestamp.to_i }
|
|
61
|
+
when :oldest_first
|
|
62
|
+
snapshots.sort_by(&:timestamp)
|
|
63
|
+
when :breadth_first
|
|
64
|
+
sort_by_depth(snapshots, depth: :shallow)
|
|
65
|
+
when :depth_first
|
|
66
|
+
sort_by_depth(snapshots, depth: :deep)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def apply_priority(snapshots)
|
|
71
|
+
return snapshots unless @priority
|
|
72
|
+
|
|
73
|
+
case @priority.to_sym
|
|
74
|
+
when :html_first
|
|
75
|
+
html, rest = snapshots.partition { |s| html?(s) }
|
|
76
|
+
html + rest
|
|
77
|
+
when :smallest_first
|
|
78
|
+
snapshots.sort_by { |s| s.length || 0 }
|
|
79
|
+
when :largest_first
|
|
80
|
+
snapshots.sort_by { |s| -(s.length || 0) }
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def sort_by_depth(snapshots, depth:)
|
|
85
|
+
segments = snapshots.map do |snap|
|
|
86
|
+
path = snap.original_url.to_s
|
|
87
|
+
depth_count = path.count("/")
|
|
88
|
+
[snap, depth_count]
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
if depth == :shallow
|
|
92
|
+
segments.sort_by { |_, d| d }.map(&:first)
|
|
93
|
+
else
|
|
94
|
+
segments.sort_by { |_, d| -d }.map(&:first)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def html?(snapshot)
|
|
99
|
+
snapshot.mimetype.to_s.include?("text/html")
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
data/lib/archaeo/http_client.rb
CHANGED
|
@@ -64,13 +64,15 @@ module Archaeo
|
|
|
64
64
|
retry_delay: DEFAULT_RETRY_DELAY,
|
|
65
65
|
user_agent: nil,
|
|
66
66
|
on_request: nil,
|
|
67
|
-
before_request: nil
|
|
67
|
+
before_request: nil,
|
|
68
|
+
rate_limiter: nil)
|
|
68
69
|
@timeout = timeout
|
|
69
70
|
@max_retries = max_retries
|
|
70
71
|
@retry_delay = retry_delay
|
|
71
72
|
@user_agent = user_agent
|
|
72
73
|
@on_request = on_request
|
|
73
74
|
@before_request = before_request
|
|
75
|
+
@rate_limiter = rate_limiter
|
|
74
76
|
@connections = {}
|
|
75
77
|
@last_used = {}
|
|
76
78
|
@mutex = Mutex.new
|
|
@@ -276,6 +278,7 @@ module Archaeo
|
|
|
276
278
|
end
|
|
277
279
|
|
|
278
280
|
def execute_tracked_request(uri, request, retry_count)
|
|
281
|
+
@rate_limiter&.wait(host: uri.host)
|
|
279
282
|
http = connection_for(uri)
|
|
280
283
|
start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
281
284
|
raw = http.request(request)
|
data/lib/archaeo/page.rb
CHANGED
|
@@ -93,6 +93,52 @@ module Archaeo
|
|
|
93
93
|
end
|
|
94
94
|
end
|
|
95
95
|
|
|
96
|
+
def headings
|
|
97
|
+
return [] unless html?
|
|
98
|
+
|
|
99
|
+
@headings ||= begin
|
|
100
|
+
doc = Nokogiri::HTML(@raw_content)
|
|
101
|
+
doc.css("h1, h2, h3, h4, h5, h6").map do |el|
|
|
102
|
+
{ level: el.name[1].to_i, text: el.text.strip }
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def images
|
|
108
|
+
return [] unless html?
|
|
109
|
+
|
|
110
|
+
@images ||= begin
|
|
111
|
+
doc = Nokogiri::HTML(@raw_content)
|
|
112
|
+
doc.css("img[src]").map do |el|
|
|
113
|
+
{ src: el["src"], alt: el["alt"].to_s,
|
|
114
|
+
width: el["width"]&.to_i, height: el["height"]&.to_i }
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def forms
|
|
120
|
+
return [] unless html?
|
|
121
|
+
|
|
122
|
+
@forms ||= begin
|
|
123
|
+
doc = Nokogiri::HTML(@raw_content)
|
|
124
|
+
doc.css("form").map do |form|
|
|
125
|
+
{ action: form["action"].to_s, method: (form["method"] || "GET").upcase,
|
|
126
|
+
fields: extract_form_fields(form) }
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def scripts
|
|
132
|
+
return [] unless html?
|
|
133
|
+
|
|
134
|
+
@scripts ||= begin
|
|
135
|
+
doc = Nokogiri::HTML(@raw_content)
|
|
136
|
+
doc.css("script").map do |el|
|
|
137
|
+
{ src: el["src"].to_s, type: el["type"].to_s }
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
96
142
|
def to_h
|
|
97
143
|
{
|
|
98
144
|
content_type: @content_type,
|
|
@@ -202,5 +248,12 @@ module Archaeo
|
|
|
202
248
|
rescue URI::InvalidURIError
|
|
203
249
|
nil
|
|
204
250
|
end
|
|
251
|
+
|
|
252
|
+
def extract_form_fields(form)
|
|
253
|
+
inputs = form.css("input, select, textarea").map do |el|
|
|
254
|
+
{ name: el["name"].to_s, type: (el["type"] || el.name).to_s }
|
|
255
|
+
end
|
|
256
|
+
inputs.reject { |f| f[:name].empty? }
|
|
257
|
+
end
|
|
205
258
|
end
|
|
206
259
|
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Value object representing download progress at a point in time.
|
|
5
|
+
#
|
|
6
|
+
# Provides computed metrics like percentage, speed, and ETA
|
|
7
|
+
# based on current counters and elapsed time.
|
|
8
|
+
ProgressReport = Struct.new(
|
|
9
|
+
:current, :total, :downloaded_bytes, :elapsed, :current_url,
|
|
10
|
+
keyword_init: true
|
|
11
|
+
) do
|
|
12
|
+
def percent_complete
|
|
13
|
+
return 0.0 if total.nil? || total.zero?
|
|
14
|
+
|
|
15
|
+
(current.to_f / total * 100).round(1)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def speed
|
|
19
|
+
return 0.0 if elapsed.nil? || elapsed.zero?
|
|
20
|
+
|
|
21
|
+
downloaded_bytes.to_f / elapsed
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def eta
|
|
25
|
+
return nil if elapsed.nil? || elapsed.zero?
|
|
26
|
+
return nil if total.nil? || current.nil? || current.zero?
|
|
27
|
+
|
|
28
|
+
rate = current.to_f / elapsed
|
|
29
|
+
remaining = total - current
|
|
30
|
+
remaining / rate
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def to_h
|
|
34
|
+
{
|
|
35
|
+
current: current,
|
|
36
|
+
total: total,
|
|
37
|
+
percent_complete: percent_complete,
|
|
38
|
+
downloaded_bytes: downloaded_bytes,
|
|
39
|
+
speed: speed,
|
|
40
|
+
eta: eta,
|
|
41
|
+
current_url: current_url,
|
|
42
|
+
elapsed: elapsed,
|
|
43
|
+
}
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def as_json(*)
|
|
47
|
+
to_h.transform_values { |v| v.is_a?(Float) ? v.round(2) : v }
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
data/lib/archaeo/save_api.rb
CHANGED
|
@@ -11,9 +11,11 @@ module Archaeo
|
|
|
11
11
|
TIMESTAMP_RE = %r{web\.archive\.org/web/(\d{14})}
|
|
12
12
|
|
|
13
13
|
def initialize(client: HttpClient.new,
|
|
14
|
-
max_tries: DEFAULT_MAX_TRIES
|
|
14
|
+
max_tries: DEFAULT_MAX_TRIES,
|
|
15
|
+
rate_limiter: nil)
|
|
15
16
|
@client = client
|
|
16
17
|
@max_tries = max_tries
|
|
18
|
+
@rate_limiter = rate_limiter
|
|
17
19
|
end
|
|
18
20
|
|
|
19
21
|
def save(url)
|
|
@@ -44,6 +46,7 @@ module Archaeo
|
|
|
44
46
|
def attempt_save(save_url, start_time, url)
|
|
45
47
|
@max_tries.times do |attempt|
|
|
46
48
|
sleep(retry_delay(attempt)) if attempt.positive?
|
|
49
|
+
@rate_limiter&.wait(host: "web.archive.org")
|
|
47
50
|
|
|
48
51
|
response = @client.get(save_url)
|
|
49
52
|
check_response_errors!(response, url)
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
|
|
5
|
+
module Archaeo
|
|
6
|
+
# Compares two archived snapshots of the same URL.
|
|
7
|
+
#
|
|
8
|
+
# Produces text diffs, structural change analysis, link and
|
|
9
|
+
# asset change tracking between snapshots at different timestamps.
|
|
10
|
+
class SnapshotDiff
|
|
11
|
+
attr_reader :url, :snapshot_a, :snapshot_b
|
|
12
|
+
|
|
13
|
+
def initialize(url:, page_a:, page_b:, timestamp_a:, timestamp_b:)
|
|
14
|
+
@url = url
|
|
15
|
+
@page_a = page_a
|
|
16
|
+
@page_b = page_b
|
|
17
|
+
@timestamp_a = Timestamp.coerce(timestamp_a)
|
|
18
|
+
@timestamp_b = Timestamp.coerce(timestamp_b)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def content_changed?
|
|
22
|
+
content_digest(@page_a.content) != content_digest(@page_b.content)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def text_diff
|
|
26
|
+
lines_a = @page_a.content.to_s.lines
|
|
27
|
+
lines_b = @page_b.content.to_s.lines
|
|
28
|
+
build_unified_diff(lines_a, lines_b)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def link_changes
|
|
32
|
+
links_a = extract_links(@page_a)
|
|
33
|
+
links_b = extract_links(@page_b)
|
|
34
|
+
compute_set_diff(links_a, links_b)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def asset_changes
|
|
38
|
+
assets_a = extract_assets(@page_a)
|
|
39
|
+
assets_b = extract_assets(@page_b)
|
|
40
|
+
compute_set_diff(assets_a, assets_b)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def structural_changes
|
|
44
|
+
return {} unless @page_a.html? && @page_b.html?
|
|
45
|
+
|
|
46
|
+
elements_a = count_elements(@page_a)
|
|
47
|
+
elements_b = count_elements(@page_b)
|
|
48
|
+
build_element_diff(elements_a, elements_b)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def to_h
|
|
52
|
+
{
|
|
53
|
+
url: @url,
|
|
54
|
+
timestamp_a: @timestamp_a.to_s,
|
|
55
|
+
timestamp_b: @timestamp_b.to_s,
|
|
56
|
+
content_changed: content_changed?,
|
|
57
|
+
links_added: link_changes[:added],
|
|
58
|
+
links_removed: link_changes[:removed],
|
|
59
|
+
assets_added: asset_changes[:added],
|
|
60
|
+
assets_removed: asset_changes[:removed],
|
|
61
|
+
structural_changes: structural_changes,
|
|
62
|
+
}
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def as_json(*)
|
|
66
|
+
to_h
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
private
|
|
70
|
+
|
|
71
|
+
def content_digest(content)
|
|
72
|
+
Digest::SHA256.hexdigest(content.to_s)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def build_unified_diff(lines_a, lines_b)
|
|
76
|
+
diff = []
|
|
77
|
+
max_len = [lines_a.size, lines_b.size].max
|
|
78
|
+
max_len.times do |i|
|
|
79
|
+
la = lines_a[i]
|
|
80
|
+
lb = lines_b[i]
|
|
81
|
+
if la == lb
|
|
82
|
+
diff << " #{la}"
|
|
83
|
+
else
|
|
84
|
+
diff << "- #{la}" if la
|
|
85
|
+
diff << "+ #{lb}" if lb
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
diff.join
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def extract_links(page)
|
|
92
|
+
return Set.new unless page.html?
|
|
93
|
+
|
|
94
|
+
page.links.filter_map { |l| l[:href] }.to_set
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def extract_assets(page)
|
|
98
|
+
return Set.new unless page.html?
|
|
99
|
+
|
|
100
|
+
extractor = AssetExtractor.new(page.content, base_url: page.archive_url)
|
|
101
|
+
extractor.extract.all.to_set
|
|
102
|
+
rescue StandardError
|
|
103
|
+
Set.new
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def count_elements(page)
|
|
107
|
+
require "nokogiri"
|
|
108
|
+
doc = Nokogiri::HTML(page.content)
|
|
109
|
+
counts = Hash.new(0)
|
|
110
|
+
doc.css("*").each { |el| counts[el.name] += 1 }
|
|
111
|
+
counts
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def compute_set_diff(set_a, set_b)
|
|
115
|
+
{
|
|
116
|
+
added: (set_b - set_a).to_a.sort,
|
|
117
|
+
removed: (set_a - set_b).to_a.sort,
|
|
118
|
+
unchanged: (set_a & set_b).size,
|
|
119
|
+
}
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def build_element_diff(counts_a, counts_b)
|
|
123
|
+
all_tags = (counts_a.keys + counts_b.keys).uniq.sort
|
|
124
|
+
changes = {}
|
|
125
|
+
all_tags.each do |tag|
|
|
126
|
+
ca = counts_a[tag]
|
|
127
|
+
cb = counts_b[tag]
|
|
128
|
+
next if ca == cb
|
|
129
|
+
|
|
130
|
+
changes[tag] = { from: ca, to: cb }
|
|
131
|
+
end
|
|
132
|
+
changes
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
data/lib/archaeo/version.rb
CHANGED
data/lib/archaeo.rb
CHANGED
|
@@ -55,4 +55,9 @@ module Archaeo
|
|
|
55
55
|
autoload :CdxCache, "archaeo/cdx_cache"
|
|
56
56
|
autoload :SubdomainDiscovery, "archaeo/subdomain_discovery"
|
|
57
57
|
autoload :ArchiveHealthCheck, "archaeo/archive_health_check"
|
|
58
|
+
autoload :DownloadScheduler, "archaeo/download_scheduler"
|
|
59
|
+
autoload :SnapshotDiff, "archaeo/snapshot_diff"
|
|
60
|
+
autoload :Configuration, "archaeo/configuration"
|
|
61
|
+
autoload :CoverageReport, "archaeo/coverage_report"
|
|
62
|
+
autoload :ProgressReport, "archaeo/progress_report"
|
|
58
63
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: archaeo
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.9
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
@@ -83,6 +83,9 @@ files:
|
|
|
83
83
|
- lib/archaeo/cdx_filter.rb
|
|
84
84
|
- lib/archaeo/cdx_timeline.rb
|
|
85
85
|
- lib/archaeo/cli.rb
|
|
86
|
+
- lib/archaeo/configuration.rb
|
|
87
|
+
- lib/archaeo/coverage_report.rb
|
|
88
|
+
- lib/archaeo/download_scheduler.rb
|
|
86
89
|
- lib/archaeo/download_state.rb
|
|
87
90
|
- lib/archaeo/encoding_detector.rb
|
|
88
91
|
- lib/archaeo/fetcher.rb
|
|
@@ -91,10 +94,12 @@ files:
|
|
|
91
94
|
- lib/archaeo/page_bundle.rb
|
|
92
95
|
- lib/archaeo/path_sanitizer.rb
|
|
93
96
|
- lib/archaeo/pattern_filter.rb
|
|
97
|
+
- lib/archaeo/progress_report.rb
|
|
94
98
|
- lib/archaeo/rate_limiter.rb
|
|
95
99
|
- lib/archaeo/save_api.rb
|
|
96
100
|
- lib/archaeo/save_result.rb
|
|
97
101
|
- lib/archaeo/snapshot.rb
|
|
102
|
+
- lib/archaeo/snapshot_diff.rb
|
|
98
103
|
- lib/archaeo/subdomain_discovery.rb
|
|
99
104
|
- lib/archaeo/timestamp.rb
|
|
100
105
|
- lib/archaeo/url_normalizer.rb
|