archaeo 0.2.9 → 0.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/archaeo/archive_search.rb +120 -0
- data/lib/archaeo/cli.rb +116 -4
- data/lib/archaeo/color_output.rb +73 -0
- data/lib/archaeo/content_tracker.rb +117 -0
- data/lib/archaeo/parallel_cdx.rb +47 -0
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo/warc_support.rb +249 -0
- data/lib/archaeo.rb +9 -0
- metadata +7 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 235d2cba1b1e071156a873d7a63cf0fdb6ba8079eb6083e21755e723727db6d9
|
|
4
|
+
data.tar.gz: 65c040c3a5984fdc1a68ca106d9ae10eab64b212ce6a72b37bec39ec57d383e2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e6eb3cdb88abb87332bbba762bf566643da717ce17557e31ed90a012bd7c164939b5eb719420f74dfa908215e0f604e71b5fb2bb8bcc7de2940e36b80524e963
|
|
7
|
+
data.tar.gz: f52bc54fe3c425eeae28093810f1d90c4200391696ffe9af0e3f91366d619e4e57a425ec1e6a6a8b9aa2465d337bb2b960782fac1c3ce068d7d4b673b8306641
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Value object for a single search match within an archived snapshot.
|
|
5
|
+
SearchResult = Struct.new(
|
|
6
|
+
:url, :snapshot, :context, :match_offset,
|
|
7
|
+
keyword_init: true
|
|
8
|
+
) do
|
|
9
|
+
def to_h
|
|
10
|
+
{
|
|
11
|
+
url: url,
|
|
12
|
+
snapshot: snapshot.as_json,
|
|
13
|
+
context: context,
|
|
14
|
+
match_offset: match_offset,
|
|
15
|
+
}
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def as_json(*)
|
|
19
|
+
to_h
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Full-text search across archived snapshots.
|
|
24
|
+
#
|
|
25
|
+
# Fetches snapshots from CDX, downloads their content, and
|
|
26
|
+
# searches for the given query string. Returns matches with
|
|
27
|
+
# surrounding context for each hit.
|
|
28
|
+
class ArchiveSearch
|
|
29
|
+
CONTEXT_RADIUS = 80
|
|
30
|
+
|
|
31
|
+
def initialize(cdx_api: CdxApi.new, fetcher: Fetcher.new)
|
|
32
|
+
@cdx = cdx_api
|
|
33
|
+
@fetcher = fetcher
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def search(url, query:, from: nil, to: nil,
|
|
37
|
+
max_results: nil, case_sensitive: false)
|
|
38
|
+
if query.nil? || query.empty?
|
|
39
|
+
raise ArgumentError,
|
|
40
|
+
"query must not be empty"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
url = UrlNormalizer.normalize(url)
|
|
44
|
+
opts = build_options(from, to)
|
|
45
|
+
|
|
46
|
+
snapshots = @cdx.snapshots(url, **opts)
|
|
47
|
+
.select { |s| s.success? && s.mimetype.to_s.include?("text") }
|
|
48
|
+
.to_a
|
|
49
|
+
|
|
50
|
+
find_matches(snapshots, query, case_sensitive, max_results)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
def build_options(from, to)
|
|
56
|
+
opts = { collapse: ["digest"] }
|
|
57
|
+
opts[:from] = Timestamp.coerce(from).to_s if from
|
|
58
|
+
opts[:to] = Timestamp.coerce(to).to_s if to
|
|
59
|
+
opts
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def find_matches(snapshots, query, case_sensitive, max_results)
|
|
63
|
+
results = []
|
|
64
|
+
pattern = build_pattern(query, case_sensitive)
|
|
65
|
+
|
|
66
|
+
snapshots.each do |snap|
|
|
67
|
+
break if max_results && results.size >= max_results
|
|
68
|
+
|
|
69
|
+
content = fetch_content(snap)
|
|
70
|
+
next unless content
|
|
71
|
+
|
|
72
|
+
scan_content(content, pattern).each do |match_offset|
|
|
73
|
+
results << SearchResult.new(
|
|
74
|
+
url: snap.original_url,
|
|
75
|
+
snapshot: snap,
|
|
76
|
+
context: extract_context(content, match_offset, query.length),
|
|
77
|
+
match_offset: match_offset,
|
|
78
|
+
)
|
|
79
|
+
break if max_results && results.size >= max_results
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
results
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def build_pattern(query, case_sensitive)
|
|
87
|
+
escaped = Regexp.escape(query)
|
|
88
|
+
return /#{escaped}/im unless case_sensitive
|
|
89
|
+
|
|
90
|
+
/#{escaped}/m
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def fetch_content(snapshot)
|
|
94
|
+
page = @fetcher.fetch(
|
|
95
|
+
snapshot.original_url, timestamp: snapshot.timestamp
|
|
96
|
+
)
|
|
97
|
+
page.content if page.text?
|
|
98
|
+
rescue Error
|
|
99
|
+
nil
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def scan_content(content, pattern)
|
|
103
|
+
offsets = []
|
|
104
|
+
content.scan(pattern) do
|
|
105
|
+
offsets << Regexp.last_match.offset(0).first
|
|
106
|
+
end
|
|
107
|
+
offsets
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def extract_context(content, offset, length)
|
|
111
|
+
start_pos = [0, offset - CONTEXT_RADIUS].max
|
|
112
|
+
end_pos = [content.length, offset + length + CONTEXT_RADIUS].min
|
|
113
|
+
|
|
114
|
+
ctx = content[start_pos...end_pos]
|
|
115
|
+
ctx = "...#{ctx}" if start_pos.positive?
|
|
116
|
+
ctx = "#{ctx}..." if end_pos < content.length
|
|
117
|
+
ctx.tr("\n\r", " ").strip
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
data/lib/archaeo/cli.rb
CHANGED
|
@@ -12,6 +12,8 @@ module Archaeo
|
|
|
12
12
|
|
|
13
13
|
class_option :quiet, type: :boolean, default: false,
|
|
14
14
|
desc: "Suppress progress messages"
|
|
15
|
+
class_option :no_color, type: :boolean, default: false,
|
|
16
|
+
desc: "Disable colored output"
|
|
15
17
|
|
|
16
18
|
def self.exit_on_failure?
|
|
17
19
|
true
|
|
@@ -351,6 +353,68 @@ module Archaeo
|
|
|
351
353
|
end
|
|
352
354
|
end
|
|
353
355
|
|
|
356
|
+
desc "search URL QUERY", "Search archived snapshots for text"
|
|
357
|
+
option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
|
|
358
|
+
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
359
|
+
option :max_results, type: :numeric, desc: "Maximum results to return"
|
|
360
|
+
option :case_sensitive, type: :boolean, default: false,
|
|
361
|
+
desc: "Case-sensitive search"
|
|
362
|
+
option :format, desc: "Output format (table, json)", default: "table"
|
|
363
|
+
def search(url, query)
|
|
364
|
+
handle_errors do
|
|
365
|
+
searcher = ArchiveSearch.new
|
|
366
|
+
results = searcher.search(
|
|
367
|
+
url, query: query,
|
|
368
|
+
from: options[:from], to: options[:to],
|
|
369
|
+
max_results: options[:max_results],
|
|
370
|
+
case_sensitive: options[:case_sensitive]
|
|
371
|
+
)
|
|
372
|
+
output_search_results(results)
|
|
373
|
+
end
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
desc "track-changes URL",
|
|
377
|
+
"Track content changes over time"
|
|
378
|
+
option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
|
|
379
|
+
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
380
|
+
option :format, desc: "Output format (table, json)", default: "table"
|
|
381
|
+
def track_changes(url)
|
|
382
|
+
handle_errors do
|
|
383
|
+
tracker = ContentTracker.new
|
|
384
|
+
report = tracker.track(url, from: options[:from], to: options[:to])
|
|
385
|
+
output_content_changes(report)
|
|
386
|
+
end
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
desc "warc-export URL", "Export snapshots to WARC format"
|
|
390
|
+
option :output, desc: "Output WARC file path", required: true
|
|
391
|
+
option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
|
|
392
|
+
option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
|
|
393
|
+
option :gzip, type: :boolean, default: false,
|
|
394
|
+
desc: "Write gzip-compressed WARC (.warc.gz)"
|
|
395
|
+
def warc_export(url)
|
|
396
|
+
handle_errors do
|
|
397
|
+
fetcher = Fetcher.new
|
|
398
|
+
cdx = CdxApi.new
|
|
399
|
+
opts = {}
|
|
400
|
+
opts[:from] = options[:from] if options[:from]
|
|
401
|
+
opts[:to] = options[:to] if options[:to]
|
|
402
|
+
snapshots = cdx.snapshots(url, **opts)
|
|
403
|
+
.select(&:success?).to_a
|
|
404
|
+
|
|
405
|
+
pages = snapshots.filter_map do |snap|
|
|
406
|
+
fetcher.fetch(snap.original_url, timestamp: snap.timestamp)
|
|
407
|
+
rescue Error
|
|
408
|
+
nil
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
WarcWriter.new.write(options[:output], pages,
|
|
412
|
+
compress: options[:gzip])
|
|
413
|
+
color = build_color
|
|
414
|
+
warn color.success("Exported #{pages.size} snapshots to #{options[:output]}")
|
|
415
|
+
end
|
|
416
|
+
end
|
|
417
|
+
|
|
354
418
|
CDX_OPTION_MAP = {
|
|
355
419
|
from: :from,
|
|
356
420
|
to: :to,
|
|
@@ -370,16 +434,16 @@ module Archaeo
|
|
|
370
434
|
def handle_errors
|
|
371
435
|
yield
|
|
372
436
|
rescue RateLimitError => e
|
|
373
|
-
warn "Rate limited: #{e.message}"
|
|
437
|
+
warn build_color.warning("Rate limited: #{e.message}")
|
|
374
438
|
exit 1
|
|
375
439
|
rescue NoSnapshotFound => e
|
|
376
|
-
warn "Not found: #{e.message}"
|
|
440
|
+
warn build_color.error("Not found: #{e.message}")
|
|
377
441
|
exit 1
|
|
378
442
|
rescue BlockedSiteError => e
|
|
379
|
-
warn "Blocked: #{e.message}"
|
|
443
|
+
warn build_color.error("Blocked: #{e.message}")
|
|
380
444
|
exit 1
|
|
381
445
|
rescue Error => e
|
|
382
|
-
warn "Error: #{e.message}"
|
|
446
|
+
warn build_color.error("Error: #{e.message}")
|
|
383
447
|
exit 1
|
|
384
448
|
end
|
|
385
449
|
|
|
@@ -746,5 +810,53 @@ module Archaeo
|
|
|
746
810
|
end
|
|
747
811
|
end
|
|
748
812
|
end
|
|
813
|
+
|
|
814
|
+
def output_search_results(results)
|
|
815
|
+
case options[:format]
|
|
816
|
+
when "json"
|
|
817
|
+
puts JSON.generate(results.map(&:as_json))
|
|
818
|
+
else
|
|
819
|
+
if results.empty?
|
|
820
|
+
warn "No results found."
|
|
821
|
+
return
|
|
822
|
+
end
|
|
823
|
+
results.each do |result|
|
|
824
|
+
puts "#{result.snapshot.timestamp} #{result.url}"
|
|
825
|
+
puts " #{result.context}"
|
|
826
|
+
puts
|
|
827
|
+
end
|
|
828
|
+
warn "#{results.size} result(s) found."
|
|
829
|
+
end
|
|
830
|
+
end
|
|
831
|
+
|
|
832
|
+
def output_content_changes(report)
|
|
833
|
+
case options[:format]
|
|
834
|
+
when "json"
|
|
835
|
+
puts JSON.generate(report.as_json)
|
|
836
|
+
else
|
|
837
|
+
puts "URL: #{report.url}"
|
|
838
|
+
puts "Total snapshots: #{report.total_snapshots}"
|
|
839
|
+
puts "Unique digests: #{report.unique_digests}"
|
|
840
|
+
puts "URLs changed: #{report.changed_urls.size}"
|
|
841
|
+
puts "URLs added: #{report.new_urls.size}"
|
|
842
|
+
puts "URLs removed: #{report.removed_urls.size}"
|
|
843
|
+
unless report.changed_urls.empty?
|
|
844
|
+
puts "Changed URLs:"
|
|
845
|
+
report.changed_urls.each { |u| puts " #{u}" }
|
|
846
|
+
end
|
|
847
|
+
unless report.new_urls.empty?
|
|
848
|
+
puts "New URLs:"
|
|
849
|
+
report.new_urls.each { |u| puts " + #{u}" }
|
|
850
|
+
end
|
|
851
|
+
unless report.removed_urls.empty?
|
|
852
|
+
puts "Removed URLs:"
|
|
853
|
+
report.removed_urls.each { |u| puts " - #{u}" }
|
|
854
|
+
end
|
|
855
|
+
end
|
|
856
|
+
end
|
|
857
|
+
|
|
858
|
+
def build_color
|
|
859
|
+
ColorOutput.new(enabled: !options[:no_color])
|
|
860
|
+
end
|
|
749
861
|
end
|
|
750
862
|
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Minimal ANSI color helper for CLI output.
|
|
5
|
+
#
|
|
6
|
+
# Detects whether the output stream supports color and wraps
|
|
7
|
+
# strings with escape codes accordingly. Respects --no-color
|
|
8
|
+
# and TERM=dumb.
|
|
9
|
+
class ColorOutput
|
|
10
|
+
COLORS = {
|
|
11
|
+
red: 31,
|
|
12
|
+
green: 32,
|
|
13
|
+
yellow: 33,
|
|
14
|
+
blue: 34,
|
|
15
|
+
magenta: 35,
|
|
16
|
+
cyan: 36,
|
|
17
|
+
white: 37,
|
|
18
|
+
}.freeze
|
|
19
|
+
|
|
20
|
+
STYLES = {
|
|
21
|
+
bold: 1,
|
|
22
|
+
dim: 2,
|
|
23
|
+
}.freeze
|
|
24
|
+
|
|
25
|
+
def initialize(enabled: nil, stream: $stderr)
|
|
26
|
+
@enabled = enabled.nil? ? detect_color_support(stream) : enabled
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
COLORS.each do |name, code|
|
|
30
|
+
define_method(name) do |text|
|
|
31
|
+
colorize(text, code)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
STYLES.each do |name, code|
|
|
36
|
+
define_method(name) do |text|
|
|
37
|
+
colorize(text, code)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def success(text)
|
|
42
|
+
green(bold(text))
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def warning(text)
|
|
46
|
+
yellow(bold(text))
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def error(text)
|
|
50
|
+
red(bold(text))
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def info(text)
|
|
54
|
+
cyan(text)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
def colorize(text, code)
|
|
60
|
+
return text unless @enabled
|
|
61
|
+
|
|
62
|
+
"\e[#{code}m#{text}\e[0m"
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def detect_color_support(stream)
|
|
66
|
+
return false if stream.nil?
|
|
67
|
+
return false if ENV["NO_COLOR"]
|
|
68
|
+
return false if ENV["TERM"] == "dumb"
|
|
69
|
+
|
|
70
|
+
stream.tty?
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
require "set"
|
|
5
|
+
|
|
6
|
+
module Archaeo
|
|
7
|
+
# Value object summarizing content changes for a URL over a time range.
|
|
8
|
+
ContentChangeReport = Struct.new(
|
|
9
|
+
:url, :from, :to,
|
|
10
|
+
:changed_urls, :new_urls, :removed_urls,
|
|
11
|
+
:content_frequency, :total_snapshots, :unique_digests,
|
|
12
|
+
keyword_init: true
|
|
13
|
+
) do
|
|
14
|
+
def any_changes?
|
|
15
|
+
!changed_urls.empty? || !new_urls.empty? || !removed_urls.empty?
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def to_h
|
|
19
|
+
{
|
|
20
|
+
url: url,
|
|
21
|
+
from: from.to_s,
|
|
22
|
+
to: to.to_s,
|
|
23
|
+
changed_urls: changed_urls,
|
|
24
|
+
new_urls: new_urls,
|
|
25
|
+
removed_urls: removed_urls,
|
|
26
|
+
content_frequency: content_frequency,
|
|
27
|
+
total_snapshots: total_snapshots,
|
|
28
|
+
unique_digests: unique_digests,
|
|
29
|
+
}
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def as_json(*)
|
|
33
|
+
to_h
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Tracks content changes for a URL across archived snapshots.
|
|
38
|
+
#
|
|
39
|
+
# Groups snapshots by original URL, then analyzes how content
|
|
40
|
+
# (identified by CDX digest) changed over the given time range.
|
|
41
|
+
class ContentTracker
|
|
42
|
+
def initialize(cdx_api: CdxApi.new, fetcher: Fetcher.new)
|
|
43
|
+
@cdx = cdx_api
|
|
44
|
+
@fetcher = fetcher
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def track(url, from: nil, to: nil)
|
|
48
|
+
url = UrlNormalizer.normalize(url)
|
|
49
|
+
ts_from = from ? Timestamp.coerce(from) : nil
|
|
50
|
+
ts_to = to ? Timestamp.coerce(to) : nil
|
|
51
|
+
|
|
52
|
+
opts = {}
|
|
53
|
+
opts[:from] = ts_from.to_s if ts_from
|
|
54
|
+
opts[:to] = ts_to.to_s if ts_to
|
|
55
|
+
|
|
56
|
+
snapshots = @cdx.snapshots(url, **opts)
|
|
57
|
+
.select(&:success?).to_a
|
|
58
|
+
|
|
59
|
+
grouped = group_by_url(snapshots)
|
|
60
|
+
analyze(url, ts_from, ts_to, snapshots, grouped)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def group_by_url(snapshots)
|
|
66
|
+
snapshots.group_by(&:original_url)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def analyze(url, ts_from, ts_to, all_snapshots, grouped)
|
|
70
|
+
changed = []
|
|
71
|
+
new_urls = []
|
|
72
|
+
removed = []
|
|
73
|
+
frequency = {}
|
|
74
|
+
|
|
75
|
+
sorted = all_snapshots.sort_by(&:timestamp)
|
|
76
|
+
timestamps = sorted.map(&:timestamp).uniq
|
|
77
|
+
|
|
78
|
+
grouped.each do |original_url, snaps|
|
|
79
|
+
url_snaps = snaps.sort_by(&:timestamp)
|
|
80
|
+
digests = url_snaps.map(&:digest).reject(&:empty?)
|
|
81
|
+
|
|
82
|
+
if digests.uniq.size > 1
|
|
83
|
+
changed << original_url
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
frequency[original_url] = digests.uniq.size
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
if timestamps.size >= 2
|
|
90
|
+
first_half, second_half = split_by_time(sorted, timestamps)
|
|
91
|
+
first_urls = Set.new(first_half.map(&:original_url))
|
|
92
|
+
second_urls = Set.new(second_half.map(&:original_url))
|
|
93
|
+
|
|
94
|
+
new_urls = (second_urls - first_urls).to_a.sort
|
|
95
|
+
removed = (first_urls - second_urls).to_a.sort
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
ContentChangeReport.new(
|
|
99
|
+
url: url,
|
|
100
|
+
from: ts_from,
|
|
101
|
+
to: ts_to,
|
|
102
|
+
changed_urls: changed.sort,
|
|
103
|
+
new_urls: new_urls,
|
|
104
|
+
removed_urls: removed,
|
|
105
|
+
content_frequency: frequency,
|
|
106
|
+
total_snapshots: all_snapshots.size,
|
|
107
|
+
unique_digests: all_snapshots.map(&:digest).reject(&:empty?).uniq.size,
|
|
108
|
+
)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def split_by_time(snapshots, timestamps)
|
|
112
|
+
mid = timestamps[timestamps.size / 2]
|
|
113
|
+
first, second = snapshots.partition { |s| s.timestamp < mid }
|
|
114
|
+
[first, second]
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Archaeo
|
|
4
|
+
# Fetches CDX pages in parallel for faster bulk queries.
|
|
5
|
+
#
|
|
6
|
+
# Wraps CdxApi and uses a thread pool to fetch multiple CDX
|
|
7
|
+
# result pages simultaneously, then merges results in order.
|
|
8
|
+
class ParallelCdx
|
|
9
|
+
DEFAULT_CONCURRENCY = 4
|
|
10
|
+
|
|
11
|
+
def initialize(cdx_api: CdxApi.new, concurrency: DEFAULT_CONCURRENCY)
|
|
12
|
+
@cdx = cdx_api
|
|
13
|
+
@concurrency = [concurrency.to_i, 1].max
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def snapshots(url, **options)
|
|
17
|
+
pages = @cdx.num_pages(url, **options)
|
|
18
|
+
return @cdx.snapshots(url, **options) if pages <= 1
|
|
19
|
+
|
|
20
|
+
fetch_parallel(url, options, pages)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def fetch_parallel(url, options, total_pages)
|
|
26
|
+
queue = (0...total_pages).to_a
|
|
27
|
+
results = Array.new(total_pages)
|
|
28
|
+
mutex = Mutex.new
|
|
29
|
+
|
|
30
|
+
threads = Array.new(@concurrency) do
|
|
31
|
+
Thread.new do
|
|
32
|
+
loop do
|
|
33
|
+
page_num = mutex.synchronize { queue.shift }
|
|
34
|
+
break unless page_num
|
|
35
|
+
|
|
36
|
+
opts = options.merge(page: page_num)
|
|
37
|
+
page_results = @cdx.snapshots(url, **opts).to_a
|
|
38
|
+
mutex.synchronize { results[page_num] = page_results }
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
threads.each(&:join)
|
|
44
|
+
results.compact.flatten
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
data/lib/archaeo/version.rb
CHANGED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
require "time"
|
|
5
|
+
require "zlib"
|
|
6
|
+
|
|
7
|
+
module Archaeo
|
|
8
|
+
# Reads WARC (Web ARChive) format files (.warc, .warc.gz).
|
|
9
|
+
#
|
|
10
|
+
# Parses WARC 1.0 records and yields WarcRecord value objects
|
|
11
|
+
# containing headers and body content.
|
|
12
|
+
class WarcReader
|
|
13
|
+
WARC_VERSION = "WARC/1.0"
|
|
14
|
+
CRLF = "\r\n"
|
|
15
|
+
HEADER_END = "\r\n\r\n"
|
|
16
|
+
|
|
17
|
+
def initialize
|
|
18
|
+
@record_count = 0
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def read(path, &block)
|
|
22
|
+
io = open_warc(path)
|
|
23
|
+
read_records_from_io(io, &block)
|
|
24
|
+
ensure
|
|
25
|
+
io&.close
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def read_records(path)
|
|
29
|
+
records = []
|
|
30
|
+
read(path) { |record| records << record }
|
|
31
|
+
records
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def open_warc(path)
|
|
37
|
+
if path.end_with?(".gz")
|
|
38
|
+
Zlib::GzipReader.open(path)
|
|
39
|
+
else
|
|
40
|
+
File.open(path, "rb")
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def read_records_from_io(io)
|
|
45
|
+
buffer = +""
|
|
46
|
+
loop do
|
|
47
|
+
chunk = io.read(8192)
|
|
48
|
+
buffer << chunk if chunk
|
|
49
|
+
|
|
50
|
+
while (record = try_parse_record(buffer))
|
|
51
|
+
yield record
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
break unless chunk
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
return if buffer.strip.empty?
|
|
58
|
+
|
|
59
|
+
record = try_parse_record(buffer, final: true)
|
|
60
|
+
yield record if record
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def try_parse_record(buffer, final: false)
|
|
64
|
+
header_end = buffer.index(HEADER_END)
|
|
65
|
+
return nil unless header_end
|
|
66
|
+
|
|
67
|
+
header_block = buffer.byteslice(0, header_end)
|
|
68
|
+
headers = parse_warc_headers(header_block.split(CRLF))
|
|
69
|
+
return nil unless headers[:warc_type]
|
|
70
|
+
|
|
71
|
+
content_length = headers[:content_length].to_i
|
|
72
|
+
body_start = header_end + HEADER_END.length
|
|
73
|
+
body_end = body_start + content_length
|
|
74
|
+
|
|
75
|
+
return nil unless final || buffer.bytesize >= body_end
|
|
76
|
+
|
|
77
|
+
body = buffer.byteslice(body_start, content_length).to_s
|
|
78
|
+
record = WarcRecord.new(
|
|
79
|
+
version: headers.delete(:version),
|
|
80
|
+
headers: headers,
|
|
81
|
+
body: body,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
total_consumed = body_end
|
|
85
|
+
total_consumed += 2 while buffer.byteslice(total_consumed, 2) == CRLF
|
|
86
|
+
|
|
87
|
+
remaining = buffer.byteslice(total_consumed,
|
|
88
|
+
buffer.bytesize - total_consumed)
|
|
89
|
+
buffer.replace(remaining.to_s)
|
|
90
|
+
record
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def parse_warc_headers(lines)
|
|
94
|
+
headers = {}
|
|
95
|
+
lines.each do |line|
|
|
96
|
+
case line
|
|
97
|
+
when /\AWARC\/(\d+\.\d+)\z/
|
|
98
|
+
headers[:version] = $1
|
|
99
|
+
when /\A([^:]+):\s*(.*)\z/
|
|
100
|
+
key = $1.downcase.tr("-", "_").to_sym
|
|
101
|
+
headers[key] = $2
|
|
102
|
+
else
|
|
103
|
+
break if line.strip.empty?
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
headers
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Writes snapshots to WARC format files (.warc, .warc.gz).
|
|
111
|
+
#
|
|
112
|
+
# Produces valid WARC 1.0 files with response and metadata records.
|
|
113
|
+
class WarcWriter
|
|
114
|
+
WARC_VERSION = "WARC/1.0"
|
|
115
|
+
RECORD_SEP = "\r\n\r\n"
|
|
116
|
+
CRLF = "\r\n"
|
|
117
|
+
|
|
118
|
+
def initialize(software: "archaeo/#{VERSION}")
|
|
119
|
+
@software = software
|
|
120
|
+
@record_count = 0
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def write(path, pages, compress: nil)
|
|
124
|
+
compress = path.end_with?(".gz") if compress.nil?
|
|
125
|
+
io = open_warc(path, compress)
|
|
126
|
+
write_warcinfo(io, path)
|
|
127
|
+
pages.each { |page| write_page(io, page) }
|
|
128
|
+
ensure
|
|
129
|
+
io&.close
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
private
|
|
133
|
+
|
|
134
|
+
def open_warc(path, compress)
|
|
135
|
+
if compress
|
|
136
|
+
Zlib::GzipWriter.open(path)
|
|
137
|
+
else
|
|
138
|
+
File.open(path, "wb")
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def write_warcinfo(io, filename)
|
|
143
|
+
fields = {
|
|
144
|
+
software: @software,
|
|
145
|
+
format: "WARC File Format 1.0",
|
|
146
|
+
filename: File.basename(filename),
|
|
147
|
+
}
|
|
148
|
+
body = fields.map { |k, v| "#{k}: #{v}" }.join(CRLF) + CRLF
|
|
149
|
+
record_id = generate_record_id
|
|
150
|
+
headers = warc_headers(
|
|
151
|
+
type: "warcinfo",
|
|
152
|
+
record_id: record_id,
|
|
153
|
+
date: Time.now.utc.iso8601,
|
|
154
|
+
content_type: "application/warc-fields",
|
|
155
|
+
content_length: body.bytesize,
|
|
156
|
+
)
|
|
157
|
+
io.write(headers + body + RECORD_SEP)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def write_page(io, page)
|
|
161
|
+
record_id = generate_record_id
|
|
162
|
+
date = page.timestamp.to_time.utc.iso8601
|
|
163
|
+
|
|
164
|
+
http_headers = build_http_headers(page)
|
|
165
|
+
body = page.content.to_s
|
|
166
|
+
full_body = http_headers + body
|
|
167
|
+
|
|
168
|
+
headers = warc_headers(
|
|
169
|
+
type: "response",
|
|
170
|
+
record_id: record_id,
|
|
171
|
+
date: date,
|
|
172
|
+
target_uri: page.original_url.to_s,
|
|
173
|
+
content_type: "application/http;msgtype=response",
|
|
174
|
+
content_length: full_body.bytesize,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
io.write(headers + full_body + RECORD_SEP)
|
|
178
|
+
@record_count += 1
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def build_http_headers(page)
|
|
182
|
+
parts = ["HTTP/1.1 #{page.status_code}"]
|
|
183
|
+
parts << "Content-Type: #{page.content_type}"
|
|
184
|
+
parts << "Content-Length: #{page.size}"
|
|
185
|
+
parts.join(CRLF) + CRLF
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def warc_headers(type:, record_id:, date:, target_uri: nil,
|
|
189
|
+
content_type: nil, content_length: 0)
|
|
190
|
+
lines = [
|
|
191
|
+
WARC_VERSION.to_s,
|
|
192
|
+
"WARC-Type: #{type}",
|
|
193
|
+
"WARC-Record-ID: #{record_id}",
|
|
194
|
+
"WARC-Date: #{date}",
|
|
195
|
+
]
|
|
196
|
+
lines << "WARC-Target-URI: #{target_uri}" if target_uri
|
|
197
|
+
lines << "Content-Type: #{content_type}" if content_type
|
|
198
|
+
lines << "Content-Length: #{content_length}"
|
|
199
|
+
lines.join(CRLF) + RECORD_SEP
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def generate_record_id
|
|
203
|
+
@record_count += 1
|
|
204
|
+
uuid = Digest::SHA256.hexdigest(
|
|
205
|
+
"#{Time.now.utc.to_f}-#{@record_count}-#{rand(1 << 32)}",
|
|
206
|
+
)
|
|
207
|
+
"<urn:uuid:#{uuid[0, 8]}-#{uuid[8, 4]}-#{uuid[12, 4]}-" \
|
|
208
|
+
"#{uuid[16, 4]}-#{uuid[20, 12]}>"
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Value object representing a single WARC record.
|
|
213
|
+
WarcRecord = Struct.new(
|
|
214
|
+
:version, :headers, :body,
|
|
215
|
+
keyword_init: true
|
|
216
|
+
) do
|
|
217
|
+
def warc_type
|
|
218
|
+
headers[:warc_type]
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def target_uri
|
|
222
|
+
headers[:warc_target_uri]
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def date
|
|
226
|
+
headers[:warc_date]
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def content_type
|
|
230
|
+
headers[:content_type]
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def content_length
|
|
234
|
+
headers[:content_length].to_i
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def response?
|
|
238
|
+
warc_type == "response"
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def warcinfo?
|
|
242
|
+
warc_type == "warcinfo"
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def to_h
|
|
246
|
+
{ version: version, headers: headers, body_length: body.to_s.bytesize }
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
data/lib/archaeo.rb
CHANGED
|
@@ -60,4 +60,13 @@ module Archaeo
|
|
|
60
60
|
autoload :Configuration, "archaeo/configuration"
|
|
61
61
|
autoload :CoverageReport, "archaeo/coverage_report"
|
|
62
62
|
autoload :ProgressReport, "archaeo/progress_report"
|
|
63
|
+
autoload :ColorOutput, "archaeo/color_output"
|
|
64
|
+
autoload :WarcReader, "archaeo/warc_support"
|
|
65
|
+
autoload :WarcWriter, "archaeo/warc_support"
|
|
66
|
+
autoload :WarcRecord, "archaeo/warc_support"
|
|
67
|
+
autoload :ParallelCdx, "archaeo/parallel_cdx"
|
|
68
|
+
autoload :ContentTracker, "archaeo/content_tracker"
|
|
69
|
+
autoload :ContentChangeReport, "archaeo/content_tracker"
|
|
70
|
+
autoload :ArchiveSearch, "archaeo/archive_search"
|
|
71
|
+
autoload :SearchResult, "archaeo/archive_search"
|
|
63
72
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: archaeo
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.10
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: csv
|
|
@@ -72,6 +72,7 @@ files:
|
|
|
72
72
|
- exe/archaeo
|
|
73
73
|
- lib/archaeo.rb
|
|
74
74
|
- lib/archaeo/archive_health_check.rb
|
|
75
|
+
- lib/archaeo/archive_search.rb
|
|
75
76
|
- lib/archaeo/archive_url.rb
|
|
76
77
|
- lib/archaeo/asset_extractor.rb
|
|
77
78
|
- lib/archaeo/asset_list.rb
|
|
@@ -83,7 +84,9 @@ files:
|
|
|
83
84
|
- lib/archaeo/cdx_filter.rb
|
|
84
85
|
- lib/archaeo/cdx_timeline.rb
|
|
85
86
|
- lib/archaeo/cli.rb
|
|
87
|
+
- lib/archaeo/color_output.rb
|
|
86
88
|
- lib/archaeo/configuration.rb
|
|
89
|
+
- lib/archaeo/content_tracker.rb
|
|
87
90
|
- lib/archaeo/coverage_report.rb
|
|
88
91
|
- lib/archaeo/download_scheduler.rb
|
|
89
92
|
- lib/archaeo/download_state.rb
|
|
@@ -92,6 +95,7 @@ files:
|
|
|
92
95
|
- lib/archaeo/http_client.rb
|
|
93
96
|
- lib/archaeo/page.rb
|
|
94
97
|
- lib/archaeo/page_bundle.rb
|
|
98
|
+
- lib/archaeo/parallel_cdx.rb
|
|
95
99
|
- lib/archaeo/path_sanitizer.rb
|
|
96
100
|
- lib/archaeo/pattern_filter.rb
|
|
97
101
|
- lib/archaeo/progress_report.rb
|
|
@@ -105,6 +109,7 @@ files:
|
|
|
105
109
|
- lib/archaeo/url_normalizer.rb
|
|
106
110
|
- lib/archaeo/url_rewriter.rb
|
|
107
111
|
- lib/archaeo/version.rb
|
|
112
|
+
- lib/archaeo/warc_support.rb
|
|
108
113
|
- sig/archaeo.rbs
|
|
109
114
|
homepage: https://github.com/riboseinc/archaeo
|
|
110
115
|
licenses:
|