archaeo 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fb2b99e313bf2a3ac807cebf0052d369d83c8514ad89a1b9ca18deed421a0c4d
4
- data.tar.gz: fa1f9536f838d8246706d5eca3350ba4eae97546ae88c48e28a27e5df952d987
3
+ metadata.gz: 235d2cba1b1e071156a873d7a63cf0fdb6ba8079eb6083e21755e723727db6d9
4
+ data.tar.gz: 65c040c3a5984fdc1a68ca106d9ae10eab64b212ce6a72b37bec39ec57d383e2
5
5
  SHA512:
6
- metadata.gz: 148875e2dae2319e4c96d892c2233bd889e3c193a9ddad8995faded2d637ce398a3e971d27ca05c62c93674c3c6db05322814d823d8493c8d0318f052e1278d4
7
- data.tar.gz: ee1d6df5dc3623d6aee2e7803b82306c69099a72b52475e6969ef5a2a4bdff73ea4544faf3aa6fdcd54e4594a1020dc7b5dd05ffe5d437327eda18a5c23d35ec
6
+ metadata.gz: e6eb3cdb88abb87332bbba762bf566643da717ce17557e31ed90a012bd7c164939b5eb719420f74dfa908215e0f604e71b5fb2bb8bcc7de2940e36b80524e963
7
+ data.tar.gz: f52bc54fe3c425eeae28093810f1d90c4200391696ffe9af0e3f91366d619e4e57a425ec1e6a6a8b9aa2465d337bb2b960782fac1c3ce068d7d4b673b8306641
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Value object for a single search match within an archived snapshot.
5
+ SearchResult = Struct.new(
6
+ :url, :snapshot, :context, :match_offset,
7
+ keyword_init: true
8
+ ) do
9
+ def to_h
10
+ {
11
+ url: url,
12
+ snapshot: snapshot.as_json,
13
+ context: context,
14
+ match_offset: match_offset,
15
+ }
16
+ end
17
+
18
+ def as_json(*)
19
+ to_h
20
+ end
21
+ end
22
+
23
+ # Full-text search across archived snapshots.
24
+ #
25
+ # Fetches snapshots from CDX, downloads their content, and
26
+ # searches for the given query string. Returns matches with
27
+ # surrounding context for each hit.
28
+ class ArchiveSearch
29
+ CONTEXT_RADIUS = 80
30
+
31
+ def initialize(cdx_api: CdxApi.new, fetcher: Fetcher.new)
32
+ @cdx = cdx_api
33
+ @fetcher = fetcher
34
+ end
35
+
36
+ def search(url, query:, from: nil, to: nil,
37
+ max_results: nil, case_sensitive: false)
38
+ if query.nil? || query.empty?
39
+ raise ArgumentError,
40
+ "query must not be empty"
41
+ end
42
+
43
+ url = UrlNormalizer.normalize(url)
44
+ opts = build_options(from, to)
45
+
46
+ snapshots = @cdx.snapshots(url, **opts)
47
+ .select { |s| s.success? && s.mimetype.to_s.include?("text") }
48
+ .to_a
49
+
50
+ find_matches(snapshots, query, case_sensitive, max_results)
51
+ end
52
+
53
+ private
54
+
55
+ def build_options(from, to)
56
+ opts = { collapse: ["digest"] }
57
+ opts[:from] = Timestamp.coerce(from).to_s if from
58
+ opts[:to] = Timestamp.coerce(to).to_s if to
59
+ opts
60
+ end
61
+
62
+ def find_matches(snapshots, query, case_sensitive, max_results)
63
+ results = []
64
+ pattern = build_pattern(query, case_sensitive)
65
+
66
+ snapshots.each do |snap|
67
+ break if max_results && results.size >= max_results
68
+
69
+ content = fetch_content(snap)
70
+ next unless content
71
+
72
+ scan_content(content, pattern).each do |match_offset|
73
+ results << SearchResult.new(
74
+ url: snap.original_url,
75
+ snapshot: snap,
76
+ context: extract_context(content, match_offset, query.length),
77
+ match_offset: match_offset,
78
+ )
79
+ break if max_results && results.size >= max_results
80
+ end
81
+ end
82
+
83
+ results
84
+ end
85
+
86
+ def build_pattern(query, case_sensitive)
87
+ escaped = Regexp.escape(query)
88
+ return /#{escaped}/im unless case_sensitive
89
+
90
+ /#{escaped}/m
91
+ end
92
+
93
+ def fetch_content(snapshot)
94
+ page = @fetcher.fetch(
95
+ snapshot.original_url, timestamp: snapshot.timestamp
96
+ )
97
+ page.content if page.text?
98
+ rescue Error
99
+ nil
100
+ end
101
+
102
+ def scan_content(content, pattern)
103
+ offsets = []
104
+ content.scan(pattern) do
105
+ offsets << Regexp.last_match.offset(0).first
106
+ end
107
+ offsets
108
+ end
109
+
110
+ def extract_context(content, offset, length)
111
+ start_pos = [0, offset - CONTEXT_RADIUS].max
112
+ end_pos = [content.length, offset + length + CONTEXT_RADIUS].min
113
+
114
+ ctx = content[start_pos...end_pos]
115
+ ctx = "...#{ctx}" if start_pos.positive?
116
+ ctx = "#{ctx}..." if end_pos < content.length
117
+ ctx.tr("\n\r", " ").strip
118
+ end
119
+ end
120
+ end
data/lib/archaeo/cli.rb CHANGED
@@ -12,6 +12,8 @@ module Archaeo
12
12
 
13
13
  class_option :quiet, type: :boolean, default: false,
14
14
  desc: "Suppress progress messages"
15
+ class_option :no_color, type: :boolean, default: false,
16
+ desc: "Disable colored output"
15
17
 
16
18
  def self.exit_on_failure?
17
19
  true
@@ -351,6 +353,68 @@ module Archaeo
351
353
  end
352
354
  end
353
355
 
356
+ desc "search URL QUERY", "Search archived snapshots for text"
357
+ option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
358
+ option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
359
+ option :max_results, type: :numeric, desc: "Maximum results to return"
360
+ option :case_sensitive, type: :boolean, default: false,
361
+ desc: "Case-sensitive search"
362
+ option :format, desc: "Output format (table, json)", default: "table"
363
+ def search(url, query)
364
+ handle_errors do
365
+ searcher = ArchiveSearch.new
366
+ results = searcher.search(
367
+ url, query: query,
368
+ from: options[:from], to: options[:to],
369
+ max_results: options[:max_results],
370
+ case_sensitive: options[:case_sensitive]
371
+ )
372
+ output_search_results(results)
373
+ end
374
+ end
375
+
376
+ desc "track-changes URL",
377
+ "Track content changes over time"
378
+ option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
379
+ option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
380
+ option :format, desc: "Output format (table, json)", default: "table"
381
+ def track_changes(url)
382
+ handle_errors do
383
+ tracker = ContentTracker.new
384
+ report = tracker.track(url, from: options[:from], to: options[:to])
385
+ output_content_changes(report)
386
+ end
387
+ end
388
+
389
+ desc "warc-export URL", "Export snapshots to WARC format"
390
+ option :output, desc: "Output WARC file path", required: true
391
+ option :from, desc: "Start timestamp (YYYYMMDDHHmmss)"
392
+ option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
393
+ option :gzip, type: :boolean, default: false,
394
+ desc: "Write gzip-compressed WARC (.warc.gz)"
395
+ def warc_export(url)
396
+ handle_errors do
397
+ fetcher = Fetcher.new
398
+ cdx = CdxApi.new
399
+ opts = {}
400
+ opts[:from] = options[:from] if options[:from]
401
+ opts[:to] = options[:to] if options[:to]
402
+ snapshots = cdx.snapshots(url, **opts)
403
+ .select(&:success?).to_a
404
+
405
+ pages = snapshots.filter_map do |snap|
406
+ fetcher.fetch(snap.original_url, timestamp: snap.timestamp)
407
+ rescue Error
408
+ nil
409
+ end
410
+
411
+ WarcWriter.new.write(options[:output], pages,
412
+ compress: options[:gzip])
413
+ color = build_color
414
+ warn color.success("Exported #{pages.size} snapshots to #{options[:output]}")
415
+ end
416
+ end
417
+
354
418
  CDX_OPTION_MAP = {
355
419
  from: :from,
356
420
  to: :to,
@@ -370,16 +434,16 @@ module Archaeo
370
434
  def handle_errors
371
435
  yield
372
436
  rescue RateLimitError => e
373
- warn "Rate limited: #{e.message}"
437
+ warn build_color.warning("Rate limited: #{e.message}")
374
438
  exit 1
375
439
  rescue NoSnapshotFound => e
376
- warn "Not found: #{e.message}"
440
+ warn build_color.error("Not found: #{e.message}")
377
441
  exit 1
378
442
  rescue BlockedSiteError => e
379
- warn "Blocked: #{e.message}"
443
+ warn build_color.error("Blocked: #{e.message}")
380
444
  exit 1
381
445
  rescue Error => e
382
- warn "Error: #{e.message}"
446
+ warn build_color.error("Error: #{e.message}")
383
447
  exit 1
384
448
  end
385
449
 
@@ -746,5 +810,53 @@ module Archaeo
746
810
  end
747
811
  end
748
812
  end
813
+
814
+ def output_search_results(results)
815
+ case options[:format]
816
+ when "json"
817
+ puts JSON.generate(results.map(&:as_json))
818
+ else
819
+ if results.empty?
820
+ warn "No results found."
821
+ return
822
+ end
823
+ results.each do |result|
824
+ puts "#{result.snapshot.timestamp} #{result.url}"
825
+ puts " #{result.context}"
826
+ puts
827
+ end
828
+ warn "#{results.size} result(s) found."
829
+ end
830
+ end
831
+
832
+ def output_content_changes(report)
833
+ case options[:format]
834
+ when "json"
835
+ puts JSON.generate(report.as_json)
836
+ else
837
+ puts "URL: #{report.url}"
838
+ puts "Total snapshots: #{report.total_snapshots}"
839
+ puts "Unique digests: #{report.unique_digests}"
840
+ puts "URLs changed: #{report.changed_urls.size}"
841
+ puts "URLs added: #{report.new_urls.size}"
842
+ puts "URLs removed: #{report.removed_urls.size}"
843
+ unless report.changed_urls.empty?
844
+ puts "Changed URLs:"
845
+ report.changed_urls.each { |u| puts " #{u}" }
846
+ end
847
+ unless report.new_urls.empty?
848
+ puts "New URLs:"
849
+ report.new_urls.each { |u| puts " + #{u}" }
850
+ end
851
+ unless report.removed_urls.empty?
852
+ puts "Removed URLs:"
853
+ report.removed_urls.each { |u| puts " - #{u}" }
854
+ end
855
+ end
856
+ end
857
+
858
+ def build_color
859
+ ColorOutput.new(enabled: !options[:no_color])
860
+ end
749
861
  end
750
862
  end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Minimal ANSI color helper for CLI output.
5
+ #
6
+ # Detects whether the output stream supports color and wraps
7
+ # strings with escape codes accordingly. Respects --no-color
8
+ # and TERM=dumb.
9
+ class ColorOutput
10
+ COLORS = {
11
+ red: 31,
12
+ green: 32,
13
+ yellow: 33,
14
+ blue: 34,
15
+ magenta: 35,
16
+ cyan: 36,
17
+ white: 37,
18
+ }.freeze
19
+
20
+ STYLES = {
21
+ bold: 1,
22
+ dim: 2,
23
+ }.freeze
24
+
25
+ def initialize(enabled: nil, stream: $stderr)
26
+ @enabled = enabled.nil? ? detect_color_support(stream) : enabled
27
+ end
28
+
29
+ COLORS.each do |name, code|
30
+ define_method(name) do |text|
31
+ colorize(text, code)
32
+ end
33
+ end
34
+
35
+ STYLES.each do |name, code|
36
+ define_method(name) do |text|
37
+ colorize(text, code)
38
+ end
39
+ end
40
+
41
+ def success(text)
42
+ green(bold(text))
43
+ end
44
+
45
+ def warning(text)
46
+ yellow(bold(text))
47
+ end
48
+
49
+ def error(text)
50
+ red(bold(text))
51
+ end
52
+
53
+ def info(text)
54
+ cyan(text)
55
+ end
56
+
57
+ private
58
+
59
+ def colorize(text, code)
60
+ return text unless @enabled
61
+
62
+ "\e[#{code}m#{text}\e[0m"
63
+ end
64
+
65
+ def detect_color_support(stream)
66
+ return false if stream.nil?
67
+ return false if ENV["NO_COLOR"]
68
+ return false if ENV["TERM"] == "dumb"
69
+
70
+ stream.tty?
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "set"
5
+
6
+ module Archaeo
7
+ # Value object summarizing content changes for a URL over a time range.
8
+ ContentChangeReport = Struct.new(
9
+ :url, :from, :to,
10
+ :changed_urls, :new_urls, :removed_urls,
11
+ :content_frequency, :total_snapshots, :unique_digests,
12
+ keyword_init: true
13
+ ) do
14
+ def any_changes?
15
+ !changed_urls.empty? || !new_urls.empty? || !removed_urls.empty?
16
+ end
17
+
18
+ def to_h
19
+ {
20
+ url: url,
21
+ from: from.to_s,
22
+ to: to.to_s,
23
+ changed_urls: changed_urls,
24
+ new_urls: new_urls,
25
+ removed_urls: removed_urls,
26
+ content_frequency: content_frequency,
27
+ total_snapshots: total_snapshots,
28
+ unique_digests: unique_digests,
29
+ }
30
+ end
31
+
32
+ def as_json(*)
33
+ to_h
34
+ end
35
+ end
36
+
37
+ # Tracks content changes for a URL across archived snapshots.
38
+ #
39
+ # Groups snapshots by original URL, then analyzes how content
40
+ # (identified by CDX digest) changed over the given time range.
41
+ class ContentTracker
42
+ def initialize(cdx_api: CdxApi.new, fetcher: Fetcher.new)
43
+ @cdx = cdx_api
44
+ @fetcher = fetcher
45
+ end
46
+
47
+ def track(url, from: nil, to: nil)
48
+ url = UrlNormalizer.normalize(url)
49
+ ts_from = from ? Timestamp.coerce(from) : nil
50
+ ts_to = to ? Timestamp.coerce(to) : nil
51
+
52
+ opts = {}
53
+ opts[:from] = ts_from.to_s if ts_from
54
+ opts[:to] = ts_to.to_s if ts_to
55
+
56
+ snapshots = @cdx.snapshots(url, **opts)
57
+ .select(&:success?).to_a
58
+
59
+ grouped = group_by_url(snapshots)
60
+ analyze(url, ts_from, ts_to, snapshots, grouped)
61
+ end
62
+
63
+ private
64
+
65
+ def group_by_url(snapshots)
66
+ snapshots.group_by(&:original_url)
67
+ end
68
+
69
+ def analyze(url, ts_from, ts_to, all_snapshots, grouped)
70
+ changed = []
71
+ new_urls = []
72
+ removed = []
73
+ frequency = {}
74
+
75
+ sorted = all_snapshots.sort_by(&:timestamp)
76
+ timestamps = sorted.map(&:timestamp).uniq
77
+
78
+ grouped.each do |original_url, snaps|
79
+ url_snaps = snaps.sort_by(&:timestamp)
80
+ digests = url_snaps.map(&:digest).reject(&:empty?)
81
+
82
+ if digests.uniq.size > 1
83
+ changed << original_url
84
+ end
85
+
86
+ frequency[original_url] = digests.uniq.size
87
+ end
88
+
89
+ if timestamps.size >= 2
90
+ first_half, second_half = split_by_time(sorted, timestamps)
91
+ first_urls = Set.new(first_half.map(&:original_url))
92
+ second_urls = Set.new(second_half.map(&:original_url))
93
+
94
+ new_urls = (second_urls - first_urls).to_a.sort
95
+ removed = (first_urls - second_urls).to_a.sort
96
+ end
97
+
98
+ ContentChangeReport.new(
99
+ url: url,
100
+ from: ts_from,
101
+ to: ts_to,
102
+ changed_urls: changed.sort,
103
+ new_urls: new_urls,
104
+ removed_urls: removed,
105
+ content_frequency: frequency,
106
+ total_snapshots: all_snapshots.size,
107
+ unique_digests: all_snapshots.map(&:digest).reject(&:empty?).uniq.size,
108
+ )
109
+ end
110
+
111
+ def split_by_time(snapshots, timestamps)
112
+ mid = timestamps[timestamps.size / 2]
113
+ first, second = snapshots.partition { |s| s.timestamp < mid }
114
+ [first, second]
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Archaeo
4
+ # Fetches CDX pages in parallel for faster bulk queries.
5
+ #
6
+ # Wraps CdxApi and uses a thread pool to fetch multiple CDX
7
+ # result pages simultaneously, then merges results in order.
8
+ class ParallelCdx
9
+ DEFAULT_CONCURRENCY = 4
10
+
11
+ def initialize(cdx_api: CdxApi.new, concurrency: DEFAULT_CONCURRENCY)
12
+ @cdx = cdx_api
13
+ @concurrency = [concurrency.to_i, 1].max
14
+ end
15
+
16
+ def snapshots(url, **options)
17
+ pages = @cdx.num_pages(url, **options)
18
+ return @cdx.snapshots(url, **options) if pages <= 1
19
+
20
+ fetch_parallel(url, options, pages)
21
+ end
22
+
23
+ private
24
+
25
+ def fetch_parallel(url, options, total_pages)
26
+ queue = (0...total_pages).to_a
27
+ results = Array.new(total_pages)
28
+ mutex = Mutex.new
29
+
30
+ threads = Array.new(@concurrency) do
31
+ Thread.new do
32
+ loop do
33
+ page_num = mutex.synchronize { queue.shift }
34
+ break unless page_num
35
+
36
+ opts = options.merge(page: page_num)
37
+ page_results = @cdx.snapshots(url, **opts).to_a
38
+ mutex.synchronize { results[page_num] = page_results }
39
+ end
40
+ end
41
+ end
42
+
43
+ threads.each(&:join)
44
+ results.compact.flatten
45
+ end
46
+ end
47
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.9"
4
+ VERSION = "0.2.10"
5
5
  end
@@ -0,0 +1,249 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "time"
5
+ require "zlib"
6
+
7
+ module Archaeo
8
+ # Reads WARC (Web ARChive) format files (.warc, .warc.gz).
9
+ #
10
+ # Parses WARC 1.0 records and yields WarcRecord value objects
11
+ # containing headers and body content.
12
+ class WarcReader
13
+ WARC_VERSION = "WARC/1.0"
14
+ CRLF = "\r\n"
15
+ HEADER_END = "\r\n\r\n"
16
+
17
+ def initialize
18
+ @record_count = 0
19
+ end
20
+
21
+ def read(path, &block)
22
+ io = open_warc(path)
23
+ read_records_from_io(io, &block)
24
+ ensure
25
+ io&.close
26
+ end
27
+
28
+ def read_records(path)
29
+ records = []
30
+ read(path) { |record| records << record }
31
+ records
32
+ end
33
+
34
+ private
35
+
36
+ def open_warc(path)
37
+ if path.end_with?(".gz")
38
+ Zlib::GzipReader.open(path)
39
+ else
40
+ File.open(path, "rb")
41
+ end
42
+ end
43
+
44
+ def read_records_from_io(io)
45
+ buffer = +""
46
+ loop do
47
+ chunk = io.read(8192)
48
+ buffer << chunk if chunk
49
+
50
+ while (record = try_parse_record(buffer))
51
+ yield record
52
+ end
53
+
54
+ break unless chunk
55
+ end
56
+
57
+ return if buffer.strip.empty?
58
+
59
+ record = try_parse_record(buffer, final: true)
60
+ yield record if record
61
+ end
62
+
63
+ def try_parse_record(buffer, final: false)
64
+ header_end = buffer.index(HEADER_END)
65
+ return nil unless header_end
66
+
67
+ header_block = buffer.byteslice(0, header_end)
68
+ headers = parse_warc_headers(header_block.split(CRLF))
69
+ return nil unless headers[:warc_type]
70
+
71
+ content_length = headers[:content_length].to_i
72
+ body_start = header_end + HEADER_END.length
73
+ body_end = body_start + content_length
74
+
75
+ return nil unless final || buffer.bytesize >= body_end
76
+
77
+ body = buffer.byteslice(body_start, content_length).to_s
78
+ record = WarcRecord.new(
79
+ version: headers.delete(:version),
80
+ headers: headers,
81
+ body: body,
82
+ )
83
+
84
+ total_consumed = body_end
85
+ total_consumed += 2 while buffer.byteslice(total_consumed, 2) == CRLF
86
+
87
+ remaining = buffer.byteslice(total_consumed,
88
+ buffer.bytesize - total_consumed)
89
+ buffer.replace(remaining.to_s)
90
+ record
91
+ end
92
+
93
+ def parse_warc_headers(lines)
94
+ headers = {}
95
+ lines.each do |line|
96
+ case line
97
+ when /\AWARC\/(\d+\.\d+)\z/
98
+ headers[:version] = $1
99
+ when /\A([^:]+):\s*(.*)\z/
100
+ key = $1.downcase.tr("-", "_").to_sym
101
+ headers[key] = $2
102
+ else
103
+ break if line.strip.empty?
104
+ end
105
+ end
106
+ headers
107
+ end
108
+ end
109
+
110
+ # Writes snapshots to WARC format files (.warc, .warc.gz).
111
+ #
112
+ # Produces valid WARC 1.0 files with response and metadata records.
113
+ class WarcWriter
114
+ WARC_VERSION = "WARC/1.0"
115
+ RECORD_SEP = "\r\n\r\n"
116
+ CRLF = "\r\n"
117
+
118
+ def initialize(software: "archaeo/#{VERSION}")
119
+ @software = software
120
+ @record_count = 0
121
+ end
122
+
123
+ def write(path, pages, compress: nil)
124
+ compress = path.end_with?(".gz") if compress.nil?
125
+ io = open_warc(path, compress)
126
+ write_warcinfo(io, path)
127
+ pages.each { |page| write_page(io, page) }
128
+ ensure
129
+ io&.close
130
+ end
131
+
132
+ private
133
+
134
+ def open_warc(path, compress)
135
+ if compress
136
+ Zlib::GzipWriter.open(path)
137
+ else
138
+ File.open(path, "wb")
139
+ end
140
+ end
141
+
142
+ def write_warcinfo(io, filename)
143
+ fields = {
144
+ software: @software,
145
+ format: "WARC File Format 1.0",
146
+ filename: File.basename(filename),
147
+ }
148
+ body = fields.map { |k, v| "#{k}: #{v}" }.join(CRLF) + CRLF
149
+ record_id = generate_record_id
150
+ headers = warc_headers(
151
+ type: "warcinfo",
152
+ record_id: record_id,
153
+ date: Time.now.utc.iso8601,
154
+ content_type: "application/warc-fields",
155
+ content_length: body.bytesize,
156
+ )
157
+ io.write(headers + body + RECORD_SEP)
158
+ end
159
+
160
+ def write_page(io, page)
161
+ record_id = generate_record_id
162
+ date = page.timestamp.to_time.utc.iso8601
163
+
164
+ http_headers = build_http_headers(page)
165
+ body = page.content.to_s
166
+ full_body = http_headers + body
167
+
168
+ headers = warc_headers(
169
+ type: "response",
170
+ record_id: record_id,
171
+ date: date,
172
+ target_uri: page.original_url.to_s,
173
+ content_type: "application/http;msgtype=response",
174
+ content_length: full_body.bytesize,
175
+ )
176
+
177
+ io.write(headers + full_body + RECORD_SEP)
178
+ @record_count += 1
179
+ end
180
+
181
+ def build_http_headers(page)
182
+ parts = ["HTTP/1.1 #{page.status_code}"]
183
+ parts << "Content-Type: #{page.content_type}"
184
+ parts << "Content-Length: #{page.size}"
185
+ parts.join(CRLF) + CRLF
186
+ end
187
+
188
+ def warc_headers(type:, record_id:, date:, target_uri: nil,
189
+ content_type: nil, content_length: 0)
190
+ lines = [
191
+ WARC_VERSION.to_s,
192
+ "WARC-Type: #{type}",
193
+ "WARC-Record-ID: #{record_id}",
194
+ "WARC-Date: #{date}",
195
+ ]
196
+ lines << "WARC-Target-URI: #{target_uri}" if target_uri
197
+ lines << "Content-Type: #{content_type}" if content_type
198
+ lines << "Content-Length: #{content_length}"
199
+ lines.join(CRLF) + RECORD_SEP
200
+ end
201
+
202
+ def generate_record_id
203
+ @record_count += 1
204
+ uuid = Digest::SHA256.hexdigest(
205
+ "#{Time.now.utc.to_f}-#{@record_count}-#{rand(1 << 32)}",
206
+ )
207
+ "<urn:uuid:#{uuid[0, 8]}-#{uuid[8, 4]}-#{uuid[12, 4]}-" \
208
+ "#{uuid[16, 4]}-#{uuid[20, 12]}>"
209
+ end
210
+ end
211
+
212
+ # Value object representing a single WARC record.
213
+ WarcRecord = Struct.new(
214
+ :version, :headers, :body,
215
+ keyword_init: true
216
+ ) do
217
+ def warc_type
218
+ headers[:warc_type]
219
+ end
220
+
221
+ def target_uri
222
+ headers[:warc_target_uri]
223
+ end
224
+
225
+ def date
226
+ headers[:warc_date]
227
+ end
228
+
229
+ def content_type
230
+ headers[:content_type]
231
+ end
232
+
233
+ def content_length
234
+ headers[:content_length].to_i
235
+ end
236
+
237
+ def response?
238
+ warc_type == "response"
239
+ end
240
+
241
+ def warcinfo?
242
+ warc_type == "warcinfo"
243
+ end
244
+
245
+ def to_h
246
+ { version: version, headers: headers, body_length: body.to_s.bytesize }
247
+ end
248
+ end
249
+ end
data/lib/archaeo.rb CHANGED
@@ -60,4 +60,13 @@ module Archaeo
60
60
  autoload :Configuration, "archaeo/configuration"
61
61
  autoload :CoverageReport, "archaeo/coverage_report"
62
62
  autoload :ProgressReport, "archaeo/progress_report"
63
+ autoload :ColorOutput, "archaeo/color_output"
64
+ autoload :WarcReader, "archaeo/warc_support"
65
+ autoload :WarcWriter, "archaeo/warc_support"
66
+ autoload :WarcRecord, "archaeo/warc_support"
67
+ autoload :ParallelCdx, "archaeo/parallel_cdx"
68
+ autoload :ContentTracker, "archaeo/content_tracker"
69
+ autoload :ContentChangeReport, "archaeo/content_tracker"
70
+ autoload :ArchiveSearch, "archaeo/archive_search"
71
+ autoload :SearchResult, "archaeo/archive_search"
63
72
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.9
4
+ version: 0.2.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-12 00:00:00.000000000 Z
11
+ date: 2026-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: csv
@@ -72,6 +72,7 @@ files:
72
72
  - exe/archaeo
73
73
  - lib/archaeo.rb
74
74
  - lib/archaeo/archive_health_check.rb
75
+ - lib/archaeo/archive_search.rb
75
76
  - lib/archaeo/archive_url.rb
76
77
  - lib/archaeo/asset_extractor.rb
77
78
  - lib/archaeo/asset_list.rb
@@ -83,7 +84,9 @@ files:
83
84
  - lib/archaeo/cdx_filter.rb
84
85
  - lib/archaeo/cdx_timeline.rb
85
86
  - lib/archaeo/cli.rb
87
+ - lib/archaeo/color_output.rb
86
88
  - lib/archaeo/configuration.rb
89
+ - lib/archaeo/content_tracker.rb
87
90
  - lib/archaeo/coverage_report.rb
88
91
  - lib/archaeo/download_scheduler.rb
89
92
  - lib/archaeo/download_state.rb
@@ -92,6 +95,7 @@ files:
92
95
  - lib/archaeo/http_client.rb
93
96
  - lib/archaeo/page.rb
94
97
  - lib/archaeo/page_bundle.rb
98
+ - lib/archaeo/parallel_cdx.rb
95
99
  - lib/archaeo/path_sanitizer.rb
96
100
  - lib/archaeo/pattern_filter.rb
97
101
  - lib/archaeo/progress_report.rb
@@ -105,6 +109,7 @@ files:
105
109
  - lib/archaeo/url_normalizer.rb
106
110
  - lib/archaeo/url_rewriter.rb
107
111
  - lib/archaeo/version.rb
112
+ - lib/archaeo/warc_support.rb
108
113
  - sig/archaeo.rbs
109
114
  homepage: https://github.com/riboseinc/archaeo
110
115
  licenses: