archaeo 0.2.10 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +361 -11
- data/lib/archaeo/cli.rb +26 -0
- data/lib/archaeo/local_rewriter.rb +106 -0
- data/lib/archaeo/page.rb +63 -0
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +2 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 76a36571f0747712c2abda1a4aef93c7ade9a83b42590e23f0148b89138451b0
|
|
4
|
+
data.tar.gz: a9eed4768d084756fbb10eda17b1f2098246fd56a93cbe91b55f693850e5008a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fa8e01a6aa31aa678a17ce2fc4f59e324c4e8779716b7c41d876dbd366af06dda30296af446919eedc3136efe5bc2527abef60d5aa4274745e94ef7415a775fa
|
|
7
|
+
data.tar.gz: b3fd25ec4d3b10c759992226dd2d699276dbd9def9318ef343f632b69faa5c4fb0017f78ae7aa87b1b85c4fb48a642d4d667c3f43e0e768162486d63f1bf7be1
|
data/README.adoc
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
Archaeo is a Ruby client for the Internet Archive's https://web.archive.org[Wayback Machine] APIs.
|
|
6
6
|
|
|
7
|
-
It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content,
|
|
7
|
+
It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content, bulk downloading with resume support, snapshot comparison, coverage analysis, content tracking, full-text search, WARC format I/O, and more.
|
|
8
8
|
|
|
9
9
|
== Installation
|
|
10
10
|
|
|
@@ -100,6 +100,18 @@ cdx.num_pages("example.com")
|
|
|
100
100
|
|
|
101
101
|
# Discover all known URLs for a domain
|
|
102
102
|
cdx.known_urls("example.com")
|
|
103
|
+
|
|
104
|
+
# Composite snapshot (point-in-time site reconstruction)
|
|
105
|
+
cdx.composite_snapshot("example.com", timestamp: "20220615",
|
|
106
|
+
collapse: ["digest"])
|
|
107
|
+
# => picks newest snapshot per URL at or before the given timestamp
|
|
108
|
+
|
|
109
|
+
# CDX caching (speeds up repeated queries)
|
|
110
|
+
cdx = Archaeo::CdxApi.new(cache_dir: ".cache")
|
|
111
|
+
|
|
112
|
+
# Parallel CDX fetching (thread pool for multi-page queries)
|
|
113
|
+
parallel = Archaeo::ParallelCdx.new(concurrency: 4)
|
|
114
|
+
snapshots = parallel.snapshots("example.com")
|
|
103
115
|
----
|
|
104
116
|
|
|
105
117
|
=== Check Availability
|
|
@@ -141,6 +153,14 @@ result.as_json # => JSON-serializable Hash
|
|
|
141
153
|
results = save.batch_save(%w[https://a.com https://b.com],
|
|
142
154
|
delay: 2, stop_on_error: false)
|
|
143
155
|
results.each { |r| puts "#{r.url}: #{r.success?}" }
|
|
156
|
+
|
|
157
|
+
# Inspect response details
|
|
158
|
+
result.status_code # => HTTP status from Save API
|
|
159
|
+
result.response_url # => redirect URL if any
|
|
160
|
+
result.response_headers # => Hash of response headers
|
|
161
|
+
|
|
162
|
+
# With rate limiter
|
|
163
|
+
save = Archaeo::SaveApi.new(rate_limiter: Archaeo::RateLimiter.new(min_interval: 1.0))
|
|
144
164
|
----
|
|
145
165
|
|
|
146
166
|
=== Fetch Archived Content
|
|
@@ -182,6 +202,12 @@ page = fetcher.fetch!("https://example.com/",
|
|
|
182
202
|
# Page links and meta extraction
|
|
183
203
|
page.links # => [{ href: "...", text: "...", external: true/false }]
|
|
184
204
|
page.meta_tags # => { "description" => "...", "og:title" => "...", "canonical" => "..." }
|
|
205
|
+
|
|
206
|
+
# Structured content extraction (HTML pages only)
|
|
207
|
+
page.headings # => [{ level: 1, text: "Title" }, { level: 2, text: "Subtitle" }]
|
|
208
|
+
page.images # => [{ src: "photo.jpg", alt: "...", width: 800, height: 600 }]
|
|
209
|
+
page.forms # => [{ action: "/submit", method: "POST", fields: [{ name: "q", type: "text" }] }]
|
|
210
|
+
page.scripts # => [{ src: "app.js", type: "text/javascript" }]
|
|
185
211
|
----
|
|
186
212
|
|
|
187
213
|
=== Fetch Page with Assets
|
|
@@ -260,6 +286,46 @@ downloader = Archaeo::BulkDownloader.new(
|
|
|
260
286
|
output_dir: "archive", concurrency: 4,
|
|
261
287
|
)
|
|
262
288
|
downloader.download("example.com")
|
|
289
|
+
|
|
290
|
+
# Download with page requisites (CSS/JS/images)
|
|
291
|
+
downloader.download("example.com", page_requisites: true)
|
|
292
|
+
|
|
293
|
+
# Point-in-time composite snapshot
|
|
294
|
+
downloader.download("example.com", snapshot_at: "20220615")
|
|
295
|
+
|
|
296
|
+
# All timestamps (not just latest per URL)
|
|
297
|
+
downloader.download("example.com", all_timestamps: true)
|
|
298
|
+
|
|
299
|
+
# URL pattern filtering
|
|
300
|
+
filter = Archaeo::PatternFilter.new(only: ".*\\.html$", exclude: nil)
|
|
301
|
+
downloader.download("example.com", filter: filter)
|
|
302
|
+
|
|
303
|
+
# Download scheduling strategies
|
|
304
|
+
scheduler = Archaeo::DownloadScheduler.new(
|
|
305
|
+
strategy: :breadth_first, # or :depth_first, :newest_first, :oldest_first
|
|
306
|
+
priority: :html_first,
|
|
307
|
+
max_file_size: 50 * 1024 * 1024,
|
|
308
|
+
)
|
|
309
|
+
# Integrates with BulkDownloader via strategy: option
|
|
310
|
+
|
|
311
|
+
# Rate limiting
|
|
312
|
+
limiter = Archaeo::RateLimiter.new(min_interval: 0.5)
|
|
313
|
+
downloader = Archaeo::BulkDownloader.new(
|
|
314
|
+
output_dir: "archive", rate_limiter: limiter,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Limit snapshots
|
|
318
|
+
downloader.download("example.com", max_snapshots: 10, strategy: :newest_first)
|
|
319
|
+
|
|
320
|
+
# Progress reporting
|
|
321
|
+
downloader.download("example.com") do |current, total, snap|
|
|
322
|
+
report = Archaeo::ProgressReport.new(
|
|
323
|
+
current: current, total: total,
|
|
324
|
+
downloaded_bytes: current * 1024, elapsed: 10.0,
|
|
325
|
+
current_url: snap.original_url,
|
|
326
|
+
)
|
|
327
|
+
puts "#{report.percent_complete}% — ETA #{report.eta}s"
|
|
328
|
+
end
|
|
263
329
|
----
|
|
264
330
|
|
|
265
331
|
=== Download State (Resume Tracking)
|
|
@@ -351,9 +417,21 @@ rewriter.rewrite("https://web.archive.org/web/20220615000000/style.css")
|
|
|
351
417
|
# Rewrite batch
|
|
352
418
|
rewriter.rewrite_batch(["url1", "url2"])
|
|
353
419
|
|
|
354
|
-
# Rewrite URLs within HTML (src, href, srcset, data-src, poster)
|
|
420
|
+
# Rewrite URLs within HTML (src, href, srcset, data-src, poster, action, data-url)
|
|
355
421
|
# Also rewrites inline style url() and <style> element url()
|
|
356
422
|
rewritten_html = rewriter.rewrite_html(html_content)
|
|
423
|
+
|
|
424
|
+
# Enhanced rewriting with JS strings, absolute URLs, and server extensions
|
|
425
|
+
rewriter = Archaeo::UrlRewriter.new(
|
|
426
|
+
"https://web.archive.org/web/20220615000000/",
|
|
427
|
+
"local",
|
|
428
|
+
rewrite_js: true, # rewrite URLs inside JS string literals
|
|
429
|
+
rewrite_absolute: true, # rewrite all absolute archive URLs (not just prefix match)
|
|
430
|
+
server_extensions: true, # handle .php/.asp/.jsp URLs specially
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Standalone CSS file rewriting
|
|
434
|
+
rewritten_css = rewriter.rewrite_css(css_content)
|
|
357
435
|
----
|
|
358
436
|
|
|
359
437
|
=== Snapshot Convenience
|
|
@@ -468,6 +546,218 @@ client.pool_stats
|
|
|
468
546
|
# idle_times: { "web.archive.org": 12 } }
|
|
469
547
|
----
|
|
470
548
|
|
|
549
|
+
=== Snapshot Comparison (Diff)
|
|
550
|
+
|
|
551
|
+
[source,ruby]
|
|
552
|
+
----
|
|
553
|
+
diff = Archaeo::SnapshotDiff.new(
|
|
554
|
+
url: "https://example.com/",
|
|
555
|
+
page_a: page_a, page_b: page_b,
|
|
556
|
+
timestamp_a: "20220101", timestamp_b: "20220615",
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
diff.content_changed? # => true/false (SHA256 digest comparison)
|
|
560
|
+
diff.text_diff # => unified diff of content lines
|
|
561
|
+
diff.link_changes # => { added: [...], removed: [...], unchanged: N }
|
|
562
|
+
diff.asset_changes # => { added: [...], removed: [...], unchanged: N }
|
|
563
|
+
diff.structural_changes # => { "a" => { from: 1, to: 2 }, ... }
|
|
564
|
+
diff.to_h # => Hash with all fields
|
|
565
|
+
----
|
|
566
|
+
|
|
567
|
+
=== Coverage Analysis
|
|
568
|
+
|
|
569
|
+
[source,ruby]
|
|
570
|
+
----
|
|
571
|
+
analyzer = Archaeo::CoverageAnalyzer.new
|
|
572
|
+
report = analyzer.analyze("example.com", from: "20220101", to: "20221231")
|
|
573
|
+
|
|
574
|
+
report.url # => "example.com"
|
|
575
|
+
report.total_urls # => unique URLs found
|
|
576
|
+
report.archived_urls # => URLs with at least one capture
|
|
577
|
+
report.coverage_percent # => 87.3
|
|
578
|
+
report.temporal_gaps # => [{ from: ts, to: ts, gap_days: 45 }, ...]
|
|
579
|
+
report.has_gaps? # => true/false
|
|
580
|
+
report.status_distribution # => { 200 => 150, 404 => 10 }
|
|
581
|
+
report.missing_assets # => resources referenced but not archived
|
|
582
|
+
----
|
|
583
|
+
|
|
584
|
+
=== Archive Health Check
|
|
585
|
+
|
|
586
|
+
[source,ruby]
|
|
587
|
+
----
|
|
588
|
+
checker = Archaeo::ArchiveHealthCheck.new
|
|
589
|
+
report = checker.check("example.com", from: "20220101", to: "20221231")
|
|
590
|
+
|
|
591
|
+
report.total # => 150
|
|
592
|
+
report.accessible # => 148
|
|
593
|
+
report.missing # => 2
|
|
594
|
+
report.errors # => 0
|
|
595
|
+
report.details # => [HealthDetail, ...]
|
|
596
|
+
|
|
597
|
+
# Sample a subset (for large collections)
|
|
598
|
+
report = checker.check("example.com", sample: 50)
|
|
599
|
+
----
|
|
600
|
+
|
|
601
|
+
=== Content Tracking
|
|
602
|
+
|
|
603
|
+
[source,ruby]
|
|
604
|
+
----
|
|
605
|
+
tracker = Archaeo::ContentTracker.new
|
|
606
|
+
report = tracker.track("example.com", from: "20220101", to: "20221231")
|
|
607
|
+
|
|
608
|
+
report.changed_urls # => URLs whose digest changed over time
|
|
609
|
+
report.new_urls # => URLs that appeared in the second half
|
|
610
|
+
report.removed_urls # => URLs that disappeared in the second half
|
|
611
|
+
report.content_frequency # => { "url" => unique_digest_count }
|
|
612
|
+
report.any_changes? # => true if any changes detected
|
|
613
|
+
----
|
|
614
|
+
|
|
615
|
+
=== Archive Search
|
|
616
|
+
|
|
617
|
+
[source,ruby]
|
|
618
|
+
----
|
|
619
|
+
searcher = Archaeo::ArchiveSearch.new
|
|
620
|
+
results = searcher.search("example.com",
|
|
621
|
+
query: "contact us",
|
|
622
|
+
from: "20220101",
|
|
623
|
+
to: "20221231",
|
|
624
|
+
case_sensitive: false,
|
|
625
|
+
max_results: 10)
|
|
626
|
+
|
|
627
|
+
results.each do |match|
|
|
628
|
+
puts match.snapshot.timestamp # => when it was archived
|
|
629
|
+
puts match.url # => the page URL
|
|
630
|
+
puts match.context # => "...contact us..." with surrounding text
|
|
631
|
+
end
|
|
632
|
+
----
|
|
633
|
+
|
|
634
|
+
=== WARC Support
|
|
635
|
+
|
|
636
|
+
[source,ruby]
|
|
637
|
+
----
|
|
638
|
+
# Export snapshots to WARC format
|
|
639
|
+
writer = Archaeo::WarcWriter.new
|
|
640
|
+
writer.write("archive/output.warc", pages)
|
|
641
|
+
|
|
642
|
+
# Gzip-compressed output
|
|
643
|
+
writer.write("archive/output.warc.gz", pages, compress: true)
|
|
644
|
+
|
|
645
|
+
# Read WARC files
|
|
646
|
+
reader = Archaeo::WarcReader.new
|
|
647
|
+
records = reader.read_records("archive/output.warc")
|
|
648
|
+
|
|
649
|
+
records.each do |record|
|
|
650
|
+
record.warc_type # => "response" or "warcinfo"
|
|
651
|
+
record.target_uri # => original URL
|
|
652
|
+
record.body # => archived content
|
|
653
|
+
record.response? # => true for response records
|
|
654
|
+
end
|
|
655
|
+
----
|
|
656
|
+
|
|
657
|
+
=== Configuration
|
|
658
|
+
|
|
659
|
+
[source,ruby]
|
|
660
|
+
----
|
|
661
|
+
# Load .archaeo.yml config
|
|
662
|
+
config = Archaeo::Configuration.new
|
|
663
|
+
|
|
664
|
+
config.get("output_dir") # => "archive" (default)
|
|
665
|
+
config.get("rate_limit") # => 0.5
|
|
666
|
+
config.get("concurrency", profile: "fast") # => 8
|
|
667
|
+
|
|
668
|
+
# Persist settings
|
|
669
|
+
config.set("rate_limit", 1.0)
|
|
670
|
+
config.set("concurrency", 4, profile: "fast")
|
|
671
|
+
|
|
672
|
+
# List profiles
|
|
673
|
+
config.profiles # => ["fast", "careful"]
|
|
674
|
+
----
|
|
675
|
+
|
|
676
|
+
=== Encoding Detection
|
|
677
|
+
|
|
678
|
+
[source,ruby]
|
|
679
|
+
----
|
|
680
|
+
detector = Archaeo::EncodingDetector.new
|
|
681
|
+
|
|
682
|
+
# Detect encoding from content + content-type charset
|
|
683
|
+
encoding = detector.detect(binary_content, content_type: "text/html; charset=iso-8859-1")
|
|
684
|
+
# => Encoding::ISO_8859_1
|
|
685
|
+
|
|
686
|
+
# Detect from HTML meta tag
|
|
687
|
+
encoding = detector.detect("<html><head><meta charset='utf-8'>...")
|
|
688
|
+
# => Encoding::UTF_8
|
|
689
|
+
|
|
690
|
+
# Multi-encoding fallback chain
|
|
691
|
+
detector.detect(content) # tries UTF-8, ISO-8859-1, Windows-1252
|
|
692
|
+
----
|
|
693
|
+
|
|
694
|
+
=== Path Sanitization
|
|
695
|
+
|
|
696
|
+
[source,ruby]
|
|
697
|
+
----
|
|
698
|
+
sanitizer = Archaeo::PathSanitizer.new
|
|
699
|
+
safe_path = sanitizer.sanitize("https://example.com/path?q=1&r=2")
|
|
700
|
+
# => "path_q_1_r_2"
|
|
701
|
+
|
|
702
|
+
# Handles query string hashing, recursive percent-decoding,
|
|
703
|
+
# and file/directory conflict resolution
|
|
704
|
+
----
|
|
705
|
+
|
|
706
|
+
=== Pattern Filtering
|
|
707
|
+
|
|
708
|
+
[source,ruby]
|
|
709
|
+
----
|
|
710
|
+
# Include/exclude URL patterns
|
|
711
|
+
filter = Archaeo::PatternFilter.new(
|
|
712
|
+
only: ".*\\.html$", # regex string or %r{} Regexp
|
|
713
|
+
exclude: /\\/api\\//,
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
filter.match?("https://example.com/page.html") # => true
|
|
717
|
+
filter.match?("https://example.com/style.css") # => false
|
|
718
|
+
filter.match?("https://example.com/api/data") # => false (excluded)
|
|
719
|
+
----
|
|
720
|
+
|
|
721
|
+
=== Subdomain Discovery
|
|
722
|
+
|
|
723
|
+
[source,ruby]
|
|
724
|
+
----
|
|
725
|
+
discovery = Archaeo::SubdomainDiscovery.new("example.com", max_depth: 2)
|
|
726
|
+
|
|
727
|
+
# Scan downloaded files to discover subdomains
|
|
728
|
+
subdomains = discovery.scan_files("archive/")
|
|
729
|
+
# => ["cdn.example.com", "blog.example.com"]
|
|
730
|
+
|
|
731
|
+
# Scan raw content (HTML, CSS, JS)
|
|
732
|
+
subdomains = discovery.scan_content("<a href='https://blog.example.com/post'>")
|
|
733
|
+
# => ["blog.example.com"]
|
|
734
|
+
----
|
|
735
|
+
|
|
736
|
+
=== Rate Limiting
|
|
737
|
+
|
|
738
|
+
[source,ruby]
|
|
739
|
+
----
|
|
740
|
+
# Per-host rate limiter with adaptive backoff
|
|
741
|
+
limiter = Archaeo::RateLimiter.new(min_interval: 0.5)
|
|
742
|
+
limiter.wait(host: "web.archive.org") # sleeps if needed
|
|
743
|
+
limiter.wait(host: "api.example.com") # independent per-host tracking
|
|
744
|
+
----
|
|
745
|
+
|
|
746
|
+
=== Color Output
|
|
747
|
+
|
|
748
|
+
[source,ruby]
|
|
749
|
+
----
|
|
750
|
+
color = Archaeo::ColorOutput.new(enabled: true)
|
|
751
|
+
|
|
752
|
+
color.success("Done!") # green + bold
|
|
753
|
+
color.warning("Careful") # yellow + bold
|
|
754
|
+
color.error("Failed!") # red + bold
|
|
755
|
+
color.info("Info") # cyan
|
|
756
|
+
|
|
757
|
+
# Auto-detects from TTY, NO_COLOR env, TERM=dumb
|
|
758
|
+
color = Archaeo::ColorOutput.new # enabled: auto-detected
|
|
759
|
+
----
|
|
760
|
+
|
|
471
761
|
=== Command-Line Interface
|
|
472
762
|
|
|
473
763
|
[source,bash]
|
|
@@ -543,11 +833,67 @@ archaeo download --concurrency 4 example.com --output ./archive
|
|
|
543
833
|
# Resume interrupted download
|
|
544
834
|
archaeo download example.com --resume
|
|
545
835
|
|
|
836
|
+
# Download with page requisites (linked assets)
|
|
837
|
+
archaeo download --page-requisites example.com
|
|
838
|
+
|
|
839
|
+
# Point-in-time composite snapshot
|
|
840
|
+
archaeo download --snapshot-at 20220615 example.com
|
|
841
|
+
|
|
842
|
+
# All timestamps (not just latest)
|
|
843
|
+
archaeo download --all-timestamps example.com
|
|
844
|
+
|
|
845
|
+
# URL pattern filtering
|
|
846
|
+
archaeo download --only '.*\.html$' --exclude '/api/' example.com
|
|
847
|
+
|
|
848
|
+
# Download scheduling
|
|
849
|
+
archaeo download --strategy newest_first --max-snapshots 10 example.com
|
|
850
|
+
|
|
851
|
+
# Reset download state
|
|
852
|
+
archaeo download --reset example.com
|
|
853
|
+
|
|
854
|
+
# Rate limiting
|
|
855
|
+
archaeo download --rate-limit 0.5 example.com
|
|
856
|
+
|
|
857
|
+
# Recursive subdomain discovery
|
|
858
|
+
archaeo download --recursive-subdomains --subdomain-depth 2 example.com
|
|
859
|
+
|
|
546
860
|
# Suppress progress messages
|
|
547
861
|
archaeo --quiet download example.com
|
|
548
862
|
|
|
863
|
+
# Disable colored output
|
|
864
|
+
archaeo --no-color download example.com
|
|
865
|
+
|
|
549
866
|
# Discover all known URLs for a domain
|
|
550
867
|
archaeo known_urls example.com
|
|
868
|
+
archaeo known_urls --file urls.txt example.com
|
|
869
|
+
archaeo known_urls --subdomain example.com
|
|
870
|
+
|
|
871
|
+
# Check archive health
|
|
872
|
+
archaeo health example.com
|
|
873
|
+
archaeo health --from 20220101 --to 20221231 --sample 50 example.com
|
|
874
|
+
|
|
875
|
+
# Analyze archive coverage
|
|
876
|
+
archaeo coverage example.com
|
|
877
|
+
archaeo coverage --from 20220101 --to 20221231 --format json example.com
|
|
878
|
+
|
|
879
|
+
# Compare two snapshots
|
|
880
|
+
archaeo snapshot-diff example.com 20220101 20220615
|
|
881
|
+
archaeo snapshot-diff --format json example.com 20220101 20220615
|
|
882
|
+
|
|
883
|
+
# Search archived content
|
|
884
|
+
archaeo search example.com "contact us"
|
|
885
|
+
archaeo search --from 20220101 --to 20221231 --max-results 10 example.com "about"
|
|
886
|
+
|
|
887
|
+
# Track content changes over time
|
|
888
|
+
archaeo track-changes example.com
|
|
889
|
+
archaeo track-changes --from 20220101 --to 20221231 --format json example.com
|
|
890
|
+
|
|
891
|
+
# Export to WARC format
|
|
892
|
+
archaeo warc-export --output archive.warc example.com
|
|
893
|
+
archaeo warc-export --output archive.warc.gz --gzip example.com
|
|
894
|
+
|
|
895
|
+
# Save API with headers
|
|
896
|
+
archaeo save --headers https://example.com/
|
|
551
897
|
----
|
|
552
898
|
|
|
553
899
|
=== Error Handling
|
|
@@ -585,28 +931,32 @@ Archaeo follows a model-driven, OOP design:
|
|
|
585
931
|
| Layer | Classes | Purpose
|
|
586
932
|
|
|
587
933
|
| *Models*
|
|
588
|
-
| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`, `CdxTimeline`
|
|
934
|
+
| `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`, `CdxTimeline`, `ProgressReport`, `CoverageReport`, `ContentChangeReport`, `SearchResult`, `WarcRecord`, `HealthReport`
|
|
589
935
|
| Domain value objects with `to_h`, `as_json`, `inspect` support
|
|
590
936
|
|
|
591
937
|
| *URL Processing*
|
|
592
|
-
| `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
|
|
593
|
-
| URL sanitization, validated filtering
|
|
938
|
+
| `UrlNormalizer`, `CdxFilter`, `UrlRewriter`, `PatternFilter`, `PathSanitizer`
|
|
939
|
+
| URL sanitization, validated filtering, regex include/exclude, path conflict resolution, and HTML/JS/CSS URL rewriting
|
|
594
940
|
|
|
595
941
|
| *Asset Extraction*
|
|
596
942
|
| `AssetExtractor`, `AssetList`
|
|
597
943
|
| Parse HTML for resource URLs including preloads and modulepreload
|
|
598
944
|
|
|
599
945
|
| *APIs*
|
|
600
|
-
| `CdxApi`, `AvailabilityApi`, `SaveApi`
|
|
601
|
-
| Query and mutate the archive
|
|
946
|
+
| `CdxApi`, `ParallelCdx`, `AvailabilityApi`, `SaveApi`, `ArchiveSearch`
|
|
947
|
+
| Query and mutate the archive, parallel CDX fetching, full-text search
|
|
602
948
|
|
|
603
949
|
| *Operations*
|
|
604
|
-
| `Fetcher`, `BulkDownloader`, `DownloadState`
|
|
605
|
-
| Download content with resume,
|
|
950
|
+
| `Fetcher`, `BulkDownloader`, `DownloadState`, `DownloadScheduler`, `SubdomainDiscovery`
|
|
951
|
+
| Download content with resume, scheduling strategies, subdomain discovery, and digest verification
|
|
952
|
+
|
|
953
|
+
| *Analysis*
|
|
954
|
+
| `SnapshotDiff`, `CoverageAnalyzer`, `ContentTracker`, `ArchiveHealthCheck`
|
|
955
|
+
| Compare snapshots, analyze coverage, track changes over time, verify accessibility
|
|
606
956
|
|
|
607
957
|
| *Infrastructure*
|
|
608
|
-
| `HttpClient`
|
|
609
|
-
| HTTP transport
|
|
958
|
+
| `HttpClient`, `RateLimiter`, `EncodingDetector`, `CdxCache`, `Configuration`, `ColorOutput`, `WarcWriter`, `WarcReader`
|
|
959
|
+
| HTTP transport, rate limiting, encoding detection, caching, config management, WARC I/O, and color output
|
|
610
960
|
|===
|
|
611
961
|
|
|
612
962
|
All API classes accept an `HttpClient` via dependency injection for testability.
|
data/lib/archaeo/cli.rb
CHANGED
|
@@ -194,6 +194,32 @@ module Archaeo
|
|
|
194
194
|
end
|
|
195
195
|
end
|
|
196
196
|
|
|
197
|
+
desc "rewrite-local INPUT_DIR",
|
|
198
|
+
"Rewrite previously downloaded files to use local paths"
|
|
199
|
+
option :output, desc: "Output directory (default: rewrite in-place)",
|
|
200
|
+
required: false
|
|
201
|
+
option :prefix, desc: "Local path prefix", default: "local"
|
|
202
|
+
option :rewrite_js, type: :boolean, default: false,
|
|
203
|
+
desc: "Rewrite URLs in JavaScript strings"
|
|
204
|
+
option :rewrite_absolute, type: :boolean, default: false,
|
|
205
|
+
desc: "Rewrite all absolute archive URLs"
|
|
206
|
+
def rewrite_local(input_dir)
|
|
207
|
+
handle_errors do
|
|
208
|
+
output_dir = options[:output] || input_dir
|
|
209
|
+
local_rewriter = LocalRewriter.new(
|
|
210
|
+
prefix: options[:prefix],
|
|
211
|
+
rewrite_js: options[:rewrite_js],
|
|
212
|
+
rewrite_absolute: options[:rewrite_absolute],
|
|
213
|
+
)
|
|
214
|
+
summary = local_rewriter.rewrite_directory(input_dir, output_dir)
|
|
215
|
+
color = build_color
|
|
216
|
+
warn color.success(
|
|
217
|
+
"Rewrote #{summary.rewritten}/#{summary.total} files " \
|
|
218
|
+
"in #{summary.elapsed.round(1)}s",
|
|
219
|
+
)
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
|
|
197
223
|
desc "diff URL TIMESTAMP_A TIMESTAMP_B",
|
|
198
224
|
"Compare assets of two archived snapshots"
|
|
199
225
|
option :format, desc: "Output format (table, json)", default: "table"
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
|
|
5
|
+
module Archaeo
|
|
6
|
+
LocalRewriteSummary = Struct.new(
|
|
7
|
+
:total, :rewritten, :skipped, :elapsed,
|
|
8
|
+
keyword_init: true
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
# Rewrites previously downloaded files by converting archive URLs
|
|
12
|
+
# to local paths. Operates on files already on disk without fetching.
|
|
13
|
+
class LocalRewriter
|
|
14
|
+
def initialize(prefix: "local", rewrite_js: false,
|
|
15
|
+
rewrite_absolute: false)
|
|
16
|
+
@prefix = prefix
|
|
17
|
+
@rewrite_js = rewrite_js
|
|
18
|
+
@rewrite_absolute = rewrite_absolute
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def rewrite_directory(input_dir, output_dir)
|
|
22
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
23
|
+
files = gather_files(input_dir)
|
|
24
|
+
rewritten = 0
|
|
25
|
+
skipped = 0
|
|
26
|
+
|
|
27
|
+
files.each do |path|
|
|
28
|
+
rel = path.sub(%r{\A#{Regexp.escape(input_dir)}/?}, "")
|
|
29
|
+
out_path = File.join(output_dir, rel)
|
|
30
|
+
|
|
31
|
+
result = rewrite_file(path, out_path)
|
|
32
|
+
result ? rewritten += 1 : skipped += 1
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
36
|
+
LocalRewriteSummary.new(
|
|
37
|
+
total: files.size, rewritten: rewritten,
|
|
38
|
+
skipped: skipped, elapsed: elapsed
|
|
39
|
+
)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def rewrite_file(input_path, output_path)
|
|
43
|
+
content = File.read(input_path)
|
|
44
|
+
return nil unless rewrite_candidate?(content)
|
|
45
|
+
|
|
46
|
+
FileUtils.mkdir_p(File.dirname(output_path))
|
|
47
|
+
rewriter = build_rewriter
|
|
48
|
+
rewritten = apply_rewriting(rewriter, content, input_path)
|
|
49
|
+
File.write(output_path, rewritten)
|
|
50
|
+
true
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
def gather_files(dir)
|
|
56
|
+
Dir.glob(File.join(dir, "**", "*"))
|
|
57
|
+
.select { |f| File.file?(f) && text_file?(f) }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def text_file?(path)
|
|
61
|
+
ext = File.extname(path).downcase
|
|
62
|
+
TEXT_EXTENSIONS.include?(ext)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
TEXT_EXTENSIONS = %w[
|
|
66
|
+
.html .htm .xhtml .css .js .json .xml .txt
|
|
67
|
+
.svg .md .yaml .yml .rss .atom
|
|
68
|
+
].freeze
|
|
69
|
+
|
|
70
|
+
def rewrite_candidate?(content)
|
|
71
|
+
content.include?("web.archive.org")
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def build_rewriter
|
|
75
|
+
UrlRewriter.new(
|
|
76
|
+
"https://web.archive.org", @prefix,
|
|
77
|
+
rewrite_js: @rewrite_js,
|
|
78
|
+
rewrite_absolute: @rewrite_absolute
|
|
79
|
+
)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def apply_rewriting(rewriter, content, path)
|
|
83
|
+
ext = File.extname(path).downcase
|
|
84
|
+
case ext
|
|
85
|
+
when ".html", ".htm", ".xhtml"
|
|
86
|
+
rewriter.rewrite_html(content)
|
|
87
|
+
when ".css"
|
|
88
|
+
rewriter.rewrite_css(content)
|
|
89
|
+
when ".js"
|
|
90
|
+
rewriter.rewrite_js(content)
|
|
91
|
+
else
|
|
92
|
+
rewrite_mixed(rewriter, content)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def rewrite_mixed(rewriter, content)
|
|
97
|
+
if content.include?("<") && content.include?(">")
|
|
98
|
+
rewriter.rewrite_html(content)
|
|
99
|
+
elsif content.include?("url(")
|
|
100
|
+
rewriter.rewrite_css(content)
|
|
101
|
+
else
|
|
102
|
+
rewriter.rewrite_js(content)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
data/lib/archaeo/page.rb
CHANGED
|
@@ -139,6 +139,16 @@ module Archaeo
|
|
|
139
139
|
end
|
|
140
140
|
end
|
|
141
141
|
|
|
142
|
+
def microposts
|
|
143
|
+
return [] unless html?
|
|
144
|
+
|
|
145
|
+
@microposts ||= begin
|
|
146
|
+
doc = Nokogiri::HTML(@raw_content)
|
|
147
|
+
containers = find_article_containers(doc)
|
|
148
|
+
containers.filter_map { |el| extract_micropost(el) }
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
142
152
|
def to_h
|
|
143
153
|
{
|
|
144
154
|
content_type: @content_type,
|
|
@@ -255,5 +265,58 @@ module Archaeo
|
|
|
255
265
|
end
|
|
256
266
|
inputs.reject { |f| f[:name].empty? }
|
|
257
267
|
end
|
|
268
|
+
|
|
269
|
+
ARTICLE_SELECTORS = %w[
|
|
270
|
+
article [role=article] .post .entry .blog-post
|
|
271
|
+
.hentry .post-content .entry-content .article-content
|
|
272
|
+
.story .story-body .news-article
|
|
273
|
+
].freeze
|
|
274
|
+
|
|
275
|
+
def find_article_containers(doc)
|
|
276
|
+
found = ARTICLE_SELECTORS
|
|
277
|
+
.filter_map { |sel| doc.css(sel) }
|
|
278
|
+
.flat_map(&:to_a)
|
|
279
|
+
found.any? ? found.uniq : [doc.at_css("body") || doc]
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def extract_micropost(element)
|
|
283
|
+
title = extract_micropost_title(element)
|
|
284
|
+
body = extract_micropost_body(element)
|
|
285
|
+
return nil if body.nil? || body.strip.empty?
|
|
286
|
+
|
|
287
|
+
{ title: title, body: body.strip,
|
|
288
|
+
date: extract_micropost_date(element),
|
|
289
|
+
author: extract_micropost_author(element) }
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
def extract_micropost_title(el)
|
|
293
|
+
heading = el.at_css("h1, h2, h3, [class*=title], [class*=heading]")
|
|
294
|
+
heading&.text&.strip
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def extract_micropost_body(el)
|
|
298
|
+
paragraphs = el.css("p").map(&:text).join("\n")
|
|
299
|
+
return nil if paragraphs.strip.empty?
|
|
300
|
+
|
|
301
|
+
paragraphs
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
def extract_micropost_date(el)
|
|
305
|
+
time = el.at_css("time[datetime]")
|
|
306
|
+
return time["datetime"] if time
|
|
307
|
+
|
|
308
|
+
date_el = el.at_css(
|
|
309
|
+
"[class*=date], [class*=time], [class*=published], " \
|
|
310
|
+
"[property='datePublished']",
|
|
311
|
+
)
|
|
312
|
+
date_el&.text&.strip
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def extract_micropost_author(el)
|
|
316
|
+
author_el = el.at_css(
|
|
317
|
+
"[class*=author], [rel=author], [property='author']",
|
|
318
|
+
)
|
|
319
|
+
author_el&.text&.strip
|
|
320
|
+
end
|
|
258
321
|
end
|
|
259
322
|
end
|
data/lib/archaeo/version.rb
CHANGED
data/lib/archaeo.rb
CHANGED
|
@@ -69,4 +69,6 @@ module Archaeo
|
|
|
69
69
|
autoload :ContentChangeReport, "archaeo/content_tracker"
|
|
70
70
|
autoload :ArchiveSearch, "archaeo/archive_search"
|
|
71
71
|
autoload :SearchResult, "archaeo/archive_search"
|
|
72
|
+
autoload :LocalRewriter, "archaeo/local_rewriter"
|
|
73
|
+
autoload :LocalRewriteSummary, "archaeo/local_rewriter"
|
|
72
74
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: archaeo
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.11
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
@@ -93,6 +93,7 @@ files:
|
|
|
93
93
|
- lib/archaeo/encoding_detector.rb
|
|
94
94
|
- lib/archaeo/fetcher.rb
|
|
95
95
|
- lib/archaeo/http_client.rb
|
|
96
|
+
- lib/archaeo/local_rewriter.rb
|
|
96
97
|
- lib/archaeo/page.rb
|
|
97
98
|
- lib/archaeo/page_bundle.rb
|
|
98
99
|
- lib/archaeo/parallel_cdx.rb
|