archaeo 0.2.10 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 235d2cba1b1e071156a873d7a63cf0fdb6ba8079eb6083e21755e723727db6d9
4
- data.tar.gz: 65c040c3a5984fdc1a68ca106d9ae10eab64b212ce6a72b37bec39ec57d383e2
3
+ metadata.gz: 76a36571f0747712c2abda1a4aef93c7ade9a83b42590e23f0148b89138451b0
4
+ data.tar.gz: a9eed4768d084756fbb10eda17b1f2098246fd56a93cbe91b55f693850e5008a
5
5
  SHA512:
6
- metadata.gz: e6eb3cdb88abb87332bbba762bf566643da717ce17557e31ed90a012bd7c164939b5eb719420f74dfa908215e0f604e71b5fb2bb8bcc7de2940e36b80524e963
7
- data.tar.gz: f52bc54fe3c425eeae28093810f1d90c4200391696ffe9af0e3f91366d619e4e57a425ec1e6a6a8b9aa2465d337bb2b960782fac1c3ce068d7d4b673b8306641
6
+ metadata.gz: fa8e01a6aa31aa678a17ce2fc4f59e324c4e8779716b7c41d876dbd366af06dda30296af446919eedc3136efe5bc2527abef60d5aa4274745e94ef7415a775fa
7
+ data.tar.gz: b3fd25ec4d3b10c759992226dd2d699276dbd9def9318ef343f632b69faa5c4fb0017f78ae7aa87b1b85c4fb48a642d4d667c3f43e0e768162486d63f1bf7be1
data/README.adoc CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  Archaeo is a Ruby client for the Internet Archive's https://web.archive.org[Wayback Machine] APIs.
6
6
 
7
- It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content, and bulk downloading with resume support.
7
+ It provides a model-driven interface for querying archived snapshots, checking availability, saving URLs, fetching archived content, bulk downloading with resume support, snapshot comparison, coverage analysis, content tracking, full-text search, WARC format I/O, and more.
8
8
 
9
9
  == Installation
10
10
 
@@ -100,6 +100,18 @@ cdx.num_pages("example.com")
100
100
 
101
101
  # Discover all known URLs for a domain
102
102
  cdx.known_urls("example.com")
103
+
104
+ # Composite snapshot (point-in-time site reconstruction)
105
+ cdx.composite_snapshot("example.com", timestamp: "20220615",
106
+ collapse: ["digest"])
107
+ # => picks newest snapshot per URL at or before the given timestamp
108
+
109
+ # CDX caching (speeds up repeated queries)
110
+ cdx = Archaeo::CdxApi.new(cache_dir: ".cache")
111
+
112
+ # Parallel CDX fetching (thread pool for multi-page queries)
113
+ parallel = Archaeo::ParallelCdx.new(concurrency: 4)
114
+ snapshots = parallel.snapshots("example.com")
103
115
  ----
104
116
 
105
117
  === Check Availability
@@ -141,6 +153,14 @@ result.as_json # => JSON-serializable Hash
141
153
  results = save.batch_save(%w[https://a.com https://b.com],
142
154
  delay: 2, stop_on_error: false)
143
155
  results.each { |r| puts "#{r.url}: #{r.success?}" }
156
+
157
+ # Inspect response details
158
+ result.status_code # => HTTP status from Save API
159
+ result.response_url # => redirect URL if any
160
+ result.response_headers # => Hash of response headers
161
+
162
+ # With rate limiter
163
+ save = Archaeo::SaveApi.new(rate_limiter: Archaeo::RateLimiter.new(min_interval: 1.0))
144
164
  ----
145
165
 
146
166
  === Fetch Archived Content
@@ -182,6 +202,12 @@ page = fetcher.fetch!("https://example.com/",
182
202
  # Page links and meta extraction
183
203
  page.links # => [{ href: "...", text: "...", external: true/false }]
184
204
  page.meta_tags # => { "description" => "...", "og:title" => "...", "canonical" => "..." }
205
+
206
+ # Structured content extraction (HTML pages only)
207
+ page.headings # => [{ level: 1, text: "Title" }, { level: 2, text: "Subtitle" }]
208
+ page.images # => [{ src: "photo.jpg", alt: "...", width: 800, height: 600 }]
209
+ page.forms # => [{ action: "/submit", method: "POST", fields: [{ name: "q", type: "text" }] }]
210
+ page.scripts # => [{ src: "app.js", type: "text/javascript" }]
185
211
  ----
186
212
 
187
213
  === Fetch Page with Assets
@@ -260,6 +286,46 @@ downloader = Archaeo::BulkDownloader.new(
260
286
  output_dir: "archive", concurrency: 4,
261
287
  )
262
288
  downloader.download("example.com")
289
+
290
+ # Download with page requisites (CSS/JS/images)
291
+ downloader.download("example.com", page_requisites: true)
292
+
293
+ # Point-in-time composite snapshot
294
+ downloader.download("example.com", snapshot_at: "20220615")
295
+
296
+ # All timestamps (not just latest per URL)
297
+ downloader.download("example.com", all_timestamps: true)
298
+
299
+ # URL pattern filtering
300
+ filter = Archaeo::PatternFilter.new(only: ".*\\.html$", exclude: nil)
301
+ downloader.download("example.com", filter: filter)
302
+
303
+ # Download scheduling strategies
304
+ scheduler = Archaeo::DownloadScheduler.new(
305
+ strategy: :breadth_first, # or :depth_first, :newest_first, :oldest_first
306
+ priority: :html_first,
307
+ max_file_size: 50 * 1024 * 1024,
308
+ )
309
+ # Integrates with BulkDownloader via strategy: option
310
+
311
+ # Rate limiting
312
+ limiter = Archaeo::RateLimiter.new(min_interval: 0.5)
313
+ downloader = Archaeo::BulkDownloader.new(
314
+ output_dir: "archive", rate_limiter: limiter,
315
+ )
316
+
317
+ # Limit snapshots
318
+ downloader.download("example.com", max_snapshots: 10, strategy: :newest_first)
319
+
320
+ # Progress reporting
321
+ downloader.download("example.com") do |current, total, snap|
322
+ report = Archaeo::ProgressReport.new(
323
+ current: current, total: total,
324
+ downloaded_bytes: current * 1024, elapsed: 10.0,
325
+ current_url: snap.original_url,
326
+ )
327
+ puts "#{report.percent_complete}% — ETA #{report.eta}s"
328
+ end
263
329
  ----
264
330
 
265
331
  === Download State (Resume Tracking)
@@ -351,9 +417,21 @@ rewriter.rewrite("https://web.archive.org/web/20220615000000/style.css")
351
417
  # Rewrite batch
352
418
  rewriter.rewrite_batch(["url1", "url2"])
353
419
 
354
- # Rewrite URLs within HTML (src, href, srcset, data-src, poster)
420
+ # Rewrite URLs within HTML (src, href, srcset, data-src, poster, action, data-url)
355
421
  # Also rewrites inline style url() and <style> element url()
356
422
  rewritten_html = rewriter.rewrite_html(html_content)
423
+
424
+ # Enhanced rewriting with JS strings, absolute URLs, and server extensions
425
+ rewriter = Archaeo::UrlRewriter.new(
426
+ "https://web.archive.org/web/20220615000000/",
427
+ "local",
428
+ rewrite_js: true, # rewrite URLs inside JS string literals
429
+ rewrite_absolute: true, # rewrite all absolute archive URLs (not just prefix match)
430
+ server_extensions: true, # handle .php/.asp/.jsp URLs specially
431
+ )
432
+
433
+ # Standalone CSS file rewriting
434
+ rewritten_css = rewriter.rewrite_css(css_content)
357
435
  ----
358
436
 
359
437
  === Snapshot Convenience
@@ -468,6 +546,218 @@ client.pool_stats
468
546
  # idle_times: { "web.archive.org": 12 } }
469
547
  ----
470
548
 
549
+ === Snapshot Comparison (Diff)
550
+
551
+ [source,ruby]
552
+ ----
553
+ diff = Archaeo::SnapshotDiff.new(
554
+ url: "https://example.com/",
555
+ page_a: page_a, page_b: page_b,
556
+ timestamp_a: "20220101", timestamp_b: "20220615",
557
+ )
558
+
559
+ diff.content_changed? # => true/false (SHA256 digest comparison)
560
+ diff.text_diff # => unified diff of content lines
561
+ diff.link_changes # => { added: [...], removed: [...], unchanged: N }
562
+ diff.asset_changes # => { added: [...], removed: [...], unchanged: N }
563
+ diff.structural_changes # => { "a" => { from: 1, to: 2 }, ... }
564
+ diff.to_h # => Hash with all fields
565
+ ----
566
+
567
+ === Coverage Analysis
568
+
569
+ [source,ruby]
570
+ ----
571
+ analyzer = Archaeo::CoverageAnalyzer.new
572
+ report = analyzer.analyze("example.com", from: "20220101", to: "20221231")
573
+
574
+ report.url # => "example.com"
575
+ report.total_urls # => unique URLs found
576
+ report.archived_urls # => URLs with at least one capture
577
+ report.coverage_percent # => 87.3
578
+ report.temporal_gaps # => [{ from: ts, to: ts, gap_days: 45 }, ...]
579
+ report.has_gaps? # => true/false
580
+ report.status_distribution # => { 200 => 150, 404 => 10 }
581
+ report.missing_assets # => resources referenced but not archived
582
+ ----
583
+
584
+ === Archive Health Check
585
+
586
+ [source,ruby]
587
+ ----
588
+ checker = Archaeo::ArchiveHealthCheck.new
589
+ report = checker.check("example.com", from: "20220101", to: "20221231")
590
+
591
+ report.total # => 150
592
+ report.accessible # => 148
593
+ report.missing # => 2
594
+ report.errors # => 0
595
+ report.details # => [HealthDetail, ...]
596
+
597
+ # Sample a subset (for large collections)
598
+ report = checker.check("example.com", sample: 50)
599
+ ----
600
+
601
+ === Content Tracking
602
+
603
+ [source,ruby]
604
+ ----
605
+ tracker = Archaeo::ContentTracker.new
606
+ report = tracker.track("example.com", from: "20220101", to: "20221231")
607
+
608
+ report.changed_urls # => URLs whose digest changed over time
609
+ report.new_urls # => URLs that appeared in the second half
610
+ report.removed_urls # => URLs that disappeared in the second half
611
+ report.content_frequency # => { "url" => unique_digest_count }
612
+ report.any_changes? # => true if any changes detected
613
+ ----
614
+
615
+ === Archive Search
616
+
617
+ [source,ruby]
618
+ ----
619
+ searcher = Archaeo::ArchiveSearch.new
620
+ results = searcher.search("example.com",
621
+ query: "contact us",
622
+ from: "20220101",
623
+ to: "20221231",
624
+ case_sensitive: false,
625
+ max_results: 10)
626
+
627
+ results.each do |match|
628
+ puts match.snapshot.timestamp # => when it was archived
629
+ puts match.url # => the page URL
630
+ puts match.context # => "...contact us..." with surrounding text
631
+ end
632
+ ----
633
+
634
+ === WARC Support
635
+
636
+ [source,ruby]
637
+ ----
638
+ # Export snapshots to WARC format
639
+ writer = Archaeo::WarcWriter.new
640
+ writer.write("archive/output.warc", pages)
641
+
642
+ # Gzip-compressed output
643
+ writer.write("archive/output.warc.gz", pages, compress: true)
644
+
645
+ # Read WARC files
646
+ reader = Archaeo::WarcReader.new
647
+ records = reader.read_records("archive/output.warc")
648
+
649
+ records.each do |record|
650
+ record.warc_type # => "response" or "warcinfo"
651
+ record.target_uri # => original URL
652
+ record.body # => archived content
653
+ record.response? # => true for response records
654
+ end
655
+ ----
656
+
657
+ === Configuration
658
+
659
+ [source,ruby]
660
+ ----
661
+ # Load .archaeo.yml config
662
+ config = Archaeo::Configuration.new
663
+
664
+ config.get("output_dir") # => "archive" (default)
665
+ config.get("rate_limit") # => 0.5
666
+ config.get("concurrency", profile: "fast") # => 8
667
+
668
+ # Persist settings
669
+ config.set("rate_limit", 1.0)
670
+ config.set("concurrency", 4, profile: "fast")
671
+
672
+ # List profiles
673
+ config.profiles # => ["fast", "careful"]
674
+ ----
675
+
676
+ === Encoding Detection
677
+
678
+ [source,ruby]
679
+ ----
680
+ detector = Archaeo::EncodingDetector.new
681
+
682
+ # Detect encoding from content + content-type charset
683
+ encoding = detector.detect(binary_content, content_type: "text/html; charset=iso-8859-1")
684
+ # => Encoding::ISO_8859_1
685
+
686
+ # Detect from HTML meta tag
687
+ encoding = detector.detect("<html><head><meta charset='utf-8'>...")
688
+ # => Encoding::UTF_8
689
+
690
+ # Multi-encoding fallback chain
691
+ detector.detect(content) # tries UTF-8, ISO-8859-1, Windows-1252
692
+ ----
693
+
694
+ === Path Sanitization
695
+
696
+ [source,ruby]
697
+ ----
698
+ sanitizer = Archaeo::PathSanitizer.new
699
+ safe_path = sanitizer.sanitize("https://example.com/path?q=1&r=2")
700
+ # => "path_q_1_r_2"
701
+
702
+ # Handles query string hashing, recursive percent-decoding,
703
+ # and file/directory conflict resolution
704
+ ----
705
+
706
+ === Pattern Filtering
707
+
708
+ [source,ruby]
709
+ ----
710
+ # Include/exclude URL patterns
711
+ filter = Archaeo::PatternFilter.new(
712
+ only: ".*\\.html$", # regex string or %r{} Regexp
713
+ exclude: /\\/api\\//,
714
+ )
715
+
716
+ filter.match?("https://example.com/page.html") # => true
717
+ filter.match?("https://example.com/style.css") # => false
718
+ filter.match?("https://example.com/api/data") # => false (excluded)
719
+ ----
720
+
721
+ === Subdomain Discovery
722
+
723
+ [source,ruby]
724
+ ----
725
+ discovery = Archaeo::SubdomainDiscovery.new("example.com", max_depth: 2)
726
+
727
+ # Scan downloaded files to discover subdomains
728
+ subdomains = discovery.scan_files("archive/")
729
+ # => ["cdn.example.com", "blog.example.com"]
730
+
731
+ # Scan raw content (HTML, CSS, JS)
732
+ subdomains = discovery.scan_content("<a href='https://blog.example.com/post'>")
733
+ # => ["blog.example.com"]
734
+ ----
735
+
736
+ === Rate Limiting
737
+
738
+ [source,ruby]
739
+ ----
740
+ # Per-host rate limiter with adaptive backoff
741
+ limiter = Archaeo::RateLimiter.new(min_interval: 0.5)
742
+ limiter.wait(host: "web.archive.org") # sleeps if needed
743
+ limiter.wait(host: "api.example.com") # independent per-host tracking
744
+ ----
745
+
746
+ === Color Output
747
+
748
+ [source,ruby]
749
+ ----
750
+ color = Archaeo::ColorOutput.new(enabled: true)
751
+
752
+ color.success("Done!") # green + bold
753
+ color.warning("Careful") # yellow + bold
754
+ color.error("Failed!") # red + bold
755
+ color.info("Info") # cyan
756
+
757
+ # Auto-detects from TTY, NO_COLOR env, TERM=dumb
758
+ color = Archaeo::ColorOutput.new # enabled: auto-detected
759
+ ----
760
+
471
761
  === Command-Line Interface
472
762
 
473
763
  [source,bash]
@@ -543,11 +833,67 @@ archaeo download --concurrency 4 example.com --output ./archive
543
833
  # Resume interrupted download
544
834
  archaeo download example.com --resume
545
835
 
836
+ # Download with page requisites (linked assets)
837
+ archaeo download --page-requisites example.com
838
+
839
+ # Point-in-time composite snapshot
840
+ archaeo download --snapshot-at 20220615 example.com
841
+
842
+ # All timestamps (not just latest)
843
+ archaeo download --all-timestamps example.com
844
+
845
+ # URL pattern filtering
846
+ archaeo download --only '.*\.html$' --exclude '/api/' example.com
847
+
848
+ # Download scheduling
849
+ archaeo download --strategy newest_first --max-snapshots 10 example.com
850
+
851
+ # Reset download state
852
+ archaeo download --reset example.com
853
+
854
+ # Rate limiting
855
+ archaeo download --rate-limit 0.5 example.com
856
+
857
+ # Recursive subdomain discovery
858
+ archaeo download --recursive-subdomains --subdomain-depth 2 example.com
859
+
546
860
  # Suppress progress messages
547
861
  archaeo --quiet download example.com
548
862
 
863
+ # Disable colored output
864
+ archaeo --no-color download example.com
865
+
549
866
  # Discover all known URLs for a domain
550
867
  archaeo known_urls example.com
868
+ archaeo known_urls --file urls.txt example.com
869
+ archaeo known_urls --subdomain example.com
870
+
871
+ # Check archive health
872
+ archaeo health example.com
873
+ archaeo health --from 20220101 --to 20221231 --sample 50 example.com
874
+
875
+ # Analyze archive coverage
876
+ archaeo coverage example.com
877
+ archaeo coverage --from 20220101 --to 20221231 --format json example.com
878
+
879
+ # Compare two snapshots
880
+ archaeo snapshot-diff example.com 20220101 20220615
881
+ archaeo snapshot-diff --format json example.com 20220101 20220615
882
+
883
+ # Search archived content
884
+ archaeo search example.com "contact us"
885
+ archaeo search --from 20220101 --to 20221231 --max-results 10 example.com "about"
886
+
887
+ # Track content changes over time
888
+ archaeo track-changes example.com
889
+ archaeo track-changes --from 20220101 --to 20221231 --format json example.com
890
+
891
+ # Export to WARC format
892
+ archaeo warc-export --output archive.warc example.com
893
+ archaeo warc-export --output archive.warc.gz --gzip example.com
894
+
895
+ # Save API with headers
896
+ archaeo save --headers https://example.com/
551
897
  ----
552
898
 
553
899
  === Error Handling
@@ -585,28 +931,32 @@ Archaeo follows a model-driven, OOP design:
585
931
  | Layer | Classes | Purpose
586
932
 
587
933
  | *Models*
588
- | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`, `CdxTimeline`
934
+ | `Timestamp`, `ArchiveUrl`, `Snapshot`, `Page`, `PageBundle`, `SaveResult`, `AvailabilityResult`, `CdxTimeline`, `ProgressReport`, `CoverageReport`, `ContentChangeReport`, `SearchResult`, `WarcRecord`, `HealthReport`
589
935
  | Domain value objects with `to_h`, `as_json`, `inspect` support
590
936
 
591
937
  | *URL Processing*
592
- | `UrlNormalizer`, `CdxFilter`, `UrlRewriter`
593
- | URL sanitization, validated filtering with composition, and HTML URL rewriting
938
+ | `UrlNormalizer`, `CdxFilter`, `UrlRewriter`, `PatternFilter`, `PathSanitizer`
939
+ | URL sanitization, validated filtering, regex include/exclude, path conflict resolution, and HTML/JS/CSS URL rewriting
594
940
 
595
941
  | *Asset Extraction*
596
942
  | `AssetExtractor`, `AssetList`
597
943
  | Parse HTML for resource URLs including preloads and modulepreload
598
944
 
599
945
  | *APIs*
600
- | `CdxApi`, `AvailabilityApi`, `SaveApi`
601
- | Query and mutate the archive
946
+ | `CdxApi`, `ParallelCdx`, `AvailabilityApi`, `SaveApi`, `ArchiveSearch`
947
+ | Query and mutate the archive, parallel CDX fetching, full-text search
602
948
 
603
949
  | *Operations*
604
- | `Fetcher`, `BulkDownloader`, `DownloadState`
605
- | Download content with resume, dry-run, digest verification, and download summaries
950
+ | `Fetcher`, `BulkDownloader`, `DownloadState`, `DownloadScheduler`, `SubdomainDiscovery`
951
+ | Download content with resume, scheduling strategies, subdomain discovery, and digest verification
952
+
953
+ | *Analysis*
954
+ | `SnapshotDiff`, `CoverageAnalyzer`, `ContentTracker`, `ArchiveHealthCheck`
955
+ | Compare snapshots, analyze coverage, track changes over time, verify accessibility
606
956
 
607
957
  | *Infrastructure*
608
- | `HttpClient`
609
- | HTTP transport with retries, gzip, 429/503 handling, connection pooling, and per-request observability
958
+ | `HttpClient`, `RateLimiter`, `EncodingDetector`, `CdxCache`, `Configuration`, `ColorOutput`, `WarcWriter`, `WarcReader`
959
+ | HTTP transport, rate limiting, encoding detection, caching, config management, WARC I/O, and color output
610
960
  |===
611
961
 
612
962
  All API classes accept an `HttpClient` via dependency injection for testability.
data/lib/archaeo/cli.rb CHANGED
@@ -194,6 +194,32 @@ module Archaeo
194
194
  end
195
195
  end
196
196
 
197
+ desc "rewrite-local INPUT_DIR",
198
+ "Rewrite previously downloaded files to use local paths"
199
+ option :output, desc: "Output directory (default: rewrite in-place)",
200
+ required: false
201
+ option :prefix, desc: "Local path prefix", default: "local"
202
+ option :rewrite_js, type: :boolean, default: false,
203
+ desc: "Rewrite URLs in JavaScript strings"
204
+ option :rewrite_absolute, type: :boolean, default: false,
205
+ desc: "Rewrite all absolute archive URLs"
206
+ def rewrite_local(input_dir)
207
+ handle_errors do
208
+ output_dir = options[:output] || input_dir
209
+ local_rewriter = LocalRewriter.new(
210
+ prefix: options[:prefix],
211
+ rewrite_js: options[:rewrite_js],
212
+ rewrite_absolute: options[:rewrite_absolute],
213
+ )
214
+ summary = local_rewriter.rewrite_directory(input_dir, output_dir)
215
+ color = build_color
216
+ warn color.success(
217
+ "Rewrote #{summary.rewritten}/#{summary.total} files " \
218
+ "in #{summary.elapsed.round(1)}s",
219
+ )
220
+ end
221
+ end
222
+
197
223
  desc "diff URL TIMESTAMP_A TIMESTAMP_B",
198
224
  "Compare assets of two archived snapshots"
199
225
  option :format, desc: "Output format (table, json)", default: "table"
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+
5
+ module Archaeo
6
+ LocalRewriteSummary = Struct.new(
7
+ :total, :rewritten, :skipped, :elapsed,
8
+ keyword_init: true
9
+ )
10
+
11
+ # Rewrites previously downloaded files by converting archive URLs
12
+ # to local paths. Operates on files already on disk without fetching.
13
+ class LocalRewriter
14
+ def initialize(prefix: "local", rewrite_js: false,
15
+ rewrite_absolute: false)
16
+ @prefix = prefix
17
+ @rewrite_js = rewrite_js
18
+ @rewrite_absolute = rewrite_absolute
19
+ end
20
+
21
+ def rewrite_directory(input_dir, output_dir)
22
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
23
+ files = gather_files(input_dir)
24
+ rewritten = 0
25
+ skipped = 0
26
+
27
+ files.each do |path|
28
+ rel = path.sub(%r{\A#{Regexp.escape(input_dir)}/?}, "")
29
+ out_path = File.join(output_dir, rel)
30
+
31
+ result = rewrite_file(path, out_path)
32
+ result ? rewritten += 1 : skipped += 1
33
+ end
34
+
35
+ elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
36
+ LocalRewriteSummary.new(
37
+ total: files.size, rewritten: rewritten,
38
+ skipped: skipped, elapsed: elapsed
39
+ )
40
+ end
41
+
42
+ def rewrite_file(input_path, output_path)
43
+ content = File.read(input_path)
44
+ return nil unless rewrite_candidate?(content)
45
+
46
+ FileUtils.mkdir_p(File.dirname(output_path))
47
+ rewriter = build_rewriter
48
+ rewritten = apply_rewriting(rewriter, content, input_path)
49
+ File.write(output_path, rewritten)
50
+ true
51
+ end
52
+
53
+ private
54
+
55
+ def gather_files(dir)
56
+ Dir.glob(File.join(dir, "**", "*"))
57
+ .select { |f| File.file?(f) && text_file?(f) }
58
+ end
59
+
60
+ def text_file?(path)
61
+ ext = File.extname(path).downcase
62
+ TEXT_EXTENSIONS.include?(ext)
63
+ end
64
+
65
+ TEXT_EXTENSIONS = %w[
66
+ .html .htm .xhtml .css .js .json .xml .txt
67
+ .svg .md .yaml .yml .rss .atom
68
+ ].freeze
69
+
70
+ def rewrite_candidate?(content)
71
+ content.include?("web.archive.org")
72
+ end
73
+
74
+ def build_rewriter
75
+ UrlRewriter.new(
76
+ "https://web.archive.org", @prefix,
77
+ rewrite_js: @rewrite_js,
78
+ rewrite_absolute: @rewrite_absolute
79
+ )
80
+ end
81
+
82
+ def apply_rewriting(rewriter, content, path)
83
+ ext = File.extname(path).downcase
84
+ case ext
85
+ when ".html", ".htm", ".xhtml"
86
+ rewriter.rewrite_html(content)
87
+ when ".css"
88
+ rewriter.rewrite_css(content)
89
+ when ".js"
90
+ rewriter.rewrite_js(content)
91
+ else
92
+ rewrite_mixed(rewriter, content)
93
+ end
94
+ end
95
+
96
+ def rewrite_mixed(rewriter, content)
97
+ if content.include?("<") && content.include?(">")
98
+ rewriter.rewrite_html(content)
99
+ elsif content.include?("url(")
100
+ rewriter.rewrite_css(content)
101
+ else
102
+ rewriter.rewrite_js(content)
103
+ end
104
+ end
105
+ end
106
+ end
data/lib/archaeo/page.rb CHANGED
@@ -139,6 +139,16 @@ module Archaeo
139
139
  end
140
140
  end
141
141
 
142
+ def microposts
143
+ return [] unless html?
144
+
145
+ @microposts ||= begin
146
+ doc = Nokogiri::HTML(@raw_content)
147
+ containers = find_article_containers(doc)
148
+ containers.filter_map { |el| extract_micropost(el) }
149
+ end
150
+ end
151
+
142
152
  def to_h
143
153
  {
144
154
  content_type: @content_type,
@@ -255,5 +265,58 @@ module Archaeo
255
265
  end
256
266
  inputs.reject { |f| f[:name].empty? }
257
267
  end
268
+
269
+ ARTICLE_SELECTORS = %w[
270
+ article [role=article] .post .entry .blog-post
271
+ .hentry .post-content .entry-content .article-content
272
+ .story .story-body .news-article
273
+ ].freeze
274
+
275
+ def find_article_containers(doc)
276
+ found = ARTICLE_SELECTORS
277
+ .filter_map { |sel| doc.css(sel) }
278
+ .flat_map(&:to_a)
279
+ found.any? ? found.uniq : [doc.at_css("body") || doc]
280
+ end
281
+
282
+ def extract_micropost(element)
283
+ title = extract_micropost_title(element)
284
+ body = extract_micropost_body(element)
285
+ return nil if body.nil? || body.strip.empty?
286
+
287
+ { title: title, body: body.strip,
288
+ date: extract_micropost_date(element),
289
+ author: extract_micropost_author(element) }
290
+ end
291
+
292
+ def extract_micropost_title(el)
293
+ heading = el.at_css("h1, h2, h3, [class*=title], [class*=heading]")
294
+ heading&.text&.strip
295
+ end
296
+
297
+ def extract_micropost_body(el)
298
+ paragraphs = el.css("p").map(&:text).join("\n")
299
+ return nil if paragraphs.strip.empty?
300
+
301
+ paragraphs
302
+ end
303
+
304
+ def extract_micropost_date(el)
305
+ time = el.at_css("time[datetime]")
306
+ return time["datetime"] if time
307
+
308
+ date_el = el.at_css(
309
+ "[class*=date], [class*=time], [class*=published], " \
310
+ "[property='datePublished']",
311
+ )
312
+ date_el&.text&.strip
313
+ end
314
+
315
+ def extract_micropost_author(el)
316
+ author_el = el.at_css(
317
+ "[class*=author], [rel=author], [property='author']",
318
+ )
319
+ author_el&.text&.strip
320
+ end
258
321
  end
259
322
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.10"
4
+ VERSION = "0.2.11"
5
5
  end
data/lib/archaeo.rb CHANGED
@@ -69,4 +69,6 @@ module Archaeo
69
69
  autoload :ContentChangeReport, "archaeo/content_tracker"
70
70
  autoload :ArchiveSearch, "archaeo/archive_search"
71
71
  autoload :SearchResult, "archaeo/archive_search"
72
+ autoload :LocalRewriter, "archaeo/local_rewriter"
73
+ autoload :LocalRewriteSummary, "archaeo/local_rewriter"
72
74
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.10
4
+ version: 0.2.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
@@ -93,6 +93,7 @@ files:
93
93
  - lib/archaeo/encoding_detector.rb
94
94
  - lib/archaeo/fetcher.rb
95
95
  - lib/archaeo/http_client.rb
96
+ - lib/archaeo/local_rewriter.rb
96
97
  - lib/archaeo/page.rb
97
98
  - lib/archaeo/page_bundle.rb
98
99
  - lib/archaeo/parallel_cdx.rb