archaeo 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,16 +8,32 @@ module Archaeo
8
8
  # Used for saving archived pages and their assets for offline
9
9
  # browsing. Converts absolute archive URLs into relative paths
10
10
  # rooted at a configurable local directory.
11
+ #
12
+ # Supports HTML attributes, srcset, inline styles, CSS url(),
13
+ # JavaScript string URLs, and server-side extension handling.
11
14
  class UrlRewriter
12
- URL_ATTRS = %w[src href data-src poster].freeze
15
+ URL_ATTRS = %w[src href data-src data-url poster action].freeze
13
16
  CSS_URL_RE = /url\(\s*['"]?([^'")\s]+)['"]?\s*\)/
17
+ ARCHIVE_RE = %r{https?://web\.archive\.org/web/\d+(?:id_)?/}
18
+ JS_URL_RE = /['"](https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/[^'"]+)['"]/
19
+
20
+ SERVER_EXTENSIONS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
14
21
 
15
- def initialize(archive_prefix, local_prefix)
22
+ def initialize(archive_prefix, local_prefix,
23
+ rewrite_js: false, rewrite_absolute: false,
24
+ server_extensions: false)
16
25
  @archive_prefix = archive_prefix.to_s
17
26
  @local_prefix = local_prefix.to_s
27
+ @rewrite_js = rewrite_js
28
+ @rewrite_absolute = rewrite_absolute
29
+ @server_extensions = server_extensions
18
30
  end
19
31
 
20
32
  def rewrite(url)
33
+ if @rewrite_absolute && url.match?(ARCHIVE_RE)
34
+ return rewrite_absolute_url(url)
35
+ end
36
+
21
37
  return url unless url.start_with?(@archive_prefix)
22
38
 
23
39
  relative = url.sub(@archive_prefix, "")
@@ -37,13 +53,39 @@ module Archaeo
37
53
  doc.to_html
38
54
  end
39
55
 
56
+ def rewrite_js(js_content)
57
+ return js_content unless @rewrite_js
58
+
59
+ js_content.gsub(JS_URL_RE) do
60
+ quote = Regexp.last_match[0][0]
61
+ url = Regexp.last_match[1]
62
+ rewritten = rewrite(url)
63
+ "#{quote}#{rewritten}#{quote}"
64
+ end
65
+ end
66
+
67
+ def rewrite_css(css_content)
68
+ css_content.gsub(CSS_URL_RE) do
69
+ url = Regexp.last_match[1]
70
+ if url.match?(ARCHIVE_RE) || url.start_with?(@archive_prefix)
71
+ "url('#{rewrite(url)}')"
72
+ else
73
+ Regexp.last_match[0]
74
+ end
75
+ end
76
+ end
77
+
40
78
  def rewrite_url_attrs(doc)
41
79
  URL_ATTRS.each do |attr|
42
80
  doc.css("[#{attr}]").each do |el|
43
81
  original = el[attr]
44
- next unless original&.start_with?(@archive_prefix)
82
+ next unless original
45
83
 
46
- el[attr] = rewrite(original)
84
+ if @rewrite_absolute && original.match?(ARCHIVE_RE)
85
+ el[attr] = rewrite_absolute_url(original)
86
+ elsif original.start_with?(@archive_prefix)
87
+ el[attr] = rewrite(original)
88
+ end
47
89
  end
48
90
  end
49
91
  end
@@ -73,8 +115,11 @@ module Archaeo
73
115
  def rewrite_css_urls(css_text)
74
116
  css_text.gsub(CSS_URL_RE) do
75
117
  url = Regexp.last_match[1]
76
- rewritten = url.start_with?(@archive_prefix) ? rewrite(url) : url
77
- "url('#{rewritten}')"
118
+ if url.match?(ARCHIVE_RE) || url.start_with?(@archive_prefix)
119
+ "url('#{rewrite(url)}')"
120
+ else
121
+ Regexp.last_match[0]
122
+ end
78
123
  end
79
124
  end
80
125
 
@@ -85,9 +130,21 @@ module Archaeo
85
130
  parts = entry.strip.split(/\s+/, 2)
86
131
  url = parts[0]
87
132
  descriptor = parts[1]
88
- rewritten = url.start_with?(@archive_prefix) ? rewrite(url) : url
133
+
134
+ rewritten = if @rewrite_absolute && url.match?(ARCHIVE_RE)
135
+ rewrite_absolute_url(url)
136
+ elsif url.start_with?(@archive_prefix)
137
+ rewrite(url)
138
+ else
139
+ url
140
+ end
89
141
  descriptor ? "#{rewritten} #{descriptor}" : rewritten
90
142
  end.join(", ")
91
143
  end
144
+
145
+ def rewrite_absolute_url(url)
146
+ rest = url.sub(ARCHIVE_RE, "")
147
+ File.join(@local_prefix, rest)
148
+ end
92
149
  end
93
150
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.7"
4
+ VERSION = "0.2.8"
5
5
  end
data/lib/archaeo.rb CHANGED
@@ -48,4 +48,11 @@ module Archaeo
48
48
  autoload :Fetcher, "archaeo/fetcher"
49
49
  autoload :BulkDownloader, "archaeo/bulk_downloader"
50
50
  autoload :Cli, "archaeo/cli"
51
+ autoload :EncodingDetector, "archaeo/encoding_detector"
52
+ autoload :PathSanitizer, "archaeo/path_sanitizer"
53
+ autoload :RateLimiter, "archaeo/rate_limiter"
54
+ autoload :PatternFilter, "archaeo/pattern_filter"
55
+ autoload :CdxCache, "archaeo/cdx_cache"
56
+ autoload :SubdomainDiscovery, "archaeo/subdomain_discovery"
57
+ autoload :ArchiveHealthCheck, "archaeo/archive_health_check"
51
58
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-11 00:00:00.000000000 Z
11
+ date: 2026-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: csv
@@ -71,6 +71,7 @@ files:
71
71
  - bin/setup
72
72
  - exe/archaeo
73
73
  - lib/archaeo.rb
74
+ - lib/archaeo/archive_health_check.rb
74
75
  - lib/archaeo/archive_url.rb
75
76
  - lib/archaeo/asset_extractor.rb
76
77
  - lib/archaeo/asset_list.rb
@@ -78,17 +79,23 @@ files:
78
79
  - lib/archaeo/availability_result.rb
79
80
  - lib/archaeo/bulk_downloader.rb
80
81
  - lib/archaeo/cdx_api.rb
82
+ - lib/archaeo/cdx_cache.rb
81
83
  - lib/archaeo/cdx_filter.rb
82
84
  - lib/archaeo/cdx_timeline.rb
83
85
  - lib/archaeo/cli.rb
84
86
  - lib/archaeo/download_state.rb
87
+ - lib/archaeo/encoding_detector.rb
85
88
  - lib/archaeo/fetcher.rb
86
89
  - lib/archaeo/http_client.rb
87
90
  - lib/archaeo/page.rb
88
91
  - lib/archaeo/page_bundle.rb
92
+ - lib/archaeo/path_sanitizer.rb
93
+ - lib/archaeo/pattern_filter.rb
94
+ - lib/archaeo/rate_limiter.rb
89
95
  - lib/archaeo/save_api.rb
90
96
  - lib/archaeo/save_result.rb
91
97
  - lib/archaeo/snapshot.rb
98
+ - lib/archaeo/subdomain_discovery.rb
92
99
  - lib/archaeo/timestamp.rb
93
100
  - lib/archaeo/url_normalizer.rb
94
101
  - lib/archaeo/url_rewriter.rb