archaeo 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/archaeo/archive_health_check.rb +77 -0
- data/lib/archaeo/bulk_downloader.rb +82 -24
- data/lib/archaeo/cdx_api.rb +39 -7
- data/lib/archaeo/cdx_cache.rb +105 -0
- data/lib/archaeo/cli.rb +109 -8
- data/lib/archaeo/download_state.rb +35 -0
- data/lib/archaeo/encoding_detector.rb +91 -0
- data/lib/archaeo/page.rb +1 -1
- data/lib/archaeo/path_sanitizer.rb +152 -0
- data/lib/archaeo/pattern_filter.rb +80 -0
- data/lib/archaeo/rate_limiter.rb +86 -0
- data/lib/archaeo/save_api.rb +7 -2
- data/lib/archaeo/save_result.rb +26 -8
- data/lib/archaeo/subdomain_discovery.rb +117 -0
- data/lib/archaeo/url_rewriter.rb +64 -7
- data/lib/archaeo/version.rb +1 -1
- data/lib/archaeo.rb +7 -0
- metadata +9 -2
data/lib/archaeo/url_rewriter.rb
CHANGED
|
@@ -8,16 +8,32 @@ module Archaeo
|
|
|
8
8
|
# Used for saving archived pages and their assets for offline
|
|
9
9
|
# browsing. Converts absolute archive URLs into relative paths
|
|
10
10
|
# rooted at a configurable local directory.
|
|
11
|
+
#
|
|
12
|
+
# Supports HTML attributes, srcset, inline styles, CSS url(),
|
|
13
|
+
# JavaScript string URLs, and server-side extension handling.
|
|
11
14
|
class UrlRewriter
|
|
12
|
-
URL_ATTRS = %w[src href data-src poster].freeze
|
|
15
|
+
URL_ATTRS = %w[src href data-src data-url poster action].freeze
|
|
13
16
|
CSS_URL_RE = /url\(\s*['"]?([^'")\s]+)['"]?\s*\)/
|
|
17
|
+
ARCHIVE_RE = %r{https?://web\.archive\.org/web/\d+(?:id_)?/}
|
|
18
|
+
JS_URL_RE = /['"](https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/[^'"]+)['"]/
|
|
19
|
+
|
|
20
|
+
SERVER_EXTENSIONS = %w[.php .asp .aspx .jsp .cgi .pl .py].freeze
|
|
14
21
|
|
|
15
|
-
def initialize(archive_prefix, local_prefix
|
|
22
|
+
def initialize(archive_prefix, local_prefix,
|
|
23
|
+
rewrite_js: false, rewrite_absolute: false,
|
|
24
|
+
server_extensions: false)
|
|
16
25
|
@archive_prefix = archive_prefix.to_s
|
|
17
26
|
@local_prefix = local_prefix.to_s
|
|
27
|
+
@rewrite_js = rewrite_js
|
|
28
|
+
@rewrite_absolute = rewrite_absolute
|
|
29
|
+
@server_extensions = server_extensions
|
|
18
30
|
end
|
|
19
31
|
|
|
20
32
|
def rewrite(url)
|
|
33
|
+
if @rewrite_absolute && url.match?(ARCHIVE_RE)
|
|
34
|
+
return rewrite_absolute_url(url)
|
|
35
|
+
end
|
|
36
|
+
|
|
21
37
|
return url unless url.start_with?(@archive_prefix)
|
|
22
38
|
|
|
23
39
|
relative = url.sub(@archive_prefix, "")
|
|
@@ -37,13 +53,39 @@ module Archaeo
|
|
|
37
53
|
doc.to_html
|
|
38
54
|
end
|
|
39
55
|
|
|
56
|
+
def rewrite_js(js_content)
|
|
57
|
+
return js_content unless @rewrite_js
|
|
58
|
+
|
|
59
|
+
js_content.gsub(JS_URL_RE) do
|
|
60
|
+
quote = Regexp.last_match[0][0]
|
|
61
|
+
url = Regexp.last_match[1]
|
|
62
|
+
rewritten = rewrite(url)
|
|
63
|
+
"#{quote}#{rewritten}#{quote}"
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def rewrite_css(css_content)
|
|
68
|
+
css_content.gsub(CSS_URL_RE) do
|
|
69
|
+
url = Regexp.last_match[1]
|
|
70
|
+
if url.match?(ARCHIVE_RE) || url.start_with?(@archive_prefix)
|
|
71
|
+
"url('#{rewrite(url)}')"
|
|
72
|
+
else
|
|
73
|
+
Regexp.last_match[0]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
40
78
|
def rewrite_url_attrs(doc)
|
|
41
79
|
URL_ATTRS.each do |attr|
|
|
42
80
|
doc.css("[#{attr}]").each do |el|
|
|
43
81
|
original = el[attr]
|
|
44
|
-
next unless original
|
|
82
|
+
next unless original
|
|
45
83
|
|
|
46
|
-
|
|
84
|
+
if @rewrite_absolute && original.match?(ARCHIVE_RE)
|
|
85
|
+
el[attr] = rewrite_absolute_url(original)
|
|
86
|
+
elsif original.start_with?(@archive_prefix)
|
|
87
|
+
el[attr] = rewrite(original)
|
|
88
|
+
end
|
|
47
89
|
end
|
|
48
90
|
end
|
|
49
91
|
end
|
|
@@ -73,8 +115,11 @@ module Archaeo
|
|
|
73
115
|
def rewrite_css_urls(css_text)
|
|
74
116
|
css_text.gsub(CSS_URL_RE) do
|
|
75
117
|
url = Regexp.last_match[1]
|
|
76
|
-
|
|
77
|
-
|
|
118
|
+
if url.match?(ARCHIVE_RE) || url.start_with?(@archive_prefix)
|
|
119
|
+
"url('#{rewrite(url)}')"
|
|
120
|
+
else
|
|
121
|
+
Regexp.last_match[0]
|
|
122
|
+
end
|
|
78
123
|
end
|
|
79
124
|
end
|
|
80
125
|
|
|
@@ -85,9 +130,21 @@ module Archaeo
|
|
|
85
130
|
parts = entry.strip.split(/\s+/, 2)
|
|
86
131
|
url = parts[0]
|
|
87
132
|
descriptor = parts[1]
|
|
88
|
-
|
|
133
|
+
|
|
134
|
+
rewritten = if @rewrite_absolute && url.match?(ARCHIVE_RE)
|
|
135
|
+
rewrite_absolute_url(url)
|
|
136
|
+
elsif url.start_with?(@archive_prefix)
|
|
137
|
+
rewrite(url)
|
|
138
|
+
else
|
|
139
|
+
url
|
|
140
|
+
end
|
|
89
141
|
descriptor ? "#{rewritten} #{descriptor}" : rewritten
|
|
90
142
|
end.join(", ")
|
|
91
143
|
end
|
|
144
|
+
|
|
145
|
+
def rewrite_absolute_url(url)
|
|
146
|
+
rest = url.sub(ARCHIVE_RE, "")
|
|
147
|
+
File.join(@local_prefix, rest)
|
|
148
|
+
end
|
|
92
149
|
end
|
|
93
150
|
end
|
data/lib/archaeo/version.rb
CHANGED
data/lib/archaeo.rb
CHANGED
|
@@ -48,4 +48,11 @@ module Archaeo
|
|
|
48
48
|
autoload :Fetcher, "archaeo/fetcher"
|
|
49
49
|
autoload :BulkDownloader, "archaeo/bulk_downloader"
|
|
50
50
|
autoload :Cli, "archaeo/cli"
|
|
51
|
+
autoload :EncodingDetector, "archaeo/encoding_detector"
|
|
52
|
+
autoload :PathSanitizer, "archaeo/path_sanitizer"
|
|
53
|
+
autoload :RateLimiter, "archaeo/rate_limiter"
|
|
54
|
+
autoload :PatternFilter, "archaeo/pattern_filter"
|
|
55
|
+
autoload :CdxCache, "archaeo/cdx_cache"
|
|
56
|
+
autoload :SubdomainDiscovery, "archaeo/subdomain_discovery"
|
|
57
|
+
autoload :ArchiveHealthCheck, "archaeo/archive_health_check"
|
|
51
58
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: archaeo
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: csv
|
|
@@ -71,6 +71,7 @@ files:
|
|
|
71
71
|
- bin/setup
|
|
72
72
|
- exe/archaeo
|
|
73
73
|
- lib/archaeo.rb
|
|
74
|
+
- lib/archaeo/archive_health_check.rb
|
|
74
75
|
- lib/archaeo/archive_url.rb
|
|
75
76
|
- lib/archaeo/asset_extractor.rb
|
|
76
77
|
- lib/archaeo/asset_list.rb
|
|
@@ -78,17 +79,23 @@ files:
|
|
|
78
79
|
- lib/archaeo/availability_result.rb
|
|
79
80
|
- lib/archaeo/bulk_downloader.rb
|
|
80
81
|
- lib/archaeo/cdx_api.rb
|
|
82
|
+
- lib/archaeo/cdx_cache.rb
|
|
81
83
|
- lib/archaeo/cdx_filter.rb
|
|
82
84
|
- lib/archaeo/cdx_timeline.rb
|
|
83
85
|
- lib/archaeo/cli.rb
|
|
84
86
|
- lib/archaeo/download_state.rb
|
|
87
|
+
- lib/archaeo/encoding_detector.rb
|
|
85
88
|
- lib/archaeo/fetcher.rb
|
|
86
89
|
- lib/archaeo/http_client.rb
|
|
87
90
|
- lib/archaeo/page.rb
|
|
88
91
|
- lib/archaeo/page_bundle.rb
|
|
92
|
+
- lib/archaeo/path_sanitizer.rb
|
|
93
|
+
- lib/archaeo/pattern_filter.rb
|
|
94
|
+
- lib/archaeo/rate_limiter.rb
|
|
89
95
|
- lib/archaeo/save_api.rb
|
|
90
96
|
- lib/archaeo/save_result.rb
|
|
91
97
|
- lib/archaeo/snapshot.rb
|
|
98
|
+
- lib/archaeo/subdomain_discovery.rb
|
|
92
99
|
- lib/archaeo/timestamp.rb
|
|
93
100
|
- lib/archaeo/url_normalizer.rb
|
|
94
101
|
- lib/archaeo/url_rewriter.rb
|