@pagepocket/lib 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +265 -3
- package/dist/builtin-blacklist.d.ts +3 -0
- package/dist/builtin-blacklist.js +6 -0
- package/dist/debug.d.ts +2 -0
- package/dist/debug.js +18 -0
- package/dist/hackers/index.js +6 -0
- package/dist/hackers/replay-block-text-fragment.d.ts +2 -0
- package/dist/hackers/replay-block-text-fragment.js +71 -0
- package/dist/hackers/replay-css-proxy.d.ts +2 -0
- package/dist/hackers/replay-css-proxy.js +206 -0
- package/dist/hackers/replay-dom-rewrite.js +103 -32
- package/dist/hackers/replay-history-path.d.ts +2 -0
- package/dist/hackers/replay-history-path.js +25 -0
- package/dist/index.d.ts +3 -1
- package/dist/index.js +18 -1
- package/dist/inflight-tracker.d.ts +19 -0
- package/dist/inflight-tracker.js +48 -0
- package/dist/pagepocket.d.ts +3 -1
- package/dist/pagepocket.js +150 -35
- package/dist/path-resolver.js +14 -4
- package/dist/replace-elements.d.ts +9 -0
- package/dist/replace-elements.js +258 -0
- package/dist/replay-script.js +308 -6
- package/dist/resource-proxy.d.ts +34 -0
- package/dist/resource-proxy.js +284 -0
- package/dist/rewrite-links.d.ts +8 -0
- package/dist/rewrite-links.js +122 -12
- package/dist/snapshot-builder.d.ts +2 -1
- package/dist/snapshot-builder.js +75 -2
- package/dist/types.d.ts +88 -1
- package/dist/writers.d.ts +2 -2
- package/dist/writers.js +56 -4
- package/package.json +3 -3
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.resolveToLocalPath = exports.buildResourceProxyIndex = void 0;
|
|
4
|
+
const addMulti = (map, key, value) => {
|
|
5
|
+
const existing = map.get(key);
|
|
6
|
+
if (!existing) {
|
|
7
|
+
map.set(key, value);
|
|
8
|
+
return;
|
|
9
|
+
}
|
|
10
|
+
if (Array.isArray(existing)) {
|
|
11
|
+
existing.push(value);
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
map.set(key, [existing, value]);
|
|
15
|
+
};
|
|
16
|
+
const toArray = (value) => {
|
|
17
|
+
if (!value) {
|
|
18
|
+
return [];
|
|
19
|
+
}
|
|
20
|
+
return Array.isArray(value) ? value : [value];
|
|
21
|
+
};
|
|
22
|
+
const stripHash = (value) => {
|
|
23
|
+
const index = value.indexOf("#");
|
|
24
|
+
return index === -1 ? value : value.slice(0, index);
|
|
25
|
+
};
|
|
26
|
+
const stripTrailingSlash = (value) => {
|
|
27
|
+
if (!value || value === "/") {
|
|
28
|
+
return value;
|
|
29
|
+
}
|
|
30
|
+
return value.endsWith("/") ? value.slice(0, -1) : value;
|
|
31
|
+
};
|
|
32
|
+
const looksAlreadyEscapedForStaticServers = (value) => {
|
|
33
|
+
// Heuristic: if the path contains "%25XX" patterns, it was likely already
|
|
34
|
+
// escaped once ("%" -> "%25") to survive static-server decoding.
|
|
35
|
+
//
|
|
36
|
+
// This is intentionally conservative; double-escaping breaks lookups.
|
|
37
|
+
return /%25[0-9a-fA-F]{2}/.test(value);
|
|
38
|
+
};
|
|
39
|
+
const escapePercentForStaticServersOnce = (value) => {
|
|
40
|
+
if (!value) {
|
|
41
|
+
return value;
|
|
42
|
+
}
|
|
43
|
+
if (looksAlreadyEscapedForStaticServers(value)) {
|
|
44
|
+
return value;
|
|
45
|
+
}
|
|
46
|
+
return value.split("%").join("%25");
|
|
47
|
+
};
|
|
48
|
+
const isLikelyHostname = (value) => {
|
|
49
|
+
// Keep this loose; we only use it as a guard for embedded-URL detection.
|
|
50
|
+
if (!value)
|
|
51
|
+
return false;
|
|
52
|
+
if (value === "localhost")
|
|
53
|
+
return true;
|
|
54
|
+
return value.includes(".");
|
|
55
|
+
};
|
|
56
|
+
const encodeEmbeddedUrlTailIfPresent = (pathname) => {
|
|
57
|
+
// Some CDNs embed a full absolute URL into a single path segment using
|
|
58
|
+
// encodeURIComponent (e.g. ".../https%3A%2F%2Fexample.com%2Fa.png").
|
|
59
|
+
//
|
|
60
|
+
// Other runtimes may request the *decoded* form in-path (e.g.
|
|
61
|
+
// ".../https://example.com/a.png"), which changes path segments.
|
|
62
|
+
//
|
|
63
|
+
// To be resilient, detect an embedded absolute URL tail (http(s)://...) and
|
|
64
|
+
// produce an alternate pathname with that tail collapsed into one encoded
|
|
65
|
+
// segment.
|
|
66
|
+
const raw = String(pathname || "");
|
|
67
|
+
if (!raw.includes("/http")) {
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
const parts = raw.split("/");
|
|
71
|
+
for (let i = 0; i < parts.length; i += 1) {
|
|
72
|
+
const scheme = parts[i];
|
|
73
|
+
if (scheme !== "http:" && scheme !== "https:") {
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
// A real absolute URL in-path is typically split like:
|
|
77
|
+
// ["...", "https:", "", "example.com", "a", "b.png"]
|
|
78
|
+
const hasDoubleSlash = parts[i + 1] === "";
|
|
79
|
+
const host = parts[i + 2] || "";
|
|
80
|
+
if (!hasDoubleSlash || !isLikelyHostname(host)) {
|
|
81
|
+
continue;
|
|
82
|
+
}
|
|
83
|
+
const embedded = scheme + "//" + parts.slice(i + 2).join("/");
|
|
84
|
+
const encoded = encodeURIComponent(embedded);
|
|
85
|
+
const nextParts = parts.slice(0, i).concat(encoded);
|
|
86
|
+
const rebuilt = nextParts.join("/") || "/";
|
|
87
|
+
return rebuilt.startsWith("/") ? rebuilt : "/" + rebuilt;
|
|
88
|
+
}
|
|
89
|
+
return null;
|
|
90
|
+
};
|
|
91
|
+
const makePathnameVariants = (pathname) => {
|
|
92
|
+
const variants = new Set();
|
|
93
|
+
const push = (value) => {
|
|
94
|
+
if (!value)
|
|
95
|
+
return;
|
|
96
|
+
variants.add(value);
|
|
97
|
+
};
|
|
98
|
+
push(pathname);
|
|
99
|
+
push(stripTrailingSlash(pathname));
|
|
100
|
+
const encodedTail = encodeEmbeddedUrlTailIfPresent(pathname);
|
|
101
|
+
if (encodedTail && encodedTail !== pathname) {
|
|
102
|
+
push(encodedTail);
|
|
103
|
+
push(stripTrailingSlash(encodedTail));
|
|
104
|
+
}
|
|
105
|
+
return Array.from(variants);
|
|
106
|
+
};
|
|
107
|
+
const getBasename = (pathname) => {
|
|
108
|
+
const clean = pathname.split("?")[0] || "";
|
|
109
|
+
const parts = clean.split("/").filter(Boolean);
|
|
110
|
+
return parts[parts.length - 1] || "";
|
|
111
|
+
};
|
|
112
|
+
const toUrlOrNull = (value) => {
|
|
113
|
+
try {
|
|
114
|
+
return new URL(value);
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
return null;
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
const buildResourceProxyIndex = (snapshot) => {
|
|
121
|
+
const byExactUrl = new Map();
|
|
122
|
+
const byPathnameWithSearch = new Map();
|
|
123
|
+
const byPathname = new Map();
|
|
124
|
+
const byBasename = new Map();
|
|
125
|
+
for (const item of snapshot.items || []) {
|
|
126
|
+
if (!item || !item.url || !item.path) {
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
const parsed = toUrlOrNull(item.url);
|
|
130
|
+
if (!parsed) {
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
const pathname = parsed.pathname || "/";
|
|
134
|
+
const pathnameWithSearch = pathname + (parsed.search || "");
|
|
135
|
+
const basename = getBasename(pathname);
|
|
136
|
+
const indexed = {
|
|
137
|
+
...item,
|
|
138
|
+
parsed,
|
|
139
|
+
pathname,
|
|
140
|
+
pathnameWithSearch,
|
|
141
|
+
basename
|
|
142
|
+
};
|
|
143
|
+
// Prefer first-seen item for exact URL.
|
|
144
|
+
if (!byExactUrl.has(parsed.toString())) {
|
|
145
|
+
byExactUrl.set(parsed.toString(), indexed);
|
|
146
|
+
}
|
|
147
|
+
addMulti(byPathnameWithSearch, pathnameWithSearch, indexed);
|
|
148
|
+
addMulti(byPathname, pathname, indexed);
|
|
149
|
+
if (basename) {
|
|
150
|
+
addMulti(byBasename, basename, indexed);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return {
|
|
154
|
+
byExactUrl,
|
|
155
|
+
byPathnameWithSearch,
|
|
156
|
+
byPathname,
|
|
157
|
+
byBasename
|
|
158
|
+
};
|
|
159
|
+
};
|
|
160
|
+
exports.buildResourceProxyIndex = buildResourceProxyIndex;
|
|
161
|
+
const uniqByPath = (items) => {
|
|
162
|
+
const seen = new Set();
|
|
163
|
+
const out = [];
|
|
164
|
+
for (const item of items) {
|
|
165
|
+
if (seen.has(item.path))
|
|
166
|
+
continue;
|
|
167
|
+
seen.add(item.path);
|
|
168
|
+
out.push(item);
|
|
169
|
+
}
|
|
170
|
+
return out;
|
|
171
|
+
};
|
|
172
|
+
const preferSingle = (items, baseUrl, suffixLength) => {
|
|
173
|
+
if (items.length <= 1) {
|
|
174
|
+
return items[0] ?? null;
|
|
175
|
+
}
|
|
176
|
+
const baseParsed = (() => {
|
|
177
|
+
try {
|
|
178
|
+
return new URL(baseUrl);
|
|
179
|
+
}
|
|
180
|
+
catch {
|
|
181
|
+
return null;
|
|
182
|
+
}
|
|
183
|
+
})();
|
|
184
|
+
if (baseParsed) {
|
|
185
|
+
const sameOrigin = items.filter((i) => i.parsed.origin === baseParsed.origin);
|
|
186
|
+
if (sameOrigin.length === 1) {
|
|
187
|
+
return sameOrigin[0];
|
|
188
|
+
}
|
|
189
|
+
if (sameOrigin.length > 1) {
|
|
190
|
+
items = sameOrigin;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
// If still ambiguous, only accept when the match key is strong.
|
|
194
|
+
// We treat very short suffix matches (or basename-only) as too risky.
|
|
195
|
+
if (suffixLength < 2) {
|
|
196
|
+
return null;
|
|
197
|
+
}
|
|
198
|
+
return null;
|
|
199
|
+
};
|
|
200
|
+
const tryCandidates = (items, baseUrl, suffixLength) => {
|
|
201
|
+
const unique = uniqByPath(items);
|
|
202
|
+
if (unique.length === 0) {
|
|
203
|
+
return null;
|
|
204
|
+
}
|
|
205
|
+
if (unique.length === 1) {
|
|
206
|
+
return unique[0];
|
|
207
|
+
}
|
|
208
|
+
return preferSingle(unique, baseUrl, suffixLength);
|
|
209
|
+
};
|
|
210
|
+
const makeSuffixes = (pathname) => {
|
|
211
|
+
const parts = pathname.split("/").filter(Boolean);
|
|
212
|
+
const out = [];
|
|
213
|
+
for (let i = 0; i < parts.length; i += 1) {
|
|
214
|
+
const suffix = "/" + parts.slice(i).join("/");
|
|
215
|
+
out.push({ key: suffix, depth: parts.length - i });
|
|
216
|
+
}
|
|
217
|
+
return out;
|
|
218
|
+
};
|
|
219
|
+
const resolveToLocalPath = (options) => {
|
|
220
|
+
const { requestUrl, baseUrl, index } = options;
|
|
221
|
+
if (!requestUrl) {
|
|
222
|
+
return undefined;
|
|
223
|
+
}
|
|
224
|
+
let abs = null;
|
|
225
|
+
try {
|
|
226
|
+
abs = new URL(requestUrl, baseUrl);
|
|
227
|
+
}
|
|
228
|
+
catch {
|
|
229
|
+
abs = null;
|
|
230
|
+
}
|
|
231
|
+
if (!abs) {
|
|
232
|
+
return undefined;
|
|
233
|
+
}
|
|
234
|
+
const absString = abs.toString();
|
|
235
|
+
const exact = index.byExactUrl.get(absString);
|
|
236
|
+
if (exact) {
|
|
237
|
+
return escapePercentForStaticServersOnce(exact.path);
|
|
238
|
+
}
|
|
239
|
+
const withoutHash = stripHash(absString);
|
|
240
|
+
if (withoutHash !== absString) {
|
|
241
|
+
const found = index.byExactUrl.get(withoutHash);
|
|
242
|
+
if (found) {
|
|
243
|
+
return escapePercentForStaticServersOnce(found.path);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
const pathname = abs.pathname || "/";
|
|
247
|
+
const pathnameVariants = makePathnameVariants(pathname);
|
|
248
|
+
const search = abs.search || "";
|
|
249
|
+
const pathnameWithSearchVariants = pathnameVariants.map((p) => p + search);
|
|
250
|
+
for (const key of pathnameWithSearchVariants) {
|
|
251
|
+
const items = toArray(index.byPathnameWithSearch.get(key));
|
|
252
|
+
const match = tryCandidates(items, baseUrl, 99);
|
|
253
|
+
if (match) {
|
|
254
|
+
return escapePercentForStaticServersOnce(match.path);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
for (const key of pathnameVariants) {
|
|
258
|
+
const items = toArray(index.byPathname.get(key));
|
|
259
|
+
const match = tryCandidates(items, baseUrl, 99);
|
|
260
|
+
if (match) {
|
|
261
|
+
return escapePercentForStaticServersOnce(match.path);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
// Suffix fallback: progressively remove leading segments.
|
|
265
|
+
for (const variant of pathnameVariants) {
|
|
266
|
+
for (const suffix of makeSuffixes(variant)) {
|
|
267
|
+
const items = toArray(index.byPathname.get(suffix.key));
|
|
268
|
+
const match = tryCandidates(items, baseUrl, suffix.depth);
|
|
269
|
+
if (match) {
|
|
270
|
+
return escapePercentForStaticServersOnce(match.path);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
const basename = getBasename(pathname);
|
|
275
|
+
if (basename) {
|
|
276
|
+
const items = toArray(index.byBasename.get(basename));
|
|
277
|
+
const match = tryCandidates(items, baseUrl, 1);
|
|
278
|
+
if (match) {
|
|
279
|
+
return escapePercentForStaticServersOnce(match.path);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return undefined;
|
|
283
|
+
};
|
|
284
|
+
exports.resolveToLocalPath = resolveToLocalPath;
|
package/dist/rewrite-links.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { ReplaceElementsConfig } from "./types";
|
|
1
2
|
type UrlResolver = (absoluteUrl: string) => string | null;
|
|
2
3
|
export declare const rewriteJsText: (source: string, resolve: UrlResolver, baseUrl: string) => Promise<string>;
|
|
3
4
|
export declare const rewriteEntryHtml: (input: {
|
|
@@ -6,6 +7,13 @@ export declare const rewriteEntryHtml: (input: {
|
|
|
6
7
|
apiPath: string;
|
|
7
8
|
resolve: UrlResolver;
|
|
8
9
|
rewriteLinks?: boolean;
|
|
10
|
+
replaceElements?: ReplaceElementsConfig;
|
|
11
|
+
isEntryDocument?: boolean;
|
|
12
|
+
/**
|
|
13
|
+
* The top-level entry URL for this snapshot (may differ from entryUrl when
|
|
14
|
+
* multiple documents/frames are captured).
|
|
15
|
+
*/
|
|
16
|
+
snapshotEntryUrl?: string;
|
|
9
17
|
}) => Promise<{
|
|
10
18
|
html: string;
|
|
11
19
|
title?: string;
|
package/dist/rewrite-links.js
CHANGED
|
@@ -37,6 +37,7 @@ exports.rewriteEntryHtml = exports.rewriteJsText = void 0;
|
|
|
37
37
|
const cheerio = __importStar(require("cheerio"));
|
|
38
38
|
const css_rewrite_1 = require("./css-rewrite");
|
|
39
39
|
const hack_html_1 = require("./hack-html");
|
|
40
|
+
const replace_elements_1 = require("./replace-elements");
|
|
40
41
|
const shouldSkipValue = (value) => {
|
|
41
42
|
const trimmed = value.trim();
|
|
42
43
|
return (!trimmed ||
|
|
@@ -59,18 +60,83 @@ const resolveUrlValue = (value, baseUrl, resolve) => {
|
|
|
59
60
|
return null;
|
|
60
61
|
}
|
|
61
62
|
};
|
|
63
|
+
const isUnsafeSrcsetValue = (value) => {
|
|
64
|
+
const trimmed = value.trim();
|
|
65
|
+
if (!trimmed) {
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
// Some sites (notably Substack) emit image transform URLs that contain commas
|
|
69
|
+
// inside the URL itself (e.g. "/image/fetch/...,$w_40,$h_40,.../https%3A...").
|
|
70
|
+
//
|
|
71
|
+
// In the HTML srcset grammar, commas separate candidates, so unescaped commas
|
|
72
|
+
// inside a URL make the srcset invalid. Browsers will parse it into garbage
|
|
73
|
+
// URLs like "https%3A%2F%2F...png" and try to fetch them.
|
|
74
|
+
//
|
|
75
|
+
// For offline snapshots, it's better to drop srcset entirely and rely on
|
|
76
|
+
// the already-rewritten img[src].
|
|
77
|
+
const hasFetchTransform = trimmed.includes("/image/fetch/");
|
|
78
|
+
const hasEncodedUrlTail = trimmed.includes("https%3A%2F%2F");
|
|
79
|
+
const hasCommaTokens = trimmed.includes(",w_") ||
|
|
80
|
+
trimmed.includes(", w_") ||
|
|
81
|
+
trimmed.includes(",h_") ||
|
|
82
|
+
trimmed.includes(", h_") ||
|
|
83
|
+
trimmed.includes(",c_") ||
|
|
84
|
+
trimmed.includes(", c_");
|
|
85
|
+
return hasFetchTransform && hasEncodedUrlTail && hasCommaTokens;
|
|
86
|
+
};
|
|
87
|
+
const isDescriptorToken = (token) => {
|
|
88
|
+
const trimmed = token.trim();
|
|
89
|
+
if (!trimmed)
|
|
90
|
+
return false;
|
|
91
|
+
// Common srcset descriptors: 1x, 2x, 320w
|
|
92
|
+
return /^\d+(\.\d+)?x$/i.test(trimmed) || /^\d+w$/i.test(trimmed);
|
|
93
|
+
};
|
|
94
|
+
const parseSrcset = (input) => {
|
|
95
|
+
// Minimal srcset parser:
|
|
96
|
+
// - Candidates are separated by commas.
|
|
97
|
+
// - Each candidate is "<url> [descriptor]".
|
|
98
|
+
// - URLs may contain spaces/commas (e.g. CDN transform strings). To avoid
|
|
99
|
+
// breaking those, we locate the descriptor from the *end* of the candidate.
|
|
100
|
+
const rawCandidates = input
|
|
101
|
+
.split(",")
|
|
102
|
+
.map((c) => c.trim())
|
|
103
|
+
.filter(Boolean);
|
|
104
|
+
return rawCandidates.map((candidate) => {
|
|
105
|
+
const tokens = candidate.split(/\s+/).filter(Boolean);
|
|
106
|
+
if (tokens.length === 0) {
|
|
107
|
+
return { url: candidate };
|
|
108
|
+
}
|
|
109
|
+
const last = tokens[tokens.length - 1] ?? "";
|
|
110
|
+
if (tokens.length >= 2 && isDescriptorToken(last)) {
|
|
111
|
+
const descriptor = last;
|
|
112
|
+
const url = candidate.slice(0, candidate.lastIndexOf(descriptor)).trim();
|
|
113
|
+
return { url, descriptor };
|
|
114
|
+
}
|
|
115
|
+
return { url: candidate };
|
|
116
|
+
});
|
|
117
|
+
};
|
|
118
|
+
const stringifySrcset = (candidates) => {
|
|
119
|
+
return (candidates
|
|
120
|
+
.map((c) => {
|
|
121
|
+
const url = c.url.trim();
|
|
122
|
+
if (!c.descriptor)
|
|
123
|
+
return url;
|
|
124
|
+
return `${url} ${c.descriptor.trim()}`;
|
|
125
|
+
})
|
|
126
|
+
.filter(Boolean)
|
|
127
|
+
// Don't introduce spaces after commas inside URL tokens.
|
|
128
|
+
.join(","));
|
|
129
|
+
};
|
|
62
130
|
const rewriteSrcsetValue = (value, baseUrl, resolve) => {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
const resolved = resolveUrlValue(
|
|
69
|
-
|
|
70
|
-
return part;
|
|
71
|
-
return descriptor ? `${resolved} ${descriptor}` : resolved;
|
|
131
|
+
if (isUnsafeSrcsetValue(value)) {
|
|
132
|
+
return "";
|
|
133
|
+
}
|
|
134
|
+
const candidates = parseSrcset(value);
|
|
135
|
+
const rewritten = candidates.map((c) => {
|
|
136
|
+
const resolved = resolveUrlValue(c.url, baseUrl, resolve);
|
|
137
|
+
return { url: resolved ?? c.url, descriptor: c.descriptor };
|
|
72
138
|
});
|
|
73
|
-
return rewritten
|
|
139
|
+
return stringifySrcset(rewritten);
|
|
74
140
|
};
|
|
75
141
|
const rewriteMetaRefresh = (content, baseUrl, resolve) => {
|
|
76
142
|
const parts = content.split(";");
|
|
@@ -80,7 +146,13 @@ const rewriteMetaRefresh = (content, baseUrl, resolve) => {
|
|
|
80
146
|
if (urlPartIndex === -1)
|
|
81
147
|
return content;
|
|
82
148
|
const urlPart = parts[urlPartIndex];
|
|
83
|
-
|
|
149
|
+
let rawUrl = urlPart.split("=").slice(1).join("=").trim();
|
|
150
|
+
// Some pages quote the URL value (url="/next" or url='/next').
|
|
151
|
+
// Strip a single pair of surrounding quotes to improve rewrite coverage.
|
|
152
|
+
if ((rawUrl.startsWith('"') && rawUrl.endsWith('"')) ||
|
|
153
|
+
(rawUrl.startsWith("'") && rawUrl.endsWith("'"))) {
|
|
154
|
+
rawUrl = rawUrl.slice(1, -1).trim();
|
|
155
|
+
}
|
|
84
156
|
const resolved = resolveUrlValue(rawUrl, baseUrl, resolve);
|
|
85
157
|
if (!resolved)
|
|
86
158
|
return content;
|
|
@@ -89,6 +161,18 @@ const rewriteMetaRefresh = (content, baseUrl, resolve) => {
|
|
|
89
161
|
nextParts[urlPartIndex] = next;
|
|
90
162
|
return nextParts.join(";");
|
|
91
163
|
};
|
|
164
|
+
const shouldRewriteLinkHref = ($element) => {
|
|
165
|
+
const rel = ($element.attr("rel") || "").trim().toLowerCase();
|
|
166
|
+
if (!rel) {
|
|
167
|
+
return true;
|
|
168
|
+
}
|
|
169
|
+
// Only rewrite link rels that are expected to load a resource.
|
|
170
|
+
// Avoid rewriting navigational/SEO links like canonical, preconnect, etc.
|
|
171
|
+
return (rel.includes("stylesheet") ||
|
|
172
|
+
rel.includes("preload") ||
|
|
173
|
+
rel.includes("prefetch") ||
|
|
174
|
+
rel.includes("icon"));
|
|
175
|
+
};
|
|
92
176
|
const rewriteJsText = async (source, resolve, baseUrl) => {
|
|
93
177
|
const replaceSpecifier = async (specifier) => {
|
|
94
178
|
const trimmed = specifier.trim();
|
|
@@ -171,7 +255,19 @@ const rewriteEntryHtml = async (input) => {
|
|
|
171
255
|
rewriteAttr("iframe[src]", "src");
|
|
172
256
|
rewriteAttr("embed[src]", "src");
|
|
173
257
|
rewriteAttr("object[data]", "data");
|
|
174
|
-
|
|
258
|
+
$("link[href]").each((_, element) => {
|
|
259
|
+
const el = $(element);
|
|
260
|
+
if (!shouldRewriteLinkHref(el)) {
|
|
261
|
+
return;
|
|
262
|
+
}
|
|
263
|
+
const value = el.attr("href");
|
|
264
|
+
if (!value)
|
|
265
|
+
return;
|
|
266
|
+
const resolved = resolveUrlValue(value, baseUrl, resolve);
|
|
267
|
+
if (resolved) {
|
|
268
|
+
el.attr("href", resolved);
|
|
269
|
+
}
|
|
270
|
+
});
|
|
175
271
|
rewriteAttr("[poster]", "poster");
|
|
176
272
|
rewriteDataAttrs("[data-src]", "data-src");
|
|
177
273
|
rewriteDataAttrs("[data-href]", "data-href");
|
|
@@ -184,6 +280,13 @@ const rewriteEntryHtml = async (input) => {
|
|
|
184
280
|
const rewritten = rewriteSrcsetValue(value, baseUrl, resolve);
|
|
185
281
|
$(element).attr("srcset", rewritten);
|
|
186
282
|
});
|
|
283
|
+
$("link[imagesrcset]").each((_, element) => {
|
|
284
|
+
const value = $(element).attr("imagesrcset");
|
|
285
|
+
if (!value)
|
|
286
|
+
return;
|
|
287
|
+
const rewritten = rewriteSrcsetValue(value, baseUrl, resolve);
|
|
288
|
+
$(element).attr("imagesrcset", rewritten);
|
|
289
|
+
});
|
|
187
290
|
$("meta[http-equiv]").each((_, element) => {
|
|
188
291
|
const httpEquiv = ($(element).attr("http-equiv") || "").toLowerCase();
|
|
189
292
|
if (httpEquiv !== "refresh")
|
|
@@ -245,6 +348,13 @@ const rewriteEntryHtml = async (input) => {
|
|
|
245
348
|
baseUrl: baseUrl,
|
|
246
349
|
apiPath: input.apiPath
|
|
247
350
|
});
|
|
351
|
+
await (0, replace_elements_1.applyReplaceElements)({
|
|
352
|
+
$,
|
|
353
|
+
entryUrl: input.snapshotEntryUrl ?? baseUrl,
|
|
354
|
+
url: baseUrl,
|
|
355
|
+
replaceElements: input.replaceElements,
|
|
356
|
+
isEntryDocument: input.isEntryDocument ?? true
|
|
357
|
+
});
|
|
248
358
|
const title = $("title").first().text() || undefined;
|
|
249
359
|
return { html: $.html(), title };
|
|
250
360
|
};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { ApiEntry, StoredResource } from "./network-store";
|
|
2
|
-
import type { ContentStore, PageSnapshot, PathResolver } from "./types";
|
|
2
|
+
import type { ContentStore, PageSnapshot, PathResolver, ReplaceElementsConfig } from "./types";
|
|
3
3
|
type BuildOptions = {
|
|
4
4
|
entryUrl: string;
|
|
5
5
|
createdAt: number;
|
|
@@ -9,6 +9,7 @@ type BuildOptions = {
|
|
|
9
9
|
pathResolver?: PathResolver;
|
|
10
10
|
rewriteEntry: boolean;
|
|
11
11
|
rewriteCSS: boolean;
|
|
12
|
+
replaceElements?: ReplaceElementsConfig;
|
|
12
13
|
warnings: string[];
|
|
13
14
|
};
|
|
14
15
|
export declare const buildSnapshot: (input: BuildOptions) => Promise<PageSnapshot>;
|
package/dist/snapshot-builder.js
CHANGED
|
@@ -6,6 +6,18 @@ const path_resolver_1 = require("./path-resolver");
|
|
|
6
6
|
const rewrite_links_1 = require("./rewrite-links");
|
|
7
7
|
const snapshot_1 = require("./snapshot");
|
|
8
8
|
const utils_1 = require("./utils");
|
|
9
|
+
const escapePercentForStaticServers = (value) => {
|
|
10
|
+
// Many static servers decode percent-encoding in the request path before
|
|
11
|
+
// resolving it to a filesystem path.
|
|
12
|
+
//
|
|
13
|
+
// Our snapshots can contain literal "%2F" sequences in filenames (e.g.
|
|
14
|
+
// Substack image URLs embedded into a path segment). When a server decodes
|
|
15
|
+
// "%2F" to "/", it changes the path structure and causes 404s.
|
|
16
|
+
//
|
|
17
|
+
// Escaping "%" to "%25" makes the request decode back to the original
|
|
18
|
+
// filename on disk.
|
|
19
|
+
return value.split("%").join("%25");
|
|
20
|
+
};
|
|
9
21
|
const streamToUint8Array = async (stream) => {
|
|
10
22
|
const reader = stream.getReader();
|
|
11
23
|
const chunks = [];
|
|
@@ -119,6 +131,42 @@ const buildApiSnapshot = (url, createdAt, entries) => ({
|
|
|
119
131
|
createdAt,
|
|
120
132
|
records: entries.map((entry) => entry.record)
|
|
121
133
|
});
|
|
134
|
+
const buildResourcesPathSnapshot = (createdAt, files) => {
|
|
135
|
+
const items = [];
|
|
136
|
+
for (const file of files) {
|
|
137
|
+
const originalUrl = file.originalUrl;
|
|
138
|
+
if (!originalUrl) {
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
// Exclude HTML documents. We only want static resources.
|
|
142
|
+
if (file.resourceType === "document") {
|
|
143
|
+
continue;
|
|
144
|
+
}
|
|
145
|
+
// Skip snapshot-local pseudo URLs.
|
|
146
|
+
if (originalUrl.startsWith("/")) {
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
// Be defensive: only include valid absolute URLs.
|
|
150
|
+
try {
|
|
151
|
+
new URL(originalUrl);
|
|
152
|
+
}
|
|
153
|
+
catch {
|
|
154
|
+
continue;
|
|
155
|
+
}
|
|
156
|
+
items.push({
|
|
157
|
+
url: originalUrl,
|
|
158
|
+
path: file.path,
|
|
159
|
+
resourceType: file.resourceType,
|
|
160
|
+
mimeType: file.mimeType,
|
|
161
|
+
size: file.size
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
return {
|
|
165
|
+
version: "1.0",
|
|
166
|
+
createdAt,
|
|
167
|
+
items
|
|
168
|
+
};
|
|
169
|
+
};
|
|
122
170
|
const buildSnapshot = async (input) => {
|
|
123
171
|
const warnings = input.warnings;
|
|
124
172
|
const groups = groupResources({
|
|
@@ -147,7 +195,18 @@ const buildSnapshot = async (input) => {
|
|
|
147
195
|
});
|
|
148
196
|
urlToPath.set(resource.request.url, path);
|
|
149
197
|
}
|
|
150
|
-
const resolve = (absoluteUrl) =>
|
|
198
|
+
const resolve = (absoluteUrl) => {
|
|
199
|
+
const resolved = urlToPath.get(absoluteUrl);
|
|
200
|
+
if (!resolved) {
|
|
201
|
+
return null;
|
|
202
|
+
}
|
|
203
|
+
// Only escape snapshot-local paths. (Defensive: resolved should always be
|
|
204
|
+
// a path, but avoid breaking any unexpected absolute URLs.)
|
|
205
|
+
if (resolved.includes("://")) {
|
|
206
|
+
return resolved;
|
|
207
|
+
}
|
|
208
|
+
return escapePercentForStaticServers(resolved);
|
|
209
|
+
};
|
|
151
210
|
const apiPath = (0, utils_1.ensureLeadingSlash)(multiDoc ? `${(0, utils_1.sanitizePosixPath)(docDir)}/api.json` : "/api.json");
|
|
152
211
|
for (const resource of group.resources) {
|
|
153
212
|
if (resource.request.resourceType === "document") {
|
|
@@ -161,7 +220,10 @@ const buildSnapshot = async (input) => {
|
|
|
161
220
|
entryUrl: group.url,
|
|
162
221
|
apiPath,
|
|
163
222
|
resolve,
|
|
164
|
-
rewriteLinks: input.rewriteEntry
|
|
223
|
+
rewriteLinks: input.rewriteEntry,
|
|
224
|
+
replaceElements: input.replaceElements,
|
|
225
|
+
isEntryDocument: group.url === input.entryUrl,
|
|
226
|
+
snapshotEntryUrl: input.entryUrl
|
|
165
227
|
});
|
|
166
228
|
html = rewritten.html;
|
|
167
229
|
if (!title) {
|
|
@@ -253,6 +315,17 @@ const buildSnapshot = async (input) => {
|
|
|
253
315
|
originalUrl: apiPath
|
|
254
316
|
});
|
|
255
317
|
}
|
|
318
|
+
{
|
|
319
|
+
const resourcesPath = buildResourcesPathSnapshot(input.createdAt, files);
|
|
320
|
+
const bytes = new TextEncoder().encode(JSON.stringify(resourcesPath, null, 2));
|
|
321
|
+
// Snapshot-local artifact. It intentionally has no originalUrl.
|
|
322
|
+
files.push({
|
|
323
|
+
path: "/resources_path.json",
|
|
324
|
+
mimeType: "application/json",
|
|
325
|
+
size: bytes.byteLength,
|
|
326
|
+
source: { kind: "memory", data: bytes }
|
|
327
|
+
});
|
|
328
|
+
}
|
|
256
329
|
const totalBytes = files.reduce((sum, file) => sum + (file.size ?? 0), 0);
|
|
257
330
|
const totalFiles = files.length;
|
|
258
331
|
const snapshotUrl = input.entryUrl || groups[0]?.url || "";
|