@pagepocket/lib 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,14 +4,18 @@ exports.replayHackers = exports.preloadHackers = void 0;
4
4
  const preload_fetch_1 = require("./preload-fetch");
5
5
  const preload_xhr_1 = require("./preload-xhr");
6
6
  const replay_beacon_1 = require("./replay-beacon");
7
+ const replay_block_text_fragment_1 = require("./replay-block-text-fragment");
7
8
  const replay_dom_rewrite_1 = require("./replay-dom-rewrite");
8
9
  const replay_eventsource_1 = require("./replay-eventsource");
9
10
  const replay_fetch_1 = require("./replay-fetch");
11
+ const replay_history_path_1 = require("./replay-history-path");
10
12
  const replay_svg_image_1 = require("./replay-svg-image");
11
13
  const replay_websocket_1 = require("./replay-websocket");
12
14
  const replay_xhr_1 = require("./replay-xhr");
13
15
  exports.preloadHackers = [preload_fetch_1.preloadFetchRecorder, preload_xhr_1.preloadXhrRecorder];
14
16
  exports.replayHackers = [
17
+ replay_block_text_fragment_1.replayBlockTextFragment,
18
+ replay_history_path_1.replayHistoryPath,
15
19
  replay_fetch_1.replayFetchResponder,
16
20
  replay_xhr_1.replayXhrResponder,
17
21
  replay_dom_rewrite_1.replayDomRewriter,
@@ -0,0 +1,2 @@
1
+ import type { ScriptHacker } from "./types";
2
+ export declare const replayBlockTextFragment: ScriptHacker;
@@ -0,0 +1,71 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.replayBlockTextFragment = void 0;
4
+ exports.replayBlockTextFragment = {
5
+ id: "replay-block-text-fragment",
6
+ stage: "replay",
7
+ build: () => `
8
+ // Block navigations that only add a Text Fragment.
9
+ //
10
+ // Some sites (e.g. Substack) call document.location.replace("#:~:text=...")
11
+ // to trigger the browser's Text Fragment highlighting. On large pages this
12
+ // can cause severe jank/freezes while the browser scans the document.
13
+ //
14
+ // We intercept location.replace/assign. If the navigation target differs
15
+ // only by hash and the hash starts with "#:~:text=", we update history
16
+ // without triggering a navigation.
17
+ const shouldBlockTextFragmentNavigation = (targetUrl) => {
18
+ try {
19
+ const current = new URL(window.location.href);
20
+ const next = new URL(String(targetUrl), current);
21
+ if (!next.hash || !next.hash.startsWith("#:~:text=")) {
22
+ return false;
23
+ }
24
+
25
+ return (
26
+ current.origin === next.origin &&
27
+ current.pathname === next.pathname &&
28
+ current.search === next.search
29
+ );
30
+ } catch {
31
+ return false;
32
+ }
33
+ };
34
+
35
+ const patchLocationMethod = (name) => {
36
+ try {
37
+ const loc = window.location;
38
+ const original = loc && loc[name];
39
+ if (typeof original !== "function") {
40
+ return;
41
+ }
42
+
43
+ const patched = function(url) {
44
+ try {
45
+ if (shouldBlockTextFragmentNavigation(url)) {
46
+ const current = new URL(window.location.href);
47
+ const next = new URL(String(url), current);
48
+ history.replaceState(history.state, document.title, next.hash);
49
+ return;
50
+ }
51
+ } catch {}
52
+
53
+ return original.call(window.location, url);
54
+ };
55
+
56
+ try {
57
+ patched.__pagepocketOriginal = original;
58
+ } catch {}
59
+
60
+ try {
61
+ Object.defineProperty(loc, name, { configurable: true, value: patched });
62
+ } catch {
63
+ // Some browsers don't allow redefining Location methods.
64
+ }
65
+ } catch {}
66
+ };
67
+
68
+ patchLocationMethod("replace");
69
+ patchLocationMethod("assign");
70
+ `
71
+ };
@@ -31,9 +31,33 @@ exports.replayDomRewriter = {
31
31
  }
32
32
  };
33
33
 
34
+ const rewritten = new WeakMap();
35
+
34
36
  // Rewrite srcset values to local files only (avoid data: URLs in srcset).
35
37
  const rewriteSrcset = (value) => {
36
38
  if (!value) return value;
39
+
40
+ // Substack-style image/fetch URLs include commas inside the URL token
41
+ // (",w_40,h_40,..."). This makes the srcset invalid and browsers will
42
+ // parse it into garbage candidate URLs. Prefer dropping srcset and relying
43
+ // on the rewritten img[src].
44
+ try {
45
+ const trimmed = String(value || "").trim();
46
+ const hasFetchTransform = trimmed.includes("/image/fetch/");
47
+ const hasEncodedUrlTail = trimmed.includes("https%3A%2F%2F");
48
+ const hasCommaTokens =
49
+ trimmed.includes(",w_") ||
50
+ trimmed.includes(", w_") ||
51
+ trimmed.includes(",h_") ||
52
+ trimmed.includes(", h_") ||
53
+ trimmed.includes(",c_") ||
54
+ trimmed.includes(", c_");
55
+
56
+ if (hasFetchTransform && hasEncodedUrlTail && hasCommaTokens) {
57
+ return "";
58
+ }
59
+ } catch {}
60
+
37
61
  return value.split(",").map((part) => {
38
62
  const trimmed = part.trim();
39
63
  if (!trimmed) return trimmed;
@@ -46,7 +70,7 @@ exports.replayDomRewriter = {
46
70
  return descriptor ? localPath + " " + descriptor : localPath;
47
71
  }
48
72
  return trimmed;
49
- }).join(", ");
73
+ }).join(",");
50
74
  };
51
75
 
52
76
  // Rewrite element attributes to local files or data URLs.
@@ -56,9 +80,21 @@ exports.replayDomRewriter = {
56
80
  onReady(() => rewriteElement(element));
57
81
  return;
58
82
  }
83
+ const prev = rewritten.get(element);
84
+ const currentSrc = element.getAttribute("src");
85
+ const currentHref = element.getAttribute("href");
86
+ const currentSrcset = element.getAttribute("srcset");
87
+ if (
88
+ prev &&
89
+ prev.src === currentSrc &&
90
+ prev.href === currentHref &&
91
+ prev.srcset === currentSrcset
92
+ ) {
93
+ return;
94
+ }
59
95
  const tag = (element.tagName || "").toLowerCase();
60
96
  if (tag === "img" || tag === "source" || tag === "video" || tag === "audio" || tag === "script" || tag === "iframe") {
61
- const src = element.getAttribute("src");
97
+ const src = currentSrc;
62
98
  if (src && !isLocalResource(src) && !src.startsWith("data:") && !src.startsWith("blob:")) {
63
99
  const localPath = findLocalPath(src);
64
100
  if (localPath) {
@@ -72,7 +108,7 @@ exports.replayDomRewriter = {
72
108
  }
73
109
 
74
110
  if (tag === "link") {
75
- const href = element.getAttribute("href");
111
+ const href = currentHref;
76
112
  const rel = (element.getAttribute("rel") || "").toLowerCase();
77
113
  if (href && !isLocalResource(href) && !href.startsWith("data:") && !href.startsWith("blob:")) {
78
114
  const localPath = findLocalPath(href);
@@ -86,10 +122,16 @@ exports.replayDomRewriter = {
86
122
  }
87
123
  }
88
124
 
89
- const srcset = element.getAttribute("srcset");
125
+ const srcset = currentSrcset;
90
126
  if (srcset) {
91
127
  element.setAttribute("srcset", rewriteSrcset(srcset));
92
128
  }
129
+
130
+ rewritten.set(element, {
131
+ src: element.getAttribute("src"),
132
+ href: element.getAttribute("href"),
133
+ srcset: element.getAttribute("srcset")
134
+ });
93
135
  };
94
136
 
95
137
  // Intercept DOM attribute writes to keep resources local.
@@ -275,8 +317,6 @@ exports.replayDomRewriter = {
275
317
  mutation.addedNodes.forEach((node) => {
276
318
  if (node && node.nodeType === 1) {
277
319
  rewriteElement(node);
278
- const descendants = node.querySelectorAll ? node.querySelectorAll("img,source,video,audio,script,link,iframe") : [];
279
- descendants.forEach((el) => rewriteElement(el));
280
320
  }
281
321
  });
282
322
  }
@@ -0,0 +1,2 @@
1
+ import type { ScriptHacker } from "./types";
2
+ export declare const replayHistoryPath: ScriptHacker;
@@ -0,0 +1,25 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.replayHistoryPath = void 0;
4
+ exports.replayHistoryPath = {
5
+ id: "replay-history-path",
6
+ stage: "replay",
7
+ build: () => `
8
+ // Ensure history/location reflects the original captured URL path.
9
+ //
10
+ // When a snapshot is served from a static server root (e.g. http://localhost:8080/index.html),
11
+ // SPA routers that read location.pathname will see "/".
12
+ // For a page captured from https://foo.com/bar/foo, the correct route should be "/bar/foo".
13
+ //
14
+ // We patch history early and replace the current URL without causing navigation.
15
+ try {
16
+ const parsed = new URL(baseUrl);
17
+ const desiredPath = parsed.pathname + (parsed.search || "") + (parsed.hash || "");
18
+ const currentPath = window.location.pathname + window.location.search + window.location.hash;
19
+
20
+ if (desiredPath && desiredPath !== currentPath) {
21
+ history.replaceState(history.state, "", desiredPath);
22
+ }
23
+ } catch {}
24
+ `
25
+ };
@@ -4,12 +4,22 @@ exports.isDocumentType = exports.withPrefixPathResolver = exports.resolveCrossOr
4
4
  const utils_1 = require("./utils");
5
5
  const normalizePathname = (pathname) => {
6
6
  const normalized = pathname || "/";
7
+ const hasTrailingSlash = normalized.endsWith("/");
7
8
  const clean = (0, utils_1.sanitizePosixPath)(normalized);
8
- const leading = clean ? `/${clean}` : "/";
9
- if (leading.endsWith("/")) {
10
- return `${leading}index`;
9
+ const parts = clean.split("/").filter(Boolean);
10
+ if (hasTrailingSlash) {
11
+ parts.push("index");
11
12
  }
12
- return leading;
13
+ if (parts.length === 0) {
14
+ return "/index";
15
+ }
16
+ const rewritten = parts.map((part, index) => {
17
+ if (index < parts.length - 1 && part.includes(".")) {
18
+ return `${part}__pp_dir`;
19
+ }
20
+ return part;
21
+ });
22
+ return `/${rewritten.join("/")}`;
13
23
  };
14
24
  const withSuffix = (path, suffix) => {
15
25
  const lastSlash = path.lastIndexOf("/");
@@ -242,9 +242,15 @@ const buildReplayScript = (apiPath, baseUrl) => {
242
242
  return matchAPI({ records, byKey, baseUrl, method, url, body });
243
243
  };
244
244
 
245
+ const urlLookupCache = new Map();
245
246
  const findByUrl = (url) => {
246
247
  if (isLocalResource(url)) return null;
247
- return matchAPI({ records, byKey, baseUrl, method: "GET", url, body: "" });
248
+ if (urlLookupCache.has(url)) {
249
+ return urlLookupCache.get(url);
250
+ }
251
+ const record = matchAPI({ records, byKey, baseUrl, method: "GET", url, body: "" });
252
+ urlLookupCache.set(url, record || null);
253
+ return record;
248
254
  };
249
255
 
250
256
  const findLocalPath = () => null;
@@ -292,16 +298,28 @@ const buildReplayScript = (apiPath, baseUrl) => {
292
298
  return "application/octet-stream";
293
299
  };
294
300
 
301
+ const dataUrlCache = new Map();
295
302
  const toDataUrl = (record, fallbackType) => {
296
303
  if (!record) return "";
297
304
  const contentType = getContentType(record) || fallbackType || "application/octet-stream";
305
+ const cacheKey = (record.url || "") + "|" + contentType + "|" + (record.responseEncoding || "") + "|" +
306
+ (record.responseBodyBase64 ? "b64:" + record.responseBodyBase64.length : "txt:" + (record.responseBody ? record.responseBody.length : 0));
307
+ if (dataUrlCache.has(cacheKey)) {
308
+ return dataUrlCache.get(cacheKey);
309
+ }
298
310
  if (record.responseEncoding === "base64" && record.responseBodyBase64) {
299
- return "data:" + contentType + ";base64," + record.responseBodyBase64;
311
+ const dataUrl = "data:" + contentType + ";base64," + record.responseBodyBase64;
312
+ dataUrlCache.set(cacheKey, dataUrl);
313
+ return dataUrl;
300
314
  }
301
315
  if (record.responseBody) {
302
- return "data:" + contentType + ";base64," + textToBase64(record.responseBody);
316
+ const dataUrl = "data:" + contentType + ";base64," + textToBase64(record.responseBody);
317
+ dataUrlCache.set(cacheKey, dataUrl);
318
+ return dataUrl;
303
319
  }
304
- return "data:" + (fallbackType || "application/octet-stream") + ",";
320
+ const dataUrl = "data:" + (fallbackType || "application/octet-stream") + ",";
321
+ dataUrlCache.set(cacheKey, dataUrl);
322
+ return dataUrl;
305
323
  };
306
324
 
307
325
  const responseFromRecord = (record) => {
@@ -59,18 +59,83 @@ const resolveUrlValue = (value, baseUrl, resolve) => {
59
59
  return null;
60
60
  }
61
61
  };
62
+ const isUnsafeSrcsetValue = (value) => {
63
+ const trimmed = value.trim();
64
+ if (!trimmed) {
65
+ return false;
66
+ }
67
+ // Some sites (notably Substack) emit image transform URLs that contain commas
68
+ // inside the URL itself (e.g. "/image/fetch/...,$w_40,$h_40,.../https%3A...").
69
+ //
70
+ // In the HTML srcset grammar, commas separate candidates, so unescaped commas
71
+ // inside a URL make the srcset invalid. Browsers will parse it into garbage
72
+ // URLs like "https%3A%2F%2F...png" and try to fetch them.
73
+ //
74
+ // For offline snapshots, it's better to drop srcset entirely and rely on
75
+ // the already-rewritten img[src].
76
+ const hasFetchTransform = trimmed.includes("/image/fetch/");
77
+ const hasEncodedUrlTail = trimmed.includes("https%3A%2F%2F");
78
+ const hasCommaTokens = trimmed.includes(",w_") ||
79
+ trimmed.includes(", w_") ||
80
+ trimmed.includes(",h_") ||
81
+ trimmed.includes(", h_") ||
82
+ trimmed.includes(",c_") ||
83
+ trimmed.includes(", c_");
84
+ return hasFetchTransform && hasEncodedUrlTail && hasCommaTokens;
85
+ };
86
+ const isDescriptorToken = (token) => {
87
+ const trimmed = token.trim();
88
+ if (!trimmed)
89
+ return false;
90
+ // Common srcset descriptors: 1x, 2x, 320w
91
+ return /^\d+(\.\d+)?x$/i.test(trimmed) || /^\d+w$/i.test(trimmed);
92
+ };
93
+ const parseSrcset = (input) => {
94
+ // Minimal srcset parser:
95
+ // - Candidates are separated by commas.
96
+ // - Each candidate is "<url> [descriptor]".
97
+ // - URLs may contain spaces/commas (e.g. CDN transform strings). To avoid
98
+ // breaking those, we locate the descriptor from the *end* of the candidate.
99
+ const rawCandidates = input
100
+ .split(",")
101
+ .map((c) => c.trim())
102
+ .filter(Boolean);
103
+ return rawCandidates.map((candidate) => {
104
+ const tokens = candidate.split(/\s+/).filter(Boolean);
105
+ if (tokens.length === 0) {
106
+ return { url: candidate };
107
+ }
108
+ const last = tokens[tokens.length - 1] ?? "";
109
+ if (tokens.length >= 2 && isDescriptorToken(last)) {
110
+ const descriptor = last;
111
+ const url = candidate.slice(0, candidate.lastIndexOf(descriptor)).trim();
112
+ return { url, descriptor };
113
+ }
114
+ return { url: candidate };
115
+ });
116
+ };
117
+ const stringifySrcset = (candidates) => {
118
+ return (candidates
119
+ .map((c) => {
120
+ const url = c.url.trim();
121
+ if (!c.descriptor)
122
+ return url;
123
+ return `${url} ${c.descriptor.trim()}`;
124
+ })
125
+ .filter(Boolean)
126
+ // Don't introduce spaces after commas inside URL tokens.
127
+ .join(","));
128
+ };
62
129
  const rewriteSrcsetValue = (value, baseUrl, resolve) => {
63
- const parts = value.split(",").map((part) => part.trim());
64
- const rewritten = parts.map((part) => {
65
- const [rawUrl, descriptor] = part.split(/\s+/, 2);
66
- if (!rawUrl)
67
- return part;
68
- const resolved = resolveUrlValue(rawUrl, baseUrl, resolve);
69
- if (!resolved)
70
- return part;
71
- return descriptor ? `${resolved} ${descriptor}` : resolved;
130
+ if (isUnsafeSrcsetValue(value)) {
131
+ return "";
132
+ }
133
+ const candidates = parseSrcset(value);
134
+ const rewritten = candidates.map((c) => {
135
+ const resolved = resolveUrlValue(c.url, baseUrl, resolve);
136
+ return { url: resolved ?? c.url, descriptor: c.descriptor };
72
137
  });
73
- return rewritten.join(", ");
138
+ return stringifySrcset(rewritten);
74
139
  };
75
140
  const rewriteMetaRefresh = (content, baseUrl, resolve) => {
76
141
  const parts = content.split(";");
@@ -80,7 +145,13 @@ const rewriteMetaRefresh = (content, baseUrl, resolve) => {
80
145
  if (urlPartIndex === -1)
81
146
  return content;
82
147
  const urlPart = parts[urlPartIndex];
83
- const rawUrl = urlPart.split("=").slice(1).join("=").trim();
148
+ let rawUrl = urlPart.split("=").slice(1).join("=").trim();
149
+ // Some pages quote the URL value (url="/next" or url='/next').
150
+ // Strip a single pair of surrounding quotes to improve rewrite coverage.
151
+ if ((rawUrl.startsWith('"') && rawUrl.endsWith('"')) ||
152
+ (rawUrl.startsWith("'") && rawUrl.endsWith("'"))) {
153
+ rawUrl = rawUrl.slice(1, -1).trim();
154
+ }
84
155
  const resolved = resolveUrlValue(rawUrl, baseUrl, resolve);
85
156
  if (!resolved)
86
157
  return content;
@@ -89,6 +160,18 @@ const rewriteMetaRefresh = (content, baseUrl, resolve) => {
89
160
  nextParts[urlPartIndex] = next;
90
161
  return nextParts.join(";");
91
162
  };
163
+ const shouldRewriteLinkHref = ($element) => {
164
+ const rel = ($element.attr("rel") || "").trim().toLowerCase();
165
+ if (!rel) {
166
+ return true;
167
+ }
168
+ // Only rewrite link rels that are expected to load a resource.
169
+ // Avoid rewriting navigational/SEO links like canonical, preconnect, etc.
170
+ return (rel.includes("stylesheet") ||
171
+ rel.includes("preload") ||
172
+ rel.includes("prefetch") ||
173
+ rel.includes("icon"));
174
+ };
92
175
  const rewriteJsText = async (source, resolve, baseUrl) => {
93
176
  const replaceSpecifier = async (specifier) => {
94
177
  const trimmed = specifier.trim();
@@ -171,7 +254,19 @@ const rewriteEntryHtml = async (input) => {
171
254
  rewriteAttr("iframe[src]", "src");
172
255
  rewriteAttr("embed[src]", "src");
173
256
  rewriteAttr("object[data]", "data");
174
- rewriteAttr("link[href]", "href");
257
+ $("link[href]").each((_, element) => {
258
+ const el = $(element);
259
+ if (!shouldRewriteLinkHref(el)) {
260
+ return;
261
+ }
262
+ const value = el.attr("href");
263
+ if (!value)
264
+ return;
265
+ const resolved = resolveUrlValue(value, baseUrl, resolve);
266
+ if (resolved) {
267
+ el.attr("href", resolved);
268
+ }
269
+ });
175
270
  rewriteAttr("[poster]", "poster");
176
271
  rewriteDataAttrs("[data-src]", "data-src");
177
272
  rewriteDataAttrs("[data-href]", "data-href");
@@ -184,6 +279,13 @@ const rewriteEntryHtml = async (input) => {
184
279
  const rewritten = rewriteSrcsetValue(value, baseUrl, resolve);
185
280
  $(element).attr("srcset", rewritten);
186
281
  });
282
+ $("link[imagesrcset]").each((_, element) => {
283
+ const value = $(element).attr("imagesrcset");
284
+ if (!value)
285
+ return;
286
+ const rewritten = rewriteSrcsetValue(value, baseUrl, resolve);
287
+ $(element).attr("imagesrcset", rewritten);
288
+ });
187
289
  $("meta[http-equiv]").each((_, element) => {
188
290
  const httpEquiv = ($(element).attr("http-equiv") || "").toLowerCase();
189
291
  if (httpEquiv !== "refresh")
@@ -6,6 +6,18 @@ const path_resolver_1 = require("./path-resolver");
6
6
  const rewrite_links_1 = require("./rewrite-links");
7
7
  const snapshot_1 = require("./snapshot");
8
8
  const utils_1 = require("./utils");
9
+ const escapePercentForStaticServers = (value) => {
10
+ // Many static servers decode percent-encoding in the request path before
11
+ // resolving it to a filesystem path.
12
+ //
13
+ // Our snapshots can contain literal "%2F" sequences in filenames (e.g.
14
+ // Substack image URLs embedded into a path segment). When a server decodes
15
+ // "%2F" to "/", it changes the path structure and causes 404s.
16
+ //
17
+ // Escaping "%" to "%25" makes the request decode back to the original
18
+ // filename on disk.
19
+ return value.split("%").join("%25");
20
+ };
9
21
  const streamToUint8Array = async (stream) => {
10
22
  const reader = stream.getReader();
11
23
  const chunks = [];
@@ -147,7 +159,18 @@ const buildSnapshot = async (input) => {
147
159
  });
148
160
  urlToPath.set(resource.request.url, path);
149
161
  }
150
- const resolve = (absoluteUrl) => urlToPath.get(absoluteUrl) ?? null;
162
+ const resolve = (absoluteUrl) => {
163
+ const resolved = urlToPath.get(absoluteUrl);
164
+ if (!resolved) {
165
+ return null;
166
+ }
167
+ // Only escape snapshot-local paths. (Defensive: resolved should always be
168
+ // a path, but avoid breaking any unexpected absolute URLs.)
169
+ if (resolved.includes("://")) {
170
+ return resolved;
171
+ }
172
+ return escapePercentForStaticServers(resolved);
173
+ };
151
174
  const apiPath = (0, utils_1.ensureLeadingSlash)(multiDoc ? `${(0, utils_1.sanitizePosixPath)(docDir)}/api.json` : "/api.json");
152
175
  for (const resource of group.resources) {
153
176
  if (resource.request.resourceType === "document") {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pagepocket/lib",
3
- "version": "0.6.1",
3
+ "version": "0.6.3",
4
4
  "description": "Library for rewriting HTML snapshots and inlining local resources.",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -12,8 +12,8 @@
12
12
  "license": "ISC",
13
13
  "dependencies": {
14
14
  "cheerio": "^1.0.0-rc.12",
15
- "@pagepocket/interceptor": "0.6.1",
16
- "@pagepocket/uni-fs": "0.6.1"
15
+ "@pagepocket/interceptor": "0.6.3",
16
+ "@pagepocket/uni-fs": "0.6.3"
17
17
  },
18
18
  "devDependencies": {
19
19
  "@playwright/test": "^1.50.1",