npm - @pagepocket/lib - Versions diffs - 0.6.1 → 0.6.3 - Mend

@pagepocket/lib 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/hackers/index.js +4 -0
package/dist/hackers/replay-block-text-fragment.d.ts +2 -0
package/dist/hackers/replay-block-text-fragment.js +71 -0
package/dist/hackers/replay-dom-rewrite.js +46 -6
package/dist/hackers/replay-history-path.d.ts +2 -0
package/dist/hackers/replay-history-path.js +25 -0
package/dist/path-resolver.js +14 -4
package/dist/replay-script.js +22 -4
package/dist/rewrite-links.js +114 -12
package/dist/snapshot-builder.js +24 -1
package/package.json +3 -3

package/dist/hackers/index.js CHANGED Viewed

@@ -4,14 +4,18 @@ exports.replayHackers = exports.preloadHackers = void 0;
 const preload_fetch_1 = require("./preload-fetch");
 const preload_xhr_1 = require("./preload-xhr");
 const replay_beacon_1 = require("./replay-beacon");
+const replay_block_text_fragment_1 = require("./replay-block-text-fragment");
 const replay_dom_rewrite_1 = require("./replay-dom-rewrite");
 const replay_eventsource_1 = require("./replay-eventsource");
 const replay_fetch_1 = require("./replay-fetch");
+const replay_history_path_1 = require("./replay-history-path");
 const replay_svg_image_1 = require("./replay-svg-image");
 const replay_websocket_1 = require("./replay-websocket");
 const replay_xhr_1 = require("./replay-xhr");
 exports.preloadHackers = [preload_fetch_1.preloadFetchRecorder, preload_xhr_1.preloadXhrRecorder];
 exports.replayHackers = [
+    replay_block_text_fragment_1.replayBlockTextFragment,
+    replay_history_path_1.replayHistoryPath,
     replay_fetch_1.replayFetchResponder,
     replay_xhr_1.replayXhrResponder,
     replay_dom_rewrite_1.replayDomRewriter,

package/dist/hackers/replay-block-text-fragment.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { ScriptHacker } from "./types";
2	+ export declare const replayBlockTextFragment: ScriptHacker;

package/dist/hackers/replay-block-text-fragment.js ADDED Viewed

@@ -0,0 +1,71 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.replayBlockTextFragment = void 0;
+exports.replayBlockTextFragment = {
+    id: "replay-block-text-fragment",
+    stage: "replay",
+    build: () => `
+  // Block navigations that only add a Text Fragment.
+  //
+  // Some sites (e.g. Substack) call document.location.replace("#:~:text=...")
+  // to trigger the browser's Text Fragment highlighting. On large pages this
+  // can cause severe jank/freezes while the browser scans the document.
+  //
+  // We intercept location.replace/assign. If the navigation target differs
+  // only by hash and the hash starts with "#:~:text=", we update history
+  // without triggering a navigation.
+  const shouldBlockTextFragmentNavigation = (targetUrl) => {
+    try {
+      const current = new URL(window.location.href);
+      const next = new URL(String(targetUrl), current);
+      if (!next.hash || !next.hash.startsWith("#:~:text=")) {
+        return false;
+      }
+      return (
+        current.origin === next.origin &&
+        current.pathname === next.pathname &&
+        current.search === next.search
+      );
+    } catch {
+      return false;
+    }
+  };
+  const patchLocationMethod = (name) => {
+    try {
+      const loc = window.location;
+      const original = loc && loc[name];
+      if (typeof original !== "function") {
+        return;
+      }
+      const patched = function(url) {
+        try {
+          if (shouldBlockTextFragmentNavigation(url)) {
+            const current = new URL(window.location.href);
+            const next = new URL(String(url), current);
+            history.replaceState(history.state, document.title, next.hash);
+            return;
+          }
+        } catch {}
+        return original.call(window.location, url);
+      };
+      try {
+        patched.__pagepocketOriginal = original;
+      } catch {}
+      try {
+        Object.defineProperty(loc, name, { configurable: true, value: patched });
+      } catch {
+        // Some browsers don't allow redefining Location methods.
+      }
+    } catch {}
+  };
+  patchLocationMethod("replace");
+  patchLocationMethod("assign");
+  `
+};

package/dist/hackers/replay-dom-rewrite.js CHANGED Viewed

@@ -31,9 +31,33 @@ exports.replayDomRewriter = {
     }
   };
+  const rewritten = new WeakMap();
   // Rewrite srcset values to local files only (avoid data: URLs in srcset).
   const rewriteSrcset = (value) => {
     if (!value) return value;
+    // Substack-style image/fetch URLs include commas inside the URL token
+    // (",w_40,h_40,..."). This makes the srcset invalid and browsers will
+    // parse it into garbage candidate URLs. Prefer dropping srcset and relying
+    // on the rewritten img[src].
+    try {
+      const trimmed = String(value || "").trim();
+      const hasFetchTransform = trimmed.includes("/image/fetch/");
+      const hasEncodedUrlTail = trimmed.includes("https%3A%2F%2F");
+      const hasCommaTokens =
+        trimmed.includes(",w_") ||
+        trimmed.includes(", w_") ||
+        trimmed.includes(",h_") ||
+        trimmed.includes(", h_") ||
+        trimmed.includes(",c_") ||
+        trimmed.includes(", c_");
+      if (hasFetchTransform && hasEncodedUrlTail && hasCommaTokens) {
+        return "";
+      }
+    } catch {}
     return value.split(",").map((part) => {
       const trimmed = part.trim();
       if (!trimmed) return trimmed;
@@ -46,7 +70,7 @@ exports.replayDomRewriter = {
         return descriptor ? localPath + " " + descriptor : localPath;
       }
       return trimmed;
-    }).join(", ");
+    }).join(",");
   };
   // Rewrite element attributes to local files or data URLs.
@@ -56,9 +80,21 @@ exports.replayDomRewriter = {
       onReady(() => rewriteElement(element));
       return;
     }
+    const prev = rewritten.get(element);
+    const currentSrc = element.getAttribute("src");
+    const currentHref = element.getAttribute("href");
+    const currentSrcset = element.getAttribute("srcset");
+    if (
+      prev &&
+      prev.src === currentSrc &&
+      prev.href === currentHref &&
+      prev.srcset === currentSrcset
+    ) {
+      return;
+    }
     const tag = (element.tagName || "").toLowerCase();
     if (tag === "img" || tag === "source" || tag === "video" || tag === "audio" || tag === "script" || tag === "iframe") {
-      const src = element.getAttribute("src");
+      const src = currentSrc;
       if (src && !isLocalResource(src) && !src.startsWith("data:") && !src.startsWith("blob:")) {
         const localPath = findLocalPath(src);
         if (localPath) {
@@ -72,7 +108,7 @@ exports.replayDomRewriter = {
     }
     if (tag === "link") {
-      const href = element.getAttribute("href");
+      const href = currentHref;
       const rel = (element.getAttribute("rel") || "").toLowerCase();
       if (href && !isLocalResource(href) && !href.startsWith("data:") && !href.startsWith("blob:")) {
         const localPath = findLocalPath(href);
@@ -86,10 +122,16 @@ exports.replayDomRewriter = {
       }
     }
-    const srcset = element.getAttribute("srcset");
+    const srcset = currentSrcset;
     if (srcset) {
       element.setAttribute("srcset", rewriteSrcset(srcset));
     }
+    rewritten.set(element, {
+      src: element.getAttribute("src"),
+      href: element.getAttribute("href"),
+      srcset: element.getAttribute("srcset")
+    });
   };
   // Intercept DOM attribute writes to keep resources local.
@@ -275,8 +317,6 @@ exports.replayDomRewriter = {
         mutation.addedNodes.forEach((node) => {
           if (node && node.nodeType === 1) {
             rewriteElement(node);
-            const descendants = node.querySelectorAll ? node.querySelectorAll("img,source,video,audio,script,link,iframe") : [];
-            descendants.forEach((el) => rewriteElement(el));
           }
         });
       }

package/dist/hackers/replay-history-path.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { ScriptHacker } from "./types";
2	+ export declare const replayHistoryPath: ScriptHacker;

package/dist/hackers/replay-history-path.js ADDED Viewed

@@ -0,0 +1,25 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.replayHistoryPath = void 0;
+exports.replayHistoryPath = {
+    id: "replay-history-path",
+    stage: "replay",
+    build: () => `
+  // Ensure history/location reflects the original captured URL path.
+  //
+  // When a snapshot is served from a static server root (e.g. http://localhost:8080/index.html),
+  // SPA routers that read location.pathname will see "/".
+  // For a page captured from https://foo.com/bar/foo, the correct route should be "/bar/foo".
+  //
+  // We patch history early and replace the current URL without causing navigation.
+  try {
+    const parsed = new URL(baseUrl);
+    const desiredPath = parsed.pathname + (parsed.search || "") + (parsed.hash || "");
+    const currentPath = window.location.pathname + window.location.search + window.location.hash;
+    if (desiredPath && desiredPath !== currentPath) {
+      history.replaceState(history.state, "", desiredPath);
+    }
+  } catch {}
+  `
+};

package/dist/path-resolver.js CHANGED Viewed

@@ -4,12 +4,22 @@ exports.isDocumentType = exports.withPrefixPathResolver = exports.resolveCrossOr
 const utils_1 = require("./utils");
 const normalizePathname = (pathname) => {
     const normalized = pathname || "/";
+    const hasTrailingSlash = normalized.endsWith("/");
     const clean = (0, utils_1.sanitizePosixPath)(normalized);
-    const leading = clean ? `/${clean}` : "/";
-    if (leading.endsWith("/")) {
-        return `${leading}index`;
+    const parts = clean.split("/").filter(Boolean);
+    if (hasTrailingSlash) {
+        parts.push("index");
     }
-    return leading;
+    if (parts.length === 0) {
+        return "/index";
+    }
+    const rewritten = parts.map((part, index) => {
+        if (index < parts.length - 1 && part.includes(".")) {
+            return `${part}__pp_dir`;
+        }
+        return part;
+    });
+    return `/${rewritten.join("/")}`;
 };
 const withSuffix = (path, suffix) => {
     const lastSlash = path.lastIndexOf("/");

package/dist/replay-script.js CHANGED Viewed

@@ -242,9 +242,15 @@ const buildReplayScript = (apiPath, baseUrl) => {
     return matchAPI({ records, byKey, baseUrl, method, url, body });
   };
+  const urlLookupCache = new Map();
   const findByUrl = (url) => {
     if (isLocalResource(url)) return null;
-    return matchAPI({ records, byKey, baseUrl, method: "GET", url, body: "" });
+    if (urlLookupCache.has(url)) {
+      return urlLookupCache.get(url);
+    }
+    const record = matchAPI({ records, byKey, baseUrl, method: "GET", url, body: "" });
+    urlLookupCache.set(url, record || null);
+    return record;
   };
   const findLocalPath = () => null;
@@ -292,16 +298,28 @@ const buildReplayScript = (apiPath, baseUrl) => {
     return "application/octet-stream";
   };
+  const dataUrlCache = new Map();
   const toDataUrl = (record, fallbackType) => {
     if (!record) return "";
     const contentType = getContentType(record) || fallbackType || "application/octet-stream";
+    const cacheKey = (record.url || "") + "|" + contentType + "|" + (record.responseEncoding || "") + "|" +
+      (record.responseBodyBase64 ? "b64:" + record.responseBodyBase64.length : "txt:" + (record.responseBody ? record.responseBody.length : 0));
+    if (dataUrlCache.has(cacheKey)) {
+      return dataUrlCache.get(cacheKey);
+    }
     if (record.responseEncoding === "base64" && record.responseBodyBase64) {
-      return "data:" + contentType + ";base64," + record.responseBodyBase64;
+      const dataUrl = "data:" + contentType + ";base64," + record.responseBodyBase64;
+      dataUrlCache.set(cacheKey, dataUrl);
+      return dataUrl;
     }
     if (record.responseBody) {
-      return "data:" + contentType + ";base64," + textToBase64(record.responseBody);
+      const dataUrl = "data:" + contentType + ";base64," + textToBase64(record.responseBody);
+      dataUrlCache.set(cacheKey, dataUrl);
+      return dataUrl;
     }
-    return "data:" + (fallbackType || "application/octet-stream") + ",";
+    const dataUrl = "data:" + (fallbackType || "application/octet-stream") + ",";
+    dataUrlCache.set(cacheKey, dataUrl);
+    return dataUrl;
   };
   const responseFromRecord = (record) => {

package/dist/rewrite-links.js CHANGED Viewed

@@ -59,18 +59,83 @@ const resolveUrlValue = (value, baseUrl, resolve) => {
         return null;
     }
 };
+const isUnsafeSrcsetValue = (value) => {
+    const trimmed = value.trim();
+    if (!trimmed) {
+        return false;
+    }
+    // Some sites (notably Substack) emit image transform URLs that contain commas
+    // inside the URL itself (e.g. "/image/fetch/...,$w_40,$h_40,.../https%3A...").
+    //
+    // In the HTML srcset grammar, commas separate candidates, so unescaped commas
+    // inside a URL make the srcset invalid. Browsers will parse it into garbage
+    // URLs like "https%3A%2F%2F...png" and try to fetch them.
+    //
+    // For offline snapshots, it's better to drop srcset entirely and rely on
+    // the already-rewritten img[src].
+    const hasFetchTransform = trimmed.includes("/image/fetch/");
+    const hasEncodedUrlTail = trimmed.includes("https%3A%2F%2F");
+    const hasCommaTokens = trimmed.includes(",w_") ||
+        trimmed.includes(", w_") ||
+        trimmed.includes(",h_") ||
+        trimmed.includes(", h_") ||
+        trimmed.includes(",c_") ||
+        trimmed.includes(", c_");
+    return hasFetchTransform && hasEncodedUrlTail && hasCommaTokens;
+};
+const isDescriptorToken = (token) => {
+    const trimmed = token.trim();
+    if (!trimmed)
+        return false;
+    // Common srcset descriptors: 1x, 2x, 320w
+    return /^\d+(\.\d+)?x$/i.test(trimmed) || /^\d+w$/i.test(trimmed);
+};
+const parseSrcset = (input) => {
+    // Minimal srcset parser:
+    // - Candidates are separated by commas.
+    // - Each candidate is "<url> [descriptor]".
+    // - URLs may contain spaces/commas (e.g. CDN transform strings). To avoid
+    //   breaking those, we locate the descriptor from the *end* of the candidate.
+    const rawCandidates = input
+        .split(",")
+        .map((c) => c.trim())
+        .filter(Boolean);
+    return rawCandidates.map((candidate) => {
+        const tokens = candidate.split(/\s+/).filter(Boolean);
+        if (tokens.length === 0) {
+            return { url: candidate };
+        }
+        const last = tokens[tokens.length - 1] ?? "";
+        if (tokens.length >= 2 && isDescriptorToken(last)) {
+            const descriptor = last;
+            const url = candidate.slice(0, candidate.lastIndexOf(descriptor)).trim();
+            return { url, descriptor };
+        }
+        return { url: candidate };
+    });
+};
+const stringifySrcset = (candidates) => {
+    return (candidates
+        .map((c) => {
+        const url = c.url.trim();
+        if (!c.descriptor)
+            return url;
+        return `${url} ${c.descriptor.trim()}`;
+    })
+        .filter(Boolean)
+        // Don't introduce spaces after commas inside URL tokens.
+        .join(","));
+};
 const rewriteSrcsetValue = (value, baseUrl, resolve) => {
-    const parts = value.split(",").map((part) => part.trim());
-    const rewritten = parts.map((part) => {
-        const [rawUrl, descriptor] = part.split(/\s+/, 2);
-        if (!rawUrl)
-            return part;
-        const resolved = resolveUrlValue(rawUrl, baseUrl, resolve);
-        if (!resolved)
-            return part;
-        return descriptor ? `${resolved} ${descriptor}` : resolved;
+    if (isUnsafeSrcsetValue(value)) {
+        return "";
+    }
+    const candidates = parseSrcset(value);
+    const rewritten = candidates.map((c) => {
+        const resolved = resolveUrlValue(c.url, baseUrl, resolve);
+        return { url: resolved ?? c.url, descriptor: c.descriptor };
     });
-    return rewritten.join(", ");
+    return stringifySrcset(rewritten);
 };
 const rewriteMetaRefresh = (content, baseUrl, resolve) => {
     const parts = content.split(";");
@@ -80,7 +145,13 @@ const rewriteMetaRefresh = (content, baseUrl, resolve) => {
     if (urlPartIndex === -1)
         return content;
     const urlPart = parts[urlPartIndex];
-    const rawUrl = urlPart.split("=").slice(1).join("=").trim();
+    let rawUrl = urlPart.split("=").slice(1).join("=").trim();
+    // Some pages quote the URL value (url="/next" or url='/next').
+    // Strip a single pair of surrounding quotes to improve rewrite coverage.
+    if ((rawUrl.startsWith('"') && rawUrl.endsWith('"')) ||
+        (rawUrl.startsWith("'") && rawUrl.endsWith("'"))) {
+        rawUrl = rawUrl.slice(1, -1).trim();
+    }
     const resolved = resolveUrlValue(rawUrl, baseUrl, resolve);
     if (!resolved)
         return content;
@@ -89,6 +160,18 @@ const rewriteMetaRefresh = (content, baseUrl, resolve) => {
     nextParts[urlPartIndex] = next;
     return nextParts.join(";");
 };
+const shouldRewriteLinkHref = ($element) => {
+    const rel = ($element.attr("rel") || "").trim().toLowerCase();
+    if (!rel) {
+        return true;
+    }
+    // Only rewrite link rels that are expected to load a resource.
+    // Avoid rewriting navigational/SEO links like canonical, preconnect, etc.
+    return (rel.includes("stylesheet") ||
+        rel.includes("preload") ||
+        rel.includes("prefetch") ||
+        rel.includes("icon"));
+};
 const rewriteJsText = async (source, resolve, baseUrl) => {
     const replaceSpecifier = async (specifier) => {
         const trimmed = specifier.trim();
@@ -171,7 +254,19 @@ const rewriteEntryHtml = async (input) => {
         rewriteAttr("iframe[src]", "src");
         rewriteAttr("embed[src]", "src");
         rewriteAttr("object[data]", "data");
-        rewriteAttr("link[href]", "href");
+        $("link[href]").each((_, element) => {
+            const el = $(element);
+            if (!shouldRewriteLinkHref(el)) {
+                return;
+            }
+            const value = el.attr("href");
+            if (!value)
+                return;
+            const resolved = resolveUrlValue(value, baseUrl, resolve);
+            if (resolved) {
+                el.attr("href", resolved);
+            }
+        });
         rewriteAttr("[poster]", "poster");
         rewriteDataAttrs("[data-src]", "data-src");
         rewriteDataAttrs("[data-href]", "data-href");
@@ -184,6 +279,13 @@ const rewriteEntryHtml = async (input) => {
             const rewritten = rewriteSrcsetValue(value, baseUrl, resolve);
             $(element).attr("srcset", rewritten);
         });
+        $("link[imagesrcset]").each((_, element) => {
+            const value = $(element).attr("imagesrcset");
+            if (!value)
+                return;
+            const rewritten = rewriteSrcsetValue(value, baseUrl, resolve);
+            $(element).attr("imagesrcset", rewritten);
+        });
         $("meta[http-equiv]").each((_, element) => {
             const httpEquiv = ($(element).attr("http-equiv") || "").toLowerCase();
             if (httpEquiv !== "refresh")

package/dist/snapshot-builder.js CHANGED Viewed

@@ -6,6 +6,18 @@ const path_resolver_1 = require("./path-resolver");
 const rewrite_links_1 = require("./rewrite-links");
 const snapshot_1 = require("./snapshot");
 const utils_1 = require("./utils");
+const escapePercentForStaticServers = (value) => {
+    // Many static servers decode percent-encoding in the request path before
+    // resolving it to a filesystem path.
+    //
+    // Our snapshots can contain literal "%2F" sequences in filenames (e.g.
+    // Substack image URLs embedded into a path segment). When a server decodes
+    // "%2F" to "/", it changes the path structure and causes 404s.
+    //
+    // Escaping "%" to "%25" makes the request decode back to the original
+    // filename on disk.
+    return value.split("%").join("%25");
+};
 const streamToUint8Array = async (stream) => {
     const reader = stream.getReader();
     const chunks = [];
@@ -147,7 +159,18 @@ const buildSnapshot = async (input) => {
             });
             urlToPath.set(resource.request.url, path);
         }
-        const resolve = (absoluteUrl) => urlToPath.get(absoluteUrl) ?? null;
+        const resolve = (absoluteUrl) => {
+            const resolved = urlToPath.get(absoluteUrl);
+            if (!resolved) {
+                return null;
+            }
+            // Only escape snapshot-local paths. (Defensive: resolved should always be
+            // a path, but avoid breaking any unexpected absolute URLs.)
+            if (resolved.includes("://")) {
+                return resolved;
+            }
+            return escapePercentForStaticServers(resolved);
+        };
         const apiPath = (0, utils_1.ensureLeadingSlash)(multiDoc ? `${(0, utils_1.sanitizePosixPath)(docDir)}/api.json` : "/api.json");
         for (const resource of group.resources) {
             if (resource.request.resourceType === "document") {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pagepocket/lib",
-  "version": "0.6.1",
+  "version": "0.6.3",
   "description": "Library for rewriting HTML snapshots and inlining local resources.",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",
@@ -12,8 +12,8 @@
   "license": "ISC",
   "dependencies": {
     "cheerio": "^1.0.0-rc.12",
-    "@pagepocket/interceptor": "0.6.1",
-    "@pagepocket/uni-fs": "0.6.1"
+    "@pagepocket/interceptor": "0.6.3",
+    "@pagepocket/uni-fs": "0.6.3"
   },
   "devDependencies": {
     "@playwright/test": "^1.50.1",