@pagepocket/lib 0.6.3 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -30,17 +30,19 @@ await snapshot.toDirectory("./out");
30
30
  class PagePocket {
31
31
  static fromURL(url: string, options?: PagePocketOptions): PagePocket;
32
32
  static fromTarget(target: InterceptTarget, options?: PagePocketOptions): PagePocket;
33
+ interceptedRequestEvents(): NetworkEventStream;
33
34
  capture(options?: CaptureOptions): Promise<PageSnapshot>;
34
35
  }
35
36
  ```
36
37
 
37
38
  ### CaptureOptions (core)
38
39
 
39
- ```ts
40
+ ````ts
40
41
  interface CaptureOptions {
41
42
  interceptor: NetworkInterceptorAdapter;
42
43
  completion?: CompletionStrategy | CompletionStrategy[];
43
44
  filter?: ResourceFilter;
45
+ blacklist?: RegExp[];
44
46
  pathResolver?: PathResolver;
45
47
  contentStore?: ContentStore;
46
48
  rewriteEntry?: boolean;
@@ -51,8 +53,33 @@ interface CaptureOptions {
51
53
  maxResources?: number;
52
54
  };
53
55
  }
56
+
57
+ type NetworkEventStream = AsyncIterable<NetworkEvent>;
58
+
59
+ ### Built-in blacklist
60
+
61
+ ```ts
62
+ import { PagePocket, gaBlacklist } from "@pagepocket/lib";
63
+
64
+ const snapshot = await PagePocket.fromURL("https://example.com").capture({
65
+ interceptor,
66
+ blacklist: gaBlacklist.ga
67
+ });
68
+ ````
69
+
70
+ You can combine multiple built-in lists:
71
+
72
+ ```ts
73
+ import { PagePocket, builtinBlacklist } from "@pagepocket/lib";
74
+
75
+ const snapshot = await PagePocket.fromURL("https://example.com").capture({
76
+ interceptor,
77
+ blacklist: [...builtinBlacklist.ga]
78
+ });
54
79
  ```
55
80
 
81
+ ````
82
+
56
83
  ### PageSnapshot output
57
84
 
58
85
  ```ts
@@ -63,18 +90,30 @@ interface PageSnapshot {
63
90
  entry: string;
64
91
  files: SnapshotFile[];
65
92
  toDirectory(outDir: string, options?: WriteFSOptions): Promise<WriteResult>;
66
- toZip(options?: ZipOptions): Promise<Uint8Array | Blob>;
93
+ toZip(options?: ZipOptions): Promise<ZipResult>;
67
94
  }
68
95
 
69
96
  interface WriteFSOptions {
70
97
  clearCache?: boolean;
98
+ overwrite?: boolean;
99
+ suffix?: string;
71
100
  }
72
101
 
73
102
  interface ZipOptions {
74
103
  asBlob?: boolean;
75
104
  clearCache?: boolean;
105
+ overwrite?: boolean;
106
+ suffix?: string;
107
+ outputPath?: string;
76
108
  }
77
- ```
109
+
110
+ interface ZipWriteResult {
111
+ data: Uint8Array | Blob;
112
+ outputPath: string;
113
+ }
114
+
115
+ type ZipResult = Uint8Array | Blob | ZipWriteResult;
116
+ ````
78
117
 
79
118
  Snapshot layout:
80
119
 
@@ -92,3 +131,226 @@ directory based on the document URL path (e.g. `foo/bar/index.html`).
92
131
 
93
132
  - Uses `@pagepocket/uni-fs` for file IO so it works in Node and OPFS contexts.
94
133
  - Network data comes only from the interceptor events.
134
+
135
+ ## HTML element replacement (replaceElements)
136
+
137
+ During capture, `@pagepocket/lib` rewrites the captured HTML (the raw Document response body) before writing the final snapshot HTML.
138
+
139
+ `replaceElements` lets you declaratively or programmatically replace parts of that HTML.
140
+
141
+ Typical use-cases:
142
+
143
+ - Replace embedded players/widgets with a placeholder (e.g. YouTube iframe → `<div>Player</div>`)
144
+ - Rename tags at scale (e.g. all `<div>` → `<p>`)
145
+ - Replace a specific element (e.g. `<div id="foo">` → `<span>hello</span>`)
146
+
147
+ ### Where this runs
148
+
149
+ `replaceElements` runs during the **HTML rewrite stage (Cheerio)**, not on a live browser DOM.
150
+
151
+ - Input is the captured HTML string.
152
+ - You can match using CSS selectors.
153
+ - Function rules receive a Cheerio element context.
154
+
155
+ If you need to modify the live DOM after scripts run, that is a different capture mode and is not supported by this option.
156
+
157
+ ### Configuration
158
+
159
+ Add `replaceElements` to `CaptureOptions`:
160
+
161
+ ```ts
162
+ import type {
163
+ CaptureOptions,
164
+ ReplaceElementsConfig,
165
+ ReplaceAction,
166
+ ReplaceElementContext,
167
+ ReplaceElementRule
168
+ } from "@pagepocket/lib";
169
+
170
+ const replaceElements: ReplaceElementsConfig = [
171
+ {
172
+ name: "replace-youtube-embed",
173
+ match: 'iframe[src*="youtube.com/embed"]',
174
+ replace: { type: "replaceWithHtml", html: "<div>Player</div>" }
175
+ }
176
+ ];
177
+
178
+ const options: CaptureOptions = {
179
+ interceptor,
180
+ replaceElements
181
+ };
182
+ ```
183
+
184
+ ### Data structure
185
+
186
+ `replaceElements` is an array of items. Each item can be:
187
+
188
+ 1. A **rule object** (declarative)
189
+ 2. A **function** (imperative; convenience form)
190
+ 3. A **function rule with query** (imperative; recommended for performance)
191
+
192
+ ```ts
193
+ export type ReplaceElementsConfig = Array<
194
+ ReplaceElementRule | ReplaceElementFn | ReplaceElementFnWithQuery
195
+ >;
196
+
197
+ export interface ReplaceElementRule {
198
+ name?: string;
199
+ match: MatchQuery;
200
+ replace: ReplaceAction;
201
+ apply?: ApplyOptions;
202
+ }
203
+
204
+ export type ReplaceElementFn = (
205
+ ctx: ReplaceElementContext
206
+ ) => void | ReplaceAction | ReplaceAction[] | Promise<void | ReplaceAction | ReplaceAction[]>;
207
+
208
+ export interface ReplaceElementFnWithQuery {
209
+ name?: string;
210
+ query: string;
211
+ run: ReplaceElementFn;
212
+ apply?: ApplyOptions;
213
+ }
214
+ ```
215
+
216
+ #### MatchQuery
217
+
218
+ First-class support is CSS selectors (Cheerio selectors).
219
+
220
+ ```ts
221
+ export type MatchQuery =
222
+ | string
223
+ | {
224
+ selector?: string;
225
+ tagName?: string;
226
+ id?: string;
227
+ attrs?: Record<string, string | RegExp | true>;
228
+ };
229
+ ```
230
+
231
+ #### ReplaceAction
232
+
233
+ ```ts
234
+ export type ReplaceAction =
235
+ | { type: "replaceWithHtml"; html: string }
236
+ | {
237
+ type: "replaceWithElement";
238
+ tagName: string;
239
+ textContent?: string;
240
+ html?: string;
241
+ attrs?: Record<string, string | null>;
242
+ }
243
+ | {
244
+ type: "renameTag";
245
+ to: string;
246
+ keepAttributes?: boolean;
247
+ keepChildren?: boolean;
248
+ }
249
+ | { type: "remove" };
250
+ ```
251
+
252
+ #### ApplyOptions
253
+
254
+ ```ts
255
+ export interface ApplyOptions {
256
+ /** default: "document" */
257
+ scope?: "document" | "allFrames";
258
+ /** default: "all" */
259
+ limit?: number | "all";
260
+ /** default: "stop" */
261
+ onReplaced?: "stop" | "continue";
262
+ }
263
+ ```
264
+
265
+ Notes:
266
+
267
+ - `scope: "allFrames"` applies to multiple captured documents (if your interceptor captures frames / subdocuments). Cross-origin frames may not be available depending on capture.
268
+ - `limit` limits how many matched elements are replaced for that rule.
269
+
270
+ ### Semantics
271
+
272
+ - Rules run **in array order**.
273
+ - Each rule matches against the **current HTML state** (earlier replacements affect later matches).
274
+ - Default `onReplaced: "stop"` makes results predictable: once an element is replaced by a rule, later rules won't attempt to re-process that same original element.
275
+
276
+ ### Examples
277
+
278
+ Replace YouTube iframe embeds with a placeholder:
279
+
280
+ ```ts
281
+ replaceElements: [
282
+ {
283
+ match: 'iframe[src*="youtube.com/embed"]',
284
+ replace: { type: "replaceWithHtml", html: "<div>Player</div>" }
285
+ }
286
+ ];
287
+ ```
288
+
289
+ Rename all `div` tags to `p`:
290
+
291
+ ```ts
292
+ replaceElements: [
293
+ {
294
+ match: "div",
295
+ replace: { type: "renameTag", to: "p", keepAttributes: true, keepChildren: true }
296
+ }
297
+ ];
298
+ ```
299
+
300
+ Replace `<div id="foo">` with `<span>hello</span>`:
301
+
302
+ ```ts
303
+ replaceElements: [
304
+ {
305
+ match: "div#foo",
306
+ replace: { type: "replaceWithElement", tagName: "span", textContent: "hello" }
307
+ }
308
+ ];
309
+ ```
310
+
311
+ Complex matching via function (recommended form with `query`):
312
+
313
+ ```ts
314
+ replaceElements: [
315
+ {
316
+ query: "iframe",
317
+ run: (ctx) => {
318
+ const src = ctx.$el.attr("src") || "";
319
+ if (!/youtube\.com\/embed/.test(src)) return;
320
+
321
+ return { type: "replaceWithHtml", html: "<div>Player</div>" };
322
+ }
323
+ }
324
+ ];
325
+ ```
326
+
327
+ ### Function context
328
+
329
+ Function rules receive a Cheerio-aware context:
330
+
331
+ ```ts
332
+ export interface ReplaceElementContext {
333
+ /** Cheerio root ($) for the current document */
334
+ $: CheerioAPI;
335
+ /** Cheerio wrapper for the matched element */
336
+ $el: Cheerio<any>;
337
+
338
+ /** Document URL being rewritten */
339
+ url: string;
340
+ /** Entry URL (top-level) */
341
+ entryUrl: string;
342
+
343
+ ruleIndex: number;
344
+ matchIndex: number;
345
+ }
346
+ ```
347
+
348
+ ### Manual validation
349
+
350
+ Run the CLI against a page containing the target elements:
351
+
352
+ ```bash
353
+ pnpm -F @pagepocket/cli start -- https://example.com
354
+ ```
355
+
356
+ Then inspect the output `*.html` for the expected replacements.
@@ -0,0 +1,3 @@
1
+ export declare const ga: RegExp[];
2
+ export declare const sw_iframe: RegExp[];
3
+ export declare const ns: RegExp[];
@@ -0,0 +1,6 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.ns = exports.sw_iframe = exports.ga = void 0;
4
+ exports.ga = [/google-analytics/i, /analytics\.google/i];
5
+ exports.sw_iframe = [/sw_iframe\.html/i];
6
+ exports.ns = [/googletagmanager/i];
@@ -0,0 +1,2 @@
1
+ export declare const debug_log: (...args: unknown[]) => void;
2
+ export declare const debugLog: (...args: unknown[]) => void;
package/dist/debug.js ADDED
@@ -0,0 +1,18 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.debugLog = exports.debug_log = void 0;
4
+ const isDebugEnabled = () => {
5
+ const globalProcess = globalThis
6
+ .process;
7
+ const value = globalProcess?.env?.PAGEPOCKET_DEBUG;
8
+ return Boolean(value);
9
+ };
10
+ const debug_log = (...args) => {
11
+ if (!isDebugEnabled()) {
12
+ return;
13
+ }
14
+ // eslint-disable-next-line no-console
15
+ console.log(...args);
16
+ };
17
+ exports.debug_log = debug_log;
18
+ exports.debugLog = exports.debug_log;
@@ -5,6 +5,7 @@ const preload_fetch_1 = require("./preload-fetch");
5
5
  const preload_xhr_1 = require("./preload-xhr");
6
6
  const replay_beacon_1 = require("./replay-beacon");
7
7
  const replay_block_text_fragment_1 = require("./replay-block-text-fragment");
8
+ const replay_css_proxy_1 = require("./replay-css-proxy");
8
9
  const replay_dom_rewrite_1 = require("./replay-dom-rewrite");
9
10
  const replay_eventsource_1 = require("./replay-eventsource");
10
11
  const replay_fetch_1 = require("./replay-fetch");
@@ -18,6 +19,7 @@ exports.replayHackers = [
18
19
  replay_history_path_1.replayHistoryPath,
19
20
  replay_fetch_1.replayFetchResponder,
20
21
  replay_xhr_1.replayXhrResponder,
22
+ replay_css_proxy_1.replayCssProxy,
21
23
  replay_dom_rewrite_1.replayDomRewriter,
22
24
  replay_svg_image_1.replaySvgImageRewriter,
23
25
  replay_beacon_1.replayBeaconStub,
@@ -0,0 +1,2 @@
1
+ import type { ScriptHacker } from "./types";
2
+ export declare const replayCssProxy: ScriptHacker;
@@ -0,0 +1,206 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.replayCssProxy = void 0;
4
+ exports.replayCssProxy = {
5
+ id: "replay-css-proxy",
6
+ stage: "replay",
7
+ build: () => `
8
+ const shouldSkipCssUrl = (value) => {
9
+ const trimmed = String(value || "").trim();
10
+ return (
11
+ !trimmed ||
12
+ trimmed.startsWith("data:") ||
13
+ trimmed.startsWith("blob:") ||
14
+ trimmed.startsWith("mailto:") ||
15
+ trimmed.startsWith("tel:") ||
16
+ trimmed.startsWith("javascript:") ||
17
+ trimmed.startsWith("#")
18
+ );
19
+ };
20
+
21
+ const rewriteCssUrlsSync = (cssText, cssBaseUrl) => {
22
+ try {
23
+ const base = cssBaseUrl || (document && document.baseURI) || baseUrl;
24
+ const text = String(cssText || "");
25
+
26
+ // Rewrite url(...)
27
+ const urlPattern = /url\(\s*(['"]?)([^'")]+)\\1\s*\)/g;
28
+ const updated = text.replace(urlPattern, (full, quote, rawUrl) => {
29
+ try {
30
+ const raw = String(rawUrl || "").trim();
31
+ if (shouldSkipCssUrl(raw)) {
32
+ return full;
33
+ }
34
+ const absolute = new URL(raw, base).toString();
35
+ const localPath = findLocalPath(absolute);
36
+ if (!localPath) {
37
+ return full;
38
+ }
39
+ const q = quote || "";
40
+ return "url(" + q + localPath + q + ")";
41
+ } catch {
42
+ return full;
43
+ }
44
+ });
45
+
46
+ // Rewrite @import
47
+ const importPattern = /@import\s+(?:url\()?['"]?([^'")]+)['"]?\)?/g;
48
+ const final = updated.replace(importPattern, (full, rawUrl) => {
49
+ try {
50
+ const raw = String(rawUrl || "").trim();
51
+ if (shouldSkipCssUrl(raw)) {
52
+ return full;
53
+ }
54
+ const absolute = new URL(raw, base).toString();
55
+ const localPath = findLocalPath(absolute);
56
+ if (!localPath) {
57
+ return full;
58
+ }
59
+ return full.replace(rawUrl, localPath);
60
+ } catch {
61
+ return full;
62
+ }
63
+ });
64
+
65
+ return final;
66
+ } catch {
67
+ return String(cssText || "");
68
+ }
69
+ };
70
+
71
+ // <style> text mutations.
72
+ const rewriteStyleElement = (styleEl) => {
73
+ try {
74
+ if (!styleEl || !styleEl.tagName) return;
75
+ const tag = String(styleEl.tagName || "").toLowerCase();
76
+ if (tag !== "style") return;
77
+
78
+ const current = styleEl.textContent || "";
79
+ const next = rewriteCssUrlsSync(current, document && document.baseURI);
80
+ if (next !== current) {
81
+ styleEl.textContent = next;
82
+ }
83
+ } catch {}
84
+ };
85
+
86
+ // Inline style: element.setAttribute('style', ...)
87
+ const rewriteInlineStyleValue = (value) => {
88
+ try {
89
+ return rewriteCssUrlsSync(String(value || ""), document && document.baseURI);
90
+ } catch {
91
+ return String(value || "");
92
+ }
93
+ };
94
+
95
+ // Patch setAttribute to rewrite inline style values.
96
+ try {
97
+ const originalSetAttribute = Element.prototype.setAttribute;
98
+ Element.prototype.setAttribute = function(name, value) {
99
+ const attr = String(name).toLowerCase();
100
+ if (attr === "style") {
101
+ const next = rewriteInlineStyleValue(value);
102
+ return originalSetAttribute.call(this, name, next);
103
+ }
104
+ return originalSetAttribute.call(this, name, value);
105
+ };
106
+ } catch {}
107
+
108
+ // Patch CSSStyleDeclaration.setProperty.
109
+ try {
110
+ const originalSetProperty = CSSStyleDeclaration.prototype.setProperty;
111
+ CSSStyleDeclaration.prototype.setProperty = function(propertyName, value, priority) {
112
+ const next = rewriteInlineStyleValue(value);
113
+ return originalSetProperty.call(this, propertyName, next, priority);
114
+ };
115
+ } catch {}
116
+
117
+ // Patch CSSStyleDeclaration.cssText setter.
118
+ try {
119
+ const desc = Object.getOwnPropertyDescriptor(CSSStyleDeclaration.prototype, "cssText");
120
+ if (desc && desc.set) {
121
+ Object.defineProperty(CSSStyleDeclaration.prototype, "cssText", {
122
+ configurable: true,
123
+ get: desc.get,
124
+ set: function(value) {
125
+ const next = rewriteInlineStyleValue(value);
126
+ return desc.set.call(this, next);
127
+ }
128
+ });
129
+ }
130
+ } catch {}
131
+
132
+ // Patch CSSStyleSheet.insertRule.
133
+ try {
134
+ const originalInsertRule = CSSStyleSheet.prototype.insertRule;
135
+ CSSStyleSheet.prototype.insertRule = function(rule, index) {
136
+ const next = rewriteCssUrlsSync(String(rule || ""), document && document.baseURI);
137
+ return originalInsertRule.call(this, next, index);
138
+ };
139
+ } catch {}
140
+
141
+ // Patch constructable stylesheet APIs.
142
+ try {
143
+ if (CSSStyleSheet.prototype.replaceSync) {
144
+ const originalReplaceSync = CSSStyleSheet.prototype.replaceSync;
145
+ CSSStyleSheet.prototype.replaceSync = function(text) {
146
+ const next = rewriteCssUrlsSync(String(text || ""), document && document.baseURI);
147
+ return originalReplaceSync.call(this, next);
148
+ };
149
+ }
150
+ } catch {}
151
+
152
+ try {
153
+ if (CSSStyleSheet.prototype.replace) {
154
+ const originalReplace = CSSStyleSheet.prototype.replace;
155
+ CSSStyleSheet.prototype.replace = function(text) {
156
+ const next = rewriteCssUrlsSync(String(text || ""), document && document.baseURI);
157
+ return originalReplace.call(this, next);
158
+ };
159
+ }
160
+ } catch {}
161
+
162
+ // Observe <style> elements being added/updated.
163
+ try {
164
+ const rewriteAllStyles = () => {
165
+ try {
166
+ document.querySelectorAll("style").forEach((el) => rewriteStyleElement(el));
167
+ } catch {}
168
+ };
169
+
170
+ const observer = new MutationObserver((mutations) => {
171
+ for (const mutation of mutations) {
172
+ if (mutation.type === "childList") {
173
+ mutation.addedNodes.forEach((node) => {
174
+ try {
175
+ if (!node) return;
176
+ if (node.nodeType === 1) {
177
+ // Element
178
+ rewriteStyleElement(node);
179
+ if (node.querySelectorAll) {
180
+ node.querySelectorAll("style").forEach((el) => rewriteStyleElement(el));
181
+ }
182
+ }
183
+ } catch {}
184
+ });
185
+ }
186
+
187
+ if (mutation.type === "characterData") {
188
+ const parent = mutation.target && mutation.target.parentElement;
189
+ if (parent && parent.tagName && String(parent.tagName).toLowerCase() === "style") {
190
+ rewriteStyleElement(parent);
191
+ }
192
+ }
193
+ }
194
+ });
195
+
196
+ observer.observe(document.documentElement, {
197
+ subtree: true,
198
+ childList: true,
199
+ characterData: true
200
+ });
201
+
202
+ // Initial pass.
203
+ onReady(() => rewriteAllStyles());
204
+ } catch {}
205
+ `
206
+ };