@pagepocket/lib 0.8.5 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/dist/cheerio/types.d.ts +5 -0
  2. package/dist/cheerio/types.js +1 -0
  3. package/dist/core/completion.js +2 -1
  4. package/dist/core/content-store.js +2 -2
  5. package/dist/css-rewrite.d.ts +1 -1
  6. package/dist/css-rewrite.js +2 -2
  7. package/dist/hackers/replay-dom-rewrite/script-part-1.d.ts +1 -0
  8. package/dist/hackers/replay-dom-rewrite/script-part-1.js +202 -0
  9. package/dist/hackers/replay-dom-rewrite/script-part-2.d.ts +1 -0
  10. package/dist/hackers/replay-dom-rewrite/script-part-2.js +173 -0
  11. package/dist/hackers/replay-dom-rewrite.js +3 -373
  12. package/dist/hackers/replay-svg-image.js +32 -8
  13. package/dist/hackers/replay-xhr.js +3 -3
  14. package/dist/path-resolver.js +2 -2
  15. package/dist/replace-elements/actions.d.ts +3 -3
  16. package/dist/replace-elements/actions.js +36 -16
  17. package/dist/replace-elements/match.d.ts +2 -2
  18. package/dist/replace-elements/match.js +22 -11
  19. package/dist/replace-elements/normalize.d.ts +1 -1
  20. package/dist/replace-elements/normalize.js +4 -2
  21. package/dist/replay/match-api.js +29 -17
  22. package/dist/replay/templates/replay-script-template.js +16 -332
  23. package/dist/replay/templates/replay-script-template.part-1.d.ts +5 -0
  24. package/dist/replay/templates/replay-script-template.part-1.js +101 -0
  25. package/dist/replay/templates/replay-script-template.part-2.d.ts +3 -0
  26. package/dist/replay/templates/replay-script-template.part-2.js +222 -0
  27. package/dist/replay/templates/replay-script-template.part-3.d.ts +3 -0
  28. package/dist/replay/templates/replay-script-template.part-3.js +9 -0
  29. package/dist/resource-proxy/pathname-variants.js +8 -5
  30. package/dist/resource-proxy.js +10 -10
  31. package/dist/resources.d.ts +3 -2
  32. package/dist/resources.js +6 -3
  33. package/dist/rewrite-links/js-imports.d.ts +1 -1
  34. package/dist/rewrite-links/link-rel.d.ts +2 -2
  35. package/dist/rewrite-links/meta-refresh.d.ts +1 -1
  36. package/dist/rewrite-links/meta-refresh.js +6 -3
  37. package/dist/rewrite-links/srcset.d.ts +1 -1
  38. package/dist/rewrite-links/srcset.js +4 -2
  39. package/dist/rewrite-links/url-resolve.d.ts +2 -2
  40. package/dist/rewrite-links/url-resolve.js +2 -2
  41. package/dist/rewrite-links.d.ts +1 -1
  42. package/dist/rewrite-links.js +12 -6
  43. package/dist/snapshot-builder/build-snapshot.js +2 -3
  44. package/dist/snapshot-builder/capture-index/index-capture.js +2 -1
  45. package/dist/snapshot-builder/emit-document.d.ts +1 -1
  46. package/dist/snapshot-builder/emit-document.js +1 -1
  47. package/dist/snapshot-builder/grouping.js +2 -2
  48. package/dist/snapshot-builder/path-map.d.ts +1 -1
  49. package/dist/snapshot-builder/path-map.js +1 -1
  50. package/dist/snapshot-builder/resources-path.js +8 -4
  51. package/dist/snapshot-builder/rewrite-resource.d.ts +2 -2
  52. package/dist/snapshot-builder/rewrite-resource.js +2 -2
  53. package/dist/types.d.ts +3 -3
  54. package/dist/units/internal/async-queue.d.ts +9 -0
  55. package/dist/units/internal/async-queue.js +57 -0
  56. package/dist/units/internal/deferred-tracker.d.ts +5 -0
  57. package/dist/units/internal/deferred-tracker.js +13 -0
  58. package/dist/units/internal/runtime.d.ts +37 -0
  59. package/dist/units/internal/runtime.js +113 -0
  60. package/dist/units/runner.js +3 -184
  61. package/dist/utils.d.ts +1 -1
  62. package/dist/utils.js +6 -6
  63. package/package.json +5 -4
  64. package/README.md +0 -357
package/README.md DELETED
@@ -1,357 +0,0 @@
1
- # @pagepocket/lib
2
-
3
- Core library for capturing a page via NetworkInterceptorAdapter events and
4
- producing a virtual snapshot (HTML/CSS/JS rewritten to absolute snapshot paths).
5
- No network fetch happens in the core library.
6
-
7
- ## Install
8
-
9
- ```bash
10
- pnpm add @pagepocket/lib
11
- ```
12
-
13
- ## Quick Start
14
-
15
- ```ts
16
- import { PagePocket } from "@pagepocket/lib";
17
- import { CaptureHttpCdpUnit } from "@pagepocket/capture-http-cdp-unit";
18
- import { BuildSnapshotUnit } from "@pagepocket/build-snapshot-unit";
19
- import { WriteDownUnit } from "@pagepocket/write-down-unit";
20
-
21
- const result = await PagePocket.fromCDPTab(123).capture({
22
- units: [new CaptureHttpCdpUnit(), new BuildSnapshotUnit(), new WriteDownUnit({ type: "raw", outputPath: "./out" })]
23
- });
24
-
25
- // result.kind === "raw" | "zip" | ...
26
- ```
27
-
28
- ## API
29
-
30
- ```ts
31
- class PagePocket {
32
- static fromURL(url: string, options?: PagePocketOptions): PagePocket;
33
- static fromTarget(target: InterceptTarget, options?: PagePocketOptions): PagePocket;
34
- // Network progress events are available via the units runner channel (network@1).
35
- capture(options?: CaptureOptions): Promise<PageSnapshot>;
36
- }
37
- ```
38
-
39
- ### CaptureOptions (core)
40
-
41
- ````ts
42
- interface CaptureOptions {
43
- interceptor: NetworkInterceptorAdapter;
44
- completion?: CompletionStrategy | CompletionStrategy[];
45
- filter?: ResourceFilter;
46
- blacklist?: RegExp[];
47
- pathResolver?: PathResolver;
48
- contentStore?: ContentStore;
49
- rewriteEntry?: boolean;
50
- rewriteCSS?: boolean;
51
- limits?: {
52
- maxTotalBytes?: number;
53
- maxSingleResourceBytes?: number;
54
- maxResources?: number;
55
- };
56
- }
57
-
58
- type NetworkEventStream = AsyncIterable<NetworkEvent>;
59
-
60
- ### Built-in blacklist
61
-
62
- ```ts
63
- import { PagePocket, gaBlacklist } from "@pagepocket/lib";
64
-
65
- const snapshot = await PagePocket.fromURL("https://example.com").capture({
66
- interceptor,
67
- blacklist: gaBlacklist.ga
68
- });
69
- ````
70
-
71
- You can combine multiple built-in lists:
72
-
73
- ```ts
74
- import { PagePocket, builtinBlacklist } from "@pagepocket/lib";
75
-
76
- const snapshot = await PagePocket.fromURL("https://example.com").capture({
77
- interceptor,
78
- blacklist: [...builtinBlacklist.ga]
79
- });
80
- ```
81
-
82
- ````
83
-
84
- ### PageSnapshot output
85
-
86
- ```ts
87
- interface PageSnapshot {
88
- version: "1.0";
89
- createdAt: number;
90
- url: string;
91
- entry: string;
92
- files: SnapshotFile[];
93
- toDirectory(outDir: string, options?: WriteFSOptions): Promise<WriteResult>;
94
- toZip(options?: ZipOptions): Promise<ZipResult>;
95
- }
96
-
97
- interface WriteFSOptions {
98
- clearCache?: boolean;
99
- overwrite?: boolean;
100
- suffix?: string;
101
- }
102
-
103
- interface ZipOptions {
104
- asBlob?: boolean;
105
- clearCache?: boolean;
106
- overwrite?: boolean;
107
- suffix?: string;
108
- outputPath?: string;
109
- }
110
-
111
- interface ZipWriteResult {
112
- data: Uint8Array | Blob;
113
- outputPath: string;
114
- }
115
-
116
- type ZipResult = Uint8Array | Blob | ZipWriteResult;
117
- ````
118
-
119
- Snapshot layout:
120
-
121
- ```
122
- /index.html
123
- /api.json
124
- /<same-origin paths>
125
- /external_resources/<cross-origin paths>
126
- ```
127
-
128
- If multiple documents are captured, each document is written to its own output
129
- directory based on the document URL path (e.g. `foo/bar/index.html`).
130
-
131
- ## Notes
132
-
133
- - Uses `@pagepocket/uni-fs` for file IO so it works in Node and OPFS contexts.
134
- - Network data comes only from the interceptor events.
135
-
136
- ## HTML element replacement (replaceElements)
137
-
138
- During capture, `@pagepocket/lib` rewrites the captured HTML (the raw Document response body) before writing the final snapshot HTML.
139
-
140
- `replaceElements` lets you declaratively or programmatically replace parts of that HTML.
141
-
142
- Typical use-cases:
143
-
144
- - Replace embedded players/widgets with a placeholder (e.g. YouTube iframe → `<div>Player</div>`)
145
- - Rename tags at scale (e.g. all `<div>` → `<p>`)
146
- - Replace a specific element (e.g. `<div id="foo">` → `<span>hello</span>`)
147
-
148
- ### Where this runs
149
-
150
- `replaceElements` runs during the **HTML rewrite stage (Cheerio)**, not on a live browser DOM.
151
-
152
- - Input is the captured HTML string.
153
- - You can match using CSS selectors.
154
- - Function rules receive a Cheerio element context.
155
-
156
- If you need to modify the live DOM after scripts run, that is a different capture mode and is not supported by this option.
157
-
158
- ### Configuration
159
-
160
- Add `replaceElements` to `CaptureOptions`:
161
-
162
- ```ts
163
- import type {
164
- CaptureOptions,
165
- ReplaceElementsConfig,
166
- ReplaceAction,
167
- ReplaceElementContext,
168
- ReplaceElementRule
169
- } from "@pagepocket/lib";
170
-
171
- const replaceElements: ReplaceElementsConfig = [
172
- {
173
- name: "replace-youtube-embed",
174
- match: 'iframe[src*="youtube.com/embed"]',
175
- replace: { type: "replaceWithHtml", html: "<div>Player</div>" }
176
- }
177
- ];
178
-
179
- const options: CaptureOptions = {
180
- interceptor,
181
- replaceElements
182
- };
183
- ```
184
-
185
- ### Data structure
186
-
187
- `replaceElements` is an array of items. Each item can be:
188
-
189
- 1. A **rule object** (declarative)
190
- 2. A **function** (imperative; convenience form)
191
- 3. A **function rule with query** (imperative; recommended for performance)
192
-
193
- ```ts
194
- export type ReplaceElementsConfig = Array<
195
- ReplaceElementRule | ReplaceElementFn | ReplaceElementFnWithQuery
196
- >;
197
-
198
- export interface ReplaceElementRule {
199
- name?: string;
200
- match: MatchQuery;
201
- replace: ReplaceAction;
202
- apply?: ApplyOptions;
203
- }
204
-
205
- export type ReplaceElementFn = (
206
- ctx: ReplaceElementContext
207
- ) => void | ReplaceAction | ReplaceAction[] | Promise<void | ReplaceAction | ReplaceAction[]>;
208
-
209
- export interface ReplaceElementFnWithQuery {
210
- name?: string;
211
- query: string;
212
- run: ReplaceElementFn;
213
- apply?: ApplyOptions;
214
- }
215
- ```
216
-
217
- #### MatchQuery
218
-
219
- First-class support is CSS selectors (Cheerio selectors).
220
-
221
- ```ts
222
- export type MatchQuery =
223
- | string
224
- | {
225
- selector?: string;
226
- tagName?: string;
227
- id?: string;
228
- attrs?: Record<string, string | RegExp | true>;
229
- };
230
- ```
231
-
232
- #### ReplaceAction
233
-
234
- ```ts
235
- export type ReplaceAction =
236
- | { type: "replaceWithHtml"; html: string }
237
- | {
238
- type: "replaceWithElement";
239
- tagName: string;
240
- textContent?: string;
241
- html?: string;
242
- attrs?: Record<string, string | null>;
243
- }
244
- | {
245
- type: "renameTag";
246
- to: string;
247
- keepAttributes?: boolean;
248
- keepChildren?: boolean;
249
- }
250
- | { type: "remove" };
251
- ```
252
-
253
- #### ApplyOptions
254
-
255
- ```ts
256
- export interface ApplyOptions {
257
- /** default: "document" */
258
- scope?: "document" | "allFrames";
259
- /** default: "all" */
260
- limit?: number | "all";
261
- /** default: "stop" */
262
- onReplaced?: "stop" | "continue";
263
- }
264
- ```
265
-
266
- Notes:
267
-
268
- - `scope: "allFrames"` applies to multiple captured documents (if your interceptor captures frames / subdocuments). Cross-origin frames may not be available depending on capture.
269
- - `limit` limits how many matched elements are replaced for that rule.
270
-
271
- ### Semantics
272
-
273
- - Rules run **in array order**.
274
- - Each rule matches against the **current HTML state** (earlier replacements affect later matches).
275
- - Default `onReplaced: "stop"` makes results predictable: once an element is replaced by a rule, later rules won't attempt to re-process that same original element.
276
-
277
- ### Examples
278
-
279
- Replace YouTube iframe embeds with a placeholder:
280
-
281
- ```ts
282
- replaceElements: [
283
- {
284
- match: 'iframe[src*="youtube.com/embed"]',
285
- replace: { type: "replaceWithHtml", html: "<div>Player</div>" }
286
- }
287
- ];
288
- ```
289
-
290
- Rename all `div` tags to `p`:
291
-
292
- ```ts
293
- replaceElements: [
294
- {
295
- match: "div",
296
- replace: { type: "renameTag", to: "p", keepAttributes: true, keepChildren: true }
297
- }
298
- ];
299
- ```
300
-
301
- Replace `<div id="foo">` with `<span>hello</span>`:
302
-
303
- ```ts
304
- replaceElements: [
305
- {
306
- match: "div#foo",
307
- replace: { type: "replaceWithElement", tagName: "span", textContent: "hello" }
308
- }
309
- ];
310
- ```
311
-
312
- Complex matching via function (recommended form with `query`):
313
-
314
- ```ts
315
- replaceElements: [
316
- {
317
- query: "iframe",
318
- run: (ctx) => {
319
- const src = ctx.$el.attr("src") || "";
320
- if (!/youtube\.com\/embed/.test(src)) return;
321
-
322
- return { type: "replaceWithHtml", html: "<div>Player</div>" };
323
- }
324
- }
325
- ];
326
- ```
327
-
328
- ### Function context
329
-
330
- Function rules receive a Cheerio-aware context:
331
-
332
- ```ts
333
- export interface ReplaceElementContext {
334
- /** Cheerio root ($) for the current document */
335
- $: CheerioAPI;
336
- /** Cheerio wrapper for the matched element */
337
- $el: Cheerio<any>;
338
-
339
- /** Document URL being rewritten */
340
- url: string;
341
- /** Entry URL (top-level) */
342
- entryUrl: string;
343
-
344
- ruleIndex: number;
345
- matchIndex: number;
346
- }
347
- ```
348
-
349
- ### Manual validation
350
-
351
- Run the CLI against a page containing the target elements:
352
-
353
- ```bash
354
- pnpm -F @pagepocket/cli start -- https://example.com
355
- ```
356
-
357
- Then inspect the output `*.html` for the expected replacements.