@pagepocket/lib 0.8.6 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cheerio/types.d.ts +5 -0
- package/dist/cheerio/types.js +1 -0
- package/dist/core/completion.js +2 -1
- package/dist/core/content-store.js +2 -2
- package/dist/css-rewrite.d.ts +1 -1
- package/dist/css-rewrite.js +2 -2
- package/dist/hackers/replay-dom-rewrite/script-part-1.d.ts +1 -0
- package/dist/hackers/replay-dom-rewrite/script-part-1.js +202 -0
- package/dist/hackers/replay-dom-rewrite/script-part-2.d.ts +1 -0
- package/dist/hackers/replay-dom-rewrite/script-part-2.js +173 -0
- package/dist/hackers/replay-dom-rewrite.js +3 -373
- package/dist/hackers/replay-svg-image.js +32 -8
- package/dist/hackers/replay-xhr.js +3 -3
- package/dist/path-resolver.js +2 -2
- package/dist/replace-elements/actions.d.ts +3 -3
- package/dist/replace-elements/actions.js +36 -16
- package/dist/replace-elements/match.d.ts +2 -2
- package/dist/replace-elements/match.js +22 -11
- package/dist/replace-elements/normalize.d.ts +1 -1
- package/dist/replace-elements/normalize.js +4 -2
- package/dist/replay/match-api.js +29 -17
- package/dist/replay/templates/replay-script-template.js +16 -332
- package/dist/replay/templates/replay-script-template.part-1.d.ts +5 -0
- package/dist/replay/templates/replay-script-template.part-1.js +101 -0
- package/dist/replay/templates/replay-script-template.part-2.d.ts +3 -0
- package/dist/replay/templates/replay-script-template.part-2.js +222 -0
- package/dist/replay/templates/replay-script-template.part-3.d.ts +3 -0
- package/dist/replay/templates/replay-script-template.part-3.js +9 -0
- package/dist/resource-proxy/pathname-variants.js +8 -5
- package/dist/resource-proxy.js +10 -10
- package/dist/resources.d.ts +3 -2
- package/dist/resources.js +6 -3
- package/dist/rewrite-links/js-imports.d.ts +1 -1
- package/dist/rewrite-links/link-rel.d.ts +2 -2
- package/dist/rewrite-links/meta-refresh.d.ts +1 -1
- package/dist/rewrite-links/meta-refresh.js +6 -3
- package/dist/rewrite-links/srcset.d.ts +1 -1
- package/dist/rewrite-links/srcset.js +4 -2
- package/dist/rewrite-links/url-resolve.d.ts +2 -2
- package/dist/rewrite-links/url-resolve.js +2 -2
- package/dist/rewrite-links.d.ts +1 -1
- package/dist/rewrite-links.js +12 -6
- package/dist/snapshot-builder/build-snapshot.js +2 -3
- package/dist/snapshot-builder/capture-index/index-capture.js +2 -1
- package/dist/snapshot-builder/emit-document.d.ts +1 -1
- package/dist/snapshot-builder/emit-document.js +1 -1
- package/dist/snapshot-builder/grouping.js +2 -2
- package/dist/snapshot-builder/path-map.d.ts +1 -1
- package/dist/snapshot-builder/path-map.js +1 -1
- package/dist/snapshot-builder/resources-path.js +8 -4
- package/dist/snapshot-builder/rewrite-resource.d.ts +2 -2
- package/dist/snapshot-builder/rewrite-resource.js +2 -2
- package/dist/types.d.ts +3 -3
- package/dist/units/internal/async-queue.d.ts +9 -0
- package/dist/units/internal/async-queue.js +57 -0
- package/dist/units/internal/deferred-tracker.d.ts +5 -0
- package/dist/units/internal/deferred-tracker.js +13 -0
- package/dist/units/internal/runtime.d.ts +37 -0
- package/dist/units/internal/runtime.js +113 -0
- package/dist/units/runner.js +3 -184
- package/dist/utils.d.ts +1 -1
- package/dist/utils.js +6 -6
- package/package.json +5 -4
- package/README.md +0 -357
package/README.md
DELETED
|
@@ -1,357 +0,0 @@
|
|
|
1
|
-
# @pagepocket/lib
|
|
2
|
-
|
|
3
|
-
Core library for capturing a page via NetworkInterceptorAdapter events and
|
|
4
|
-
producing a virtual snapshot (HTML/CSS/JS rewritten to absolute snapshot paths).
|
|
5
|
-
No network fetch happens in the core library.
|
|
6
|
-
|
|
7
|
-
## Install
|
|
8
|
-
|
|
9
|
-
```bash
|
|
10
|
-
pnpm add @pagepocket/lib
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
## Quick Start
|
|
14
|
-
|
|
15
|
-
```ts
|
|
16
|
-
import { PagePocket } from "@pagepocket/lib";
|
|
17
|
-
import { CaptureHttpCdpUnit } from "@pagepocket/capture-http-cdp-unit";
|
|
18
|
-
import { BuildSnapshotUnit } from "@pagepocket/build-snapshot-unit";
|
|
19
|
-
import { WriteDownUnit } from "@pagepocket/write-down-unit";
|
|
20
|
-
|
|
21
|
-
const result = await PagePocket.fromCDPTab(123).capture({
|
|
22
|
-
units: [new CaptureHttpCdpUnit(), new BuildSnapshotUnit(), new WriteDownUnit({ type: "raw", outputPath: "./out" })]
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
// result.kind === "raw" | "zip" | ...
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
## API
|
|
29
|
-
|
|
30
|
-
```ts
|
|
31
|
-
class PagePocket {
|
|
32
|
-
static fromURL(url: string, options?: PagePocketOptions): PagePocket;
|
|
33
|
-
static fromTarget(target: InterceptTarget, options?: PagePocketOptions): PagePocket;
|
|
34
|
-
// Network progress events are available via the units runner channel (network@1).
|
|
35
|
-
capture(options?: CaptureOptions): Promise<PageSnapshot>;
|
|
36
|
-
}
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
### CaptureOptions (core)
|
|
40
|
-
|
|
41
|
-
````ts
|
|
42
|
-
interface CaptureOptions {
|
|
43
|
-
interceptor: NetworkInterceptorAdapter;
|
|
44
|
-
completion?: CompletionStrategy | CompletionStrategy[];
|
|
45
|
-
filter?: ResourceFilter;
|
|
46
|
-
blacklist?: RegExp[];
|
|
47
|
-
pathResolver?: PathResolver;
|
|
48
|
-
contentStore?: ContentStore;
|
|
49
|
-
rewriteEntry?: boolean;
|
|
50
|
-
rewriteCSS?: boolean;
|
|
51
|
-
limits?: {
|
|
52
|
-
maxTotalBytes?: number;
|
|
53
|
-
maxSingleResourceBytes?: number;
|
|
54
|
-
maxResources?: number;
|
|
55
|
-
};
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
type NetworkEventStream = AsyncIterable<NetworkEvent>;
|
|
59
|
-
|
|
60
|
-
### Built-in blacklist
|
|
61
|
-
|
|
62
|
-
```ts
|
|
63
|
-
import { PagePocket, gaBlacklist } from "@pagepocket/lib";
|
|
64
|
-
|
|
65
|
-
const snapshot = await PagePocket.fromURL("https://example.com").capture({
|
|
66
|
-
interceptor,
|
|
67
|
-
blacklist: gaBlacklist.ga
|
|
68
|
-
});
|
|
69
|
-
````
|
|
70
|
-
|
|
71
|
-
You can combine multiple built-in lists:
|
|
72
|
-
|
|
73
|
-
```ts
|
|
74
|
-
import { PagePocket, builtinBlacklist } from "@pagepocket/lib";
|
|
75
|
-
|
|
76
|
-
const snapshot = await PagePocket.fromURL("https://example.com").capture({
|
|
77
|
-
interceptor,
|
|
78
|
-
blacklist: [...builtinBlacklist.ga]
|
|
79
|
-
});
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
````
|
|
83
|
-
|
|
84
|
-
### PageSnapshot output
|
|
85
|
-
|
|
86
|
-
```ts
|
|
87
|
-
interface PageSnapshot {
|
|
88
|
-
version: "1.0";
|
|
89
|
-
createdAt: number;
|
|
90
|
-
url: string;
|
|
91
|
-
entry: string;
|
|
92
|
-
files: SnapshotFile[];
|
|
93
|
-
toDirectory(outDir: string, options?: WriteFSOptions): Promise<WriteResult>;
|
|
94
|
-
toZip(options?: ZipOptions): Promise<ZipResult>;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
interface WriteFSOptions {
|
|
98
|
-
clearCache?: boolean;
|
|
99
|
-
overwrite?: boolean;
|
|
100
|
-
suffix?: string;
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
interface ZipOptions {
|
|
104
|
-
asBlob?: boolean;
|
|
105
|
-
clearCache?: boolean;
|
|
106
|
-
overwrite?: boolean;
|
|
107
|
-
suffix?: string;
|
|
108
|
-
outputPath?: string;
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
interface ZipWriteResult {
|
|
112
|
-
data: Uint8Array | Blob;
|
|
113
|
-
outputPath: string;
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
type ZipResult = Uint8Array | Blob | ZipWriteResult;
|
|
117
|
-
````
|
|
118
|
-
|
|
119
|
-
Snapshot layout:
|
|
120
|
-
|
|
121
|
-
```
|
|
122
|
-
/index.html
|
|
123
|
-
/api.json
|
|
124
|
-
/<same-origin paths>
|
|
125
|
-
/external_resources/<cross-origin paths>
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
If multiple documents are captured, each document is written to its own output
|
|
129
|
-
directory based on the document URL path (e.g. `foo/bar/index.html`).
|
|
130
|
-
|
|
131
|
-
## Notes
|
|
132
|
-
|
|
133
|
-
- Uses `@pagepocket/uni-fs` for file IO so it works in Node and OPFS contexts.
|
|
134
|
-
- Network data comes only from the interceptor events.
|
|
135
|
-
|
|
136
|
-
## HTML element replacement (replaceElements)
|
|
137
|
-
|
|
138
|
-
During capture, `@pagepocket/lib` rewrites the captured HTML (the raw Document response body) before writing the final snapshot HTML.
|
|
139
|
-
|
|
140
|
-
`replaceElements` lets you declaratively or programmatically replace parts of that HTML.
|
|
141
|
-
|
|
142
|
-
Typical use-cases:
|
|
143
|
-
|
|
144
|
-
- Replace embedded players/widgets with a placeholder (e.g. YouTube iframe → `<div>Player</div>`)
|
|
145
|
-
- Rename tags at scale (e.g. all `<div>` → `<p>`)
|
|
146
|
-
- Replace a specific element (e.g. `<div id="foo">` → `<span>hello</span>`)
|
|
147
|
-
|
|
148
|
-
### Where this runs
|
|
149
|
-
|
|
150
|
-
`replaceElements` runs during the **HTML rewrite stage (Cheerio)**, not on a live browser DOM.
|
|
151
|
-
|
|
152
|
-
- Input is the captured HTML string.
|
|
153
|
-
- You can match using CSS selectors.
|
|
154
|
-
- Function rules receive a Cheerio element context.
|
|
155
|
-
|
|
156
|
-
If you need to modify the live DOM after scripts run, that is a different capture mode and is not supported by this option.
|
|
157
|
-
|
|
158
|
-
### Configuration
|
|
159
|
-
|
|
160
|
-
Add `replaceElements` to `CaptureOptions`:
|
|
161
|
-
|
|
162
|
-
```ts
|
|
163
|
-
import type {
|
|
164
|
-
CaptureOptions,
|
|
165
|
-
ReplaceElementsConfig,
|
|
166
|
-
ReplaceAction,
|
|
167
|
-
ReplaceElementContext,
|
|
168
|
-
ReplaceElementRule
|
|
169
|
-
} from "@pagepocket/lib";
|
|
170
|
-
|
|
171
|
-
const replaceElements: ReplaceElementsConfig = [
|
|
172
|
-
{
|
|
173
|
-
name: "replace-youtube-embed",
|
|
174
|
-
match: 'iframe[src*="youtube.com/embed"]',
|
|
175
|
-
replace: { type: "replaceWithHtml", html: "<div>Player</div>" }
|
|
176
|
-
}
|
|
177
|
-
];
|
|
178
|
-
|
|
179
|
-
const options: CaptureOptions = {
|
|
180
|
-
interceptor,
|
|
181
|
-
replaceElements
|
|
182
|
-
};
|
|
183
|
-
```
|
|
184
|
-
|
|
185
|
-
### Data structure
|
|
186
|
-
|
|
187
|
-
`replaceElements` is an array of items. Each item can be:
|
|
188
|
-
|
|
189
|
-
1. A **rule object** (declarative)
|
|
190
|
-
2. A **function** (imperative; convenience form)
|
|
191
|
-
3. A **function rule with query** (imperative; recommended for performance)
|
|
192
|
-
|
|
193
|
-
```ts
|
|
194
|
-
export type ReplaceElementsConfig = Array<
|
|
195
|
-
ReplaceElementRule | ReplaceElementFn | ReplaceElementFnWithQuery
|
|
196
|
-
>;
|
|
197
|
-
|
|
198
|
-
export interface ReplaceElementRule {
|
|
199
|
-
name?: string;
|
|
200
|
-
match: MatchQuery;
|
|
201
|
-
replace: ReplaceAction;
|
|
202
|
-
apply?: ApplyOptions;
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
export type ReplaceElementFn = (
|
|
206
|
-
ctx: ReplaceElementContext
|
|
207
|
-
) => void | ReplaceAction | ReplaceAction[] | Promise<void | ReplaceAction | ReplaceAction[]>;
|
|
208
|
-
|
|
209
|
-
export interface ReplaceElementFnWithQuery {
|
|
210
|
-
name?: string;
|
|
211
|
-
query: string;
|
|
212
|
-
run: ReplaceElementFn;
|
|
213
|
-
apply?: ApplyOptions;
|
|
214
|
-
}
|
|
215
|
-
```
|
|
216
|
-
|
|
217
|
-
#### MatchQuery
|
|
218
|
-
|
|
219
|
-
First-class support is CSS selectors (Cheerio selectors).
|
|
220
|
-
|
|
221
|
-
```ts
|
|
222
|
-
export type MatchQuery =
|
|
223
|
-
| string
|
|
224
|
-
| {
|
|
225
|
-
selector?: string;
|
|
226
|
-
tagName?: string;
|
|
227
|
-
id?: string;
|
|
228
|
-
attrs?: Record<string, string | RegExp | true>;
|
|
229
|
-
};
|
|
230
|
-
```
|
|
231
|
-
|
|
232
|
-
#### ReplaceAction
|
|
233
|
-
|
|
234
|
-
```ts
|
|
235
|
-
export type ReplaceAction =
|
|
236
|
-
| { type: "replaceWithHtml"; html: string }
|
|
237
|
-
| {
|
|
238
|
-
type: "replaceWithElement";
|
|
239
|
-
tagName: string;
|
|
240
|
-
textContent?: string;
|
|
241
|
-
html?: string;
|
|
242
|
-
attrs?: Record<string, string | null>;
|
|
243
|
-
}
|
|
244
|
-
| {
|
|
245
|
-
type: "renameTag";
|
|
246
|
-
to: string;
|
|
247
|
-
keepAttributes?: boolean;
|
|
248
|
-
keepChildren?: boolean;
|
|
249
|
-
}
|
|
250
|
-
| { type: "remove" };
|
|
251
|
-
```
|
|
252
|
-
|
|
253
|
-
#### ApplyOptions
|
|
254
|
-
|
|
255
|
-
```ts
|
|
256
|
-
export interface ApplyOptions {
|
|
257
|
-
/** default: "document" */
|
|
258
|
-
scope?: "document" | "allFrames";
|
|
259
|
-
/** default: "all" */
|
|
260
|
-
limit?: number | "all";
|
|
261
|
-
/** default: "stop" */
|
|
262
|
-
onReplaced?: "stop" | "continue";
|
|
263
|
-
}
|
|
264
|
-
```
|
|
265
|
-
|
|
266
|
-
Notes:
|
|
267
|
-
|
|
268
|
-
- `scope: "allFrames"` applies to multiple captured documents (if your interceptor captures frames / subdocuments). Cross-origin frames may not be available depending on capture.
|
|
269
|
-
- `limit` limits how many matched elements are replaced for that rule.
|
|
270
|
-
|
|
271
|
-
### Semantics
|
|
272
|
-
|
|
273
|
-
- Rules run **in array order**.
|
|
274
|
-
- Each rule matches against the **current HTML state** (earlier replacements affect later matches).
|
|
275
|
-
- Default `onReplaced: "stop"` makes results predictable: once an element is replaced by a rule, later rules won't attempt to re-process that same original element.
|
|
276
|
-
|
|
277
|
-
### Examples
|
|
278
|
-
|
|
279
|
-
Replace YouTube iframe embeds with a placeholder:
|
|
280
|
-
|
|
281
|
-
```ts
|
|
282
|
-
replaceElements: [
|
|
283
|
-
{
|
|
284
|
-
match: 'iframe[src*="youtube.com/embed"]',
|
|
285
|
-
replace: { type: "replaceWithHtml", html: "<div>Player</div>" }
|
|
286
|
-
}
|
|
287
|
-
];
|
|
288
|
-
```
|
|
289
|
-
|
|
290
|
-
Rename all `div` tags to `p`:
|
|
291
|
-
|
|
292
|
-
```ts
|
|
293
|
-
replaceElements: [
|
|
294
|
-
{
|
|
295
|
-
match: "div",
|
|
296
|
-
replace: { type: "renameTag", to: "p", keepAttributes: true, keepChildren: true }
|
|
297
|
-
}
|
|
298
|
-
];
|
|
299
|
-
```
|
|
300
|
-
|
|
301
|
-
Replace `<div id="foo">` with `<span>hello</span>`:
|
|
302
|
-
|
|
303
|
-
```ts
|
|
304
|
-
replaceElements: [
|
|
305
|
-
{
|
|
306
|
-
match: "div#foo",
|
|
307
|
-
replace: { type: "replaceWithElement", tagName: "span", textContent: "hello" }
|
|
308
|
-
}
|
|
309
|
-
];
|
|
310
|
-
```
|
|
311
|
-
|
|
312
|
-
Complex matching via function (recommended form with `query`):
|
|
313
|
-
|
|
314
|
-
```ts
|
|
315
|
-
replaceElements: [
|
|
316
|
-
{
|
|
317
|
-
query: "iframe",
|
|
318
|
-
run: (ctx) => {
|
|
319
|
-
const src = ctx.$el.attr("src") || "";
|
|
320
|
-
if (!/youtube\.com\/embed/.test(src)) return;
|
|
321
|
-
|
|
322
|
-
return { type: "replaceWithHtml", html: "<div>Player</div>" };
|
|
323
|
-
}
|
|
324
|
-
}
|
|
325
|
-
];
|
|
326
|
-
```
|
|
327
|
-
|
|
328
|
-
### Function context
|
|
329
|
-
|
|
330
|
-
Function rules receive a Cheerio-aware context:
|
|
331
|
-
|
|
332
|
-
```ts
|
|
333
|
-
export interface ReplaceElementContext {
|
|
334
|
-
/** Cheerio root ($) for the current document */
|
|
335
|
-
$: CheerioAPI;
|
|
336
|
-
/** Cheerio wrapper for the matched element */
|
|
337
|
-
$el: Cheerio<any>;
|
|
338
|
-
|
|
339
|
-
/** Document URL being rewritten */
|
|
340
|
-
url: string;
|
|
341
|
-
/** Entry URL (top-level) */
|
|
342
|
-
entryUrl: string;
|
|
343
|
-
|
|
344
|
-
ruleIndex: number;
|
|
345
|
-
matchIndex: number;
|
|
346
|
-
}
|
|
347
|
-
```
|
|
348
|
-
|
|
349
|
-
### Manual validation
|
|
350
|
-
|
|
351
|
-
Run the CLI against a page containing the target elements:
|
|
352
|
-
|
|
353
|
-
```bash
|
|
354
|
-
pnpm -F @pagepocket/cli start -- https://example.com
|
|
355
|
-
```
|
|
356
|
-
|
|
357
|
-
Then inspect the output `*.html` for the expected replacements.
|