@pagepocket/lib 0.6.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +265 -3
- package/dist/builtin-blacklist.d.ts +3 -0
- package/dist/builtin-blacklist.js +6 -0
- package/dist/debug.d.ts +2 -0
- package/dist/debug.js +18 -0
- package/dist/hackers/index.js +2 -0
- package/dist/hackers/replay-css-proxy.d.ts +2 -0
- package/dist/hackers/replay-css-proxy.js +206 -0
- package/dist/hackers/replay-dom-rewrite.js +57 -26
- package/dist/index.d.ts +3 -1
- package/dist/index.js +18 -1
- package/dist/inflight-tracker.d.ts +19 -0
- package/dist/inflight-tracker.js +48 -0
- package/dist/pagepocket.d.ts +3 -1
- package/dist/pagepocket.js +150 -35
- package/dist/replace-elements.d.ts +9 -0
- package/dist/replace-elements.js +258 -0
- package/dist/replay-script.js +286 -2
- package/dist/resource-proxy.d.ts +34 -0
- package/dist/resource-proxy.js +284 -0
- package/dist/rewrite-links.d.ts +8 -0
- package/dist/rewrite-links.js +8 -0
- package/dist/snapshot-builder.d.ts +2 -1
- package/dist/snapshot-builder.js +51 -1
- package/dist/types.d.ts +88 -1
- package/dist/writers.d.ts +2 -2
- package/dist/writers.js +56 -4
- package/package.json +3 -3
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.resolveToLocalPath = exports.buildResourceProxyIndex = void 0;
|
|
4
|
+
const addMulti = (map, key, value) => {
|
|
5
|
+
const existing = map.get(key);
|
|
6
|
+
if (!existing) {
|
|
7
|
+
map.set(key, value);
|
|
8
|
+
return;
|
|
9
|
+
}
|
|
10
|
+
if (Array.isArray(existing)) {
|
|
11
|
+
existing.push(value);
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
map.set(key, [existing, value]);
|
|
15
|
+
};
|
|
16
|
+
const toArray = (value) => {
|
|
17
|
+
if (!value) {
|
|
18
|
+
return [];
|
|
19
|
+
}
|
|
20
|
+
return Array.isArray(value) ? value : [value];
|
|
21
|
+
};
|
|
22
|
+
const stripHash = (value) => {
|
|
23
|
+
const index = value.indexOf("#");
|
|
24
|
+
return index === -1 ? value : value.slice(0, index);
|
|
25
|
+
};
|
|
26
|
+
const stripTrailingSlash = (value) => {
|
|
27
|
+
if (!value || value === "/") {
|
|
28
|
+
return value;
|
|
29
|
+
}
|
|
30
|
+
return value.endsWith("/") ? value.slice(0, -1) : value;
|
|
31
|
+
};
|
|
32
|
+
const looksAlreadyEscapedForStaticServers = (value) => {
|
|
33
|
+
// Heuristic: if the path contains "%25XX" patterns, it was likely already
|
|
34
|
+
// escaped once ("%" -> "%25") to survive static-server decoding.
|
|
35
|
+
//
|
|
36
|
+
// This is intentionally conservative; double-escaping breaks lookups.
|
|
37
|
+
return /%25[0-9a-fA-F]{2}/.test(value);
|
|
38
|
+
};
|
|
39
|
+
const escapePercentForStaticServersOnce = (value) => {
|
|
40
|
+
if (!value) {
|
|
41
|
+
return value;
|
|
42
|
+
}
|
|
43
|
+
if (looksAlreadyEscapedForStaticServers(value)) {
|
|
44
|
+
return value;
|
|
45
|
+
}
|
|
46
|
+
return value.split("%").join("%25");
|
|
47
|
+
};
|
|
48
|
+
const isLikelyHostname = (value) => {
|
|
49
|
+
// Keep this loose; we only use it as a guard for embedded-URL detection.
|
|
50
|
+
if (!value)
|
|
51
|
+
return false;
|
|
52
|
+
if (value === "localhost")
|
|
53
|
+
return true;
|
|
54
|
+
return value.includes(".");
|
|
55
|
+
};
|
|
56
|
+
const encodeEmbeddedUrlTailIfPresent = (pathname) => {
|
|
57
|
+
// Some CDNs embed a full absolute URL into a single path segment using
|
|
58
|
+
// encodeURIComponent (e.g. ".../https%3A%2F%2Fexample.com%2Fa.png").
|
|
59
|
+
//
|
|
60
|
+
// Other runtimes may request the *decoded* form in-path (e.g.
|
|
61
|
+
// ".../https://example.com/a.png"), which changes path segments.
|
|
62
|
+
//
|
|
63
|
+
// To be resilient, detect an embedded absolute URL tail (http(s)://...) and
|
|
64
|
+
// produce an alternate pathname with that tail collapsed into one encoded
|
|
65
|
+
// segment.
|
|
66
|
+
const raw = String(pathname || "");
|
|
67
|
+
if (!raw.includes("/http")) {
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
const parts = raw.split("/");
|
|
71
|
+
for (let i = 0; i < parts.length; i += 1) {
|
|
72
|
+
const scheme = parts[i];
|
|
73
|
+
if (scheme !== "http:" && scheme !== "https:") {
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
// A real absolute URL in-path is typically split like:
|
|
77
|
+
// ["...", "https:", "", "example.com", "a", "b.png"]
|
|
78
|
+
const hasDoubleSlash = parts[i + 1] === "";
|
|
79
|
+
const host = parts[i + 2] || "";
|
|
80
|
+
if (!hasDoubleSlash || !isLikelyHostname(host)) {
|
|
81
|
+
continue;
|
|
82
|
+
}
|
|
83
|
+
const embedded = scheme + "//" + parts.slice(i + 2).join("/");
|
|
84
|
+
const encoded = encodeURIComponent(embedded);
|
|
85
|
+
const nextParts = parts.slice(0, i).concat(encoded);
|
|
86
|
+
const rebuilt = nextParts.join("/") || "/";
|
|
87
|
+
return rebuilt.startsWith("/") ? rebuilt : "/" + rebuilt;
|
|
88
|
+
}
|
|
89
|
+
return null;
|
|
90
|
+
};
|
|
91
|
+
const makePathnameVariants = (pathname) => {
|
|
92
|
+
const variants = new Set();
|
|
93
|
+
const push = (value) => {
|
|
94
|
+
if (!value)
|
|
95
|
+
return;
|
|
96
|
+
variants.add(value);
|
|
97
|
+
};
|
|
98
|
+
push(pathname);
|
|
99
|
+
push(stripTrailingSlash(pathname));
|
|
100
|
+
const encodedTail = encodeEmbeddedUrlTailIfPresent(pathname);
|
|
101
|
+
if (encodedTail && encodedTail !== pathname) {
|
|
102
|
+
push(encodedTail);
|
|
103
|
+
push(stripTrailingSlash(encodedTail));
|
|
104
|
+
}
|
|
105
|
+
return Array.from(variants);
|
|
106
|
+
};
|
|
107
|
+
const getBasename = (pathname) => {
|
|
108
|
+
const clean = pathname.split("?")[0] || "";
|
|
109
|
+
const parts = clean.split("/").filter(Boolean);
|
|
110
|
+
return parts[parts.length - 1] || "";
|
|
111
|
+
};
|
|
112
|
+
const toUrlOrNull = (value) => {
|
|
113
|
+
try {
|
|
114
|
+
return new URL(value);
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
return null;
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
const buildResourceProxyIndex = (snapshot) => {
|
|
121
|
+
const byExactUrl = new Map();
|
|
122
|
+
const byPathnameWithSearch = new Map();
|
|
123
|
+
const byPathname = new Map();
|
|
124
|
+
const byBasename = new Map();
|
|
125
|
+
for (const item of snapshot.items || []) {
|
|
126
|
+
if (!item || !item.url || !item.path) {
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
const parsed = toUrlOrNull(item.url);
|
|
130
|
+
if (!parsed) {
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
const pathname = parsed.pathname || "/";
|
|
134
|
+
const pathnameWithSearch = pathname + (parsed.search || "");
|
|
135
|
+
const basename = getBasename(pathname);
|
|
136
|
+
const indexed = {
|
|
137
|
+
...item,
|
|
138
|
+
parsed,
|
|
139
|
+
pathname,
|
|
140
|
+
pathnameWithSearch,
|
|
141
|
+
basename
|
|
142
|
+
};
|
|
143
|
+
// Prefer first-seen item for exact URL.
|
|
144
|
+
if (!byExactUrl.has(parsed.toString())) {
|
|
145
|
+
byExactUrl.set(parsed.toString(), indexed);
|
|
146
|
+
}
|
|
147
|
+
addMulti(byPathnameWithSearch, pathnameWithSearch, indexed);
|
|
148
|
+
addMulti(byPathname, pathname, indexed);
|
|
149
|
+
if (basename) {
|
|
150
|
+
addMulti(byBasename, basename, indexed);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return {
|
|
154
|
+
byExactUrl,
|
|
155
|
+
byPathnameWithSearch,
|
|
156
|
+
byPathname,
|
|
157
|
+
byBasename
|
|
158
|
+
};
|
|
159
|
+
};
|
|
160
|
+
exports.buildResourceProxyIndex = buildResourceProxyIndex;
|
|
161
|
+
const uniqByPath = (items) => {
|
|
162
|
+
const seen = new Set();
|
|
163
|
+
const out = [];
|
|
164
|
+
for (const item of items) {
|
|
165
|
+
if (seen.has(item.path))
|
|
166
|
+
continue;
|
|
167
|
+
seen.add(item.path);
|
|
168
|
+
out.push(item);
|
|
169
|
+
}
|
|
170
|
+
return out;
|
|
171
|
+
};
|
|
172
|
+
const preferSingle = (items, baseUrl, suffixLength) => {
|
|
173
|
+
if (items.length <= 1) {
|
|
174
|
+
return items[0] ?? null;
|
|
175
|
+
}
|
|
176
|
+
const baseParsed = (() => {
|
|
177
|
+
try {
|
|
178
|
+
return new URL(baseUrl);
|
|
179
|
+
}
|
|
180
|
+
catch {
|
|
181
|
+
return null;
|
|
182
|
+
}
|
|
183
|
+
})();
|
|
184
|
+
if (baseParsed) {
|
|
185
|
+
const sameOrigin = items.filter((i) => i.parsed.origin === baseParsed.origin);
|
|
186
|
+
if (sameOrigin.length === 1) {
|
|
187
|
+
return sameOrigin[0];
|
|
188
|
+
}
|
|
189
|
+
if (sameOrigin.length > 1) {
|
|
190
|
+
items = sameOrigin;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
// If still ambiguous, only accept when the match key is strong.
|
|
194
|
+
// We treat very short suffix matches (or basename-only) as too risky.
|
|
195
|
+
if (suffixLength < 2) {
|
|
196
|
+
return null;
|
|
197
|
+
}
|
|
198
|
+
return null;
|
|
199
|
+
};
|
|
200
|
+
const tryCandidates = (items, baseUrl, suffixLength) => {
|
|
201
|
+
const unique = uniqByPath(items);
|
|
202
|
+
if (unique.length === 0) {
|
|
203
|
+
return null;
|
|
204
|
+
}
|
|
205
|
+
if (unique.length === 1) {
|
|
206
|
+
return unique[0];
|
|
207
|
+
}
|
|
208
|
+
return preferSingle(unique, baseUrl, suffixLength);
|
|
209
|
+
};
|
|
210
|
+
const makeSuffixes = (pathname) => {
|
|
211
|
+
const parts = pathname.split("/").filter(Boolean);
|
|
212
|
+
const out = [];
|
|
213
|
+
for (let i = 0; i < parts.length; i += 1) {
|
|
214
|
+
const suffix = "/" + parts.slice(i).join("/");
|
|
215
|
+
out.push({ key: suffix, depth: parts.length - i });
|
|
216
|
+
}
|
|
217
|
+
return out;
|
|
218
|
+
};
|
|
219
|
+
const resolveToLocalPath = (options) => {
|
|
220
|
+
const { requestUrl, baseUrl, index } = options;
|
|
221
|
+
if (!requestUrl) {
|
|
222
|
+
return undefined;
|
|
223
|
+
}
|
|
224
|
+
let abs = null;
|
|
225
|
+
try {
|
|
226
|
+
abs = new URL(requestUrl, baseUrl);
|
|
227
|
+
}
|
|
228
|
+
catch {
|
|
229
|
+
abs = null;
|
|
230
|
+
}
|
|
231
|
+
if (!abs) {
|
|
232
|
+
return undefined;
|
|
233
|
+
}
|
|
234
|
+
const absString = abs.toString();
|
|
235
|
+
const exact = index.byExactUrl.get(absString);
|
|
236
|
+
if (exact) {
|
|
237
|
+
return escapePercentForStaticServersOnce(exact.path);
|
|
238
|
+
}
|
|
239
|
+
const withoutHash = stripHash(absString);
|
|
240
|
+
if (withoutHash !== absString) {
|
|
241
|
+
const found = index.byExactUrl.get(withoutHash);
|
|
242
|
+
if (found) {
|
|
243
|
+
return escapePercentForStaticServersOnce(found.path);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
const pathname = abs.pathname || "/";
|
|
247
|
+
const pathnameVariants = makePathnameVariants(pathname);
|
|
248
|
+
const search = abs.search || "";
|
|
249
|
+
const pathnameWithSearchVariants = pathnameVariants.map((p) => p + search);
|
|
250
|
+
for (const key of pathnameWithSearchVariants) {
|
|
251
|
+
const items = toArray(index.byPathnameWithSearch.get(key));
|
|
252
|
+
const match = tryCandidates(items, baseUrl, 99);
|
|
253
|
+
if (match) {
|
|
254
|
+
return escapePercentForStaticServersOnce(match.path);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
for (const key of pathnameVariants) {
|
|
258
|
+
const items = toArray(index.byPathname.get(key));
|
|
259
|
+
const match = tryCandidates(items, baseUrl, 99);
|
|
260
|
+
if (match) {
|
|
261
|
+
return escapePercentForStaticServersOnce(match.path);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
// Suffix fallback: progressively remove leading segments.
|
|
265
|
+
for (const variant of pathnameVariants) {
|
|
266
|
+
for (const suffix of makeSuffixes(variant)) {
|
|
267
|
+
const items = toArray(index.byPathname.get(suffix.key));
|
|
268
|
+
const match = tryCandidates(items, baseUrl, suffix.depth);
|
|
269
|
+
if (match) {
|
|
270
|
+
return escapePercentForStaticServersOnce(match.path);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
const basename = getBasename(pathname);
|
|
275
|
+
if (basename) {
|
|
276
|
+
const items = toArray(index.byBasename.get(basename));
|
|
277
|
+
const match = tryCandidates(items, baseUrl, 1);
|
|
278
|
+
if (match) {
|
|
279
|
+
return escapePercentForStaticServersOnce(match.path);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return undefined;
|
|
283
|
+
};
|
|
284
|
+
exports.resolveToLocalPath = resolveToLocalPath;
|
package/dist/rewrite-links.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { ReplaceElementsConfig } from "./types";
|
|
1
2
|
type UrlResolver = (absoluteUrl: string) => string | null;
|
|
2
3
|
export declare const rewriteJsText: (source: string, resolve: UrlResolver, baseUrl: string) => Promise<string>;
|
|
3
4
|
export declare const rewriteEntryHtml: (input: {
|
|
@@ -6,6 +7,13 @@ export declare const rewriteEntryHtml: (input: {
|
|
|
6
7
|
apiPath: string;
|
|
7
8
|
resolve: UrlResolver;
|
|
8
9
|
rewriteLinks?: boolean;
|
|
10
|
+
replaceElements?: ReplaceElementsConfig;
|
|
11
|
+
isEntryDocument?: boolean;
|
|
12
|
+
/**
|
|
13
|
+
* The top-level entry URL for this snapshot (may differ from entryUrl when
|
|
14
|
+
* multiple documents/frames are captured).
|
|
15
|
+
*/
|
|
16
|
+
snapshotEntryUrl?: string;
|
|
9
17
|
}) => Promise<{
|
|
10
18
|
html: string;
|
|
11
19
|
title?: string;
|
package/dist/rewrite-links.js
CHANGED
|
@@ -37,6 +37,7 @@ exports.rewriteEntryHtml = exports.rewriteJsText = void 0;
|
|
|
37
37
|
const cheerio = __importStar(require("cheerio"));
|
|
38
38
|
const css_rewrite_1 = require("./css-rewrite");
|
|
39
39
|
const hack_html_1 = require("./hack-html");
|
|
40
|
+
const replace_elements_1 = require("./replace-elements");
|
|
40
41
|
const shouldSkipValue = (value) => {
|
|
41
42
|
const trimmed = value.trim();
|
|
42
43
|
return (!trimmed ||
|
|
@@ -347,6 +348,13 @@ const rewriteEntryHtml = async (input) => {
|
|
|
347
348
|
baseUrl: baseUrl,
|
|
348
349
|
apiPath: input.apiPath
|
|
349
350
|
});
|
|
351
|
+
await (0, replace_elements_1.applyReplaceElements)({
|
|
352
|
+
$,
|
|
353
|
+
entryUrl: input.snapshotEntryUrl ?? baseUrl,
|
|
354
|
+
url: baseUrl,
|
|
355
|
+
replaceElements: input.replaceElements,
|
|
356
|
+
isEntryDocument: input.isEntryDocument ?? true
|
|
357
|
+
});
|
|
350
358
|
const title = $("title").first().text() || undefined;
|
|
351
359
|
return { html: $.html(), title };
|
|
352
360
|
};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { ApiEntry, StoredResource } from "./network-store";
|
|
2
|
-
import type { ContentStore, PageSnapshot, PathResolver } from "./types";
|
|
2
|
+
import type { ContentStore, PageSnapshot, PathResolver, ReplaceElementsConfig } from "./types";
|
|
3
3
|
type BuildOptions = {
|
|
4
4
|
entryUrl: string;
|
|
5
5
|
createdAt: number;
|
|
@@ -9,6 +9,7 @@ type BuildOptions = {
|
|
|
9
9
|
pathResolver?: PathResolver;
|
|
10
10
|
rewriteEntry: boolean;
|
|
11
11
|
rewriteCSS: boolean;
|
|
12
|
+
replaceElements?: ReplaceElementsConfig;
|
|
12
13
|
warnings: string[];
|
|
13
14
|
};
|
|
14
15
|
export declare const buildSnapshot: (input: BuildOptions) => Promise<PageSnapshot>;
|
package/dist/snapshot-builder.js
CHANGED
|
@@ -131,6 +131,42 @@ const buildApiSnapshot = (url, createdAt, entries) => ({
|
|
|
131
131
|
createdAt,
|
|
132
132
|
records: entries.map((entry) => entry.record)
|
|
133
133
|
});
|
|
134
|
+
const buildResourcesPathSnapshot = (createdAt, files) => {
|
|
135
|
+
const items = [];
|
|
136
|
+
for (const file of files) {
|
|
137
|
+
const originalUrl = file.originalUrl;
|
|
138
|
+
if (!originalUrl) {
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
// Exclude HTML documents. We only want static resources.
|
|
142
|
+
if (file.resourceType === "document") {
|
|
143
|
+
continue;
|
|
144
|
+
}
|
|
145
|
+
// Skip snapshot-local pseudo URLs.
|
|
146
|
+
if (originalUrl.startsWith("/")) {
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
// Be defensive: only include valid absolute URLs.
|
|
150
|
+
try {
|
|
151
|
+
new URL(originalUrl);
|
|
152
|
+
}
|
|
153
|
+
catch {
|
|
154
|
+
continue;
|
|
155
|
+
}
|
|
156
|
+
items.push({
|
|
157
|
+
url: originalUrl,
|
|
158
|
+
path: file.path,
|
|
159
|
+
resourceType: file.resourceType,
|
|
160
|
+
mimeType: file.mimeType,
|
|
161
|
+
size: file.size
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
return {
|
|
165
|
+
version: "1.0",
|
|
166
|
+
createdAt,
|
|
167
|
+
items
|
|
168
|
+
};
|
|
169
|
+
};
|
|
134
170
|
const buildSnapshot = async (input) => {
|
|
135
171
|
const warnings = input.warnings;
|
|
136
172
|
const groups = groupResources({
|
|
@@ -184,7 +220,10 @@ const buildSnapshot = async (input) => {
|
|
|
184
220
|
entryUrl: group.url,
|
|
185
221
|
apiPath,
|
|
186
222
|
resolve,
|
|
187
|
-
rewriteLinks: input.rewriteEntry
|
|
223
|
+
rewriteLinks: input.rewriteEntry,
|
|
224
|
+
replaceElements: input.replaceElements,
|
|
225
|
+
isEntryDocument: group.url === input.entryUrl,
|
|
226
|
+
snapshotEntryUrl: input.entryUrl
|
|
188
227
|
});
|
|
189
228
|
html = rewritten.html;
|
|
190
229
|
if (!title) {
|
|
@@ -276,6 +315,17 @@ const buildSnapshot = async (input) => {
|
|
|
276
315
|
originalUrl: apiPath
|
|
277
316
|
});
|
|
278
317
|
}
|
|
318
|
+
{
|
|
319
|
+
const resourcesPath = buildResourcesPathSnapshot(input.createdAt, files);
|
|
320
|
+
const bytes = new TextEncoder().encode(JSON.stringify(resourcesPath, null, 2));
|
|
321
|
+
// Snapshot-local artifact. It intentionally has no originalUrl.
|
|
322
|
+
files.push({
|
|
323
|
+
path: "/resources_path.json",
|
|
324
|
+
mimeType: "application/json",
|
|
325
|
+
size: bytes.byteLength,
|
|
326
|
+
source: { kind: "memory", data: bytes }
|
|
327
|
+
});
|
|
328
|
+
}
|
|
279
329
|
const totalBytes = files.reduce((sum, file) => sum + (file.size ?? 0), 0);
|
|
280
330
|
const totalFiles = files.length;
|
|
281
331
|
const snapshotUrl = input.entryUrl || groups[0]?.url || "";
|
package/dist/types.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import type { BodySource, NetworkInterceptorAdapter, NetworkRequestEvent, NetworkResponseEvent, ResourceType } from "@pagepocket/interceptor";
|
|
2
|
+
import type { Cheerio, CheerioAPI } from "cheerio";
|
|
2
3
|
export type { BodySource, InterceptOptions, InterceptSession, InterceptTarget, InterceptorActions, InterceptorCapabilities, NavigateOptions, NetworkEvent, NetworkEventHandlers, NetworkInterceptorAdapter, NetworkRequestEvent, NetworkRequestFailedEvent, NetworkResponseEvent, ResourceType, TriggerAction } from "@pagepocket/interceptor";
|
|
4
|
+
import type { NetworkEvent } from "@pagepocket/interceptor";
|
|
3
5
|
export interface PathResolver {
|
|
4
6
|
resolve(input: {
|
|
5
7
|
url: string;
|
|
@@ -47,20 +49,94 @@ export interface CompletionStrategy {
|
|
|
47
49
|
}
|
|
48
50
|
export interface PagePocketOptions {
|
|
49
51
|
}
|
|
52
|
+
export type NetworkEventStream = AsyncIterable<NetworkEvent>;
|
|
50
53
|
export interface CaptureOptions {
|
|
51
54
|
interceptor: NetworkInterceptorAdapter;
|
|
52
55
|
completion?: CompletionStrategy | CompletionStrategy[];
|
|
56
|
+
/**
|
|
57
|
+
* Network idle duration (ms) used to determine capture completion.
|
|
58
|
+
*
|
|
59
|
+
* If `completion` is not provided, PagePocket will wait until the network has
|
|
60
|
+
* been idle (no inflight requests) for this duration.
|
|
61
|
+
*
|
|
62
|
+
* Note: this is NOT a wall-clock timeout from capture start.
|
|
63
|
+
*/
|
|
64
|
+
timeoutMs?: number;
|
|
65
|
+
/**
|
|
66
|
+
* Hard wall-clock limit (ms) for the overall capture session.
|
|
67
|
+
*
|
|
68
|
+
* When `completion` is not provided, PagePocket will stop after either:
|
|
69
|
+
* - network has been idle for `timeoutMs`, OR
|
|
70
|
+
* - `maxDurationMs` has elapsed.
|
|
71
|
+
*/
|
|
72
|
+
maxDurationMs?: number;
|
|
53
73
|
filter?: ResourceFilter;
|
|
54
74
|
pathResolver?: PathResolver;
|
|
55
75
|
contentStore?: ContentStore;
|
|
56
76
|
rewriteEntry?: boolean;
|
|
57
77
|
rewriteCSS?: boolean;
|
|
78
|
+
blacklist?: RegExp[];
|
|
79
|
+
/**
|
|
80
|
+
* Replace parts of the captured HTML (Document response body) during the HTML
|
|
81
|
+
* rewrite stage (Cheerio).
|
|
82
|
+
*/
|
|
83
|
+
replaceElements?: ReplaceElementsConfig;
|
|
58
84
|
limits?: {
|
|
59
85
|
maxTotalBytes?: number;
|
|
60
86
|
maxSingleResourceBytes?: number;
|
|
61
87
|
maxResources?: number;
|
|
62
88
|
};
|
|
63
89
|
}
|
|
90
|
+
export type ReplaceElementsConfig = Array<ReplaceElementRule | ReplaceElementFn | ReplaceElementFnWithQuery>;
|
|
91
|
+
export type MatchQuery = string | {
|
|
92
|
+
selector?: string;
|
|
93
|
+
tagName?: string;
|
|
94
|
+
id?: string;
|
|
95
|
+
attrs?: Record<string, string | RegExp | true>;
|
|
96
|
+
};
|
|
97
|
+
export type ReplaceAction = {
|
|
98
|
+
type: "replaceWithHtml";
|
|
99
|
+
html: string;
|
|
100
|
+
} | {
|
|
101
|
+
type: "replaceWithElement";
|
|
102
|
+
tagName: string;
|
|
103
|
+
textContent?: string;
|
|
104
|
+
html?: string;
|
|
105
|
+
attrs?: Record<string, string | null>;
|
|
106
|
+
} | {
|
|
107
|
+
type: "renameTag";
|
|
108
|
+
to: string;
|
|
109
|
+
keepAttributes?: boolean;
|
|
110
|
+
keepChildren?: boolean;
|
|
111
|
+
} | {
|
|
112
|
+
type: "remove";
|
|
113
|
+
};
|
|
114
|
+
export interface ApplyOptions {
|
|
115
|
+
scope?: "document" | "allFrames";
|
|
116
|
+
limit?: number | "all";
|
|
117
|
+
onReplaced?: "stop" | "continue";
|
|
118
|
+
}
|
|
119
|
+
export interface ReplaceElementRule {
|
|
120
|
+
name?: string;
|
|
121
|
+
match: MatchQuery;
|
|
122
|
+
replace: ReplaceAction;
|
|
123
|
+
apply?: ApplyOptions;
|
|
124
|
+
}
|
|
125
|
+
export interface ReplaceElementContext {
|
|
126
|
+
$: CheerioAPI;
|
|
127
|
+
$el: Cheerio<any>;
|
|
128
|
+
url: string;
|
|
129
|
+
entryUrl: string;
|
|
130
|
+
ruleIndex: number;
|
|
131
|
+
matchIndex: number;
|
|
132
|
+
}
|
|
133
|
+
export type ReplaceElementFn = (ctx: ReplaceElementContext) => void | ReplaceAction | ReplaceAction[] | Promise<void | ReplaceAction | ReplaceAction[]>;
|
|
134
|
+
export interface ReplaceElementFnWithQuery {
|
|
135
|
+
name?: string;
|
|
136
|
+
query: string;
|
|
137
|
+
run: ReplaceElementFn;
|
|
138
|
+
apply?: ApplyOptions;
|
|
139
|
+
}
|
|
64
140
|
export interface SnapshotFile {
|
|
65
141
|
path: string;
|
|
66
142
|
mimeType?: string;
|
|
@@ -84,19 +160,30 @@ export interface PageSnapshot {
|
|
|
84
160
|
};
|
|
85
161
|
content: ContentStoreHandle;
|
|
86
162
|
toDirectory(outDir: string, options?: WriteFSOptions): Promise<WriteResult>;
|
|
87
|
-
toZip(options?: ZipOptions): Promise<
|
|
163
|
+
toZip(options?: ZipOptions): Promise<ZipResult>;
|
|
88
164
|
}
|
|
89
165
|
export interface WriteFSOptions {
|
|
90
166
|
clearCache?: boolean;
|
|
167
|
+
overwrite?: boolean;
|
|
168
|
+
suffix?: string;
|
|
91
169
|
}
|
|
92
170
|
export interface WriteResult {
|
|
93
171
|
filesWritten: number;
|
|
94
172
|
totalBytes: number;
|
|
173
|
+
outputDir?: string;
|
|
95
174
|
}
|
|
96
175
|
export interface ZipOptions {
|
|
97
176
|
asBlob?: boolean;
|
|
98
177
|
clearCache?: boolean;
|
|
178
|
+
overwrite?: boolean;
|
|
179
|
+
suffix?: string;
|
|
180
|
+
outputPath?: string;
|
|
181
|
+
}
|
|
182
|
+
export interface ZipWriteResult {
|
|
183
|
+
data: Uint8Array | Blob;
|
|
184
|
+
outputPath: string;
|
|
99
185
|
}
|
|
186
|
+
export type ZipResult = Uint8Array | Blob | ZipWriteResult;
|
|
100
187
|
export interface ApiRecord {
|
|
101
188
|
url: string;
|
|
102
189
|
method: string;
|
package/dist/writers.d.ts
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import type { PageSnapshot, WriteFSOptions, WriteResult, ZipOptions } from "./types";
|
|
1
|
+
import type { PageSnapshot, WriteFSOptions, WriteResult, ZipOptions, ZipResult } from "./types";
|
|
2
2
|
export declare const writeToFS: (snapshot: PageSnapshot, outDir: string, options?: WriteFSOptions) => Promise<WriteResult>;
|
|
3
|
-
export declare const toZip: (snapshot: PageSnapshot, options?: ZipOptions) => Promise<
|
|
3
|
+
export declare const toZip: (snapshot: PageSnapshot, options?: ZipOptions) => Promise<ZipResult>;
|
package/dist/writers.js
CHANGED
|
@@ -3,6 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
3
3
|
exports.toZip = exports.writeToFS = void 0;
|
|
4
4
|
const uni_fs_1 = require("@pagepocket/uni-fs");
|
|
5
5
|
const utils_1 = require("./utils");
|
|
6
|
+
const DEFAULT_SUFFIX_PATTERN = "_{num}";
|
|
6
7
|
const normalizePath = (value) => value.replace(/\\/g, "/");
|
|
7
8
|
const joinPath = (base, relative) => {
|
|
8
9
|
const cleanBase = normalizePath(base).replace(/\/+$/, "");
|
|
@@ -24,6 +25,39 @@ const splitPathExtension = (value) => {
|
|
|
24
25
|
}
|
|
25
26
|
return { filename: clean, extension: "" };
|
|
26
27
|
};
|
|
28
|
+
const trimTrailingSlash = (value) => {
|
|
29
|
+
const normalized = normalizePath(value);
|
|
30
|
+
if (normalized === "/") {
|
|
31
|
+
return normalized;
|
|
32
|
+
}
|
|
33
|
+
return normalized.replace(/\/+$/, "");
|
|
34
|
+
};
|
|
35
|
+
const buildSuffix = (pattern, index) => {
|
|
36
|
+
const template = pattern ?? DEFAULT_SUFFIX_PATTERN;
|
|
37
|
+
return template.includes("{num}") ? template.replace("{num}", String(index)) : `${template}${index}`;
|
|
38
|
+
};
|
|
39
|
+
const appendDirectorySuffix = (basePath, suffix) => {
|
|
40
|
+
return `${trimTrailingSlash(basePath)}${suffix}`;
|
|
41
|
+
};
|
|
42
|
+
const appendFileSuffix = (basePath, suffix) => {
|
|
43
|
+
const { filename, extension } = splitPathExtension(basePath);
|
|
44
|
+
if (!extension) {
|
|
45
|
+
return `${filename}${suffix}`;
|
|
46
|
+
}
|
|
47
|
+
return `${filename}${suffix}.${extension}`;
|
|
48
|
+
};
|
|
49
|
+
const resolveUniquePath = async (basePath, options) => {
|
|
50
|
+
if (options.overwrite) {
|
|
51
|
+
return basePath;
|
|
52
|
+
}
|
|
53
|
+
const applySuffix = options.kind === "directory" ? appendDirectorySuffix : appendFileSuffix;
|
|
54
|
+
for (let index = 0;; index += 1) {
|
|
55
|
+
const candidate = index === 0 ? basePath : applySuffix(basePath, buildSuffix(options.suffix, index));
|
|
56
|
+
if (!(await (0, uni_fs_1.existsPath)(candidate, ""))) {
|
|
57
|
+
return candidate;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
};
|
|
27
61
|
const streamToUint8Array = async (stream) => {
|
|
28
62
|
const reader = stream.getReader();
|
|
29
63
|
const chunks = [];
|
|
@@ -48,9 +82,14 @@ const streamToUint8Array = async (stream) => {
|
|
|
48
82
|
const writeToFS = async (snapshot, outDir, options) => {
|
|
49
83
|
let filesWritten = 0;
|
|
50
84
|
let totalBytes = 0;
|
|
85
|
+
const outputDir = await resolveUniquePath(outDir, {
|
|
86
|
+
overwrite: options?.overwrite ?? false,
|
|
87
|
+
suffix: options?.suffix,
|
|
88
|
+
kind: "directory"
|
|
89
|
+
});
|
|
51
90
|
for (const file of snapshot.files) {
|
|
52
91
|
const relative = (0, utils_1.stripLeadingSlash)(file.path);
|
|
53
|
-
const outputPath = joinPath(
|
|
92
|
+
const outputPath = joinPath(outputDir, relative);
|
|
54
93
|
const { filename, extension } = splitPathExtension(outputPath);
|
|
55
94
|
const stream = await snapshot.content.open(file.source);
|
|
56
95
|
const data = await streamToUint8Array(stream);
|
|
@@ -61,7 +100,7 @@ const writeToFS = async (snapshot, outDir, options) => {
|
|
|
61
100
|
if (options?.clearCache ?? true) {
|
|
62
101
|
await snapshot.content.dispose?.();
|
|
63
102
|
}
|
|
64
|
-
return { filesWritten, totalBytes };
|
|
103
|
+
return { filesWritten, totalBytes, outputDir };
|
|
65
104
|
};
|
|
66
105
|
exports.writeToFS = writeToFS;
|
|
67
106
|
const crc32Table = (() => {
|
|
@@ -164,12 +203,25 @@ const toZip = async (snapshot, options) => {
|
|
|
164
203
|
writeUint16(0)
|
|
165
204
|
]);
|
|
166
205
|
const zipBytes = concatBytes([...localChunks, centralDirectory, endRecord]);
|
|
167
|
-
const
|
|
206
|
+
const outputData = options?.asBlob && typeof Blob !== "undefined"
|
|
168
207
|
? new Blob([zipBytes], { type: "application/zip" })
|
|
169
208
|
: zipBytes;
|
|
209
|
+
if (options?.outputPath) {
|
|
210
|
+
const outputPath = await resolveUniquePath(options.outputPath, {
|
|
211
|
+
overwrite: options?.overwrite ?? false,
|
|
212
|
+
suffix: options?.suffix,
|
|
213
|
+
kind: "file"
|
|
214
|
+
});
|
|
215
|
+
const { filename, extension } = splitPathExtension(outputPath);
|
|
216
|
+
await (0, uni_fs_1.write)(filename, extension, outputData);
|
|
217
|
+
if (options?.clearCache ?? true) {
|
|
218
|
+
await snapshot.content.dispose?.();
|
|
219
|
+
}
|
|
220
|
+
return { data: outputData, outputPath };
|
|
221
|
+
}
|
|
170
222
|
if (options?.clearCache ?? true) {
|
|
171
223
|
await snapshot.content.dispose?.();
|
|
172
224
|
}
|
|
173
|
-
return
|
|
225
|
+
return outputData;
|
|
174
226
|
};
|
|
175
227
|
exports.toZip = toZip;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pagepocket/lib",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.0",
|
|
4
4
|
"description": "Library for rewriting HTML snapshots and inlining local resources.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -12,8 +12,8 @@
|
|
|
12
12
|
"license": "ISC",
|
|
13
13
|
"dependencies": {
|
|
14
14
|
"cheerio": "^1.0.0-rc.12",
|
|
15
|
-
"@pagepocket/interceptor": "0.
|
|
16
|
-
"@pagepocket/uni-fs": "0.
|
|
15
|
+
"@pagepocket/interceptor": "0.7.0",
|
|
16
|
+
"@pagepocket/uni-fs": "0.7.0"
|
|
17
17
|
},
|
|
18
18
|
"devDependencies": {
|
|
19
19
|
"@playwright/test": "^1.50.1",
|