playwright-archaeologist 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +392 -0
- package/bin/cli.js +2 -0
- package/dist/chunk-7ZQGW5OV.js +255 -0
- package/dist/chunk-7ZQGW5OV.js.map +1 -0
- package/dist/chunk-F5WCXM7I.js +4469 -0
- package/dist/chunk-F5WCXM7I.js.map +1 -0
- package/dist/chunk-RWPEKZOW.js +118 -0
- package/dist/chunk-RWPEKZOW.js.map +1 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +310 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +1948 -0
- package/dist/index.js +789 -0
- package/dist/index.js.map +1 -0
- package/dist/page-scanner-Q76HROEW.js +8 -0
- package/dist/page-scanner-Q76HROEW.js.map +1 -0
- package/package.json +83 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,789 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ArchaeologistError,
|
|
3
|
+
AuthError,
|
|
4
|
+
BundleError,
|
|
5
|
+
CollectorError,
|
|
6
|
+
ConfigError,
|
|
7
|
+
CrawlConfigSchema,
|
|
8
|
+
CrawlError,
|
|
9
|
+
DiffConfigSchema,
|
|
10
|
+
DiffError,
|
|
11
|
+
Frontier,
|
|
12
|
+
Logger,
|
|
13
|
+
NavigationError,
|
|
14
|
+
ViewportSchema,
|
|
15
|
+
buildFlowGraph,
|
|
16
|
+
captureScreenshots,
|
|
17
|
+
createBundle,
|
|
18
|
+
createNetworkLogger,
|
|
19
|
+
detectCycles,
|
|
20
|
+
diffBundles,
|
|
21
|
+
dig,
|
|
22
|
+
escapeAttribute,
|
|
23
|
+
escapeHtml,
|
|
24
|
+
escapeJsonInHtml,
|
|
25
|
+
generateDiffReportHtml,
|
|
26
|
+
generateMermaidDefinition,
|
|
27
|
+
generateOpenApiSpec,
|
|
28
|
+
generateReportHtml,
|
|
29
|
+
groupEndpoints,
|
|
30
|
+
logger,
|
|
31
|
+
normalizeEntryUrl,
|
|
32
|
+
parseViewport,
|
|
33
|
+
probeForms,
|
|
34
|
+
writeOpenApiSpec
|
|
35
|
+
} from "./chunk-F5WCXM7I.js";
|
|
36
|
+
import {
|
|
37
|
+
scanPage
|
|
38
|
+
} from "./chunk-7ZQGW5OV.js";
|
|
39
|
+
import "./chunk-RWPEKZOW.js";
|
|
40
|
+
|
|
41
|
+
// src/auth/auth-handler.ts
|
|
42
|
+
import { resolve } from "path";
|
|
43
|
+
import { readFile, writeFile, access } from "fs/promises";
|
|
44
|
+
import { constants } from "fs";
|
|
45
|
+
|
|
46
|
+
// src/security/auth-validator.ts
|
|
47
|
+
import { readFileSync } from "fs";
|
|
48
|
+
import { pathToFileURL } from "url";
|
|
49
|
+
var DANGEROUS_MODULES = [
|
|
50
|
+
"child_process",
|
|
51
|
+
"fs",
|
|
52
|
+
"fs/promises",
|
|
53
|
+
"net",
|
|
54
|
+
"dgram",
|
|
55
|
+
"cluster",
|
|
56
|
+
"worker_threads",
|
|
57
|
+
"vm",
|
|
58
|
+
"v8",
|
|
59
|
+
"perf_hooks",
|
|
60
|
+
"dns",
|
|
61
|
+
"tls",
|
|
62
|
+
"http",
|
|
63
|
+
"https"
|
|
64
|
+
];
|
|
65
|
+
var DANGEROUS_PATTERNS = [
|
|
66
|
+
{
|
|
67
|
+
pattern: /\beval\s*\(/g,
|
|
68
|
+
description: "eval() usage detected -- potential code injection"
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
pattern: /\bnew\s+Function\s*\(/g,
|
|
72
|
+
description: "Function constructor detected -- potential code injection"
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
pattern: /\bprocess\.exit\b/g,
|
|
76
|
+
description: "process.exit() detected -- could terminate the tool unexpectedly"
|
|
77
|
+
}
|
|
78
|
+
];
|
|
79
|
+
function buildModulePatterns() {
|
|
80
|
+
const results = [];
|
|
81
|
+
for (const mod of DANGEROUS_MODULES) {
|
|
82
|
+
const escaped = mod.replace(/\//g, "\\/");
|
|
83
|
+
results.push({
|
|
84
|
+
pattern: new RegExp(`\\bimport\\b[^;]*['"](?:node:)?${escaped}['"]`, "g"),
|
|
85
|
+
moduleName: mod
|
|
86
|
+
});
|
|
87
|
+
results.push({
|
|
88
|
+
pattern: new RegExp(`\\bimport\\s*\\(\\s*['"](?:node:)?${escaped}['"]\\s*\\)`, "g"),
|
|
89
|
+
moduleName: mod
|
|
90
|
+
});
|
|
91
|
+
results.push({
|
|
92
|
+
pattern: new RegExp(`\\brequire\\s*\\(\\s*['"](?:node:)?${escaped}['"]\\s*\\)`, "g"),
|
|
93
|
+
moduleName: mod
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
return results;
|
|
97
|
+
}
|
|
98
|
+
var MODULE_PATTERNS = buildModulePatterns();
|
|
99
|
+
function validateAuthScript(scriptPath) {
|
|
100
|
+
const warnings = [];
|
|
101
|
+
let content;
|
|
102
|
+
try {
|
|
103
|
+
content = readFileSync(scriptPath, "utf-8");
|
|
104
|
+
} catch (err) {
|
|
105
|
+
const code = err?.code;
|
|
106
|
+
if (code === "ENOENT") {
|
|
107
|
+
throw new Error(`Auth script not found: ${scriptPath}`);
|
|
108
|
+
}
|
|
109
|
+
throw new Error(`Failed to read auth script '${scriptPath}': ${err?.message ?? String(err)}`);
|
|
110
|
+
}
|
|
111
|
+
for (const { pattern, moduleName } of MODULE_PATTERNS) {
|
|
112
|
+
pattern.lastIndex = 0;
|
|
113
|
+
if (pattern.test(content)) {
|
|
114
|
+
warnings.push(`Dangerous import of '${moduleName}' detected`);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
for (const { pattern, description } of DANGEROUS_PATTERNS) {
|
|
118
|
+
pattern.lastIndex = 0;
|
|
119
|
+
if (pattern.test(content)) {
|
|
120
|
+
warnings.push(description);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return {
|
|
124
|
+
valid: warnings.length === 0,
|
|
125
|
+
warnings
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// src/auth/auth-handler.ts
|
|
130
|
+
var LOGIN_PATH_PATTERNS = [
|
|
131
|
+
/\/login\b/i,
|
|
132
|
+
/\/signin\b/i,
|
|
133
|
+
/\/sign-in\b/i,
|
|
134
|
+
/\/auth\b/i,
|
|
135
|
+
/\/sso\b/i
|
|
136
|
+
];
|
|
137
|
+
async function fileExists(filePath) {
|
|
138
|
+
try {
|
|
139
|
+
await access(filePath, constants.R_OK);
|
|
140
|
+
return true;
|
|
141
|
+
} catch {
|
|
142
|
+
return false;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
function isLoginUrl(url) {
|
|
146
|
+
try {
|
|
147
|
+
const parsed = new URL(url);
|
|
148
|
+
return LOGIN_PATH_PATTERNS.some((re) => re.test(parsed.pathname));
|
|
149
|
+
} catch {
|
|
150
|
+
return false;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
async function getCookieCount(context) {
|
|
154
|
+
const cookies = await context.cookies();
|
|
155
|
+
return cookies.length;
|
|
156
|
+
}
|
|
157
|
+
async function loadStorageState(context, statePath) {
|
|
158
|
+
const raw = await readFile(statePath, "utf-8");
|
|
159
|
+
const state = JSON.parse(raw);
|
|
160
|
+
if (!state || typeof state !== "object") {
|
|
161
|
+
throw new Error(`Invalid storage state file: ${statePath}`);
|
|
162
|
+
}
|
|
163
|
+
if (Array.isArray(state.cookies) && state.cookies.length > 0) {
|
|
164
|
+
await context.addCookies(state.cookies);
|
|
165
|
+
}
|
|
166
|
+
if (Array.isArray(state.origins)) {
|
|
167
|
+
for (const origin of state.origins) {
|
|
168
|
+
if (!origin.origin || !Array.isArray(origin.localStorage) || origin.localStorage.length === 0) {
|
|
169
|
+
continue;
|
|
170
|
+
}
|
|
171
|
+
const items = origin.localStorage;
|
|
172
|
+
await context.addInitScript(
|
|
173
|
+
(data) => {
|
|
174
|
+
if (window.location.origin === data.origin) {
|
|
175
|
+
for (const item of data.items) {
|
|
176
|
+
try {
|
|
177
|
+
localStorage.setItem(item.name, item.value);
|
|
178
|
+
} catch {
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
},
|
|
183
|
+
{ origin: origin.origin, items }
|
|
184
|
+
);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
async function saveStorageState(context, statePath) {
|
|
189
|
+
const state = await context.storageState();
|
|
190
|
+
await writeFile(statePath, JSON.stringify(state, null, 2), "utf-8");
|
|
191
|
+
}
|
|
192
|
+
async function loadCookies(context, cookiesPath) {
|
|
193
|
+
const raw = await readFile(cookiesPath, "utf-8");
|
|
194
|
+
let cookies;
|
|
195
|
+
try {
|
|
196
|
+
cookies = JSON.parse(raw);
|
|
197
|
+
} catch {
|
|
198
|
+
throw new Error(
|
|
199
|
+
`Invalid JSON in cookies file: ${cookiesPath}`
|
|
200
|
+
);
|
|
201
|
+
}
|
|
202
|
+
if (!Array.isArray(cookies)) {
|
|
203
|
+
throw new Error(
|
|
204
|
+
`Cookies file must contain a JSON array: ${cookiesPath}`
|
|
205
|
+
);
|
|
206
|
+
}
|
|
207
|
+
if (cookies.length === 0) {
|
|
208
|
+
return 0;
|
|
209
|
+
}
|
|
210
|
+
await context.addCookies(cookies);
|
|
211
|
+
return cookies.length;
|
|
212
|
+
}
|
|
213
|
+
async function runAuthScript(page, scriptPath) {
|
|
214
|
+
const resolvedPath = resolve(scriptPath);
|
|
215
|
+
const validation = validateAuthScript(resolvedPath);
|
|
216
|
+
if (!validation.valid) {
|
|
217
|
+
throw new Error(
|
|
218
|
+
`Auth script failed validation: ${validation.warnings.join("; ")}`
|
|
219
|
+
);
|
|
220
|
+
}
|
|
221
|
+
if (!await fileExists(resolvedPath)) {
|
|
222
|
+
throw new Error(`Auth script not found: ${resolvedPath}`);
|
|
223
|
+
}
|
|
224
|
+
let mod;
|
|
225
|
+
try {
|
|
226
|
+
mod = await import(resolvedPath);
|
|
227
|
+
} catch (err) {
|
|
228
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
229
|
+
throw new Error(`Failed to import auth script '${resolvedPath}': ${message}`);
|
|
230
|
+
}
|
|
231
|
+
const authFn = mod.default;
|
|
232
|
+
if (typeof authFn !== "function") {
|
|
233
|
+
throw new Error(
|
|
234
|
+
`Auth script '${resolvedPath}' must export a default async function`
|
|
235
|
+
);
|
|
236
|
+
}
|
|
237
|
+
await authFn(page);
|
|
238
|
+
}
|
|
239
|
+
async function detectAuthFailure(page) {
|
|
240
|
+
const url = page.url();
|
|
241
|
+
if (isLoginUrl(url)) {
|
|
242
|
+
return {
|
|
243
|
+
failed: true,
|
|
244
|
+
reason: `Still on login page: ${url}`
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
try {
|
|
248
|
+
const response = await page.evaluate(() => {
|
|
249
|
+
const entries = performance.getEntriesByType("navigation");
|
|
250
|
+
if (entries.length > 0) {
|
|
251
|
+
return { responseStatus: entries[0].responseStatus ?? null };
|
|
252
|
+
}
|
|
253
|
+
return { responseStatus: null };
|
|
254
|
+
});
|
|
255
|
+
if (response.responseStatus === 401 || response.responseStatus === 403) {
|
|
256
|
+
return {
|
|
257
|
+
failed: true,
|
|
258
|
+
reason: `Received HTTP ${response.responseStatus} response`
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
} catch {
|
|
262
|
+
}
|
|
263
|
+
return { failed: false };
|
|
264
|
+
}
|
|
265
|
+
async function executeAuth(page, context, options) {
|
|
266
|
+
if (!options.storageStatePath && !options.cookiesFile && !options.authScript) {
|
|
267
|
+
return {
|
|
268
|
+
success: false,
|
|
269
|
+
method: "script",
|
|
270
|
+
error: "No authentication method provided",
|
|
271
|
+
cookieCount: 0
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
if (options.storageStatePath) {
|
|
275
|
+
const stateExists = await fileExists(options.storageStatePath);
|
|
276
|
+
if (stateExists) {
|
|
277
|
+
try {
|
|
278
|
+
await loadStorageState(context, options.storageStatePath);
|
|
279
|
+
const cookieCount = await getCookieCount(context);
|
|
280
|
+
return {
|
|
281
|
+
success: true,
|
|
282
|
+
method: "storageState",
|
|
283
|
+
finalUrl: page.url(),
|
|
284
|
+
cookieCount
|
|
285
|
+
};
|
|
286
|
+
} catch (err) {
|
|
287
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
288
|
+
return {
|
|
289
|
+
success: false,
|
|
290
|
+
method: "storageState",
|
|
291
|
+
error: `Failed to load storage state: ${message}`,
|
|
292
|
+
cookieCount: 0
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
if (options.cookiesFile) {
|
|
298
|
+
try {
|
|
299
|
+
const injectedCount = await loadCookies(context, options.cookiesFile);
|
|
300
|
+
const cookieCount = await getCookieCount(context);
|
|
301
|
+
return {
|
|
302
|
+
success: true,
|
|
303
|
+
method: "cookies",
|
|
304
|
+
finalUrl: page.url(),
|
|
305
|
+
cookieCount: cookieCount > 0 ? cookieCount : injectedCount
|
|
306
|
+
};
|
|
307
|
+
} catch (err) {
|
|
308
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
309
|
+
return {
|
|
310
|
+
success: false,
|
|
311
|
+
method: "cookies",
|
|
312
|
+
error: `Failed to inject cookies: ${message}`,
|
|
313
|
+
cookieCount: 0
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
if (options.authScript) {
|
|
318
|
+
try {
|
|
319
|
+
await runAuthScript(page, options.authScript);
|
|
320
|
+
const failureCheck = await detectAuthFailure(page);
|
|
321
|
+
if (failureCheck.failed) {
|
|
322
|
+
return {
|
|
323
|
+
success: false,
|
|
324
|
+
method: "script",
|
|
325
|
+
error: failureCheck.reason,
|
|
326
|
+
finalUrl: page.url(),
|
|
327
|
+
cookieCount: await getCookieCount(context)
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
if (options.storageStatePath) {
|
|
331
|
+
try {
|
|
332
|
+
await saveStorageState(context, options.storageStatePath);
|
|
333
|
+
} catch {
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
const cookieCount = await getCookieCount(context);
|
|
337
|
+
return {
|
|
338
|
+
success: true,
|
|
339
|
+
method: "script",
|
|
340
|
+
finalUrl: page.url(),
|
|
341
|
+
cookieCount
|
|
342
|
+
};
|
|
343
|
+
} catch (err) {
|
|
344
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
345
|
+
return {
|
|
346
|
+
success: false,
|
|
347
|
+
method: "script",
|
|
348
|
+
error: `Auth script failed: ${message}`,
|
|
349
|
+
finalUrl: page.url(),
|
|
350
|
+
cookieCount: 0
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
return {
|
|
355
|
+
success: false,
|
|
356
|
+
method: "script",
|
|
357
|
+
error: "No authentication method could be applied",
|
|
358
|
+
cookieCount: 0
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
async function refreshAuth(page, context, options) {
|
|
362
|
+
await context.clearCookies();
|
|
363
|
+
return executeAuth(page, context, options);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// src/crawl/context-pool.ts
|
|
367
|
+
var ContextPool = class {
|
|
368
|
+
browser;
|
|
369
|
+
concurrency;
|
|
370
|
+
recycleAfter;
|
|
371
|
+
contextOptions;
|
|
372
|
+
storageState;
|
|
373
|
+
/** Contexts currently in use by callers */
|
|
374
|
+
acquired = /* @__PURE__ */ new Map();
|
|
375
|
+
/** Contexts sitting idle, ready to be acquired */
|
|
376
|
+
idle = [];
|
|
377
|
+
/** Waiters blocked on acquire() when pool is full */
|
|
378
|
+
waiters = [];
|
|
379
|
+
/** Whether closeAll() has been called */
|
|
380
|
+
closed = false;
|
|
381
|
+
constructor(browser, options) {
|
|
382
|
+
if (!browser) {
|
|
383
|
+
throw new Error("Browser instance is required");
|
|
384
|
+
}
|
|
385
|
+
if (options.concurrency < 1) {
|
|
386
|
+
throw new Error("Concurrency must be >= 1");
|
|
387
|
+
}
|
|
388
|
+
if (options.recycleAfter < 1) {
|
|
389
|
+
throw new Error("recycleAfter must be >= 1");
|
|
390
|
+
}
|
|
391
|
+
this.browser = browser;
|
|
392
|
+
this.concurrency = options.concurrency;
|
|
393
|
+
this.recycleAfter = options.recycleAfter;
|
|
394
|
+
this.contextOptions = options.contextOptions;
|
|
395
|
+
this.storageState = options.storageState;
|
|
396
|
+
}
|
|
397
|
+
/**
|
|
398
|
+
* Acquire a browser context from the pool.
|
|
399
|
+
* Blocks if the pool is at full capacity until a context is released.
|
|
400
|
+
*/
|
|
401
|
+
async acquire() {
|
|
402
|
+
if (this.closed) {
|
|
403
|
+
throw new Error("ContextPool is closed");
|
|
404
|
+
}
|
|
405
|
+
while (this.acquired.size + this.idle.length >= this.concurrency && this.idle.length === 0) {
|
|
406
|
+
await new Promise((resolve2) => {
|
|
407
|
+
this.waiters.push(resolve2);
|
|
408
|
+
});
|
|
409
|
+
if (this.closed) {
|
|
410
|
+
throw new Error("ContextPool is closed");
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
if (this.idle.length > 0) {
|
|
414
|
+
const entry2 = this.idle.pop();
|
|
415
|
+
this.acquired.set(entry2.context, entry2);
|
|
416
|
+
return entry2.context;
|
|
417
|
+
}
|
|
418
|
+
const entry = await this.createEntry();
|
|
419
|
+
this.acquired.set(entry.context, entry);
|
|
420
|
+
return entry.context;
|
|
421
|
+
}
|
|
422
|
+
/**
|
|
423
|
+
* Release a context back to the pool. Increments the page count
|
|
424
|
+
* and recycles the context if it has exceeded recycleAfter.
|
|
425
|
+
*/
|
|
426
|
+
async release(context) {
|
|
427
|
+
const entry = this.acquired.get(context);
|
|
428
|
+
if (!entry) {
|
|
429
|
+
throw new Error("Context is not owned by this pool or was already released");
|
|
430
|
+
}
|
|
431
|
+
this.acquired.delete(context);
|
|
432
|
+
entry.pageCount += 1;
|
|
433
|
+
if (this.closed) {
|
|
434
|
+
await context.close();
|
|
435
|
+
return;
|
|
436
|
+
}
|
|
437
|
+
if (entry.pageCount >= this.recycleAfter) {
|
|
438
|
+
await context.close();
|
|
439
|
+
} else {
|
|
440
|
+
this.idle.push(entry);
|
|
441
|
+
}
|
|
442
|
+
this.drainWaiters();
|
|
443
|
+
}
|
|
444
|
+
/**
|
|
445
|
+
* Close all contexts (both idle and acquired) and reject future operations.
|
|
446
|
+
*/
|
|
447
|
+
async closeAll() {
|
|
448
|
+
this.closed = true;
|
|
449
|
+
for (const resolve2 of this.waiters) {
|
|
450
|
+
resolve2();
|
|
451
|
+
}
|
|
452
|
+
this.waiters.length = 0;
|
|
453
|
+
const closeOps = [];
|
|
454
|
+
for (const entry of this.idle) {
|
|
455
|
+
closeOps.push(entry.context.close());
|
|
456
|
+
}
|
|
457
|
+
this.idle.length = 0;
|
|
458
|
+
for (const [, entry] of this.acquired) {
|
|
459
|
+
closeOps.push(entry.context.close());
|
|
460
|
+
}
|
|
461
|
+
this.acquired.clear();
|
|
462
|
+
await Promise.all(closeOps);
|
|
463
|
+
}
|
|
464
|
+
/** Current number of active (acquired) contexts */
|
|
465
|
+
get activeCount() {
|
|
466
|
+
return this.acquired.size;
|
|
467
|
+
}
|
|
468
|
+
/** Current number of available (idle) contexts */
|
|
469
|
+
get availableCount() {
|
|
470
|
+
return this.idle.length;
|
|
471
|
+
}
|
|
472
|
+
// ---------------------------------------------------------------------------
|
|
473
|
+
// Private
|
|
474
|
+
// ---------------------------------------------------------------------------
|
|
475
|
+
async createEntry() {
|
|
476
|
+
const opts = { ...this.contextOptions };
|
|
477
|
+
if (this.storageState !== void 0) {
|
|
478
|
+
opts.storageState = this.storageState;
|
|
479
|
+
}
|
|
480
|
+
const context = await this.browser.newContext(opts);
|
|
481
|
+
return { context, pageCount: 0 };
|
|
482
|
+
}
|
|
483
|
+
drainWaiters() {
|
|
484
|
+
if (this.waiters.length > 0) {
|
|
485
|
+
const next = this.waiters.shift();
|
|
486
|
+
next();
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
};
|
|
490
|
+
|
|
491
|
+
// src/crawl/checkpoint.ts
|
|
492
|
+
import { readFile as readFile2, writeFile as writeFile2, rename, unlink } from "fs/promises";
|
|
493
|
+
import { join } from "path";
|
|
494
|
+
import { z } from "zod";
|
|
495
|
+
var CHECKPOINT_FILENAME = ".checkpoint.json";
|
|
496
|
+
var CHECKPOINT_TMP_FILENAME = ".checkpoint.json.tmp";
|
|
497
|
+
var DEFAULT_AUTO_CHECKPOINT_INTERVAL_MS = 3e4;
|
|
498
|
+
var CheckpointStateSchema = z.object({
|
|
499
|
+
version: z.literal(1),
|
|
500
|
+
startedAt: z.string(),
|
|
501
|
+
checkpointedAt: z.string(),
|
|
502
|
+
frontier: z.array(
|
|
503
|
+
z.object({
|
|
504
|
+
url: z.string(),
|
|
505
|
+
depth: z.number(),
|
|
506
|
+
referrer: z.string().optional()
|
|
507
|
+
})
|
|
508
|
+
),
|
|
509
|
+
visited: z.array(z.string()),
|
|
510
|
+
skipped: z.array(z.string()),
|
|
511
|
+
artifactsDir: z.string(),
|
|
512
|
+
pagesVisited: z.number(),
|
|
513
|
+
errors: z.array(
|
|
514
|
+
z.object({
|
|
515
|
+
timestamp: z.string(),
|
|
516
|
+
url: z.string(),
|
|
517
|
+
code: z.string(),
|
|
518
|
+
message: z.string(),
|
|
519
|
+
collector: z.enum(["page-scanner", "form-prober", "network-logger", "screenshot-capturer"]).optional(),
|
|
520
|
+
status: z.enum(["timeout", "network_error", "http_error", "no_response", "aborted", "redirect_loop"]).optional(),
|
|
521
|
+
httpStatus: z.number().optional(),
|
|
522
|
+
securityReason: z.enum([
|
|
523
|
+
"private_ip",
|
|
524
|
+
"blocked_protocol",
|
|
525
|
+
"dns_rebinding",
|
|
526
|
+
"metadata_endpoint",
|
|
527
|
+
"redirect_to_private"
|
|
528
|
+
]).optional()
|
|
529
|
+
})
|
|
530
|
+
),
|
|
531
|
+
configHash: z.string()
|
|
532
|
+
});
|
|
533
|
+
async function writeCheckpoint(state, outputDir) {
|
|
534
|
+
if (!outputDir) {
|
|
535
|
+
throw new Error("outputDir must be a non-empty string");
|
|
536
|
+
}
|
|
537
|
+
state.checkpointedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
538
|
+
const tmpPath = join(outputDir, CHECKPOINT_TMP_FILENAME);
|
|
539
|
+
const finalPath = join(outputDir, CHECKPOINT_FILENAME);
|
|
540
|
+
const json = JSON.stringify(state, null, 2);
|
|
541
|
+
await writeFile2(tmpPath, json, "utf-8");
|
|
542
|
+
await rename(tmpPath, finalPath);
|
|
543
|
+
}
|
|
544
|
+
async function readCheckpoint(outputDir) {
|
|
545
|
+
if (!outputDir) {
|
|
546
|
+
throw new Error("outputDir must be a non-empty string");
|
|
547
|
+
}
|
|
548
|
+
const filePath = join(outputDir, CHECKPOINT_FILENAME);
|
|
549
|
+
let raw;
|
|
550
|
+
try {
|
|
551
|
+
raw = await readFile2(filePath, "utf-8");
|
|
552
|
+
} catch (err) {
|
|
553
|
+
if (isNodeError(err) && err.code === "ENOENT") {
|
|
554
|
+
return null;
|
|
555
|
+
}
|
|
556
|
+
throw err;
|
|
557
|
+
}
|
|
558
|
+
let parsed;
|
|
559
|
+
try {
|
|
560
|
+
parsed = JSON.parse(raw);
|
|
561
|
+
} catch {
|
|
562
|
+
throw new Error(`Checkpoint file is not valid JSON: ${filePath}`);
|
|
563
|
+
}
|
|
564
|
+
const result = CheckpointStateSchema.safeParse(parsed);
|
|
565
|
+
if (!result.success) {
|
|
566
|
+
throw new Error(
|
|
567
|
+
`Invalid checkpoint data in ${filePath}: ${result.error.message}`
|
|
568
|
+
);
|
|
569
|
+
}
|
|
570
|
+
return result.data;
|
|
571
|
+
}
|
|
572
|
+
async function deleteCheckpoint(outputDir) {
|
|
573
|
+
if (!outputDir) {
|
|
574
|
+
throw new Error("outputDir must be a non-empty string");
|
|
575
|
+
}
|
|
576
|
+
const filesToRemove = [
|
|
577
|
+
join(outputDir, CHECKPOINT_FILENAME),
|
|
578
|
+
join(outputDir, CHECKPOINT_TMP_FILENAME)
|
|
579
|
+
];
|
|
580
|
+
await Promise.all(
|
|
581
|
+
filesToRemove.map(async (filePath) => {
|
|
582
|
+
try {
|
|
583
|
+
await unlink(filePath);
|
|
584
|
+
} catch (err) {
|
|
585
|
+
if (isNodeError(err) && err.code === "ENOENT") {
|
|
586
|
+
return;
|
|
587
|
+
}
|
|
588
|
+
throw err;
|
|
589
|
+
}
|
|
590
|
+
})
|
|
591
|
+
);
|
|
592
|
+
}
|
|
593
|
+
function createCheckpointState(params) {
|
|
594
|
+
if (!params.configHash) {
|
|
595
|
+
throw new Error("configHash must be a non-empty string");
|
|
596
|
+
}
|
|
597
|
+
if (!params.artifactsDir) {
|
|
598
|
+
throw new Error("artifactsDir must be a non-empty string");
|
|
599
|
+
}
|
|
600
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
601
|
+
return {
|
|
602
|
+
version: 1,
|
|
603
|
+
startedAt: now,
|
|
604
|
+
checkpointedAt: now,
|
|
605
|
+
frontier: [],
|
|
606
|
+
visited: [],
|
|
607
|
+
skipped: [],
|
|
608
|
+
artifactsDir: params.artifactsDir,
|
|
609
|
+
pagesVisited: 0,
|
|
610
|
+
errors: [],
|
|
611
|
+
configHash: params.configHash
|
|
612
|
+
};
|
|
613
|
+
}
|
|
614
|
+
function setupAutoCheckpoint(getState, outputDir, intervalMs = DEFAULT_AUTO_CHECKPOINT_INTERVAL_MS) {
|
|
615
|
+
if (!outputDir) {
|
|
616
|
+
throw new Error("outputDir must be a non-empty string");
|
|
617
|
+
}
|
|
618
|
+
if (intervalMs <= 0) {
|
|
619
|
+
throw new Error("intervalMs must be a positive number");
|
|
620
|
+
}
|
|
621
|
+
const timer = setInterval(() => {
|
|
622
|
+
const state = getState();
|
|
623
|
+
writeCheckpoint(state, outputDir).catch(() => {
|
|
624
|
+
});
|
|
625
|
+
}, intervalMs);
|
|
626
|
+
if (typeof timer === "object" && "unref" in timer) {
|
|
627
|
+
timer.unref();
|
|
628
|
+
}
|
|
629
|
+
return {
|
|
630
|
+
stop: () => {
|
|
631
|
+
clearInterval(timer);
|
|
632
|
+
}
|
|
633
|
+
};
|
|
634
|
+
}
|
|
635
|
+
function isNodeError(err) {
|
|
636
|
+
return err instanceof Error && "code" in err;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
// src/utils/progress.ts
|
|
640
|
+
var ROLLING_WINDOW_SIZE = 10;
|
|
641
|
+
var ProgressTracker = class {
|
|
642
|
+
pagesVisited = 0;
|
|
643
|
+
pagesTotal;
|
|
644
|
+
errorsCount = 0;
|
|
645
|
+
startTime;
|
|
646
|
+
currentUrls = /* @__PURE__ */ new Set();
|
|
647
|
+
/** Rolling window of timestamps (ms) when pages were recorded as visited */
|
|
648
|
+
visitTimestamps = [];
|
|
649
|
+
constructor(estimatedTotal) {
|
|
650
|
+
this.pagesTotal = estimatedTotal ?? 0;
|
|
651
|
+
this.startTime = Date.now();
|
|
652
|
+
}
|
|
653
|
+
/** Record a page visit. */
|
|
654
|
+
recordVisit(url) {
|
|
655
|
+
this.pagesVisited += 1;
|
|
656
|
+
this.visitTimestamps.push(Date.now());
|
|
657
|
+
if (this.visitTimestamps.length > ROLLING_WINDOW_SIZE) {
|
|
658
|
+
this.visitTimestamps.shift();
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
/** Record an error. */
|
|
662
|
+
recordError() {
|
|
663
|
+
this.errorsCount += 1;
|
|
664
|
+
}
|
|
665
|
+
/** Mark a URL as currently being crawled. */
|
|
666
|
+
startPage(url) {
|
|
667
|
+
this.currentUrls.add(url);
|
|
668
|
+
}
|
|
669
|
+
/** Mark a URL as done crawling. */
|
|
670
|
+
endPage(url) {
|
|
671
|
+
this.currentUrls.delete(url);
|
|
672
|
+
}
|
|
673
|
+
/** Update the estimated total page count. */
|
|
674
|
+
updateTotal(total) {
|
|
675
|
+
if (total < 0) {
|
|
676
|
+
throw new Error("Estimated total must be >= 0");
|
|
677
|
+
}
|
|
678
|
+
this.pagesTotal = total;
|
|
679
|
+
}
|
|
680
|
+
/** Get the current progress state snapshot. */
|
|
681
|
+
getState() {
|
|
682
|
+
const now = Date.now();
|
|
683
|
+
const elapsedMs = now - this.startTime;
|
|
684
|
+
const pps = this.calculatePagesPerSecond();
|
|
685
|
+
const remaining = this.calculateRemainingMs(pps);
|
|
686
|
+
return {
|
|
687
|
+
pagesVisited: this.pagesVisited,
|
|
688
|
+
pagesTotal: this.pagesTotal,
|
|
689
|
+
currentUrls: Array.from(this.currentUrls),
|
|
690
|
+
errorsCount: this.errorsCount,
|
|
691
|
+
startTime: this.startTime,
|
|
692
|
+
elapsedMs,
|
|
693
|
+
estimatedRemainingMs: remaining,
|
|
694
|
+
pagesPerSecond: pps
|
|
695
|
+
};
|
|
696
|
+
}
|
|
697
|
+
/**
|
|
698
|
+
* Format the current progress as a terminal-friendly string.
|
|
699
|
+
*
|
|
700
|
+
* Example: `[12/50] 2.3 p/s | ETA: 16s | Errors: 0 | Crawling: /about, /api/users`
|
|
701
|
+
*/
|
|
702
|
+
format() {
|
|
703
|
+
const state = this.getState();
|
|
704
|
+
const pps = state.pagesPerSecond.toFixed(1);
|
|
705
|
+
const eta = state.estimatedRemainingMs > 0 ? `${Math.ceil(state.estimatedRemainingMs / 1e3)}s` : "--";
|
|
706
|
+
const crawling = state.currentUrls.length > 0 ? state.currentUrls.join(", ") : "idle";
|
|
707
|
+
return `[${state.pagesVisited}/${state.pagesTotal}] ${pps} p/s | ETA: ${eta} | Errors: ${state.errorsCount} | Crawling: ${crawling}`;
|
|
708
|
+
}
|
|
709
|
+
// ---------------------------------------------------------------------------
|
|
710
|
+
// Private
|
|
711
|
+
// ---------------------------------------------------------------------------
|
|
712
|
+
/**
|
|
713
|
+
* Calculate pages per second from the rolling window of visit timestamps.
|
|
714
|
+
* Uses the time span between the oldest and newest entry in the window.
|
|
715
|
+
*/
|
|
716
|
+
calculatePagesPerSecond() {
|
|
717
|
+
const timestamps = this.visitTimestamps;
|
|
718
|
+
if (timestamps.length < 2) {
|
|
719
|
+
return 0;
|
|
720
|
+
}
|
|
721
|
+
const oldest = timestamps[0];
|
|
722
|
+
const newest = timestamps[timestamps.length - 1];
|
|
723
|
+
const spanMs = newest - oldest;
|
|
724
|
+
if (spanMs <= 0) {
|
|
725
|
+
return 0;
|
|
726
|
+
}
|
|
727
|
+
return (timestamps.length - 1) / spanMs * 1e3;
|
|
728
|
+
}
|
|
729
|
+
/**
|
|
730
|
+
* Estimate remaining milliseconds: (remaining pages) / pagesPerSecond * 1000
|
|
731
|
+
*/
|
|
732
|
+
calculateRemainingMs(pps) {
|
|
733
|
+
if (pps <= 0 || this.pagesTotal <= 0) {
|
|
734
|
+
return 0;
|
|
735
|
+
}
|
|
736
|
+
const remaining = this.pagesTotal - this.pagesVisited;
|
|
737
|
+
if (remaining <= 0) {
|
|
738
|
+
return 0;
|
|
739
|
+
}
|
|
740
|
+
return remaining / pps * 1e3;
|
|
741
|
+
}
|
|
742
|
+
};
|
|
743
|
+
export {
|
|
744
|
+
ArchaeologistError,
|
|
745
|
+
AuthError,
|
|
746
|
+
BundleError,
|
|
747
|
+
CollectorError,
|
|
748
|
+
ConfigError,
|
|
749
|
+
ContextPool,
|
|
750
|
+
CrawlConfigSchema,
|
|
751
|
+
CrawlError,
|
|
752
|
+
DiffConfigSchema,
|
|
753
|
+
DiffError,
|
|
754
|
+
Frontier,
|
|
755
|
+
Logger,
|
|
756
|
+
NavigationError,
|
|
757
|
+
ProgressTracker,
|
|
758
|
+
ViewportSchema,
|
|
759
|
+
buildFlowGraph,
|
|
760
|
+
captureScreenshots,
|
|
761
|
+
createBundle,
|
|
762
|
+
createCheckpointState,
|
|
763
|
+
createNetworkLogger,
|
|
764
|
+
deleteCheckpoint,
|
|
765
|
+
detectAuthFailure,
|
|
766
|
+
detectCycles,
|
|
767
|
+
diffBundles,
|
|
768
|
+
dig,
|
|
769
|
+
escapeAttribute,
|
|
770
|
+
escapeHtml,
|
|
771
|
+
escapeJsonInHtml,
|
|
772
|
+
executeAuth,
|
|
773
|
+
generateDiffReportHtml,
|
|
774
|
+
generateMermaidDefinition,
|
|
775
|
+
generateOpenApiSpec,
|
|
776
|
+
generateReportHtml,
|
|
777
|
+
groupEndpoints,
|
|
778
|
+
logger,
|
|
779
|
+
normalizeEntryUrl,
|
|
780
|
+
parseViewport,
|
|
781
|
+
probeForms,
|
|
782
|
+
readCheckpoint,
|
|
783
|
+
refreshAuth,
|
|
784
|
+
scanPage,
|
|
785
|
+
setupAutoCheckpoint,
|
|
786
|
+
writeCheckpoint,
|
|
787
|
+
writeOpenApiSpec
|
|
788
|
+
};
|
|
789
|
+
//# sourceMappingURL=index.js.map
|