playwright-archaeologist 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1948 @@
1
+ import { z } from 'zod';
2
+ import { Page, Response, BrowserContext, Browser } from 'playwright';
3
+
4
+ /**
5
+ * playwright-archaeologist configuration schema.
6
+ *
7
+ * Validated with Zod at CLI entry before any crawl work begins.
8
+ * ConfigError is thrown for all validation failures with a clear
9
+ * user-facing message indicating which field failed and why.
10
+ *
11
+ * Validation rules and edge cases:
12
+ * --depth 0 Valid (crawl only the entry URL)
13
+ * --depth -1 Invalid: must be >= 0
14
+ * --depth 999999 Valid but warn: clamped to 100 with warning
15
+ * --concurrency 0 Invalid: must be >= 1
16
+ * --concurrency 100 Valid but warn: clamped to 20 with warning
17
+ * --viewport invalid Invalid: must match /^\d+x\d+$/
18
+ * --viewport 100x100 Valid
19
+ * --output /nonexistent Validated at startup: parent dir must exist and be writable
20
+ * --auth missing.ts AuthError('script_not_found')
21
+ * --cookies bad.json AuthError('cookie_file_malformed')
22
+ * URL without protocol Auto-prepend https:// with warning
23
+ * --timeout 0 Invalid: must be >= 1000
24
+ * --max-time 0 Invalid: must be >= 10
25
+ * --delay -1 Invalid: must be >= 0
26
+ * --max-pages 0 Invalid: must be >= 1
27
+ *
28
+ * The Zod schema produces a strongly-typed CrawlConfig object that is
29
+ * passed to the Orchestrator and threaded through to all collectors.
30
+ */
31
+
32
+ declare const ViewportSchema: z.ZodObject<{
33
+ width: z.ZodNumber;
34
+ height: z.ZodNumber;
35
+ }, "strip", z.ZodTypeAny, {
36
+ width: number;
37
+ height: number;
38
+ }, {
39
+ width: number;
40
+ height: number;
41
+ }>;
42
+ type Viewport = z.infer<typeof ViewportSchema>;
43
+ /**
44
+ * Parse a "WxH" string into a Viewport object.
45
+ * Returns null if the format is invalid.
46
+ */
47
+ declare function parseViewport(input: string): Viewport | null;
48
+ declare const OutputFormatSchema: z.ZodEnum<["html", "json", "openapi", "both"]>;
49
+ type OutputFormat = z.infer<typeof OutputFormatSchema>;
50
+ /**
51
+ * Normalize the user-provided entry URL.
52
+ * - Prepend https:// if no protocol is provided
53
+ * - Validate it is http or https
54
+ * Returns { url, warnings } where warnings contains any auto-corrections.
55
+ */
56
+ declare function normalizeEntryUrl(raw: string): {
57
+ url: string;
58
+ warnings: string[];
59
+ };
60
+ declare const CrawlConfigSchema: z.ZodObject<{
61
+ /** Entry URL to start crawling. Must be http or https. */
62
+ targetUrl: z.ZodString;
63
+ /** Maximum crawl depth from the entry URL. 0 = entry URL only. */
64
+ depth: z.ZodDefault<z.ZodNumber>;
65
+ /** Maximum total pages to visit. */
66
+ maxPages: z.ZodDefault<z.ZodNumber>;
67
+ /** URL glob patterns to include. Only URLs matching at least one pattern are crawled. */
68
+ include: z.ZodDefault<z.ZodArray<z.ZodString, "many">>;
69
+ /** URL glob patterns to exclude. URLs matching any pattern are skipped. */
70
+ exclude: z.ZodDefault<z.ZodArray<z.ZodString, "many">>;
71
+ /** Whether to follow links to external origins. Default: false (same-origin only). */
72
+ followExternal: z.ZodDefault<z.ZodBoolean>;
73
+ /** Enable Tier 3 clicking: click non-link interactive elements to discover SPA states. */
74
+ deepClick: z.ZodDefault<z.ZodBoolean>;
75
+ /** Crawl cross-origin iframe content. Default: false (record URL only). */
76
+ includeIframes: z.ZodDefault<z.ZodBoolean>;
77
+ /** Number of parallel browser contexts. */
78
+ concurrency: z.ZodDefault<z.ZodNumber>;
79
+ /** Delay in milliseconds between page visits per context. */
80
+ delay: z.ZodDefault<z.ZodNumber>;
81
+ /** Per-page navigation timeout in milliseconds. */
82
+ timeout: z.ZodDefault<z.ZodNumber>;
83
+ /** Global crawl timeout in seconds. 0 = no limit. */
84
+ maxTime: z.ZodDefault<z.ZodNumber>;
85
+ /** Path to auth script (.ts or .js) exporting a default async function. */
86
+ authScript: z.ZodOptional<z.ZodString>;
87
+ /** Path to cookies JSON file for session injection. */
88
+ cookiesFile: z.ZodOptional<z.ZodString>;
89
+ /** Include cookies/auth headers in output (default: scrubbed). */
90
+ includeCookies: z.ZodDefault<z.ZodBoolean>;
91
+ /** Output directory path. Created if it does not exist. Parent must exist. */
92
+ outputDir: z.ZodDefault<z.ZodString>;
93
+ /** Output format(s). */
94
+ format: z.ZodDefault<z.ZodEnum<["html", "json", "openapi", "both"]>>;
95
+ /** Skip screenshot capture entirely. */
96
+ noScreenshots: z.ZodDefault<z.ZodBoolean>;
97
+ /** Skip HAR recording entirely. */
98
+ noHar: z.ZodDefault<z.ZodBoolean>;
99
+ /** Primary viewport dimensions. */
100
+ viewport: z.ZodDefault<z.ZodObject<{
101
+ width: z.ZodNumber;
102
+ height: z.ZodNumber;
103
+ }, "strip", z.ZodTypeAny, {
104
+ width: number;
105
+ height: number;
106
+ }, {
107
+ width: number;
108
+ height: number;
109
+ }>>;
110
+ /** Additional viewports for responsive screenshots (empty = primary only). */
111
+ additionalViewports: z.ZodDefault<z.ZodArray<z.ZodObject<{
112
+ width: z.ZodNumber;
113
+ height: z.ZodNumber;
114
+ }, "strip", z.ZodTypeAny, {
115
+ width: number;
116
+ height: number;
117
+ }, {
118
+ width: number;
119
+ height: number;
120
+ }>, "many">>;
121
+ /** Resume from last checkpoint in the output directory. */
122
+ resume: z.ZodDefault<z.ZodBoolean>;
123
+ /** Skip confirmation prompts (auth script execution, etc.). */
124
+ yes: z.ZodDefault<z.ZodBoolean>;
125
+ /** Allow crawling private/internal IP ranges (SSRF protection bypass). */
126
+ allowPrivate: z.ZodDefault<z.ZodBoolean>;
127
+ /** Pixel diff threshold (0-1 scale, lower = more sensitive). */
128
+ diffThreshold: z.ZodDefault<z.ZodNumber>;
129
+ /** Maximum diff ratio (0-100%) before a screenshot is considered "changed". */
130
+ diffMaxRatio: z.ZodDefault<z.ZodNumber>;
131
+ /** Fields to ignore during API diff (comma-separated or array). */
132
+ diffIgnoreFields: z.ZodDefault<z.ZodArray<z.ZodString, "many">>;
133
+ }, "strip", z.ZodTypeAny, {
134
+ depth: number;
135
+ concurrency: number;
136
+ delay: number;
137
+ timeout: number;
138
+ viewport: {
139
+ width: number;
140
+ height: number;
141
+ };
142
+ outputDir: string;
143
+ targetUrl: string;
144
+ maxPages: number;
145
+ include: string[];
146
+ exclude: string[];
147
+ followExternal: boolean;
148
+ deepClick: boolean;
149
+ includeIframes: boolean;
150
+ maxTime: number;
151
+ includeCookies: boolean;
152
+ format: "html" | "json" | "openapi" | "both";
153
+ noScreenshots: boolean;
154
+ noHar: boolean;
155
+ additionalViewports: {
156
+ width: number;
157
+ height: number;
158
+ }[];
159
+ resume: boolean;
160
+ yes: boolean;
161
+ allowPrivate: boolean;
162
+ diffThreshold: number;
163
+ diffMaxRatio: number;
164
+ diffIgnoreFields: string[];
165
+ authScript?: string | undefined;
166
+ cookiesFile?: string | undefined;
167
+ }, {
168
+ targetUrl: string;
169
+ depth?: number | undefined;
170
+ concurrency?: number | undefined;
171
+ delay?: number | undefined;
172
+ timeout?: number | undefined;
173
+ viewport?: {
174
+ width: number;
175
+ height: number;
176
+ } | undefined;
177
+ outputDir?: string | undefined;
178
+ maxPages?: number | undefined;
179
+ include?: string[] | undefined;
180
+ exclude?: string[] | undefined;
181
+ followExternal?: boolean | undefined;
182
+ deepClick?: boolean | undefined;
183
+ includeIframes?: boolean | undefined;
184
+ maxTime?: number | undefined;
185
+ authScript?: string | undefined;
186
+ cookiesFile?: string | undefined;
187
+ includeCookies?: boolean | undefined;
188
+ format?: "html" | "json" | "openapi" | "both" | undefined;
189
+ noScreenshots?: boolean | undefined;
190
+ noHar?: boolean | undefined;
191
+ additionalViewports?: {
192
+ width: number;
193
+ height: number;
194
+ }[] | undefined;
195
+ resume?: boolean | undefined;
196
+ yes?: boolean | undefined;
197
+ allowPrivate?: boolean | undefined;
198
+ diffThreshold?: number | undefined;
199
+ diffMaxRatio?: number | undefined;
200
+ diffIgnoreFields?: string[] | undefined;
201
+ }>;
202
+ type CrawlConfig = z.infer<typeof CrawlConfigSchema>;
203
+ declare const DiffConfigSchema: z.ZodObject<{
204
+ /** Path to the "old" .archaeologist bundle. */
205
+ oldBundle: z.ZodString;
206
+ /** Path to the "new" .archaeologist bundle. */
207
+ newBundle: z.ZodString;
208
+ /** Output directory for diff report and artifacts. */
209
+ outputDir: z.ZodDefault<z.ZodString>;
210
+ /** Pixel diff threshold. */
211
+ diffThreshold: z.ZodDefault<z.ZodNumber>;
212
+ /** Maximum diff ratio. */
213
+ diffMaxRatio: z.ZodDefault<z.ZodNumber>;
214
+ /** Fields to ignore during API response diff. */
215
+ diffIgnoreFields: z.ZodDefault<z.ZodArray<z.ZodString, "many">>;
216
+ /** Normalize dynamic values (UUIDs, timestamps, JWTs) before diffing. */
217
+ normalizeDynamicValues: z.ZodDefault<z.ZodBoolean>;
218
+ /** Show detailed value-level diffs (default: schema-level only). */
219
+ detailed: z.ZodDefault<z.ZodBoolean>;
220
+ /** Output formats for the diff report. */
221
+ outputFormats: z.ZodDefault<z.ZodObject<{
222
+ html: z.ZodOptional<z.ZodString>;
223
+ json: z.ZodOptional<z.ZodString>;
224
+ junit: z.ZodOptional<z.ZodString>;
225
+ markdown: z.ZodOptional<z.ZodString>;
226
+ }, "strip", z.ZodTypeAny, {
227
+ html?: string | undefined;
228
+ json?: string | undefined;
229
+ junit?: string | undefined;
230
+ markdown?: string | undefined;
231
+ }, {
232
+ html?: string | undefined;
233
+ json?: string | undefined;
234
+ junit?: string | undefined;
235
+ markdown?: string | undefined;
236
+ }>>;
237
+ }, "strip", z.ZodTypeAny, {
238
+ outputDir: string;
239
+ diffThreshold: number;
240
+ diffMaxRatio: number;
241
+ diffIgnoreFields: string[];
242
+ oldBundle: string;
243
+ newBundle: string;
244
+ normalizeDynamicValues: boolean;
245
+ detailed: boolean;
246
+ outputFormats: {
247
+ html?: string | undefined;
248
+ json?: string | undefined;
249
+ junit?: string | undefined;
250
+ markdown?: string | undefined;
251
+ };
252
+ }, {
253
+ oldBundle: string;
254
+ newBundle: string;
255
+ outputDir?: string | undefined;
256
+ diffThreshold?: number | undefined;
257
+ diffMaxRatio?: number | undefined;
258
+ diffIgnoreFields?: string[] | undefined;
259
+ normalizeDynamicValues?: boolean | undefined;
260
+ detailed?: boolean | undefined;
261
+ outputFormats?: {
262
+ html?: string | undefined;
263
+ json?: string | undefined;
264
+ junit?: string | undefined;
265
+ markdown?: string | undefined;
266
+ } | undefined;
267
+ }>;
268
+ type DiffConfig = z.infer<typeof DiffConfigSchema>;
269
+ /**
270
+ * ResolvedConfig is the fully validated and resolved config passed to
271
+ * the Orchestrator. All paths are absolute, all defaults are applied,
272
+ * and the entry URL is normalized.
273
+ */
274
+ interface ResolvedConfig extends CrawlConfig {
275
+ /** Absolute path to the resolved output directory. */
276
+ outputDir: string;
277
+ /** Absolute path to the auth script, if provided. */
278
+ authScript?: string;
279
+ /** Absolute path to the cookies file, if provided. */
280
+ cookiesFile?: string;
281
+ /** Warnings generated during config resolution (e.g. auto-prepended https://). */
282
+ warnings: string[];
283
+ }
284
+
285
+ /**
286
+ * playwright-archaeologist error types.
287
+ *
288
+ * Error hierarchy:
289
+ * ArchaeologistError (base)
290
+ * +-- ConfigError Invalid CLI options or config file
291
+ * +-- AuthError Auth script loading, execution, or validation failure
292
+ * +-- CrawlError (base) Any error during the crawl phase
293
+ * | +-- NavigationError page.goto failure (timeout, network, HTTP >= 400)
294
+ * | +-- CollectorError A collector (scanner/prober/logger/capturer) failed on a page
295
+ * | +-- BrowserContextError Context crashed or was killed
296
+ * | +-- FrontierError URL queue corruption or checkpoint I/O failure
297
+ * | +-- SecurityBlockError SSRF or protocol violation blocked a request
298
+ * +-- AssemblerError Data merging or dedup failure
299
+ * +-- ReportError Report generation failure (HTML, JSON, OpenAPI, Mermaid)
300
+ * +-- DiffError Bundle loading, comparison, or diff report failure
301
+ * +-- BundleError .archaeologist bundle I/O or integrity failure
302
+ *
303
+ * Error propagation rules:
304
+ * - NavigationError on one page: log + skip page, crawl continues
305
+ * - CollectorError on one page: log + partial data, crawl continues
306
+ * - BrowserContextError: recycle context, re-enqueue URL, crawl continues
307
+ * - FrontierError (checkpoint): warn, crawl continues without checkpointing
308
+ * - SecurityBlockError: log + skip URL, crawl continues
309
+ * - ConfigError: abort with user-facing message before crawl starts
310
+ * - AuthError: abort with user-facing message before crawl starts
311
+ * - AssemblerError: abort, partial output may exist
312
+ * - ReportError: degrade gracefully (skip failed section, warn)
313
+ * - DiffError: abort with exit code 2
314
+ * - BundleError: abort with exit code 2
315
+ */
316
+ declare class ArchaeologistError extends Error {
317
+ /** Machine-readable error code for programmatic handling. */
318
+ readonly code: string;
319
+ constructor(message: string, code: string, options?: ErrorOptions);
320
+ }
321
+ declare class ConfigError extends ArchaeologistError {
322
+ /** The config field that failed validation (e.g. "depth", "viewport"). */
323
+ readonly field: string;
324
+ /** The invalid value the user supplied. */
325
+ readonly value: unknown;
326
+ constructor(field: string, message: string, value?: unknown);
327
+ }
328
+ type AuthErrorReason = 'script_not_found' | 'script_import_failed' | 'script_no_default_export' | 'script_execution_failed' | 'script_dangerous_import' | 'script_user_declined' | 'cookie_file_not_found' | 'cookie_file_malformed' | 'auth_verification_failed';
329
+ declare class AuthError extends ArchaeologistError {
330
+ readonly reason: AuthErrorReason;
331
+ readonly scriptPath?: string;
332
+ constructor(reason: AuthErrorReason, message: string, scriptPath?: string, options?: ErrorOptions);
333
+ }
334
+ declare class CrawlError extends ArchaeologistError {
335
+ /** The URL being processed when the error occurred. */
336
+ readonly url: string;
337
+ constructor(message: string, code: string, url: string, options?: ErrorOptions);
338
+ }
339
+ type NavigationStatus = 'timeout' | 'network_error' | 'http_error' | 'no_response' | 'aborted' | 'redirect_loop';
340
+ declare class NavigationError extends CrawlError {
341
+ readonly status: NavigationStatus;
342
+ /** HTTP status code, if applicable (e.g. 404, 500). */
343
+ readonly httpStatus?: number;
344
+ constructor(url: string, status: NavigationStatus, message: string, httpStatus?: number, options?: ErrorOptions);
345
+ }
346
+ type CollectorName = 'page-scanner' | 'form-prober' | 'network-logger' | 'screenshot-capturer';
347
+ declare class CollectorError extends CrawlError {
348
+ readonly collector: CollectorName;
349
+ constructor(collector: CollectorName, url: string, message: string, options?: ErrorOptions);
350
+ }
351
+ type SecurityBlockReason = 'private_ip' | 'blocked_protocol' | 'dns_rebinding' | 'metadata_endpoint' | 'redirect_to_private';
352
+ type DiffErrorReason = 'bundle_not_found' | 'bundle_corrupt' | 'manifest_invalid' | 'incompatible_config' | 'screenshot_size_mismatch' | 'diff_generation_failed';
353
+ declare class DiffError extends ArchaeologistError {
354
+ readonly reason: DiffErrorReason;
355
+ constructor(reason: DiffErrorReason, message: string, options?: ErrorOptions);
356
+ }
357
+ type BundleErrorReason = 'create_failed' | 'extract_failed' | 'integrity_check_failed' | 'missing_manifest' | 'unsupported_version' | 'output_dir_not_writable';
358
+ declare class BundleError extends ArchaeologistError {
359
+ readonly reason: BundleErrorReason;
360
+ readonly bundlePath?: string;
361
+ constructor(reason: BundleErrorReason, message: string, bundlePath?: string, options?: ErrorOptions);
362
+ }
363
+ interface CrawlErrorEntry {
364
+ timestamp: string;
365
+ url: string;
366
+ code: string;
367
+ message: string;
368
+ collector?: CollectorName;
369
+ status?: NavigationStatus;
370
+ httpStatus?: number;
371
+ securityReason?: SecurityBlockReason;
372
+ }
373
+
374
+ /**
375
+ * playwright-archaeologist artifact type definitions.
376
+ *
377
+ * Data flow through the system:
378
+ *
379
+ * CLI (ResolvedConfig)
380
+ * |
381
+ * v
382
+ * Orchestrator ----> per-page collectors (parallel)
383
+ * | |
384
+ * | PageScanner ---> PageScanResult
385
+ * | FormProber ---> FormProbeResult
386
+ * | NetworkLogger -> NetworkLogResult
387
+ * | ScreenshotCapturer -> ScreenshotResult
388
+ * | |
389
+ * v v
390
+ * PageVisitResult (aggregates all collector results for one URL)
391
+ * |
392
+ * v
393
+ * CrawlResult (all pages + crawl metadata + errors)
394
+ * |
395
+ * v
396
+ * Assembler ---> AssembledArtifacts
397
+ * | (deduplicated, correlated, structured)
398
+ * v
399
+ * ReportGenerator
400
+ * |-- HTML report (ReportData -> report.html)
401
+ * |-- JSON output (AssembledArtifacts -> *.json files)
402
+ * |-- OpenAPI spec (ApiEndpointGroup[] -> openapi.json)
403
+ * |-- Bundle (AssembledArtifacts + screenshots -> .archaeologist ZIP)
404
+ * v
405
+ * DiffModule
406
+ * |-- loads two BundleManifest + their contents
407
+ * |-- produces DiffResult
408
+ * |-- renders diff report (HTML, JSON, JUnit, Markdown)
409
+ */
410
+
411
+ interface HeadingEntry {
412
+ level: 1 | 2 | 3 | 4 | 5 | 6;
413
+ text: string;
414
+ }
415
+ interface MetaTag {
416
+ name?: string;
417
+ property?: string;
418
+ content: string;
419
+ }
420
+ interface LandmarkInfo {
421
+ role: string;
422
+ tagName: string;
423
+ label?: string;
424
+ }
425
+ interface LinkInfo {
426
+ href: string;
427
+ text: string;
428
+ /** Whether the link is internal (same-origin) or external. */
429
+ isExternal: boolean;
430
+ /** Relationship attribute value if present (e.g. "nofollow"). */
431
+ rel?: string;
432
+ }
433
+ interface InteractiveElement {
434
+ tagName: string;
435
+ type?: string;
436
+ text: string;
437
+ role?: string;
438
+ ariaLabel?: string;
439
+ /** CSS selector that uniquely identifies this element on the page. */
440
+ selector: string;
441
+ }
442
+ interface PageTiming {
443
+ /** Total time from navigation start to load event (ms). */
444
+ loadTime: number;
445
+ /** DOM content loaded time (ms). */
446
+ domContentLoaded: number;
447
+ /** First contentful paint (ms), if available. */
448
+ firstContentfulPaint?: number;
449
+ }
450
+ /**
451
+ * Output of the PageScanner collector for a single URL.
452
+ * The Orchestrator receives this and stores it in PageVisitResult.
453
+ */
454
+ interface PageScanResult {
455
+ url: string;
456
+ /** Canonical URL if different from requested URL (e.g. after redirect). */
457
+ canonicalUrl?: string;
458
+ /** HTTP status code of the final response. */
459
+ statusCode: number;
460
+ /** Page <title> text. */
461
+ title: string;
462
+ /** Meta description and other meta tags. */
463
+ metaTags: MetaTag[];
464
+ /** Heading hierarchy. */
465
+ headings: HeadingEntry[];
466
+ /** Semantic landmark elements found on the page. */
467
+ landmarks: LandmarkInfo[];
468
+ /** All links discovered on the page (both <a href> and SPA navigation targets). */
469
+ links: LinkInfo[];
470
+ /** Buttons, dropdowns, and other interactive elements (not links). */
471
+ interactiveElements: InteractiveElement[];
472
+ /** Navigation Timing API data. */
473
+ timing: PageTiming;
474
+ /** SHA-256 hash of the page's main text content (for change detection). */
475
+ contentHash: string;
476
+ /** Whether hash-based routing was detected on this page. */
477
+ hashRoutingDetected: boolean;
478
+ }
479
+ interface FormFieldOption {
480
+ value: string;
481
+ label: string;
482
+ selected: boolean;
483
+ }
484
+ interface FormField {
485
+ /** Input name attribute. */
486
+ name: string;
487
+ /** Input type (text, email, password, select, radio, checkbox, file, hidden, etc.). */
488
+ type: string;
489
+ /** Whether the field is required (HTML5 required attribute or aria-required). */
490
+ required: boolean;
491
+ /** Validation pattern attribute value. */
492
+ pattern?: string;
493
+ /** Placeholder text. */
494
+ placeholder?: string;
495
+ /** Associated label text (from <label>, aria-label, or aria-labelledby). */
496
+ label?: string;
497
+ /** For select, radio, and checkbox: the available options. */
498
+ options?: FormFieldOption[];
499
+ /** Min/max/step for numeric inputs. */
500
+ min?: string;
501
+ max?: string;
502
+ step?: string;
503
+ /** Maxlength constraint. */
504
+ maxLength?: number;
505
+ /** Whether the field accepts multiple values (e.g. multi-select, multiple file). */
506
+ multiple?: boolean;
507
+ /** Accept attribute for file inputs. */
508
+ accept?: string;
509
+ /** Default/current value (redacted for password fields). */
510
+ defaultValue?: string;
511
+ }
512
+ interface ValidationMessage {
513
+ fieldName: string;
514
+ message: string;
515
+ /** Whether this is a browser-native validation or custom JS validation. */
516
+ type: 'native' | 'custom';
517
+ }
518
+ /**
519
+ * Output of the FormProber collector for a single URL.
520
+ * A page may contain zero or more forms.
521
+ */
522
+ interface FormProbeResult {
523
+ /** The page URL where these forms were found. */
524
+ pageUrl: string;
525
+ forms: FormInfo[];
526
+ }
527
+ interface FormInfo {
528
+ /** Form action URL (resolved to absolute). Empty string if no action attribute. */
529
+ action: string;
530
+ /** HTTP method (GET, POST, etc.). Default: GET. */
531
+ method: string;
532
+ /** Form id attribute, if present. */
533
+ id?: string;
534
+ /** Form name attribute, if present. */
535
+ name?: string;
536
+ /** Whether this is an explicit <form> or an implicit form (input group without wrapper). */
537
+ isImplicit: boolean;
538
+ /** All fields in the form. */
539
+ fields: FormField[];
540
+ /** Submit button text. */
541
+ submitButtonText?: string;
542
+ /** Encoding type (application/x-www-form-urlencoded, multipart/form-data, text/plain). */
543
+ enctype?: string;
544
+ /** Validation messages captured by attempting an empty submission. */
545
+ validationMessages: ValidationMessage[];
546
+ /** ARIA attributes on the form element. */
547
+ ariaLabel?: string;
548
+ /** Whether the form was hidden and required clicking (tab/accordion) to reveal. */
549
+ wasHidden: boolean;
550
+ }
551
+ type RequestClassification = 'api' | 'static' | 'analytics' | 'third-party' | 'websocket' | 'other';
552
+ interface CapturedRequest {
553
+ /** Unique request ID within this crawl. */
554
+ requestId: string;
555
+ /** Request URL. */
556
+ url: string;
557
+ /** HTTP method. */
558
+ method: string;
559
+ /** Request headers (sensitive headers scrubbed unless --include-cookies). */
560
+ headers: Record<string, string>;
561
+ /** Request body (truncated to 100KB, null for GET). */
562
+ body?: string;
563
+ /** Content type of the request body. */
564
+ contentType?: string;
565
+ /** Resource type as classified by the browser. */
566
+ resourceType: string;
567
+ /** Our classification of the request. */
568
+ classification: RequestClassification;
569
+ /** Whether this request was initiated by user interaction or automatic. */
570
+ initiator: 'navigation' | 'script' | 'fetch' | 'xhr' | 'other';
571
+ }
572
+ interface CapturedResponse {
573
+ /** Corresponding request ID. */
574
+ requestId: string;
575
+ /** HTTP status code. */
576
+ statusCode: number;
577
+ /** Response headers (sensitive headers scrubbed unless --include-cookies). */
578
+ headers: Record<string, string>;
579
+ /** Response body (truncated to 100KB). Null for redirects and non-text responses. */
580
+ body?: string;
581
+ /** Content type of the response. */
582
+ contentType?: string;
583
+ /** Response body size in bytes (before truncation). */
584
+ bodySize: number;
585
+ /** Response timing in milliseconds. */
586
+ timing: number;
587
+ }
588
+ interface FailedRequest {
589
+ requestId: string;
590
+ url: string;
591
+ method: string;
592
+ /** Error message (e.g. "net::ERR_CONNECTION_REFUSED"). */
593
+ errorText: string;
594
+ classification: RequestClassification;
595
+ }
596
+ interface GraphQLOperation {
597
+ /** The page URL where this operation was captured. */
598
+ pageUrl: string;
599
+ /** Endpoint URL (typically /graphql). */
600
+ endpointUrl: string;
601
+ /** Operation name from the query. */
602
+ operationName: string;
603
+ /** Operation type. */
604
+ operationType: 'query' | 'mutation' | 'subscription';
605
+ /** GraphQL variables sent with the operation. */
606
+ variables?: Record<string, unknown>;
607
+ /** Full query string. */
608
+ query: string;
609
+ /** Response data (truncated). */
610
+ responseData?: unknown;
611
+ }
612
+ interface WebSocketConnection {
613
+ /** WebSocket URL. */
614
+ url: string;
615
+ /** The page URL that opened this WebSocket. */
616
+ pageUrl: string;
617
+ /** Whether the connection was successfully established. */
618
+ connected: boolean;
619
+ /** Number of messages observed (sent + received). */
620
+ messageCount: number;
621
+ /** Sample messages (first 5 sent, first 5 received). */
622
+ sampleMessages: Array<{
623
+ direction: 'sent' | 'received';
624
+ data: string;
625
+ timestamp: string;
626
+ }>;
627
+ }
628
+ /**
629
+ * Output of the NetworkLogger collector for a single URL.
630
+ */
631
+ interface NetworkLogResult {
632
+ /** The page URL this network activity was captured on. */
633
+ pageUrl: string;
634
+ /** All captured request/response pairs. */
635
+ requests: CapturedRequest[];
636
+ responses: CapturedResponse[];
637
+ /** Requests that failed (network error, CORS, timeout). */
638
+ failedRequests: FailedRequest[];
639
+ /** GraphQL operations detected. */
640
+ graphqlOperations: GraphQLOperation[];
641
+ /** WebSocket connections detected. */
642
+ webSocketConnections: WebSocketConnection[];
643
+ /** Cookie mutations observed (cookies set or deleted during page visit). */
644
+ cookieMutations: Array<{
645
+ name: string;
646
+ action: 'set' | 'deleted';
647
+ domain: string;
648
+ path: string;
649
+ }>;
650
+ }
651
+ interface ScreenshotResult {
652
+ /** The page URL this screenshot was taken on. */
653
+ pageUrl: string;
654
+ /** Filesystem path where the full-page screenshot was saved. */
655
+ fullPagePath: string;
656
+ /** Filesystem path where the viewport-only screenshot was saved. */
657
+ viewportPath: string;
658
+ /** Viewport dimensions used for capture. */
659
+ viewport: Viewport;
660
+ /** SHA-256 hash of the full-page screenshot file. */
661
+ fullPageHash: string;
662
+ /** SHA-256 hash of the viewport screenshot file. */
663
+ viewportHash: string;
664
+ /** Full-page screenshot dimensions. */
665
+ dimensions: {
666
+ width: number;
667
+ height: number;
668
+ };
669
+ /** File size in bytes. */
670
+ fileSizeBytes: number;
671
+ /** Whether a dialog or modal was detected and captured. */
672
+ modalDetected: boolean;
673
+ }
674
+ type PageVisitStatus = 'ok' | 'http_error' | 'timeout' | 'network_error' | 'no_response' | 'security_blocked' | 'skipped';
675
+ /**
676
+ * The Orchestrator produces one PageVisitResult per visited URL.
677
+ * It aggregates results from all collectors plus navigation metadata.
678
+ * Passed to the Assembler in bulk via CrawlResult.
679
+ */
680
+ interface PageVisitResult {
681
+ url: string;
682
+ /** Final URL after redirects. */
683
+ finalUrl: string;
684
+ status: PageVisitStatus;
685
+ /** HTTP status code (undefined if navigation failed before response). */
686
+ httpStatus?: number;
687
+ /** Crawl depth from the entry URL. */
688
+ depth: number;
689
+ /** Which page linked to this one (undefined for the entry URL). */
690
+ referrer?: string;
691
+ /** How this URL was discovered. */
692
+ discoveryMethod: 'link' | 'pushState' | 'replaceState' | 'navigation-api' | 'click' | 'sitemap.xml' | 'entry';
693
+ /** Timestamp when the page visit started. */
694
+ visitedAt: string;
695
+ /** Duration of the entire page visit (navigation + all collectors) in ms. */
696
+ durationMs: number;
697
+ pageScan?: PageScanResult;
698
+ formProbe?: FormProbeResult;
699
+ networkLog?: NetworkLogResult;
700
+ screenshot?: ScreenshotResult;
701
+ /** Per-collector errors (collector ran but threw). */
702
+ collectorErrors: Array<{
703
+ collector: CollectorName;
704
+ message: string;
705
+ }>;
706
+ /** Navigation edges discovered from this page. */
707
+ navigationEdges: NavigationEdge[];
708
+ }
709
+ type NavigationTrigger = 'link' | 'form-submit' | 'redirect' | 'pushState' | 'replaceState' | 'navigation-api' | 'click' | 'meta-refresh' | 'js-redirect';
710
+ interface NavigationEdge {
711
+ from: string;
712
+ to: string;
713
+ trigger: NavigationTrigger;
714
+ /** Text of the element that triggered navigation (link text, button text). */
715
+ triggerText?: string;
716
+ /** CSS selector of the element that triggered navigation. */
717
+ triggerSelector?: string;
718
+ }
719
+ /**
720
+ * CrawlResult is the complete output of the Orchestrator.
721
+ * Passed to the Assembler for deduplication, correlation, and structuring.
722
+ */
723
+ interface CrawlResult {
724
+ /** Crawl configuration that was used. */
725
+ config: {
726
+ targetUrl: string;
727
+ depth: number;
728
+ maxPages: number;
729
+ concurrency: number;
730
+ viewport: Viewport;
731
+ followExternal: boolean;
732
+ deepClick: boolean;
733
+ };
734
+ /** When the crawl started. */
735
+ startedAt: string;
736
+ /** When the crawl finished. */
737
+ finishedAt: string;
738
+ /** Total crawl duration in milliseconds. */
739
+ durationMs: number;
740
+ /** All page visit results, one per visited URL. */
741
+ pages: PageVisitResult[];
742
+ /** URLs that were discovered but not visited (depth/max-pages limit reached). */
743
+ unvisitedUrls: string[];
744
+ /** Errors encountered during the crawl. */
745
+ errors: CrawlErrorEntry[];
746
+ /** Whether the crawl completed fully or was interrupted. */
747
+ completionStatus: 'complete' | 'max_pages_reached' | 'max_time_reached' | 'interrupted' | 'error';
748
+ }
749
+ interface RouteNode {
750
+ /** URL path segment (e.g. "users" or ":id"). */
751
+ segment: string;
752
+ /** Full URL for this route. */
753
+ url: string;
754
+ /** Page title. */
755
+ title: string;
756
+ /** HTTP status code. */
757
+ statusCode: number;
758
+ /** Content hash for change detection. */
759
+ contentHash: string;
760
+ /** Headings on this page. */
761
+ headings: HeadingEntry[];
762
+ /** Landmarks on this page. */
763
+ landmarks: LandmarkInfo[];
764
+ /** Depth from root. */
765
+ depth: number;
766
+ /** Child route nodes. */
767
+ children: RouteNode[];
768
+ /** Number of forms on this page. */
769
+ formCount: number;
770
+ /** Number of API calls made from this page. */
771
+ apiCallCount: number;
772
+ /** Whether this page has a screenshot. */
773
+ hasScreenshot: boolean;
774
+ }
775
+ interface ApiEndpointGroup {
776
+ /** URL pattern with parameter placeholders (e.g. "/api/users/:id"). */
777
+ pattern: string;
778
+ /** HTTP method. */
779
+ method: string;
780
+ /** Classification. */
781
+ classification: RequestClassification;
782
+ /** All observed concrete URLs that matched this pattern. */
783
+ observedUrls: string[];
784
+ /** Number of times this endpoint was called across the crawl. */
785
+ callCount: number;
786
+ /** Pages that called this endpoint. */
787
+ callingPages: string[];
788
+ /** Example request (first observed). */
789
+ exampleRequest: {
790
+ url: string;
791
+ headers: Record<string, string>;
792
+ body?: string;
793
+ contentType?: string;
794
+ };
795
+ /** Example response (first observed). */
796
+ exampleResponse: {
797
+ statusCode: number;
798
+ headers: Record<string, string>;
799
+ body?: string;
800
+ contentType?: string;
801
+ bodySize: number;
802
+ };
803
+ /** All unique status codes observed. */
804
+ observedStatusCodes: number[];
805
+ /** Response content types observed. */
806
+ observedContentTypes: string[];
807
+ /** Whether this endpoint appears to be GraphQL. */
808
+ isGraphQL: boolean;
809
+ /** GraphQL operation names if applicable. */
810
+ graphqlOperations?: string[];
811
+ }
812
+ interface FlowGraph$1 {
813
+ /** All unique nodes (URLs). */
814
+ nodes: Array<{
815
+ url: string;
816
+ title: string;
817
+ /** Route group this node belongs to (for clustering). */
818
+ cluster: string;
819
+ }>;
820
+ /** All navigation edges. */
821
+ edges: NavigationEdge[];
822
+ /** Entry point URL. */
823
+ entryPoint: string;
824
+ /** Dead-end pages (no outgoing links). */
825
+ deadEnds: string[];
826
+ /** Pages that form cycles. */
827
+ cycles: string[][];
828
+ }
829
+ interface FlowDiagramSet {
830
+ /** Top-level overview diagram (max 50 nodes). */
831
+ overview: MermaidDiagram;
832
+ /** Per-section sub-diagrams. */
833
+ sections: Array<{
834
+ /** Route prefix for this section (e.g. "/users"). */
835
+ prefix: string;
836
+ diagram: MermaidDiagram;
837
+ }>;
838
+ }
839
+ interface MermaidDiagram {
840
+ /** Raw Mermaid definition text (for .mmd file output). */
841
+ definition: string;
842
+ /** Pre-rendered SVG string (for HTML report embedding). Undefined if rendering failed. */
843
+ svg?: string;
844
+ }
845
+ interface ScreenshotManifestEntry {
846
+ /** Page URL. */
847
+ url: string;
848
+ /** Page title. */
849
+ title: string;
850
+ /** Relative path to the full-page screenshot (from output dir). */
851
+ fullPagePath: string;
852
+ /** Relative path to the viewport screenshot. */
853
+ viewportPath: string;
854
+ /** Viewport dimensions. */
855
+ viewport: Viewport;
856
+ /** Full-page image dimensions. */
857
+ dimensions: {
858
+ width: number;
859
+ height: number;
860
+ };
861
+ /** SHA-256 hash of the full-page screenshot. */
862
+ fullPageHash: string;
863
+ /** Base64-encoded JPEG thumbnail (320px wide, quality 60). */
864
+ thumbnailBase64: string;
865
+ }
866
+ /**
867
+ * AssembledArtifacts is the structured, deduplicated output of the Assembler.
868
+ * This is the primary input to the ReportGenerator.
869
+ */
870
+ interface AssembledArtifacts {
871
+ /** Crawl metadata. */
872
+ meta: CrawlMeta;
873
+ /** Hierarchical route tree. */
874
+ routeTree: RouteNode;
875
+ /** Flat list of all discovered routes (for search/filter). */
876
+ routes: RouteInfo[];
877
+ /** All forms discovered, deduplicated by (pageUrl + action + method). */
878
+ forms: FormInfo[];
879
+ /** API endpoints grouped by URL pattern. */
880
+ apiEndpoints: ApiEndpointGroup[];
881
+ /** GraphQL operations discovered. */
882
+ graphqlOperations: GraphQLOperation[];
883
+ /** WebSocket connections discovered. */
884
+ webSocketConnections: WebSocketConnection[];
885
+ /** Navigation flow graph. */
886
+ flowGraph: FlowGraph$1;
887
+ /** Mermaid diagrams (overview + sections). */
888
+ flowDiagrams: FlowDiagramSet;
889
+ /** Screenshot manifest with thumbnail data. */
890
+ screenshots: ScreenshotManifestEntry[];
891
+ /** Crawl errors summary. */
892
+ errors: CrawlErrorEntry[];
893
+ }
894
+ interface RouteInfo {
895
+ url: string;
896
+ title: string;
897
+ statusCode: number;
898
+ contentHash: string;
899
+ depth: number;
900
+ formCount: number;
901
+ apiCallCount: number;
902
+ hasScreenshot: boolean;
903
+ discoveryMethod: PageVisitResult['discoveryMethod'];
904
+ }
905
+ interface CrawlMeta {
906
+ /** Tool version (from package.json). */
907
+ toolVersion: string;
908
+ /** Target URL that was crawled. */
909
+ targetUrl: string;
910
+ /** When the crawl started (ISO 8601). */
911
+ crawlDate: string;
912
+ /** Total crawl duration in milliseconds. */
913
+ duration: number;
914
+ /** Total pages visited. */
915
+ pagesVisited: number;
916
+ /** Total pages discovered (visited + unvisited). */
917
+ pagesDiscovered: number;
918
+ /** Total forms found. */
919
+ formsFound: number;
920
+ /** Total unique API endpoint patterns found. */
921
+ apiEndpointsFound: number;
922
+ /** Total screenshots taken. */
923
+ screenshotsTaken: number;
924
+ /** Total errors encountered. */
925
+ errorCount: number;
926
+ /** How the crawl ended. */
927
+ completionStatus: CrawlResult['completionStatus'];
928
+ /** Viewport used for primary screenshots. */
929
+ viewport: Viewport;
930
+ }
931
+ interface BundleManifest {
932
+ /** Schema version for forward compatibility. */
933
+ version: 1;
934
+ /** Tool identifier. */
935
+ tool: 'playwright-archaeologist';
936
+ /** Tool version that created this bundle. */
937
+ toolVersion: string;
938
+ /** When the bundle was created (ISO 8601). */
939
+ createdAt: string;
940
+ /** Crawl configuration (for diff compatibility checking). */
941
+ config: {
942
+ targetUrl: string;
943
+ depth: number;
944
+ viewport: Viewport;
945
+ concurrency: number;
946
+ maxPages: number;
947
+ followExternal: boolean;
948
+ deepClick: boolean;
949
+ };
950
+ /** Crawl statistics. */
951
+ stats: {
952
+ pagesVisited: number;
953
+ formsFound: number;
954
+ apiEndpoints: number;
955
+ screenshotCount: number;
956
+ duration: number;
957
+ };
958
+ /** All files in the bundle with checksums. */
959
+ files: BundleFileEntry[];
960
+ }
961
+ interface BundleFileEntry {
962
+ /** Relative path within the ZIP archive. */
963
+ path: string;
964
+ /** SHA-256 hex digest. */
965
+ sha256: string;
966
+ /** Uncompressed size in bytes. */
967
+ size: number;
968
+ /** File category. */
969
+ type: 'screenshot' | 'json' | 'api-snapshot' | 'manifest';
970
+ }
971
+ interface DiffResult {
972
+ /** Metadata about the comparison. */
973
+ meta: {
974
+ oldBundle: {
975
+ path: string;
976
+ createdAt: string;
977
+ targetUrl: string;
978
+ toolVersion: string;
979
+ };
980
+ newBundle: {
981
+ path: string;
982
+ createdAt: string;
983
+ targetUrl: string;
984
+ toolVersion: string;
985
+ };
986
+ comparedAt: string;
987
+ };
988
+ /** Overall change status. */
989
+ hasChanges: boolean;
990
+ /** Route/sitemap diff. */
991
+ sitemap: SitemapDiff;
992
+ /** Form diff. */
993
+ forms: FormDiff;
994
+ /** API endpoint diff. */
995
+ api: ApiDiff;
996
+ /** Screenshot diff. */
997
+ screenshots: ScreenshotDiff;
998
+ /** Summary counts for terminal and CI output. */
999
+ summary: DiffSummary;
1000
+ }
1001
+ interface DiffSummary {
1002
+ routes: {
1003
+ added: number;
1004
+ removed: number;
1005
+ changed: number;
1006
+ unchanged: number;
1007
+ };
1008
+ forms: {
1009
+ added: number;
1010
+ removed: number;
1011
+ changed: number;
1012
+ unchanged: number;
1013
+ };
1014
+ api: {
1015
+ added: number;
1016
+ removed: number;
1017
+ changed: number;
1018
+ unchanged: number;
1019
+ };
1020
+ screenshots: {
1021
+ changed: number;
1022
+ unchanged: number;
1023
+ added: number;
1024
+ removed: number;
1025
+ };
1026
+ }
1027
+ interface SitemapDiff {
1028
+ added: RouteInfo[];
1029
+ removed: RouteInfo[];
1030
+ changed: SitemapRouteChange[];
1031
+ unchangedCount: number;
1032
+ }
1033
+ interface SitemapRouteChange {
1034
+ url: string;
1035
+ changes: {
1036
+ title?: {
1037
+ old: string;
1038
+ new: string;
1039
+ };
1040
+ statusCode?: {
1041
+ old: number;
1042
+ new: number;
1043
+ };
1044
+ contentHash?: {
1045
+ old: string;
1046
+ new: string;
1047
+ };
1048
+ };
1049
+ }
1050
+ interface FormDiff {
1051
+ added: FormInfo[];
1052
+ removed: FormInfo[];
1053
+ changed: FormChange[];
1054
+ unchangedCount: number;
1055
+ }
1056
+ interface FormChange {
1057
+ /** Identity key (action:method or id). */
1058
+ formId: string;
1059
+ /** Page URL where the form lives. */
1060
+ pageUrl: string;
1061
+ changes: {
1062
+ fieldsAdded?: FormField[];
1063
+ fieldsRemoved?: FormField[];
1064
+ fieldsChanged?: FieldChange[];
1065
+ actionChanged?: {
1066
+ old: string;
1067
+ new: string;
1068
+ };
1069
+ methodChanged?: {
1070
+ old: string;
1071
+ new: string;
1072
+ };
1073
+ };
1074
+ }
1075
+ interface FieldChange {
1076
+ name: string;
1077
+ changes: {
1078
+ type?: {
1079
+ old: string;
1080
+ new: string;
1081
+ };
1082
+ required?: {
1083
+ old: boolean;
1084
+ new: boolean;
1085
+ };
1086
+ options?: {
1087
+ added: string[];
1088
+ removed: string[];
1089
+ };
1090
+ validationPattern?: {
1091
+ old: string;
1092
+ new: string;
1093
+ };
1094
+ placeholder?: {
1095
+ old: string;
1096
+ new: string;
1097
+ };
1098
+ };
1099
+ }
1100
+ interface ApiDiff {
1101
+ added: ApiEndpointGroup[];
1102
+ removed: ApiEndpointGroup[];
1103
+ changed: ApiEndpointChange[];
1104
+ unchangedCount: number;
1105
+ }
1106
+ interface ApiEndpointChange {
1107
+ /** Endpoint pattern (e.g. "GET /api/users/:id"). */
1108
+ endpointId: string;
1109
+ changes: {
1110
+ /** Status codes that appeared/disappeared. */
1111
+ statusCodesAdded?: number[];
1112
+ statusCodesRemoved?: number[];
1113
+ /** Response content type changed. */
1114
+ contentTypeChanged?: {
1115
+ old: string;
1116
+ new: string;
1117
+ };
1118
+ /** Response schema fields added/removed (jsondiffpatch delta). */
1119
+ schemaDelta?: unknown;
1120
+ /** Response body size changed significantly. */
1121
+ bodySizeChange?: {
1122
+ old: number;
1123
+ new: number;
1124
+ percentChange: number;
1125
+ };
1126
+ };
1127
+ }
1128
+ interface ScreenshotDiff {
1129
+ added: Array<{
1130
+ url: string;
1131
+ screenshotPath: string;
1132
+ }>;
1133
+ removed: Array<{
1134
+ url: string;
1135
+ }>;
1136
+ changed: ScreenshotChange[];
1137
+ unchangedCount: number;
1138
+ }
1139
+ interface ScreenshotChange {
1140
+ url: string;
1141
+ /** Percentage of pixels that differ (0-100). */
1142
+ diffPercentage: number;
1143
+ /** Number of pixels that differ. */
1144
+ diffPixelCount: number;
1145
+ /** Total pixels in the image. */
1146
+ totalPixels: number;
1147
+ /** Path to the diff image (red overlay on semi-transparent original). */
1148
+ diffImagePath: string;
1149
+ /** Path to the old screenshot. */
1150
+ oldScreenshotPath: string;
1151
+ /** Path to the new screenshot. */
1152
+ newScreenshotPath: string;
1153
+ /** Whether the image dimensions changed. */
1154
+ dimensionChange?: {
1155
+ old: {
1156
+ width: number;
1157
+ height: number;
1158
+ };
1159
+ new: {
1160
+ width: number;
1161
+ height: number;
1162
+ };
1163
+ };
1164
+ }
1165
+ interface CheckpointState {
1166
+ /** Schema version. */
1167
+ version: 1;
1168
+ /** When the crawl started. */
1169
+ startedAt: string;
1170
+ /** When this checkpoint was written. */
1171
+ checkpointedAt: string;
1172
+ /** URLs still in the frontier queue (to visit). */
1173
+ frontier: Array<{
1174
+ url: string;
1175
+ depth: number;
1176
+ referrer?: string;
1177
+ }>;
1178
+ /** URLs already visited (normalized). */
1179
+ visited: string[];
1180
+ /** URLs that were skipped (security block, scope violation, etc.). */
1181
+ skipped: string[];
1182
+ /** Partial artifacts path (relative to output dir). */
1183
+ artifactsDir: string;
1184
+ /** Number of pages visited so far. */
1185
+ pagesVisited: number;
1186
+ /** Errors accumulated so far. */
1187
+ errors: CrawlErrorEntry[];
1188
+ /** Crawl config hash to verify resume compatibility. */
1189
+ configHash: string;
1190
+ }
1191
+ /**
1192
+ * The public programmatic API returns AssembledArtifacts plus file paths
1193
+ * to the generated output.
1194
+ */
1195
+ interface DigResult {
1196
+ artifacts: AssembledArtifacts;
1197
+ outputPaths: {
1198
+ outputDir: string;
1199
+ reportHtml?: string;
1200
+ sitemapJson?: string;
1201
+ formsJson?: string;
1202
+ apiMapJson?: string;
1203
+ openapiJson?: string;
1204
+ flowGraphMmd?: string;
1205
+ harFile?: string;
1206
+ screenshotsDir?: string;
1207
+ bundle?: string;
1208
+ };
1209
+ errors: CrawlErrorEntry[];
1210
+ completionStatus: CrawlResult['completionStatus'];
1211
+ }
1212
+
1213
+ /**
1214
+ * Crawl orchestrator — the heart of M1.
1215
+ *
1216
+ * Coordinates browser launch, secure context creation, BFS frontier loop,
1217
+ * per-page navigation and scanning, SSRF protection, graceful shutdown,
1218
+ * and output generation.
1219
+ *
1220
+ * Error strategy:
1221
+ * - Per-page errors are logged and the page is skipped (crawl continues).
1222
+ * - SIGINT triggers graceful shutdown: stop loop, write partial results, close browser.
1223
+ * - Fatal errors (browser crash, output dir unwritable) abort the crawl.
1224
+ */
1225
+
1226
+ /**
1227
+ * Run a full crawl of the target URL.
1228
+ *
1229
+ * @param config - Fully validated and resolved crawl configuration.
1230
+ * @returns The assembled artifacts, output paths, and completion status.
1231
+ */
1232
+ declare function dig(config: ResolvedConfig): Promise<DigResult>;
1233
+
1234
+ /**
1235
+ * Leveled logger for playwright-archaeologist.
1236
+ *
1237
+ * Provides debug, info, warn, error, and success log methods
1238
+ * with timestamps and ANSI color output. Debug messages are
1239
+ * suppressed unless verbose mode is enabled.
1240
+ */
1241
+ type LogLevel = 'debug' | 'info' | 'warn' | 'error';
1242
+ declare class Logger {
1243
+ private level;
1244
+ constructor(level?: LogLevel);
1245
+ /** Update the minimum log level at runtime. */
1246
+ setLevel(level: LogLevel): void;
1247
+ /** Get the current minimum log level. */
1248
+ getLevel(): LogLevel;
1249
+ /** Check whether a given level would be emitted. */
1250
+ isLevelEnabled(level: LogLevel): boolean;
1251
+ debug(message: string, ...args: unknown[]): void;
1252
+ info(message: string, ...args: unknown[]): void;
1253
+ warn(message: string, ...args: unknown[]): void;
1254
+ error(message: string, ...args: unknown[]): void;
1255
+ /** Success message — always shown at info level or below. */
1256
+ success(message: string, ...args: unknown[]): void;
1257
+ }
1258
+ /**
1259
+ * Shared singleton logger instance.
1260
+ * The CLI sets its level to 'debug' when --verbose is passed.
1261
+ */
1262
+ declare const logger: Logger;
1263
+
1264
+ /**
1265
+ * PageScanner collector — extracts structural information from a page.
1266
+ *
1267
+ * Receives a Playwright Page that has already navigated to the target URL.
1268
+ * Does NOT navigate, click, or scroll — it only reads the current DOM state.
1269
+ *
1270
+ * Extracts:
1271
+ * - URL, canonical URL, title
1272
+ * - Meta tags (name, property, content)
1273
+ * - Heading hierarchy (h1-h6)
1274
+ * - Landmark elements (nav, main, aside, footer, header, [role])
1275
+ * - Links (<a href>) with internal/external classification
1276
+ * - Interactive elements (buttons, inputs, selects) with CSS selectors
1277
+ * - Navigation Timing data
1278
+ * - Content hash for change detection
1279
+ * - Hash-routing detection
1280
+ */
1281
+
1282
+ /**
1283
+ * Scan a page and extract its structural information.
1284
+ *
1285
+ * @param page Playwright Page already navigated to the target URL.
1286
+ * @param baseUrl The crawl's starting URL, used for same-origin checks.
1287
+ * @param response The navigation response (for HTTP status code).
1288
+ * Pass null/undefined if not available; statusCode defaults to 200.
1289
+ */
1290
+ declare function scanPage(page: Page, baseUrl: string, response?: Response | null): Promise<PageScanResult>;
1291
+
1292
+ /**
1293
+ * ScreenshotCapturer collector — captures full-page and viewport screenshots.
1294
+ *
1295
+ * Receives a Playwright Page that has already navigated to the target URL.
1296
+ * Takes two screenshots per page:
1297
+ * 1. Full-page screenshot (captures entire scrollable area)
1298
+ * 2. Viewport-only screenshot (captures visible area only)
1299
+ *
1300
+ * Before capturing, performs limited scrolling (up to 3 viewport increments)
1301
+ * to trigger lazy-loaded content, then scrolls back to the top.
1302
+ *
1303
+ * Detects visible modals/dialogs on the page for metadata.
1304
+ * Gracefully degrades: returns an empty result if screenshots fail.
1305
+ */
1306
+
1307
+ /**
1308
+ * Capture full-page and viewport screenshots of a page.
1309
+ *
1310
+ * @param page Playwright Page already navigated to the target URL.
1311
+ * @param baseUrl The crawl's starting URL (unused here, kept for collector interface consistency).
1312
+ * @param outputDir Directory where screenshot files will be saved.
1313
+ * @param viewport Viewport dimensions used for capture. Defaults to 1280x720.
1314
+ * @returns ScreenshotResult with paths, hashes, and metadata.
1315
+ */
1316
+ declare function captureScreenshots(page: Page, baseUrl: string, outputDir: string, viewport?: Viewport): Promise<ScreenshotResult>;
1317
+
1318
+ /**
1319
+ * FormProber collector — extracts form metadata from a page.
1320
+ *
1321
+ * Receives a Playwright Page that has already navigated to the target URL.
1322
+ * Does NOT submit forms or trigger validation — it only reads the current DOM state.
1323
+ *
1324
+ * Extracts:
1325
+ * - Explicit <form> elements with all fields, options, and attributes
1326
+ * - Implicit forms (orphaned input groups outside any <form> wrapper)
1327
+ * - Submit button text
1328
+ * - Field labels, validation attributes, and default values
1329
+ * - Password field values are redacted
1330
+ */
1331
+
1332
+ /**
1333
+ * Probe all forms on the page and extract their metadata.
1334
+ *
1335
+ * @param page Playwright Page already navigated to the target URL.
1336
+ * @param pageUrl The URL of the page being probed.
1337
+ */
1338
+ declare function probeForms(page: Page, pageUrl: string): Promise<FormProbeResult>;
1339
+
1340
+ /**
1341
+ * NetworkLogger collector — captures all network activity on a page.
1342
+ *
1343
+ * Attaches to Playwright Page events to record:
1344
+ * - HTTP requests and responses (with header scrubbing)
1345
+ * - Failed requests (network errors, CORS, timeouts)
1346
+ * - GraphQL operations (detected from POST to /graphql endpoints)
1347
+ * - WebSocket connections and sample messages
1348
+ *
1349
+ * Usage:
1350
+ * const logger = createNetworkLogger(page, pageUrl);
1351
+ * logger.start();
1352
+ * // ... navigate, interact ...
1353
+ * const result = logger.stop();
1354
+ */
1355
+
1356
+ interface NetworkLoggerOptions {
1357
+ includeCookies?: boolean;
1358
+ }
1359
+ interface NetworkLogger {
1360
+ start(): void;
1361
+ stop(): NetworkLogResult;
1362
+ }
1363
+ /**
1364
+ * Create a NetworkLogger that captures all network activity on a page.
1365
+ *
1366
+ * @param page Playwright Page to attach listeners to.
1367
+ * @param pageUrl The page URL (used for origin comparison and GraphQL context).
1368
+ * @param options Optional configuration (e.g. whether to include cookies).
1369
+ */
1370
+ declare function createNetworkLogger(page: Page, pageUrl: string, options?: NetworkLoggerOptions): NetworkLogger;
1371
+
1372
+ /**
1373
+ * Auth Handler
1374
+ *
1375
+ * Manages authentication flows for the crawler. Supports three methods
1376
+ * in priority order: storage state, cookie injection, and auth scripts.
1377
+ *
1378
+ * Storage state is the most direct method (restores full browser state).
1379
+ * Cookie injection loads cookies from a JSON file.
1380
+ * Auth scripts execute user-provided Playwright automation to log in.
1381
+ */
1382
+
1383
+ interface AuthOptions {
1384
+ /** Path to auth script (JS/TS file that exports default async function(page)) */
1385
+ authScript?: string;
1386
+ /** Path to cookies JSON file */
1387
+ cookiesFile?: string;
1388
+ /** Path to save/load storage state */
1389
+ storageStatePath?: string;
1390
+ }
1391
+ interface AuthResult {
1392
+ success: boolean;
1393
+ method: 'script' | 'cookies' | 'storageState';
1394
+ /** Error message if auth failed */
1395
+ error?: string;
1396
+ /** URL after auth completed */
1397
+ finalUrl?: string;
1398
+ /** Cookies set during auth */
1399
+ cookieCount: number;
1400
+ }
1401
+ /**
1402
+ * Detect whether the current page state indicates an auth failure.
1403
+ *
1404
+ * Checks:
1405
+ * - URL matches common login page patterns
1406
+ * - Page title or response status hints at auth problems
1407
+ */
1408
+ declare function detectAuthFailure(page: Page): Promise<{
1409
+ failed: boolean;
1410
+ reason?: string;
1411
+ }>;
1412
+ /**
1413
+ * Execute an authentication flow.
1414
+ *
1415
+ * Priority order: storageState > cookies > authScript.
1416
+ * Uses the most direct method available. If storage state path is provided
1417
+ * but doesn't exist yet, falls through to script execution and saves state after.
1418
+ *
1419
+ * @param page - Playwright Page instance
1420
+ * @param context - Playwright BrowserContext instance
1421
+ * @param options - Auth configuration
1422
+ * @returns Result describing the auth outcome
1423
+ */
1424
+ declare function executeAuth(page: Page, context: BrowserContext, options: AuthOptions): Promise<AuthResult>;
1425
+ /**
1426
+ * Re-run the authentication flow. Used when a session expires during a crawl.
1427
+ *
1428
+ * This is a thin wrapper around executeAuth that clears existing cookies
1429
+ * before re-running to avoid stale state.
1430
+ */
1431
+ declare function refreshAuth(page: Page, context: BrowserContext, options: AuthOptions): Promise<AuthResult>;
1432
+
1433
+ /**
1434
+ * URL Frontier -- BFS queue with deduplication.
1435
+ *
1436
+ * Maintains a queue of URLs to visit with:
1437
+ * - BFS ordering (FIFO)
1438
+ * - Deduplication using normalized URLs
1439
+ * - Depth tracking per URL
1440
+ *
1441
+ * Uses a dequeue pointer instead of Array.shift() to avoid O(n)
1442
+ * cost on large queues. Compacts the internal array when more than
1443
+ * half of its entries have been consumed.
1444
+ */
1445
+ interface FrontierEntry {
1446
+ url: string;
1447
+ depth: number;
1448
+ referrer?: string;
1449
+ }
1450
+ declare class Frontier {
1451
+ private queue;
1452
+ private head;
1453
+ private seen;
1454
+ private maxDepth;
1455
+ constructor(options?: {
1456
+ maxDepth?: number;
1457
+ });
1458
+ /**
1459
+ * Add a URL to the queue. Returns false if already seen or exceeds max depth.
1460
+ */
1461
+ enqueue(entry: FrontierEntry): boolean;
1462
+ /**
1463
+ * Remove and return the next URL from the queue (FIFO).
1464
+ * Returns undefined if queue is empty.
1465
+ */
1466
+ dequeue(): FrontierEntry | undefined;
1467
+ /**
1468
+ * Check if a URL has been seen (ever enqueued, whether still queued or already dequeued).
1469
+ */
1470
+ hasSeen(url: string): boolean;
1471
+ /**
1472
+ * Get the number of URLs remaining in the queue.
1473
+ */
1474
+ get size(): number;
1475
+ /**
1476
+ * Get the total number of URLs seen (visited + queued).
1477
+ */
1478
+ get totalSeen(): number;
1479
+ /**
1480
+ * Check if the queue is empty.
1481
+ */
1482
+ get isEmpty(): boolean;
1483
+ }
1484
+
1485
+ /**
1486
+ * Browser context pool for concurrent crawling.
1487
+ *
1488
+ * Manages a pool of Playwright BrowserContexts with:
1489
+ * - Concurrency limiting via async semaphore
1490
+ * - Automatic context recycling after N page visits
1491
+ * - Shared context options and storage state
1492
+ *
1493
+ * Callers acquire a context before visiting a page and release it
1494
+ * when done. If the pool is at capacity, acquire() blocks until a
1495
+ * context is released.
1496
+ */
1497
+
1498
+ interface ContextPoolOptions {
1499
+ /** Max concurrent contexts (default 3) */
1500
+ concurrency: number;
1501
+ /** Pages per context before recycling (default 50) */
1502
+ recycleAfter: number;
1503
+ /** Context options to apply to all contexts */
1504
+ contextOptions: Record<string, unknown>;
1505
+ /** Storage state to share across contexts */
1506
+ storageState?: string;
1507
+ }
1508
+ declare class ContextPool {
1509
+ private readonly browser;
1510
+ private readonly concurrency;
1511
+ private readonly recycleAfter;
1512
+ private readonly contextOptions;
1513
+ private readonly storageState?;
1514
+ /** Contexts currently in use by callers */
1515
+ private acquired;
1516
+ /** Contexts sitting idle, ready to be acquired */
1517
+ private idle;
1518
+ /** Waiters blocked on acquire() when pool is full */
1519
+ private waiters;
1520
+ /** Whether closeAll() has been called */
1521
+ private closed;
1522
+ constructor(browser: Browser, options: ContextPoolOptions);
1523
+ /**
1524
+ * Acquire a browser context from the pool.
1525
+ * Blocks if the pool is at full capacity until a context is released.
1526
+ */
1527
+ acquire(): Promise<BrowserContext>;
1528
+ /**
1529
+ * Release a context back to the pool. Increments the page count
1530
+ * and recycles the context if it has exceeded recycleAfter.
1531
+ */
1532
+ release(context: BrowserContext): Promise<void>;
1533
+ /**
1534
+ * Close all contexts (both idle and acquired) and reject future operations.
1535
+ */
1536
+ closeAll(): Promise<void>;
1537
+ /** Current number of active (acquired) contexts */
1538
+ get activeCount(): number;
1539
+ /** Current number of available (idle) contexts */
1540
+ get availableCount(): number;
1541
+ private createEntry;
1542
+ private drainWaiters;
1543
+ }
1544
+
1545
+ /**
1546
+ * Checkpoint module -- pause/resume support for large crawls.
1547
+ *
1548
+ * Persists crawl state to disk so that an interrupted crawl can be
1549
+ * resumed from the last checkpoint rather than restarting from scratch.
1550
+ *
1551
+ * Uses atomic writes (write to .tmp, then rename) to prevent corruption
1552
+ * if the process is killed mid-write.
1553
+ */
1554
+
1555
+ /**
1556
+ * Persist checkpoint state to disk using an atomic write strategy.
1557
+ *
1558
+ * Writes to a temporary file first, then renames it to the final path.
1559
+ * This prevents a half-written file if the process is killed mid-write.
1560
+ *
1561
+ * Updates `state.checkpointedAt` to the current timestamp before writing.
1562
+ */
1563
+ declare function writeCheckpoint(state: CheckpointState, outputDir: string): Promise<void>;
1564
+ /**
1565
+ * Read and validate a checkpoint file from disk.
1566
+ *
1567
+ * Returns `null` if no checkpoint file exists.
1568
+ * Throws if the file exists but contains invalid data.
1569
+ */
1570
+ declare function readCheckpoint(outputDir: string): Promise<CheckpointState | null>;
1571
+ /**
1572
+ * Remove checkpoint files (both the main file and any leftover tmp file).
1573
+ * Silently ignores files that do not exist.
1574
+ */
1575
+ declare function deleteCheckpoint(outputDir: string): Promise<void>;
1576
+ /**
1577
+ * Create an initial empty checkpoint state.
1578
+ *
1579
+ * Used at the start of a new crawl to initialize the checkpoint
1580
+ * before any pages have been visited.
1581
+ */
1582
+ declare function createCheckpointState(params: {
1583
+ configHash: string;
1584
+ artifactsDir: string;
1585
+ }): CheckpointState;
1586
+ /**
1587
+ * Start a periodic auto-checkpoint timer.
1588
+ *
1589
+ * Calls `writeCheckpoint` every `intervalMs` milliseconds with the
1590
+ * current state obtained from `getState()`.
1591
+ *
1592
+ * Returns an object with a `stop()` method that clears the interval.
1593
+ */
1594
+ declare function setupAutoCheckpoint(getState: () => CheckpointState, outputDir: string, intervalMs?: number): {
1595
+ stop: () => void;
1596
+ };
1597
+
1598
+ /**
1599
+ * API Endpoint Grouper
1600
+ *
1601
+ * Groups discovered API endpoints by URL pattern, replacing dynamic
1602
+ * segments with parameter placeholders:
1603
+ * - Numeric IDs: /users/123 -> /users/:id
1604
+ * - UUIDs: /items/550e8400-... -> /items/:id
1605
+ * - Slugs: /posts/hello-world -> /posts/:slug
1606
+ */
1607
+ interface ApiEndpoint {
1608
+ method: string;
1609
+ url: string;
1610
+ statusCode?: number;
1611
+ requestBody?: unknown;
1612
+ responseBody?: unknown;
1613
+ headers?: Record<string, string>;
1614
+ }
1615
+ interface ApiGroup {
1616
+ pattern: string;
1617
+ method: string;
1618
+ examples: ApiEndpoint[];
1619
+ parameterTypes: Record<string, 'id' | 'uuid' | 'slug' | 'unknown'>;
1620
+ }
1621
+ /**
1622
+ * Group a list of API endpoints by their parameterized pattern + method.
1623
+ */
1624
+ declare function groupEndpoints(endpoints: ApiEndpoint[]): ApiGroup[];
1625
+
1626
+ /**
1627
+ * Flow Graph Builder
1628
+ *
1629
+ * Takes navigation edges from a crawl and produces:
1630
+ * - A FlowGraph with node metadata, cycle detection, and clustering
1631
+ * - A Mermaid diagram definition for visualization
1632
+ *
1633
+ * Used by the Assembler to generate flow diagrams in the HTML report.
1634
+ */
1635
+
1636
+ interface FlowGraphNode {
1637
+ url: string;
1638
+ /** Short label derived from URL path */
1639
+ label: string;
1640
+ /** Whether this is the entry point */
1641
+ isEntry: boolean;
1642
+ /** Whether this is a dead-end (no outgoing edges) */
1643
+ isExit: boolean;
1644
+ /** Number of incoming edges */
1645
+ inDegree: number;
1646
+ /** Number of outgoing edges */
1647
+ outDegree: number;
1648
+ /** Cluster/group based on URL path prefix */
1649
+ cluster?: string;
1650
+ }
1651
+ interface FlowGraph {
1652
+ nodes: FlowGraphNode[];
1653
+ edges: NavigationEdge[];
1654
+ /** Entry URL */
1655
+ entryUrl: string;
1656
+ /** Whether cycles were detected */
1657
+ hasCycles: boolean;
1658
+ /** URLs involved in cycles */
1659
+ cycleNodes: string[];
1660
+ /** Cluster groups (path prefix -> URLs) */
1661
+ clusters: Map<string, string[]>;
1662
+ }
1663
+ /**
1664
+ * DFS-based cycle detection.
1665
+ * Returns an array of URLs that participate in at least one cycle.
1666
+ */
1667
+ declare function detectCycles(edges: NavigationEdge[]): string[];
1668
+ /**
1669
+ * Build a FlowGraph from navigation edges and an entry URL.
1670
+ * Deduplicates edges, computes node metadata, detects cycles, and clusters URLs.
1671
+ */
1672
+ declare function buildFlowGraph(edges: NavigationEdge[], entryUrl: string): FlowGraph;
1673
+ /**
1674
+ * Generate a Mermaid flowchart definition from a FlowGraph.
1675
+ *
1676
+ * If the graph exceeds maxNodes (default 50), URLs are auto-clustered
1677
+ * by first path segment and rendered as Mermaid subgraphs.
1678
+ */
1679
+ declare function generateMermaidDefinition(graph: FlowGraph, options?: {
1680
+ maxNodes?: number;
1681
+ }): string;
1682
+
1683
+ /**
1684
+ * HTML Report Template
1685
+ *
1686
+ * Generates the self-contained HTML report from assembled artifacts.
1687
+ * All target-site data is HTML-entity-encoded via escape.ts.
1688
+ * Includes CSP meta tag for XSS prevention.
1689
+ */
1690
+ interface ReportPage {
1691
+ url: string;
1692
+ title: string;
1693
+ status: number;
1694
+ depth?: number;
1695
+ contentHash?: string;
1696
+ }
1697
+ interface ReportForm {
1698
+ url: string;
1699
+ action: string;
1700
+ method: string;
1701
+ fields: Array<{
1702
+ name: string;
1703
+ type: string;
1704
+ required: boolean;
1705
+ }>;
1706
+ }
1707
+ interface ReportApiEndpoint {
1708
+ url?: string;
1709
+ pattern?: string;
1710
+ method: string;
1711
+ status?: number;
1712
+ examples?: Array<{
1713
+ url: string;
1714
+ status: number;
1715
+ }>;
1716
+ }
1717
+ interface ReportScreenshot {
1718
+ url: string;
1719
+ thumbnailBase64: string;
1720
+ fullPath: string;
1721
+ }
1722
+ interface ReportInput {
1723
+ title?: string;
1724
+ targetUrl: string;
1725
+ crawlDate: string;
1726
+ duration: number;
1727
+ pagesVisited: number;
1728
+ errors: number;
1729
+ pages?: ReportPage[];
1730
+ sitemap?: ReportPage[];
1731
+ forms: ReportForm[];
1732
+ apiEndpoints: ReportApiEndpoint[];
1733
+ screenshots?: ReportScreenshot[];
1734
+ flowDiagramSvg?: string;
1735
+ }
1736
+ /**
1737
+ * Generate the complete HTML report string.
1738
+ * All user-provided data is escaped to prevent XSS.
1739
+ */
1740
+ declare function generateReportHtml(input: ReportInput): string;
1741
+
1742
+ /**
1743
+ * HTML Escaping — XSS Prevention
1744
+ *
1745
+ * All data from the target site MUST be HTML-entity-encoded
1746
+ * before embedding in reports. This is a critical security boundary.
1747
+ */
1748
+ /**
1749
+ * Escape a string for safe inclusion in HTML content.
1750
+ * Encodes: & < > " '
1751
+ * Strips null bytes.
1752
+ * Preserves Unicode characters.
1753
+ */
1754
+ declare function escapeHtml(input: string): string;
1755
+ /**
1756
+ * Escape a string for safe inclusion in an HTML attribute value.
1757
+ * More aggressive than content escaping.
1758
+ */
1759
+ declare function escapeAttribute(input: string): string;
1760
+ /**
1761
+ * Sanitize a string for inclusion in a JSON block embedded in HTML.
1762
+ * Prevents </script> injection.
1763
+ */
1764
+ declare function escapeJsonInHtml(jsonString: string): string;
1765
+
1766
+ /**
1767
+ * Progress tracker for crawl operations.
1768
+ *
1769
+ * Tracks page visits, errors, currently-active URLs, and provides
1770
+ * ETA estimates using a rolling window of recent visit timestamps.
1771
+ */
1772
+ interface ProgressState {
1773
+ pagesVisited: number;
1774
+ pagesTotal: number;
1775
+ currentUrls: string[];
1776
+ errorsCount: number;
1777
+ startTime: number;
1778
+ elapsedMs: number;
1779
+ estimatedRemainingMs: number;
1780
+ pagesPerSecond: number;
1781
+ }
1782
+ declare class ProgressTracker {
1783
+ private pagesVisited;
1784
+ private pagesTotal;
1785
+ private errorsCount;
1786
+ private readonly startTime;
1787
+ private readonly currentUrls;
1788
+ /** Rolling window of timestamps (ms) when pages were recorded as visited */
1789
+ private readonly visitTimestamps;
1790
+ constructor(estimatedTotal?: number);
1791
+ /** Record a page visit. */
1792
+ recordVisit(url: string): void;
1793
+ /** Record an error. */
1794
+ recordError(): void;
1795
+ /** Mark a URL as currently being crawled. */
1796
+ startPage(url: string): void;
1797
+ /** Mark a URL as done crawling. */
1798
+ endPage(url: string): void;
1799
+ /** Update the estimated total page count. */
1800
+ updateTotal(total: number): void;
1801
+ /** Get the current progress state snapshot. */
1802
+ getState(): ProgressState;
1803
+ /**
1804
+ * Format the current progress as a terminal-friendly string.
1805
+ *
1806
+ * Example: `[12/50] 2.3 p/s | ETA: 16s | Errors: 0 | Crawling: /about, /api/users`
1807
+ */
1808
+ format(): string;
1809
+ /**
1810
+ * Calculate pages per second from the rolling window of visit timestamps.
1811
+ * Uses the time span between the oldest and newest entry in the window.
1812
+ */
1813
+ private calculatePagesPerSecond;
1814
+ /**
1815
+ * Estimate remaining milliseconds: (remaining pages) / pagesPerSecond * 1000
1816
+ */
1817
+ private calculateRemainingMs;
1818
+ }
1819
+
1820
+ /**
1821
+ * Bundle Creator
1822
+ *
1823
+ * Scans a crawl output directory and creates a structured bundle directory
1824
+ * with a manifest.json at the root. The bundle layout:
1825
+ *
1826
+ * manifest.json -- BundleManifest describing all files
1827
+ * screenshots/ -- PNG screenshot files
1828
+ * data/ -- JSON artifacts (sitemap.json, forms.json, api-map.json, etc.)
1829
+ * report.html -- HTML report (if present)
1830
+ *
1831
+ * No external dependencies -- uses Node.js built-in crypto and fs.
1832
+ */
1833
+
1834
+ /**
1835
+ * Create a bundle directory from crawl output.
1836
+ *
1837
+ * @param outputDir - The crawl output directory containing artifacts
1838
+ * @param bundlePath - The destination bundle directory to create
1839
+ * @returns The BundleManifest describing the bundle contents
1840
+ */
1841
+ declare function createBundle(outputDir: string, bundlePath: string): Promise<BundleManifest>;
1842
+
1843
+ /**
1844
+ * Diff Engine
1845
+ *
1846
+ * Compares two bundle directories to detect regressions.
1847
+ * Produces a DiffResult with structured change information for:
1848
+ * - Routes (sitemap diff)
1849
+ * - Forms (field-level diff)
1850
+ * - API endpoints (status code, content type, body size diff)
1851
+ * - Screenshots (hash-based change detection)
1852
+ *
1853
+ * No external diff dependencies -- uses JSON comparison and SHA-256 hashes.
1854
+ */
1855
+
1856
+ /**
1857
+ * Compare two bundle directories and produce a DiffResult.
1858
+ *
1859
+ * @param oldDir - Path to the older bundle directory
1860
+ * @param newDir - Path to the newer bundle directory
1861
+ * @returns Structured diff result
1862
+ */
1863
+ declare function diffBundles(oldDir: string, newDir: string): Promise<DiffResult>;
1864
+
1865
+ /**
1866
+ * Diff Report Generator
1867
+ *
1868
+ * Generates a self-contained HTML report showing all changes between
1869
+ * two crawl bundles. Color-coded: green for added, red for removed,
1870
+ * yellow for modified.
1871
+ *
1872
+ * Uses the same CSP and escaping as the main report.
1873
+ */
1874
+
1875
+ /**
1876
+ * Generate a self-contained HTML diff report.
1877
+ *
1878
+ * @param diff - The DiffResult to render
1879
+ * @returns Complete HTML document as a string
1880
+ */
1881
+ declare function generateDiffReportHtml(diff: DiffResult): string;
1882
+
1883
+ /**
1884
+ * OpenAPI 3.0.3 Specification Generator
1885
+ *
1886
+ * Converts discovered ApiEndpointGroup[] into a valid OpenAPI 3.0.3 spec.
1887
+ * Used to produce machine-readable API documentation from crawl results.
1888
+ */
1889
+
1890
+ interface OpenApiOptions {
1891
+ /** Title for the API specification. */
1892
+ title: string;
1893
+ /** Base URL of the crawled site (used for servers[].url). */
1894
+ targetUrl: string;
1895
+ /** API version string. Defaults to "1.0.0". */
1896
+ version?: string;
1897
+ /** Optional human-readable description. */
1898
+ description?: string;
1899
+ }
1900
+ interface OpenApiSpec {
1901
+ openapi: '3.0.3';
1902
+ info: {
1903
+ title: string;
1904
+ version: string;
1905
+ description?: string;
1906
+ };
1907
+ servers: Array<{
1908
+ url: string;
1909
+ }>;
1910
+ paths: Record<string, Record<string, OpenApiOperation>>;
1911
+ }
1912
+ interface OpenApiOperation {
1913
+ summary: string;
1914
+ operationId: string;
1915
+ parameters?: OpenApiParameter[];
1916
+ requestBody?: {
1917
+ content: Record<string, {
1918
+ schema: unknown;
1919
+ example?: unknown;
1920
+ }>;
1921
+ };
1922
+ responses: Record<string, {
1923
+ description: string;
1924
+ content?: Record<string, {
1925
+ schema: unknown;
1926
+ example?: unknown;
1927
+ }>;
1928
+ }>;
1929
+ tags?: string[];
1930
+ }
1931
+ interface OpenApiParameter {
1932
+ name: string;
1933
+ in: 'path' | 'query';
1934
+ required: boolean;
1935
+ schema: {
1936
+ type: string;
1937
+ };
1938
+ }
1939
+ /**
1940
+ * Generate an OpenAPI 3.0.3 specification from discovered API endpoint groups.
1941
+ */
1942
+ declare function generateOpenApiSpec(endpoints: ApiEndpointGroup[], options: OpenApiOptions): OpenApiSpec;
1943
+ /**
1944
+ * Write an OpenAPI spec to disk as JSON with 2-space indentation.
1945
+ */
1946
+ declare function writeOpenApiSpec(spec: OpenApiSpec, outputPath: string): Promise<void>;
1947
+
1948
+ export { ArchaeologistError, type AssembledArtifacts, AuthError, type AuthOptions, type AuthResult, BundleError, type BundleManifest, CollectorError, ConfigError, ContextPool, type CrawlConfig, CrawlConfigSchema, CrawlError, type CrawlResult, type DiffConfig, DiffConfigSchema, DiffError, type DiffResult, type DigResult, type FormProbeResult, Frontier, Logger, NavigationError, type NetworkLogResult, type OutputFormat, type PageScanResult, type PageVisitResult, ProgressTracker, type ResolvedConfig, type ScreenshotResult, type Viewport, ViewportSchema, buildFlowGraph, captureScreenshots, createBundle, createCheckpointState, createNetworkLogger, deleteCheckpoint, detectAuthFailure, detectCycles, diffBundles, dig, escapeAttribute, escapeHtml, escapeJsonInHtml, executeAuth, generateDiffReportHtml, generateMermaidDefinition, generateOpenApiSpec, generateReportHtml, groupEndpoints, logger, normalizeEntryUrl, parseViewport, probeForms, readCheckpoint, refreshAuth, scanPage, setupAutoCheckpoint, writeCheckpoint, writeOpenApiSpec };