playwright-archaeologist 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +392 -0
- package/bin/cli.js +2 -0
- package/dist/chunk-7ZQGW5OV.js +255 -0
- package/dist/chunk-7ZQGW5OV.js.map +1 -0
- package/dist/chunk-F5WCXM7I.js +4469 -0
- package/dist/chunk-F5WCXM7I.js.map +1 -0
- package/dist/chunk-RWPEKZOW.js +118 -0
- package/dist/chunk-RWPEKZOW.js.map +1 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +310 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +1948 -0
- package/dist/index.js +789 -0
- package/dist/index.js.map +1 -0
- package/dist/page-scanner-Q76HROEW.js +8 -0
- package/dist/page-scanner-Q76HROEW.js.map +1 -0
- package/package.json +83 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/auth/auth-handler.ts","../src/security/auth-validator.ts","../src/crawl/context-pool.ts","../src/crawl/checkpoint.ts","../src/utils/progress.ts"],"sourcesContent":["/**\n * Auth Handler\n *\n * Manages authentication flows for the crawler. Supports three methods\n * in priority order: storage state, cookie injection, and auth scripts.\n *\n * Storage state is the most direct method (restores full browser state).\n * Cookie injection loads cookies from a JSON file.\n * Auth scripts execute user-provided Playwright automation to log in.\n */\n\nimport { resolve } from 'node:path';\nimport { readFile, writeFile, access } from 'node:fs/promises';\nimport { constants } from 'node:fs';\nimport { validateAuthScript } from '../security/auth-validator.js';\n\nimport type { Page, BrowserContext, Cookie } from 'playwright';\n\n// ---------------------------------------------------------------------------\n// Types\n// ---------------------------------------------------------------------------\n\nexport interface AuthOptions {\n /** Path to auth script (JS/TS file that exports default async function(page)) */\n authScript?: string;\n /** Path to cookies JSON file */\n cookiesFile?: string;\n /** Path to save/load storage state */\n storageStatePath?: string;\n}\n\nexport interface AuthResult {\n success: boolean;\n method: 'script' | 'cookies' | 'storageState';\n /** Error message if auth failed */\n error?: string;\n /** URL after auth completed */\n finalUrl?: string;\n /** Cookies set during auth */\n cookieCount: number;\n}\n\n// ---------------------------------------------------------------------------\n// Constants\n// ---------------------------------------------------------------------------\n\n/**\n * URL path patterns that indicate the user is still on a login page.\n * Used to detect auth failures after running an auth flow.\n */\nconst LOGIN_PATH_PATTERNS = [\n /\\/login\\b/i,\n /\\/signin\\b/i,\n /\\/sign-in\\b/i,\n /\\/auth\\b/i,\n /\\/sso\\b/i,\n];\n\n// ---------------------------------------------------------------------------\n// Internal helpers\n// ---------------------------------------------------------------------------\n\n/**\n * Check whether a file exists and is readable.\n */\nasync function fileExists(filePath: string): Promise<boolean> {\n try {\n await access(filePath, constants.R_OK);\n return true;\n } catch {\n return false;\n }\n}\n\n/**\n * Check whether the current page URL matches common login page patterns.\n */\nfunction isLoginUrl(url: string): boolean {\n try {\n const parsed = new URL(url);\n return LOGIN_PATH_PATTERNS.some((re) => re.test(parsed.pathname));\n } catch {\n return false;\n }\n}\n\n/**\n * Get the cookie count from the browser context.\n */\nasync function getCookieCount(context: BrowserContext): Promise<number> {\n const cookies = await context.cookies();\n return cookies.length;\n}\n\n// ---------------------------------------------------------------------------\n// Storage State\n// ---------------------------------------------------------------------------\n\ninterface StorageStateData {\n cookies: Cookie[];\n origins: Array<{\n origin: string;\n localStorage: Array<{ name: string; value: string }>;\n }>;\n}\n\n/**\n * Load storage state from a JSON file.\n * Restores cookies via context.addCookies() and localStorage via context.addInitScript().\n */\nasync function loadStorageState(\n context: BrowserContext,\n statePath: string,\n): Promise<void> {\n const raw = await readFile(statePath, 'utf-8');\n const state: StorageStateData = JSON.parse(raw);\n\n if (!state || typeof state !== 'object') {\n throw new Error(`Invalid storage state file: ${statePath}`);\n }\n\n // Restore cookies\n if (Array.isArray(state.cookies) && state.cookies.length > 0) {\n await context.addCookies(state.cookies);\n }\n\n // Restore localStorage via init script\n if (Array.isArray(state.origins)) {\n for (const origin of state.origins) {\n if (\n !origin.origin ||\n !Array.isArray(origin.localStorage) ||\n origin.localStorage.length === 0\n ) {\n continue;\n }\n const items = origin.localStorage;\n await context.addInitScript(\n (data: { origin: string; items: Array<{ name: string; value: string }> }) => {\n if (window.location.origin === data.origin) {\n for (const item of data.items) {\n try {\n localStorage.setItem(item.name, item.value);\n } catch {\n // localStorage may be unavailable\n }\n }\n }\n },\n { origin: origin.origin, items },\n );\n }\n }\n}\n\n/**\n * Save the current browser context state to a JSON file.\n */\nasync function saveStorageState(\n context: BrowserContext,\n statePath: string,\n): Promise<void> {\n const state = await context.storageState();\n await writeFile(statePath, JSON.stringify(state, null, 2), 'utf-8');\n}\n\n// ---------------------------------------------------------------------------\n// Cookie injection\n// ---------------------------------------------------------------------------\n\n/**\n * Load cookies from a JSON file and inject them into the browser context.\n * The file must contain a JSON array of Playwright Cookie objects.\n */\nasync function loadCookies(\n context: BrowserContext,\n cookiesPath: string,\n): Promise<number> {\n const raw = await readFile(cookiesPath, 'utf-8');\n let cookies: unknown;\n try {\n cookies = JSON.parse(raw);\n } catch {\n throw new Error(\n `Invalid JSON in cookies file: ${cookiesPath}`,\n );\n }\n\n if (!Array.isArray(cookies)) {\n throw new Error(\n `Cookies file must contain a JSON array: ${cookiesPath}`,\n );\n }\n\n if (cookies.length === 0) {\n return 0;\n }\n\n await context.addCookies(cookies as Cookie[]);\n return cookies.length;\n}\n\n// ---------------------------------------------------------------------------\n// Auth script execution\n// ---------------------------------------------------------------------------\n\n/**\n * Validate and execute a user-provided auth script.\n * The script must export a default async function that accepts a Playwright Page.\n */\nasync function runAuthScript(\n page: Page,\n scriptPath: string,\n): Promise<void> {\n const resolvedPath = resolve(scriptPath);\n\n // Validate script safety before execution\n const validation = validateAuthScript(resolvedPath);\n if (!validation.valid) {\n throw new Error(\n `Auth script failed validation: ${validation.warnings.join('; ')}`,\n );\n }\n\n // Check file exists\n if (!(await fileExists(resolvedPath))) {\n throw new Error(`Auth script not found: ${resolvedPath}`);\n }\n\n // Dynamic import\n let mod: Record<string, unknown>;\n try {\n mod = await import(resolvedPath) as Record<string, unknown>;\n } catch (err: unknown) {\n const message = err instanceof Error ? err.message : String(err);\n throw new Error(`Failed to import auth script '${resolvedPath}': ${message}`);\n }\n\n const authFn = mod.default;\n if (typeof authFn !== 'function') {\n throw new Error(\n `Auth script '${resolvedPath}' must export a default async function`,\n );\n }\n\n await authFn(page);\n}\n\n// ---------------------------------------------------------------------------\n// Public API\n// ---------------------------------------------------------------------------\n\n/**\n * Detect whether the current page state indicates an auth failure.\n *\n * Checks:\n * - URL matches common login page patterns\n * - Page title or response status hints at auth problems\n */\nexport async function detectAuthFailure(\n page: Page,\n): Promise<{ failed: boolean; reason?: string }> {\n const url = page.url();\n\n // Check if still on a login page\n if (isLoginUrl(url)) {\n return {\n failed: true,\n reason: `Still on login page: ${url}`,\n };\n }\n\n // Check for 401/403 response status on the main frame\n // page.mainFrame().response() is not always available,\n // so we check the page's last navigation response.\n try {\n const response = await page.evaluate(() => {\n // Access performance entries to check for HTTP status\n const entries = performance.getEntriesByType('navigation') as PerformanceNavigationTiming[];\n if (entries.length > 0) {\n return { responseStatus: (entries[0] as any).responseStatus ?? null };\n }\n return { responseStatus: null };\n });\n\n if (\n response.responseStatus === 401 ||\n response.responseStatus === 403\n ) {\n return {\n failed: true,\n reason: `Received HTTP ${response.responseStatus} response`,\n };\n }\n } catch {\n // evaluate may fail if page context is destroyed -- not a failure signal\n }\n\n return { failed: false };\n}\n\n/**\n * Execute an authentication flow.\n *\n * Priority order: storageState > cookies > authScript.\n * Uses the most direct method available. If storage state path is provided\n * but doesn't exist yet, falls through to script execution and saves state after.\n *\n * @param page - Playwright Page instance\n * @param context - Playwright BrowserContext instance\n * @param options - Auth configuration\n * @returns Result describing the auth outcome\n */\nexport async function executeAuth(\n page: Page,\n context: BrowserContext,\n options: AuthOptions,\n): Promise<AuthResult> {\n // Validate that at least one auth method is provided\n if (!options.storageStatePath && !options.cookiesFile && !options.authScript) {\n return {\n success: false,\n method: 'script',\n error: 'No authentication method provided',\n cookieCount: 0,\n };\n }\n\n // ------ Storage State (highest priority) ------\n if (options.storageStatePath) {\n const stateExists = await fileExists(options.storageStatePath);\n\n if (stateExists) {\n try {\n await loadStorageState(context, options.storageStatePath);\n const cookieCount = await getCookieCount(context);\n\n return {\n success: true,\n method: 'storageState',\n finalUrl: page.url(),\n cookieCount,\n };\n } catch (err: unknown) {\n const message = err instanceof Error ? err.message : String(err);\n return {\n success: false,\n method: 'storageState',\n error: `Failed to load storage state: ${message}`,\n cookieCount: 0,\n };\n }\n }\n\n // Storage state path provided but doesn't exist -- fall through to script,\n // then save state after successful auth.\n }\n\n // ------ Cookie Injection ------\n if (options.cookiesFile) {\n try {\n const injectedCount = await loadCookies(context, options.cookiesFile);\n const cookieCount = await getCookieCount(context);\n\n return {\n success: true,\n method: 'cookies',\n finalUrl: page.url(),\n cookieCount: cookieCount > 0 ? cookieCount : injectedCount,\n };\n } catch (err: unknown) {\n const message = err instanceof Error ? err.message : String(err);\n return {\n success: false,\n method: 'cookies',\n error: `Failed to inject cookies: ${message}`,\n cookieCount: 0,\n };\n }\n }\n\n // ------ Auth Script ------\n if (options.authScript) {\n try {\n await runAuthScript(page, options.authScript);\n\n // Check for auth failure after script execution\n const failureCheck = await detectAuthFailure(page);\n if (failureCheck.failed) {\n return {\n success: false,\n method: 'script',\n error: failureCheck.reason,\n finalUrl: page.url(),\n cookieCount: await getCookieCount(context),\n };\n }\n\n // Save storage state for future reuse if path was provided\n if (options.storageStatePath) {\n try {\n await saveStorageState(context, options.storageStatePath);\n } catch {\n // Failing to save state is non-fatal -- auth itself succeeded\n }\n }\n\n const cookieCount = await getCookieCount(context);\n return {\n success: true,\n method: 'script',\n finalUrl: page.url(),\n cookieCount,\n };\n } catch (err: unknown) {\n const message = err instanceof Error ? err.message : String(err);\n return {\n success: false,\n method: 'script',\n error: `Auth script failed: ${message}`,\n finalUrl: page.url(),\n cookieCount: 0,\n };\n }\n }\n\n // Should not reach here given the early validation, but be defensive\n return {\n success: false,\n method: 'script',\n error: 'No authentication method could be applied',\n cookieCount: 0,\n };\n}\n\n/**\n * Re-run the authentication flow. Used when a session expires during a crawl.\n *\n * This is a thin wrapper around executeAuth that clears existing cookies\n * before re-running to avoid stale state.\n */\nexport async function refreshAuth(\n page: Page,\n context: BrowserContext,\n options: AuthOptions,\n): Promise<AuthResult> {\n // Clear existing cookies to force a fresh auth\n await context.clearCookies();\n\n return executeAuth(page, context, options);\n}\n","/**\n * Auth Validator\n *\n * Static analysis of authentication scripts to detect dangerous imports\n * and patterns. Does not block execution -- produces warnings for the user.\n *\n * Also provides executeAuthScript for safe dynamic import and execution\n * of user-provided auth scripts.\n */\n\nimport { readFileSync } from 'node:fs';\nimport { pathToFileURL } from 'node:url';\n\nexport interface AuthValidationResult {\n valid: boolean;\n warnings: string[];\n}\n\n/**\n * Dangerous Node.js module patterns that should raise warnings in auth scripts.\n * Auth scripts should only interact with the browser via Playwright API,\n * not access the filesystem, spawn processes, or open network connections.\n */\nconst DANGEROUS_MODULES = [\n 'child_process',\n 'fs',\n 'fs/promises',\n 'net',\n 'dgram',\n 'cluster',\n 'worker_threads',\n 'vm',\n 'v8',\n 'perf_hooks',\n 'dns',\n 'tls',\n 'http',\n 'https',\n] as const;\n\n/**\n * Patterns that indicate dangerous code in auth scripts.\n */\nconst DANGEROUS_PATTERNS: Array<{ pattern: RegExp; description: string }> = [\n {\n pattern: /\\beval\\s*\\(/g,\n description: 'eval() usage detected -- potential code injection',\n },\n {\n pattern: /\\bnew\\s+Function\\s*\\(/g,\n description: 'Function constructor detected -- potential code injection',\n },\n {\n pattern: /\\bprocess\\.exit\\b/g,\n description: 'process.exit() detected -- could terminate the tool unexpectedly',\n },\n];\n\n/**\n * Build regex patterns for import/require of dangerous modules.\n * Handles both bare module names and node: prefixed names.\n */\nfunction buildModulePatterns(): Array<{ pattern: RegExp; moduleName: string }> {\n const results: Array<{ pattern: RegExp; moduleName: string }> = [];\n\n for (const mod of DANGEROUS_MODULES) {\n const escaped = mod.replace(/\\//g, '\\\\/');\n\n // import ... from 'module' or 'node:module'\n results.push({\n pattern: new RegExp(`\\\\bimport\\\\b[^;]*['\"](?:node:)?${escaped}['\"]`, 'g'),\n moduleName: mod,\n });\n\n // import('module') or import('node:module')\n results.push({\n pattern: new RegExp(`\\\\bimport\\\\s*\\\\(\\\\s*['\"](?:node:)?${escaped}['\"]\\\\s*\\\\)`, 'g'),\n moduleName: mod,\n });\n\n // require('module') or require('node:module')\n results.push({\n pattern: new RegExp(`\\\\brequire\\\\s*\\\\(\\\\s*['\"](?:node:)?${escaped}['\"]\\\\s*\\\\)`, 'g'),\n moduleName: mod,\n });\n }\n\n return results;\n}\n\nconst MODULE_PATTERNS = buildModulePatterns();\n\n/**\n * Validate an authentication script by reading it and performing\n * static analysis for dangerous imports and patterns.\n *\n * This is synchronous -- it reads the file and scans the source text.\n * It does NOT execute the script.\n *\n * @param scriptPath - Absolute or relative path to the auth script file.\n * @returns Validation result with valid flag and warnings array.\n * @throws If the file cannot be read.\n */\nexport function validateAuthScript(scriptPath: string): AuthValidationResult {\n const warnings: string[] = [];\n\n let content: string;\n try {\n content = readFileSync(scriptPath, 'utf-8');\n } catch (err: any) {\n const code = err?.code;\n if (code === 'ENOENT') {\n throw new Error(`Auth script not found: ${scriptPath}`);\n }\n throw new Error(`Failed to read auth script '${scriptPath}': ${err?.message ?? String(err)}`);\n }\n\n // Check for dangerous module imports\n for (const { pattern, moduleName } of MODULE_PATTERNS) {\n // Reset lastIndex for global regex\n pattern.lastIndex = 0;\n if (pattern.test(content)) {\n warnings.push(`Dangerous import of '${moduleName}' detected`);\n }\n }\n\n // Check for dangerous code patterns\n for (const { pattern, description } of DANGEROUS_PATTERNS) {\n pattern.lastIndex = 0;\n if (pattern.test(content)) {\n warnings.push(description);\n }\n }\n\n return {\n valid: warnings.length === 0,\n warnings,\n };\n}\n\n/**\n * Execute an authentication script by dynamically importing it and\n * calling its default export with the provided Playwright Page.\n *\n * @param scriptPath - Path to the auth script (.js, .mjs, or .ts via loader).\n * @param page - Playwright Page instance to pass to the auth function.\n * @param opts - Options. `yes: true` skips confirmation prompt.\n * @throws If the script has no default export, the default export is not a function,\n * or the auth function throws.\n */\nexport async function executeAuthScript(\n scriptPath: string,\n page: any,\n opts?: { yes?: boolean }\n): Promise<void> {\n // Convert to file URL for dynamic import\n const fileUrl = pathToFileURL(scriptPath).href;\n\n let mod: any;\n try {\n mod = await import(fileUrl);\n } catch (err: any) {\n throw new Error(\n `Failed to import auth script '${scriptPath}': ${err?.message ?? String(err)}`\n );\n }\n\n const authFn = mod.default;\n\n if (authFn === undefined || authFn === null) {\n throw new Error(\n `Auth script '${scriptPath}' has no default export. ` +\n `The script must export a default async function.`\n );\n }\n\n if (typeof authFn !== 'function') {\n throw new Error(\n `Auth script '${scriptPath}' default export is not a function ` +\n `(got ${typeof authFn}). The script must export a default async function.`\n );\n }\n\n // Execute the auth function\n await authFn(page);\n}\n","/**\n * Browser context pool for concurrent crawling.\n *\n * Manages a pool of Playwright BrowserContexts with:\n * - Concurrency limiting via async semaphore\n * - Automatic context recycling after N page visits\n * - Shared context options and storage state\n *\n * Callers acquire a context before visiting a page and release it\n * when done. If the pool is at capacity, acquire() blocks until a\n * context is released.\n */\n\nimport type { Browser, BrowserContext } from 'playwright';\n\nexport interface ContextPoolOptions {\n /** Max concurrent contexts (default 3) */\n concurrency: number;\n /** Pages per context before recycling (default 50) */\n recycleAfter: number;\n /** Context options to apply to all contexts */\n contextOptions: Record<string, unknown>;\n /** Storage state to share across contexts */\n storageState?: string;\n}\n\ninterface PoolEntry {\n context: BrowserContext;\n pageCount: number;\n}\n\nexport class ContextPool {\n private readonly browser: Browser;\n private readonly concurrency: number;\n private readonly recycleAfter: number;\n private readonly contextOptions: Record<string, unknown>;\n private readonly storageState?: string;\n\n /** Contexts currently in use by callers */\n private acquired: Map<BrowserContext, PoolEntry> = new Map();\n\n /** Contexts sitting idle, ready to be acquired */\n private idle: PoolEntry[] = [];\n\n /** Waiters blocked on acquire() when pool is full */\n private waiters: Array<() => void> = [];\n\n /** Whether closeAll() has been called */\n private closed = false;\n\n constructor(browser: Browser, options: ContextPoolOptions) {\n if (!browser) {\n throw new Error('Browser instance is required');\n }\n if (options.concurrency < 1) {\n throw new Error('Concurrency must be >= 1');\n }\n if (options.recycleAfter < 1) {\n throw new Error('recycleAfter must be >= 1');\n }\n\n this.browser = browser;\n this.concurrency = options.concurrency;\n this.recycleAfter = options.recycleAfter;\n this.contextOptions = options.contextOptions;\n this.storageState = options.storageState;\n }\n\n /**\n * Acquire a browser context from the pool.\n * Blocks if the pool is at full capacity until a context is released.\n */\n async acquire(): Promise<BrowserContext> {\n if (this.closed) {\n throw new Error('ContextPool is closed');\n }\n\n // Wait if we are at capacity (all slots occupied by acquired + idle)\n while (this.acquired.size + this.idle.length >= this.concurrency && this.idle.length === 0) {\n await new Promise<void>((resolve) => {\n this.waiters.push(resolve);\n });\n if (this.closed) {\n throw new Error('ContextPool is closed');\n }\n }\n\n // Prefer reusing an idle context\n if (this.idle.length > 0) {\n const entry = this.idle.pop()!;\n this.acquired.set(entry.context, entry);\n return entry.context;\n }\n\n // Create a new context\n const entry = await this.createEntry();\n this.acquired.set(entry.context, entry);\n return entry.context;\n }\n\n /**\n * Release a context back to the pool. Increments the page count\n * and recycles the context if it has exceeded recycleAfter.\n */\n async release(context: BrowserContext): Promise<void> {\n const entry = this.acquired.get(context);\n if (!entry) {\n throw new Error('Context is not owned by this pool or was already released');\n }\n\n this.acquired.delete(context);\n entry.pageCount += 1;\n\n if (this.closed) {\n // Pool is shutting down; close the context\n await context.close();\n return;\n }\n\n if (entry.pageCount >= this.recycleAfter) {\n // Context is worn out -- close it. A fresh one will be created on next acquire.\n await context.close();\n } else {\n // Return to idle pool\n this.idle.push(entry);\n }\n\n // Wake the next waiter, if any\n this.drainWaiters();\n }\n\n /**\n * Close all contexts (both idle and acquired) and reject future operations.\n */\n async closeAll(): Promise<void> {\n this.closed = true;\n\n // Wake all waiters so they get the \"closed\" error\n for (const resolve of this.waiters) {\n resolve();\n }\n this.waiters.length = 0;\n\n // Close idle contexts\n const closeOps: Promise<void>[] = [];\n for (const entry of this.idle) {\n closeOps.push(entry.context.close());\n }\n this.idle.length = 0;\n\n // Close acquired contexts\n for (const [, entry] of this.acquired) {\n closeOps.push(entry.context.close());\n }\n this.acquired.clear();\n\n await Promise.all(closeOps);\n }\n\n /** Current number of active (acquired) contexts */\n get activeCount(): number {\n return this.acquired.size;\n }\n\n /** Current number of available (idle) contexts */\n get availableCount(): number {\n return this.idle.length;\n }\n\n // ---------------------------------------------------------------------------\n // Private\n // ---------------------------------------------------------------------------\n\n private async createEntry(): Promise<PoolEntry> {\n const opts: Record<string, unknown> = { ...this.contextOptions };\n if (this.storageState !== undefined) {\n opts.storageState = this.storageState;\n }\n const context = await this.browser.newContext(opts);\n return { context, pageCount: 0 };\n }\n\n private drainWaiters(): void {\n if (this.waiters.length > 0) {\n const next = this.waiters.shift()!;\n next();\n }\n }\n}\n","/**\n * Checkpoint module -- pause/resume support for large crawls.\n *\n * Persists crawl state to disk so that an interrupted crawl can be\n * resumed from the last checkpoint rather than restarting from scratch.\n *\n * Uses atomic writes (write to .tmp, then rename) to prevent corruption\n * if the process is killed mid-write.\n */\n\nimport { readFile, writeFile, rename, unlink, access } from 'node:fs/promises';\nimport { join } from 'node:path';\nimport { z } from 'zod';\nimport type { CheckpointState } from '../types/artifacts.js';\n\n// ---------------------------------------------------------------------------\n// Constants\n// ---------------------------------------------------------------------------\n\nconst CHECKPOINT_FILENAME = '.checkpoint.json';\nconst CHECKPOINT_TMP_FILENAME = '.checkpoint.json.tmp';\nconst DEFAULT_AUTO_CHECKPOINT_INTERVAL_MS = 30_000;\n\n// ---------------------------------------------------------------------------\n// Zod schema for validation\n// ---------------------------------------------------------------------------\n\nexport const CheckpointStateSchema = z.object({\n version: z.literal(1),\n startedAt: z.string(),\n checkpointedAt: z.string(),\n frontier: z.array(\n z.object({\n url: z.string(),\n depth: z.number(),\n referrer: z.string().optional(),\n }),\n ),\n visited: z.array(z.string()),\n skipped: z.array(z.string()),\n artifactsDir: z.string(),\n pagesVisited: z.number(),\n errors: z.array(\n z.object({\n timestamp: z.string(),\n url: z.string(),\n code: z.string(),\n message: z.string(),\n collector: z\n .enum(['page-scanner', 'form-prober', 'network-logger', 'screenshot-capturer'])\n .optional(),\n status: z\n .enum(['timeout', 'network_error', 'http_error', 'no_response', 'aborted', 'redirect_loop'])\n .optional(),\n httpStatus: z.number().optional(),\n securityReason: z\n .enum([\n 'private_ip',\n 'blocked_protocol',\n 'dns_rebinding',\n 'metadata_endpoint',\n 'redirect_to_private',\n ])\n .optional(),\n }),\n ),\n configHash: z.string(),\n});\n\n// ---------------------------------------------------------------------------\n// writeCheckpoint\n// ---------------------------------------------------------------------------\n\n/**\n * Persist checkpoint state to disk using an atomic write strategy.\n *\n * Writes to a temporary file first, then renames it to the final path.\n * This prevents a half-written file if the process is killed mid-write.\n *\n * Updates `state.checkpointedAt` to the current timestamp before writing.\n */\nexport async function writeCheckpoint(\n state: CheckpointState,\n outputDir: string,\n): Promise<void> {\n if (!outputDir) {\n throw new Error('outputDir must be a non-empty string');\n }\n\n state.checkpointedAt = new Date().toISOString();\n\n const tmpPath = join(outputDir, CHECKPOINT_TMP_FILENAME);\n const finalPath = join(outputDir, CHECKPOINT_FILENAME);\n const json = JSON.stringify(state, null, 2);\n\n await writeFile(tmpPath, json, 'utf-8');\n await rename(tmpPath, finalPath);\n}\n\n// ---------------------------------------------------------------------------\n// readCheckpoint\n// ---------------------------------------------------------------------------\n\n/**\n * Read and validate a checkpoint file from disk.\n *\n * Returns `null` if no checkpoint file exists.\n * Throws if the file exists but contains invalid data.\n */\nexport async function readCheckpoint(\n outputDir: string,\n): Promise<CheckpointState | null> {\n if (!outputDir) {\n throw new Error('outputDir must be a non-empty string');\n }\n\n const filePath = join(outputDir, CHECKPOINT_FILENAME);\n\n let raw: string;\n try {\n raw = await readFile(filePath, 'utf-8');\n } catch (err: unknown) {\n if (isNodeError(err) && err.code === 'ENOENT') {\n return null;\n }\n throw err;\n }\n\n let parsed: unknown;\n try {\n parsed = JSON.parse(raw);\n } catch {\n throw new Error(`Checkpoint file is not valid JSON: ${filePath}`);\n }\n\n const result = CheckpointStateSchema.safeParse(parsed);\n if (!result.success) {\n throw new Error(\n `Invalid checkpoint data in ${filePath}: ${result.error.message}`,\n );\n }\n\n return result.data as CheckpointState;\n}\n\n// ---------------------------------------------------------------------------\n// deleteCheckpoint\n// ---------------------------------------------------------------------------\n\n/**\n * Remove checkpoint files (both the main file and any leftover tmp file).\n * Silently ignores files that do not exist.\n */\nexport async function deleteCheckpoint(outputDir: string): Promise<void> {\n if (!outputDir) {\n throw new Error('outputDir must be a non-empty string');\n }\n\n const filesToRemove = [\n join(outputDir, CHECKPOINT_FILENAME),\n join(outputDir, CHECKPOINT_TMP_FILENAME),\n ];\n\n await Promise.all(\n filesToRemove.map(async (filePath) => {\n try {\n await unlink(filePath);\n } catch (err: unknown) {\n if (isNodeError(err) && err.code === 'ENOENT') {\n return; // File doesn't exist -- nothing to do\n }\n throw err;\n }\n }),\n );\n}\n\n// ---------------------------------------------------------------------------\n// createCheckpointState\n// ---------------------------------------------------------------------------\n\n/**\n * Create an initial empty checkpoint state.\n *\n * Used at the start of a new crawl to initialize the checkpoint\n * before any pages have been visited.\n */\nexport function createCheckpointState(params: {\n configHash: string;\n artifactsDir: string;\n}): CheckpointState {\n if (!params.configHash) {\n throw new Error('configHash must be a non-empty string');\n }\n if (!params.artifactsDir) {\n throw new Error('artifactsDir must be a non-empty string');\n }\n\n const now = new Date().toISOString();\n\n return {\n version: 1,\n startedAt: now,\n checkpointedAt: now,\n frontier: [],\n visited: [],\n skipped: [],\n artifactsDir: params.artifactsDir,\n pagesVisited: 0,\n errors: [],\n configHash: params.configHash,\n };\n}\n\n// ---------------------------------------------------------------------------\n// setupAutoCheckpoint\n// ---------------------------------------------------------------------------\n\n/**\n * Start a periodic auto-checkpoint timer.\n *\n * Calls `writeCheckpoint` every `intervalMs` milliseconds with the\n * current state obtained from `getState()`.\n *\n * Returns an object with a `stop()` method that clears the interval.\n */\nexport function setupAutoCheckpoint(\n getState: () => CheckpointState,\n outputDir: string,\n intervalMs: number = DEFAULT_AUTO_CHECKPOINT_INTERVAL_MS,\n): { stop: () => void } {\n if (!outputDir) {\n throw new Error('outputDir must be a non-empty string');\n }\n if (intervalMs <= 0) {\n throw new Error('intervalMs must be a positive number');\n }\n\n const timer = setInterval(() => {\n const state = getState();\n writeCheckpoint(state, outputDir).catch(() => {\n // Checkpoint write failures are non-fatal per the graceful\n // degradation contract (see artifacts.ts section 11).\n // The crawl continues without checkpointing.\n });\n }, intervalMs);\n\n // Allow the Node.js process to exit even if the timer is still active.\n if (typeof timer === 'object' && 'unref' in timer) {\n timer.unref();\n }\n\n return {\n stop: () => {\n clearInterval(timer);\n },\n };\n}\n\n// ---------------------------------------------------------------------------\n// Helpers\n// ---------------------------------------------------------------------------\n\ninterface NodeError extends Error {\n code?: string;\n}\n\nfunction isNodeError(err: unknown): err is NodeError {\n return err instanceof Error && 'code' in err;\n}\n","/**\n * Progress tracker for crawl operations.\n *\n * Tracks page visits, errors, currently-active URLs, and provides\n * ETA estimates using a rolling window of recent visit timestamps.\n */\n\nconst ROLLING_WINDOW_SIZE = 10;\n\nexport interface ProgressState {\n pagesVisited: number;\n pagesTotal: number;\n currentUrls: string[];\n errorsCount: number;\n startTime: number;\n elapsedMs: number;\n estimatedRemainingMs: number;\n pagesPerSecond: number;\n}\n\nexport class ProgressTracker {\n private pagesVisited = 0;\n private pagesTotal: number;\n private errorsCount = 0;\n private readonly startTime: number;\n private readonly currentUrls: Set<string> = new Set();\n\n /** Rolling window of timestamps (ms) when pages were recorded as visited */\n private readonly visitTimestamps: number[] = [];\n\n constructor(estimatedTotal?: number) {\n this.pagesTotal = estimatedTotal ?? 0;\n this.startTime = Date.now();\n }\n\n /** Record a page visit. */\n recordVisit(url: string): void {\n this.pagesVisited += 1;\n this.visitTimestamps.push(Date.now());\n\n // Keep only the most recent entries in the rolling window\n if (this.visitTimestamps.length > ROLLING_WINDOW_SIZE) {\n this.visitTimestamps.shift();\n }\n }\n\n /** Record an error. */\n recordError(): void {\n this.errorsCount += 1;\n }\n\n /** Mark a URL as currently being crawled. */\n startPage(url: string): void {\n this.currentUrls.add(url);\n }\n\n /** Mark a URL as done crawling. */\n endPage(url: string): void {\n this.currentUrls.delete(url);\n }\n\n /** Update the estimated total page count. */\n updateTotal(total: number): void {\n if (total < 0) {\n throw new Error('Estimated total must be >= 0');\n }\n this.pagesTotal = total;\n }\n\n /** Get the current progress state snapshot. */\n getState(): ProgressState {\n const now = Date.now();\n const elapsedMs = now - this.startTime;\n const pps = this.calculatePagesPerSecond();\n const remaining = this.calculateRemainingMs(pps);\n\n return {\n pagesVisited: this.pagesVisited,\n pagesTotal: this.pagesTotal,\n currentUrls: Array.from(this.currentUrls),\n errorsCount: this.errorsCount,\n startTime: this.startTime,\n elapsedMs,\n estimatedRemainingMs: remaining,\n pagesPerSecond: pps,\n };\n }\n\n /**\n * Format the current progress as a terminal-friendly string.\n *\n * Example: `[12/50] 2.3 p/s | ETA: 16s | Errors: 0 | Crawling: /about, /api/users`\n */\n format(): string {\n const state = this.getState();\n const pps = state.pagesPerSecond.toFixed(1);\n const eta = state.estimatedRemainingMs > 0\n ? `${Math.ceil(state.estimatedRemainingMs / 1000)}s`\n : '--';\n\n const crawling = state.currentUrls.length > 0\n ? state.currentUrls.join(', ')\n : 'idle';\n\n return `[${state.pagesVisited}/${state.pagesTotal}] ${pps} p/s | ETA: ${eta} | Errors: ${state.errorsCount} | Crawling: ${crawling}`;\n }\n\n // ---------------------------------------------------------------------------\n // Private\n // ---------------------------------------------------------------------------\n\n /**\n * Calculate pages per second from the rolling window of visit timestamps.\n * Uses the time span between the oldest and newest entry in the window.\n */\n private calculatePagesPerSecond(): number {\n const timestamps = this.visitTimestamps;\n if (timestamps.length < 2) {\n return 0;\n }\n\n const oldest = timestamps[0];\n const newest = timestamps[timestamps.length - 1];\n const spanMs = newest - oldest;\n\n if (spanMs <= 0) {\n return 0;\n }\n\n // Number of intervals is (count - 1) over the time span\n return ((timestamps.length - 1) / spanMs) * 1000;\n }\n\n /**\n * Estimate remaining milliseconds: (remaining pages) / pagesPerSecond * 1000\n */\n private calculateRemainingMs(pps: number): number {\n if (pps <= 0 || this.pagesTotal <= 0) {\n return 0;\n }\n\n const remaining = this.pagesTotal - this.pagesVisited;\n if (remaining <= 0) {\n return 0;\n }\n\n return (remaining / pps) * 1000;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAWA,SAAS,eAAe;AACxB,SAAS,UAAU,WAAW,cAAc;AAC5C,SAAS,iBAAiB;;;ACH1B,SAAS,oBAAoB;AAC7B,SAAS,qBAAqB;AAY9B,IAAM,oBAAoB;AAAA,EACxB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAKA,IAAM,qBAAsE;AAAA,EAC1E;AAAA,IACE,SAAS;AAAA,IACT,aAAa;AAAA,EACf;AAAA,EACA;AAAA,IACE,SAAS;AAAA,IACT,aAAa;AAAA,EACf;AAAA,EACA;AAAA,IACE,SAAS;AAAA,IACT,aAAa;AAAA,EACf;AACF;AAMA,SAAS,sBAAsE;AAC7E,QAAM,UAA0D,CAAC;AAEjE,aAAW,OAAO,mBAAmB;AACnC,UAAM,UAAU,IAAI,QAAQ,OAAO,KAAK;AAGxC,YAAQ,KAAK;AAAA,MACX,SAAS,IAAI,OAAO,kCAAkC,OAAO,QAAQ,GAAG;AAAA,MACxE,YAAY;AAAA,IACd,CAAC;AAGD,YAAQ,KAAK;AAAA,MACX,SAAS,IAAI,OAAO,qCAAqC,OAAO,eAAe,GAAG;AAAA,MAClF,YAAY;AAAA,IACd,CAAC;AAGD,YAAQ,KAAK;AAAA,MACX,SAAS,IAAI,OAAO,sCAAsC,OAAO,eAAe,GAAG;AAAA,MACnF,YAAY;AAAA,IACd,CAAC;AAAA,EACH;AAEA,SAAO;AACT;AAEA,IAAM,kBAAkB,oBAAoB;AAarC,SAAS,mBAAmB,YAA0C;AAC3E,QAAM,WAAqB,CAAC;AAE5B,MAAI;AACJ,MAAI;AACF,cAAU,aAAa,YAAY,OAAO;AAAA,EAC5C,SAAS,KAAU;AACjB,UAAM,OAAO,KAAK;AAClB,QAAI,SAAS,UAAU;AACrB,YAAM,IAAI,MAAM,0BAA0B,UAAU,EAAE;AAAA,IACxD;AACA,UAAM,IAAI,MAAM,+BAA+B,UAAU,MAAM,KAAK,WAAW,OAAO,GAAG,CAAC,EAAE;AAAA,EAC9F;AAGA,aAAW,EAAE,SAAS,WAAW,KAAK,iBAAiB;AAErD,YAAQ,YAAY;AACpB,QAAI,QAAQ,KAAK,OAAO,GAAG;AACzB,eAAS,KAAK,wBAAwB,UAAU,YAAY;AAAA,IAC9D;AAAA,EACF;AAGA,aAAW,EAAE,SAAS,YAAY,KAAK,oBAAoB;AACzD,YAAQ,YAAY;AACpB,QAAI,QAAQ,KAAK,OAAO,GAAG;AACzB,eAAS,KAAK,WAAW;AAAA,IAC3B;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,SAAS,WAAW;AAAA,IAC3B;AAAA,EACF;AACF;;;ADxFA,IAAM,sBAAsB;AAAA,EAC1B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AASA,eAAe,WAAW,UAAoC;AAC5D,MAAI;AACF,UAAM,OAAO,UAAU,UAAU,IAAI;AACrC,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAKA,SAAS,WAAW,KAAsB;AACxC,MAAI;AACF,UAAM,SAAS,IAAI,IAAI,GAAG;AAC1B,WAAO,oBAAoB,KAAK,CAAC,OAAO,GAAG,KAAK,OAAO,QAAQ,CAAC;AAAA,EAClE,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAKA,eAAe,eAAe,SAA0C;AACtE,QAAM,UAAU,MAAM,QAAQ,QAAQ;AACtC,SAAO,QAAQ;AACjB;AAkBA,eAAe,iBACb,SACA,WACe;AACf,QAAM,MAAM,MAAM,SAAS,WAAW,OAAO;AAC7C,QAAM,QAA0B,KAAK,MAAM,GAAG;AAE9C,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACvC,UAAM,IAAI,MAAM,+BAA+B,SAAS,EAAE;AAAA,EAC5D;AAGA,MAAI,MAAM,QAAQ,MAAM,OAAO,KAAK,MAAM,QAAQ,SAAS,GAAG;AAC5D,UAAM,QAAQ,WAAW,MAAM,OAAO;AAAA,EACxC;AAGA,MAAI,MAAM,QAAQ,MAAM,OAAO,GAAG;AAChC,eAAW,UAAU,MAAM,SAAS;AAClC,UACE,CAAC,OAAO,UACR,CAAC,MAAM,QAAQ,OAAO,YAAY,KAClC,OAAO,aAAa,WAAW,GAC/B;AACA;AAAA,MACF;AACA,YAAM,QAAQ,OAAO;AACrB,YAAM,QAAQ;AAAA,QACZ,CAAC,SAA4E;AAC3E,cAAI,OAAO,SAAS,WAAW,KAAK,QAAQ;AAC1C,uBAAW,QAAQ,KAAK,OAAO;AAC7B,kBAAI;AACF,6BAAa,QAAQ,KAAK,MAAM,KAAK,KAAK;AAAA,cAC5C,QAAQ;AAAA,cAER;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,QACA,EAAE,QAAQ,OAAO,QAAQ,MAAM;AAAA,MACjC;AAAA,IACF;AAAA,EACF;AACF;AAKA,eAAe,iBACb,SACA,WACe;AACf,QAAM,QAAQ,MAAM,QAAQ,aAAa;AACzC,QAAM,UAAU,WAAW,KAAK,UAAU,OAAO,MAAM,CAAC,GAAG,OAAO;AACpE;AAUA,eAAe,YACb,SACA,aACiB;AACjB,QAAM,MAAM,MAAM,SAAS,aAAa,OAAO;AAC/C,MAAI;AACJ,MAAI;AACF,cAAU,KAAK,MAAM,GAAG;AAAA,EAC1B,QAAQ;AACN,UAAM,IAAI;AAAA,MACR,iCAAiC,WAAW;AAAA,IAC9C;AAAA,EACF;AAEA,MAAI,CAAC,MAAM,QAAQ,OAAO,GAAG;AAC3B,UAAM,IAAI;AAAA,MACR,2CAA2C,WAAW;AAAA,IACxD;AAAA,EACF;AAEA,MAAI,QAAQ,WAAW,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,WAAW,OAAmB;AAC5C,SAAO,QAAQ;AACjB;AAUA,eAAe,cACb,MACA,YACe;AACf,QAAM,eAAe,QAAQ,UAAU;AAGvC,QAAM,aAAa,mBAAmB,YAAY;AAClD,MAAI,CAAC,WAAW,OAAO;AACrB,UAAM,IAAI;AAAA,MACR,kCAAkC,WAAW,SAAS,KAAK,IAAI,CAAC;AAAA,IAClE;AAAA,EACF;AAGA,MAAI,CAAE,MAAM,WAAW,YAAY,GAAI;AACrC,UAAM,IAAI,MAAM,0BAA0B,YAAY,EAAE;AAAA,EAC1D;AAGA,MAAI;AACJ,MAAI;AACF,UAAM,MAAM,OAAO;AAAA,EACrB,SAAS,KAAc;AACrB,UAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,UAAM,IAAI,MAAM,iCAAiC,YAAY,MAAM,OAAO,EAAE;AAAA,EAC9E;AAEA,QAAM,SAAS,IAAI;AACnB,MAAI,OAAO,WAAW,YAAY;AAChC,UAAM,IAAI;AAAA,MACR,gBAAgB,YAAY;AAAA,IAC9B;AAAA,EACF;AAEA,QAAM,OAAO,IAAI;AACnB;AAaA,eAAsB,kBACpB,MAC+C;AAC/C,QAAM,MAAM,KAAK,IAAI;AAGrB,MAAI,WAAW,GAAG,GAAG;AACnB,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,QAAQ,wBAAwB,GAAG;AAAA,IACrC;AAAA,EACF;AAKA,MAAI;AACF,UAAM,WAAW,MAAM,KAAK,SAAS,MAAM;AAEzC,YAAM,UAAU,YAAY,iBAAiB,YAAY;AACzD,UAAI,QAAQ,SAAS,GAAG;AACtB,eAAO,EAAE,gBAAiB,QAAQ,CAAC,EAAU,kBAAkB,KAAK;AAAA,MACtE;AACA,aAAO,EAAE,gBAAgB,KAAK;AAAA,IAChC,CAAC;AAED,QACE,SAAS,mBAAmB,OAC5B,SAAS,mBAAmB,KAC5B;AACA,aAAO;AAAA,QACL,QAAQ;AAAA,QACR,QAAQ,iBAAiB,SAAS,cAAc;AAAA,MAClD;AAAA,IACF;AAAA,EACF,QAAQ;AAAA,EAER;AAEA,SAAO,EAAE,QAAQ,MAAM;AACzB;AAcA,eAAsB,YACpB,MACA,SACA,SACqB;AAErB,MAAI,CAAC,QAAQ,oBAAoB,CAAC,QAAQ,eAAe,CAAC,QAAQ,YAAY;AAC5E,WAAO;AAAA,MACL,SAAS;AAAA,MACT,QAAQ;AAAA,MACR,OAAO;AAAA,MACP,aAAa;AAAA,IACf;AAAA,EACF;AAGA,MAAI,QAAQ,kBAAkB;AAC5B,UAAM,cAAc,MAAM,WAAW,QAAQ,gBAAgB;AAE7D,QAAI,aAAa;AACf,UAAI;AACF,cAAM,iBAAiB,SAAS,QAAQ,gBAAgB;AACxD,cAAM,cAAc,MAAM,eAAe,OAAO;AAEhD,eAAO;AAAA,UACL,SAAS;AAAA,UACT,QAAQ;AAAA,UACR,UAAU,KAAK,IAAI;AAAA,UACnB;AAAA,QACF;AAAA,MACF,SAAS,KAAc;AACrB,cAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,eAAO;AAAA,UACL,SAAS;AAAA,UACT,QAAQ;AAAA,UACR,OAAO,iCAAiC,OAAO;AAAA,UAC/C,aAAa;AAAA,QACf;AAAA,MACF;AAAA,IACF;AAAA,EAIF;AAGA,MAAI,QAAQ,aAAa;AACvB,QAAI;AACF,YAAM,gBAAgB,MAAM,YAAY,SAAS,QAAQ,WAAW;AACpE,YAAM,cAAc,MAAM,eAAe,OAAO;AAEhD,aAAO;AAAA,QACL,SAAS;AAAA,QACT,QAAQ;AAAA,QACR,UAAU,KAAK,IAAI;AAAA,QACnB,aAAa,cAAc,IAAI,cAAc;AAAA,MAC/C;AAAA,IACF,SAAS,KAAc;AACrB,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,aAAO;AAAA,QACL,SAAS;AAAA,QACT,QAAQ;AAAA,QACR,OAAO,6BAA6B,OAAO;AAAA,QAC3C,aAAa;AAAA,MACf;AAAA,IACF;AAAA,EACF;AAGA,MAAI,QAAQ,YAAY;AACtB,QAAI;AACF,YAAM,cAAc,MAAM,QAAQ,UAAU;AAG5C,YAAM,eAAe,MAAM,kBAAkB,IAAI;AACjD,UAAI,aAAa,QAAQ;AACvB,eAAO;AAAA,UACL,SAAS;AAAA,UACT,QAAQ;AAAA,UACR,OAAO,aAAa;AAAA,UACpB,UAAU,KAAK,IAAI;AAAA,UACnB,aAAa,MAAM,eAAe,OAAO;AAAA,QAC3C;AAAA,MACF;AAGA,UAAI,QAAQ,kBAAkB;AAC5B,YAAI;AACF,gBAAM,iBAAiB,SAAS,QAAQ,gBAAgB;AAAA,QAC1D,QAAQ;AAAA,QAER;AAAA,MACF;AAEA,YAAM,cAAc,MAAM,eAAe,OAAO;AAChD,aAAO;AAAA,QACL,SAAS;AAAA,QACT,QAAQ;AAAA,QACR,UAAU,KAAK,IAAI;AAAA,QACnB;AAAA,MACF;AAAA,IACF,SAAS,KAAc;AACrB,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,aAAO;AAAA,QACL,SAAS;AAAA,QACT,QAAQ;AAAA,QACR,OAAO,uBAAuB,OAAO;AAAA,QACrC,UAAU,KAAK,IAAI;AAAA,QACnB,aAAa;AAAA,MACf;AAAA,IACF;AAAA,EACF;AAGA,SAAO;AAAA,IACL,SAAS;AAAA,IACT,QAAQ;AAAA,IACR,OAAO;AAAA,IACP,aAAa;AAAA,EACf;AACF;AAQA,eAAsB,YACpB,MACA,SACA,SACqB;AAErB,QAAM,QAAQ,aAAa;AAE3B,SAAO,YAAY,MAAM,SAAS,OAAO;AAC3C;;;AEnaO,IAAM,cAAN,MAAkB;AAAA,EACN;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGT,WAA2C,oBAAI,IAAI;AAAA;AAAA,EAGnD,OAAoB,CAAC;AAAA;AAAA,EAGrB,UAA6B,CAAC;AAAA;AAAA,EAG9B,SAAS;AAAA,EAEjB,YAAY,SAAkB,SAA6B;AACzD,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI,MAAM,8BAA8B;AAAA,IAChD;AACA,QAAI,QAAQ,cAAc,GAAG;AAC3B,YAAM,IAAI,MAAM,0BAA0B;AAAA,IAC5C;AACA,QAAI,QAAQ,eAAe,GAAG;AAC5B,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AAEA,SAAK,UAAU;AACf,SAAK,cAAc,QAAQ;AAC3B,SAAK,eAAe,QAAQ;AAC5B,SAAK,iBAAiB,QAAQ;AAC9B,SAAK,eAAe,QAAQ;AAAA,EAC9B;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,UAAmC;AACvC,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,uBAAuB;AAAA,IACzC;AAGA,WAAO,KAAK,SAAS,OAAO,KAAK,KAAK,UAAU,KAAK,eAAe,KAAK,KAAK,WAAW,GAAG;AAC1F,YAAM,IAAI,QAAc,CAACA,aAAY;AACnC,aAAK,QAAQ,KAAKA,QAAO;AAAA,MAC3B,CAAC;AACD,UAAI,KAAK,QAAQ;AACf,cAAM,IAAI,MAAM,uBAAuB;AAAA,MACzC;AAAA,IACF;AAGA,QAAI,KAAK,KAAK,SAAS,GAAG;AACxB,YAAMC,SAAQ,KAAK,KAAK,IAAI;AAC5B,WAAK,SAAS,IAAIA,OAAM,SAASA,MAAK;AACtC,aAAOA,OAAM;AAAA,IACf;AAGA,UAAM,QAAQ,MAAM,KAAK,YAAY;AACrC,SAAK,SAAS,IAAI,MAAM,SAAS,KAAK;AACtC,WAAO,MAAM;AAAA,EACf;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,QAAQ,SAAwC;AACpD,UAAM,QAAQ,KAAK,SAAS,IAAI,OAAO;AACvC,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,2DAA2D;AAAA,IAC7E;AAEA,SAAK,SAAS,OAAO,OAAO;AAC5B,UAAM,aAAa;AAEnB,QAAI,KAAK,QAAQ;AAEf,YAAM,QAAQ,MAAM;AACpB;AAAA,IACF;AAEA,QAAI,MAAM,aAAa,KAAK,cAAc;AAExC,YAAM,QAAQ,MAAM;AAAA,IACtB,OAAO;AAEL,WAAK,KAAK,KAAK,KAAK;AAAA,IACtB;AAGA,SAAK,aAAa;AAAA,EACpB;AAAA;AAAA;AAAA;AAAA,EAKA,MAAM,WAA0B;AAC9B,SAAK,SAAS;AAGd,eAAWD,YAAW,KAAK,SAAS;AAClC,MAAAA,SAAQ;AAAA,IACV;AACA,SAAK,QAAQ,SAAS;AAGtB,UAAM,WAA4B,CAAC;AACnC,eAAW,SAAS,KAAK,MAAM;AAC7B,eAAS,KAAK,MAAM,QAAQ,MAAM,CAAC;AAAA,IACrC;AACA,SAAK,KAAK,SAAS;AAGnB,eAAW,CAAC,EAAE,KAAK,KAAK,KAAK,UAAU;AACrC,eAAS,KAAK,MAAM,QAAQ,MAAM,CAAC;AAAA,IACrC;AACA,SAAK,SAAS,MAAM;AAEpB,UAAM,QAAQ,IAAI,QAAQ;AAAA,EAC5B;AAAA;AAAA,EAGA,IAAI,cAAsB;AACxB,WAAO,KAAK,SAAS;AAAA,EACvB;AAAA;AAAA,EAGA,IAAI,iBAAyB;AAC3B,WAAO,KAAK,KAAK;AAAA,EACnB;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,cAAkC;AAC9C,UAAM,OAAgC,EAAE,GAAG,KAAK,eAAe;AAC/D,QAAI,KAAK,iBAAiB,QAAW;AACnC,WAAK,eAAe,KAAK;AAAA,IAC3B;AACA,UAAM,UAAU,MAAM,KAAK,QAAQ,WAAW,IAAI;AAClD,WAAO,EAAE,SAAS,WAAW,EAAE;AAAA,EACjC;AAAA,EAEQ,eAAqB;AAC3B,QAAI,KAAK,QAAQ,SAAS,GAAG;AAC3B,YAAM,OAAO,KAAK,QAAQ,MAAM;AAChC,WAAK;AAAA,IACP;AAAA,EACF;AACF;;;AClLA,SAAS,YAAAE,WAAU,aAAAC,YAAW,QAAQ,cAAsB;AAC5D,SAAS,YAAY;AACrB,SAAS,SAAS;AAOlB,IAAM,sBAAsB;AAC5B,IAAM,0BAA0B;AAChC,IAAM,sCAAsC;AAMrC,IAAM,wBAAwB,EAAE,OAAO;AAAA,EAC5C,SAAS,EAAE,QAAQ,CAAC;AAAA,EACpB,WAAW,EAAE,OAAO;AAAA,EACpB,gBAAgB,EAAE,OAAO;AAAA,EACzB,UAAU,EAAE;AAAA,IACV,EAAE,OAAO;AAAA,MACP,KAAK,EAAE,OAAO;AAAA,MACd,OAAO,EAAE,OAAO;AAAA,MAChB,UAAU,EAAE,OAAO,EAAE,SAAS;AAAA,IAChC,CAAC;AAAA,EACH;AAAA,EACA,SAAS,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAC3B,SAAS,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAC3B,cAAc,EAAE,OAAO;AAAA,EACvB,cAAc,EAAE,OAAO;AAAA,EACvB,QAAQ,EAAE;AAAA,IACR,EAAE,OAAO;AAAA,MACP,WAAW,EAAE,OAAO;AAAA,MACpB,KAAK,EAAE,OAAO;AAAA,MACd,MAAM,EAAE,OAAO;AAAA,MACf,SAAS,EAAE,OAAO;AAAA,MAClB,WAAW,EACR,KAAK,CAAC,gBAAgB,eAAe,kBAAkB,qBAAqB,CAAC,EAC7E,SAAS;AAAA,MACZ,QAAQ,EACL,KAAK,CAAC,WAAW,iBAAiB,cAAc,eAAe,WAAW,eAAe,CAAC,EAC1F,SAAS;AAAA,MACZ,YAAY,EAAE,OAAO,EAAE,SAAS;AAAA,MAChC,gBAAgB,EACb,KAAK;AAAA,QACJ;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,MACF,CAAC,EACA,SAAS;AAAA,IACd,CAAC;AAAA,EACH;AAAA,EACA,YAAY,EAAE,OAAO;AACvB,CAAC;AAcD,eAAsB,gBACpB,OACA,WACe;AACf,MAAI,CAAC,WAAW;AACd,UAAM,IAAI,MAAM,sCAAsC;AAAA,EACxD;AAEA,QAAM,kBAAiB,oBAAI,KAAK,GAAE,YAAY;AAE9C,QAAM,UAAU,KAAK,WAAW,uBAAuB;AACvD,QAAM,YAAY,KAAK,WAAW,mBAAmB;AACrD,QAAM,OAAO,KAAK,UAAU,OAAO,MAAM,CAAC;AAE1C,QAAMA,WAAU,SAAS,MAAM,OAAO;AACtC,QAAM,OAAO,SAAS,SAAS;AACjC;AAYA,eAAsB,eACpB,WACiC;AACjC,MAAI,CAAC,WAAW;AACd,UAAM,IAAI,MAAM,sCAAsC;AAAA,EACxD;AAEA,QAAM,WAAW,KAAK,WAAW,mBAAmB;AAEpD,MAAI;AACJ,MAAI;AACF,UAAM,MAAMD,UAAS,UAAU,OAAO;AAAA,EACxC,SAAS,KAAc;AACrB,QAAI,YAAY,GAAG,KAAK,IAAI,SAAS,UAAU;AAC7C,aAAO;AAAA,IACT;AACA,UAAM;AAAA,EACR;AAEA,MAAI;AACJ,MAAI;AACF,aAAS,KAAK,MAAM,GAAG;AAAA,EACzB,QAAQ;AACN,UAAM,IAAI,MAAM,sCAAsC,QAAQ,EAAE;AAAA,EAClE;AAEA,QAAM,SAAS,sBAAsB,UAAU,MAAM;AACrD,MAAI,CAAC,OAAO,SAAS;AACnB,UAAM,IAAI;AAAA,MACR,8BAA8B,QAAQ,KAAK,OAAO,MAAM,OAAO;AAAA,IACjE;AAAA,EACF;AAEA,SAAO,OAAO;AAChB;AAUA,eAAsB,iBAAiB,WAAkC;AACvE,MAAI,CAAC,WAAW;AACd,UAAM,IAAI,MAAM,sCAAsC;AAAA,EACxD;AAEA,QAAM,gBAAgB;AAAA,IACpB,KAAK,WAAW,mBAAmB;AAAA,IACnC,KAAK,WAAW,uBAAuB;AAAA,EACzC;AAEA,QAAM,QAAQ;AAAA,IACZ,cAAc,IAAI,OAAO,aAAa;AACpC,UAAI;AACF,cAAM,OAAO,QAAQ;AAAA,MACvB,SAAS,KAAc;AACrB,YAAI,YAAY,GAAG,KAAK,IAAI,SAAS,UAAU;AAC7C;AAAA,QACF;AACA,cAAM;AAAA,MACR;AAAA,IACF,CAAC;AAAA,EACH;AACF;AAYO,SAAS,sBAAsB,QAGlB;AAClB,MAAI,CAAC,OAAO,YAAY;AACtB,UAAM,IAAI,MAAM,uCAAuC;AAAA,EACzD;AACA,MAAI,CAAC,OAAO,cAAc;AACxB,UAAM,IAAI,MAAM,yCAAyC;AAAA,EAC3D;AAEA,QAAM,OAAM,oBAAI,KAAK,GAAE,YAAY;AAEnC,SAAO;AAAA,IACL,SAAS;AAAA,IACT,WAAW;AAAA,IACX,gBAAgB;AAAA,IAChB,UAAU,CAAC;AAAA,IACX,SAAS,CAAC;AAAA,IACV,SAAS,CAAC;AAAA,IACV,cAAc,OAAO;AAAA,IACrB,cAAc;AAAA,IACd,QAAQ,CAAC;AAAA,IACT,YAAY,OAAO;AAAA,EACrB;AACF;AAcO,SAAS,oBACd,UACA,WACA,aAAqB,qCACC;AACtB,MAAI,CAAC,WAAW;AACd,UAAM,IAAI,MAAM,sCAAsC;AAAA,EACxD;AACA,MAAI,cAAc,GAAG;AACnB,UAAM,IAAI,MAAM,sCAAsC;AAAA,EACxD;AAEA,QAAM,QAAQ,YAAY,MAAM;AAC9B,UAAM,QAAQ,SAAS;AACvB,oBAAgB,OAAO,SAAS,EAAE,MAAM,MAAM;AAAA,IAI9C,CAAC;AAAA,EACH,GAAG,UAAU;AAGb,MAAI,OAAO,UAAU,YAAY,WAAW,OAAO;AACjD,UAAM,MAAM;AAAA,EACd;AAEA,SAAO;AAAA,IACL,MAAM,MAAM;AACV,oBAAc,KAAK;AAAA,IACrB;AAAA,EACF;AACF;AAUA,SAAS,YAAY,KAAgC;AACnD,SAAO,eAAe,SAAS,UAAU;AAC3C;;;ACtQA,IAAM,sBAAsB;AAarB,IAAM,kBAAN,MAAsB;AAAA,EACnB,eAAe;AAAA,EACf;AAAA,EACA,cAAc;AAAA,EACL;AAAA,EACA,cAA2B,oBAAI,IAAI;AAAA;AAAA,EAGnC,kBAA4B,CAAC;AAAA,EAE9C,YAAY,gBAAyB;AACnC,SAAK,aAAa,kBAAkB;AACpC,SAAK,YAAY,KAAK,IAAI;AAAA,EAC5B;AAAA;AAAA,EAGA,YAAY,KAAmB;AAC7B,SAAK,gBAAgB;AACrB,SAAK,gBAAgB,KAAK,KAAK,IAAI,CAAC;AAGpC,QAAI,KAAK,gBAAgB,SAAS,qBAAqB;AACrD,WAAK,gBAAgB,MAAM;AAAA,IAC7B;AAAA,EACF;AAAA;AAAA,EAGA,cAAoB;AAClB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA,EAGA,UAAU,KAAmB;AAC3B,SAAK,YAAY,IAAI,GAAG;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ,KAAmB;AACzB,SAAK,YAAY,OAAO,GAAG;AAAA,EAC7B;AAAA;AAAA,EAGA,YAAY,OAAqB;AAC/B,QAAI,QAAQ,GAAG;AACb,YAAM,IAAI,MAAM,8BAA8B;AAAA,IAChD;AACA,SAAK,aAAa;AAAA,EACpB;AAAA;AAAA,EAGA,WAA0B;AACxB,UAAM,MAAM,KAAK,IAAI;AACrB,UAAM,YAAY,MAAM,KAAK;AAC7B,UAAM,MAAM,KAAK,wBAAwB;AACzC,UAAM,YAAY,KAAK,qBAAqB,GAAG;AAE/C,WAAO;AAAA,MACL,cAAc,KAAK;AAAA,MACnB,YAAY,KAAK;AAAA,MACjB,aAAa,MAAM,KAAK,KAAK,WAAW;AAAA,MACxC,aAAa,KAAK;AAAA,MAClB,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,sBAAsB;AAAA,MACtB,gBAAgB;AAAA,IAClB;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,SAAiB;AACf,UAAM,QAAQ,KAAK,SAAS;AAC5B,UAAM,MAAM,MAAM,eAAe,QAAQ,CAAC;AAC1C,UAAM,MAAM,MAAM,uBAAuB,IACrC,GAAG,KAAK,KAAK,MAAM,uBAAuB,GAAI,CAAC,MAC/C;AAEJ,UAAM,WAAW,MAAM,YAAY,SAAS,IACxC,MAAM,YAAY,KAAK,IAAI,IAC3B;AAEJ,WAAO,IAAI,MAAM,YAAY,IAAI,MAAM,UAAU,KAAK,GAAG,eAAe,GAAG,cAAc,MAAM,WAAW,gBAAgB,QAAQ;AAAA,EACpI;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAUQ,0BAAkC;AACxC,UAAM,aAAa,KAAK;AACxB,QAAI,WAAW,SAAS,GAAG;AACzB,aAAO;AAAA,IACT;AAEA,UAAM,SAAS,WAAW,CAAC;AAC3B,UAAM,SAAS,WAAW,WAAW,SAAS,CAAC;AAC/C,UAAM,SAAS,SAAS;AAExB,QAAI,UAAU,GAAG;AACf,aAAO;AAAA,IACT;AAGA,YAAS,WAAW,SAAS,KAAK,SAAU;AAAA,EAC9C;AAAA;AAAA;AAAA;AAAA,EAKQ,qBAAqB,KAAqB;AAChD,QAAI,OAAO,KAAK,KAAK,cAAc,GAAG;AACpC,aAAO;AAAA,IACT;AAEA,UAAM,YAAY,KAAK,aAAa,KAAK;AACzC,QAAI,aAAa,GAAG;AAClB,aAAO;AAAA,IACT;AAEA,WAAQ,YAAY,MAAO;AAAA,EAC7B;AACF;","names":["resolve","entry","readFile","writeFile"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
package/package.json
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "playwright-archaeologist",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "Crawl any running web app and generate a complete behavioral specification — sitemap, forms, APIs, screenshots, and regression baselines",
|
|
5
|
+
"author": "playwright-archaeologist contributors",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"repository": {
|
|
8
|
+
"type": "git",
|
|
9
|
+
"url": "https://github.com/BFAlajid/playwright-archaeologist.git"
|
|
10
|
+
},
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/BFAlajid/playwright-archaeologist/issues"
|
|
13
|
+
},
|
|
14
|
+
"homepage": "https://github.com/BFAlajid/playwright-archaeologist#readme",
|
|
15
|
+
"keywords": [
|
|
16
|
+
"playwright",
|
|
17
|
+
"crawler",
|
|
18
|
+
"web-crawler",
|
|
19
|
+
"sitemap",
|
|
20
|
+
"screenshots",
|
|
21
|
+
"openapi",
|
|
22
|
+
"api-discovery",
|
|
23
|
+
"behavioral-testing",
|
|
24
|
+
"regression-testing",
|
|
25
|
+
"spa",
|
|
26
|
+
"forms",
|
|
27
|
+
"flow-graph",
|
|
28
|
+
"web-scraping",
|
|
29
|
+
"qa",
|
|
30
|
+
"testing",
|
|
31
|
+
"audit"
|
|
32
|
+
],
|
|
33
|
+
"bin": {
|
|
34
|
+
"playwright-archaeologist": "./bin/cli.js",
|
|
35
|
+
"pa": "./bin/cli.js"
|
|
36
|
+
},
|
|
37
|
+
"engines": {
|
|
38
|
+
"node": ">=20.0.0"
|
|
39
|
+
},
|
|
40
|
+
"type": "module",
|
|
41
|
+
"main": "./dist/index.js",
|
|
42
|
+
"types": "./dist/index.d.ts",
|
|
43
|
+
"exports": {
|
|
44
|
+
".": {
|
|
45
|
+
"import": "./dist/index.js",
|
|
46
|
+
"types": "./dist/index.d.ts"
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"files": [
|
|
50
|
+
"dist",
|
|
51
|
+
"bin",
|
|
52
|
+
"README.md",
|
|
53
|
+
"LICENSE"
|
|
54
|
+
],
|
|
55
|
+
"scripts": {
|
|
56
|
+
"build": "tsup",
|
|
57
|
+
"dev": "tsup --watch",
|
|
58
|
+
"test": "vitest run",
|
|
59
|
+
"test:watch": "vitest",
|
|
60
|
+
"test:unit": "vitest run test/unit",
|
|
61
|
+
"test:integration": "vitest run test/integration",
|
|
62
|
+
"bench": "vitest bench test/performance/",
|
|
63
|
+
"bench:ci": "vitest bench test/performance/ --reporter=json --outputFile=bench-results.json",
|
|
64
|
+
"lint": "eslint src/",
|
|
65
|
+
"prepublishOnly": "npm run build"
|
|
66
|
+
},
|
|
67
|
+
"dependencies": {
|
|
68
|
+
"playwright": "^1.49.0",
|
|
69
|
+
"commander": "^13.0.0",
|
|
70
|
+
"zod": "^3.24.0",
|
|
71
|
+
"ora": "^8.0.0"
|
|
72
|
+
},
|
|
73
|
+
"devDependencies": {
|
|
74
|
+
"typescript": "^5.7.0",
|
|
75
|
+
"tsup": "^8.3.0",
|
|
76
|
+
"vitest": "^3.0.0",
|
|
77
|
+
"@types/node": "^22.0.0",
|
|
78
|
+
"express": "^4.21.0",
|
|
79
|
+
"@types/express": "^4.17.0",
|
|
80
|
+
"ws": "^8.18.0",
|
|
81
|
+
"@types/ws": "^8.5.0"
|
|
82
|
+
}
|
|
83
|
+
}
|