bluera-knowledge 0.9.42 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  IntelligentCrawler
4
- } from "../chunk-MQE32YY6.js";
4
+ } from "../chunk-ITH6FWQY.js";
5
5
  import {
6
6
  JobService,
7
7
  createDocumentId,
@@ -186,7 +186,8 @@ var BackgroundWorker = class {
186
186
  const crawlOptions = {
187
187
  maxPages: resolvedMaxPages,
188
188
  simple: simple ?? false,
189
- useHeadless: useHeadless ?? false
189
+ useHeadless: useHeadless ?? true
190
+ // Default to headless for reliability
190
191
  };
191
192
  if (crawlInstruction !== void 0) {
192
193
  crawlOptions.crawlInstruction = crawlInstruction;
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/workers/background-worker.ts","../../src/workers/pid-file.ts","../../src/workers/background-worker-cli.ts"],"sourcesContent":["import { createHash } from 'node:crypto';\nimport { IntelligentCrawler, type CrawlProgress } from '../crawl/intelligent-crawler.js';\nimport { IndexService } from '../services/index.service.js';\nimport { JobService } from '../services/job.service.js';\nimport { StoreService } from '../services/store.service.js';\nimport { createStoreId, createDocumentId } from '../types/brands.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { Document } from '../types/document.js';\nimport type { Job } from '../types/job.js';\n\n/**\n * Calculate index progress as a percentage, handling division by zero.\n * @param current - Current number of items processed\n * @param total - Total number of items (may be 0)\n * @param scale - Scale factor for progress (default 100 for 0-100%)\n * @returns Progress value, or 0 if total is 0\n */\nexport function calculateIndexProgress(\n current: number,\n total: number,\n scale: number = 100\n): number {\n if (total === 0) return 0;\n return (current / total) * scale;\n}\n\nexport class BackgroundWorker {\n constructor(\n private readonly jobService: JobService,\n private readonly storeService: StoreService,\n private readonly indexService: IndexService,\n private readonly lanceStore: LanceStore,\n private readonly embeddingEngine: EmbeddingEngine\n ) {}\n\n /**\n * Execute a job based on its type\n */\n async executeJob(jobId: string): Promise<void> {\n const job = this.jobService.getJob(jobId);\n\n if (!job) {\n throw new Error(`Job ${jobId} not found`);\n }\n\n try {\n // Update to running status\n this.jobService.updateJob(jobId, {\n status: 'running',\n message: `Starting ${job.type} operation...`,\n progress: 0,\n details: { startedAt: new Date().toISOString() },\n });\n\n // Execute based on job type\n switch (job.type) {\n case 'clone':\n await this.executeCloneJob(job);\n break;\n case 'index':\n await this.executeIndexJob(job);\n break;\n case 'crawl':\n await this.executeCrawlJob(job);\n break;\n default:\n throw new Error(`Unknown job type: ${String(job.type)}`);\n }\n\n // Mark as completed\n this.jobService.updateJob(jobId, {\n status: 'completed',\n progress: 100,\n message: `${job.type} operation completed successfully`,\n details: { completedAt: new Date().toISOString() },\n });\n } catch (error) {\n // Mark as failed\n const errorDetails: Record<string, unknown> = {\n completedAt: new Date().toISOString(),\n };\n if (error instanceof Error && error.stack !== undefined) {\n errorDetails['error'] = error.stack;\n } else {\n errorDetails['error'] = String(error);\n }\n this.jobService.updateJob(jobId, {\n status: 'failed',\n message: error instanceof Error ? error.message : 'Unknown error',\n details: errorDetails,\n });\n throw error;\n }\n }\n\n /**\n * Execute a clone job (git clone + initial indexing)\n */\n private async executeCloneJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for clone job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Clone is already done by the time the job is created\n // (happens in StoreService.create), so we just need to index\n\n // Update progress - cloning considered done (30%)\n this.jobService.updateJob(job.id, {\n status: 'running',\n message: 'Repository cloned, starting indexing...',\n progress: 30,\n });\n\n // Index the repository with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Indexing is 70% of total progress (30-100%)\n const indexProgress = calculateIndexProgress(event.current, event.total, 70);\n const totalProgress = 30 + indexProgress;\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, totalProgress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute an index job (re-indexing existing store)\n */\n private async executeIndexJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for index job');\n }\n\n // Get the store\n const store = await this.storeService.getByIdOrName(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Index with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n const progress = calculateIndexProgress(event.current, event.total);\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, progress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute a crawl job (web crawling + indexing)\n */\n private async executeCrawlJob(job: Job): Promise<void> {\n const { storeId, url, crawlInstruction, extractInstruction, maxPages, simple, useHeadless } =\n job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for crawl job');\n }\n if (url === undefined || typeof url !== 'string') {\n throw new Error('URL required for crawl job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (store?.type !== 'web') {\n throw new Error(`Web store ${storeId} not found`);\n }\n\n const resolvedMaxPages = typeof maxPages === 'number' ? maxPages : 50;\n const crawler = new IntelligentCrawler();\n\n // Listen for progress events\n crawler.on('progress', (progress: CrawlProgress) => {\n // Check if job was cancelled - just return early, for-await loop will throw and finally will cleanup\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n return;\n }\n\n // Crawling is 80% of total progress (0-80%)\n const crawlProgress = (progress.pagesVisited / resolvedMaxPages) * 80;\n\n this.jobService.updateJob(job.id, {\n message:\n progress.message ??\n `Crawling page ${String(progress.pagesVisited)}/${String(resolvedMaxPages)}`,\n progress: Math.min(80, crawlProgress),\n details: { pagesCrawled: progress.pagesVisited },\n });\n });\n\n try {\n await this.lanceStore.initialize(store.id);\n const docs: Document[] = [];\n\n // Build crawl options, only including defined values\n const crawlOptions: {\n maxPages: number;\n simple: boolean;\n useHeadless: boolean;\n crawlInstruction?: string;\n extractInstruction?: string;\n } = {\n maxPages: resolvedMaxPages,\n simple: simple ?? false,\n useHeadless: useHeadless ?? false,\n };\n if (crawlInstruction !== undefined) {\n crawlOptions.crawlInstruction = crawlInstruction;\n }\n if (extractInstruction !== undefined) {\n crawlOptions.extractInstruction = extractInstruction;\n }\n\n // Crawl pages using IntelligentCrawler\n for await (const result of crawler.crawl(url, crawlOptions)) {\n // Check cancellation between pages\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Embed and index the content (use extracted if available, otherwise markdown)\n const contentToEmbed = result.extracted ?? result.markdown;\n const vector = await this.embeddingEngine.embed(contentToEmbed);\n\n docs.push({\n id: createDocumentId(`${store.id}-${createHash('md5').update(result.url).digest('hex')}`),\n content: contentToEmbed,\n vector,\n metadata: {\n type: 'web',\n storeId: store.id,\n url: result.url,\n title: result.title,\n extracted: result.extracted !== undefined,\n depth: result.depth,\n indexedAt: new Date(),\n },\n });\n }\n\n // Index all documents (remaining 20%)\n if (docs.length > 0) {\n this.jobService.updateJob(job.id, {\n message: 'Indexing crawled documents...',\n progress: 85,\n });\n\n await this.lanceStore.addDocuments(store.id, docs);\n // Create FTS index for full-text search\n await this.lanceStore.createFtsIndex(store.id);\n }\n\n this.jobService.updateJob(job.id, {\n message: `Crawled and indexed ${String(docs.length)} pages`,\n progress: 100,\n details: { pagesCrawled: docs.length },\n });\n } finally {\n await crawler.stop();\n }\n }\n}\n","import fs from 'fs';\nimport path from 'path';\n\n/**\n * Result of a PID file delete operation.\n * Delete operations are best-effort and should not throw.\n */\nexport interface PidFileResult {\n success: boolean;\n error?: Error;\n}\n\n/**\n * Context for PID file deletion - indicates when the delete is happening.\n * Used for logging/debugging purposes.\n */\nexport type PidFileDeleteContext = 'sigterm' | 'success' | 'failure';\n\n/**\n * Write PID file - CRITICAL operation that must succeed.\n *\n * If the PID file cannot be written, the job cannot be cancelled through\n * the job management system. This is a critical failure and the job\n * should not proceed.\n *\n * @param pidFile - Absolute path to the PID file\n * @param pid - Process ID to write\n * @throws Error if PID file cannot be written\n */\nexport function writePidFile(pidFile: string, pid: number): void {\n try {\n fs.writeFileSync(pidFile, pid.toString(), 'utf-8');\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n throw new Error(\n `CRITICAL: Failed to write PID file ${pidFile}. ` +\n `Job cannot be cancelled without PID file. ` +\n `Original error: ${message}`\n );\n }\n}\n\n/**\n * Delete PID file - best-effort cleanup during shutdown.\n *\n * This operation should NEVER throw. During process shutdown (SIGTERM,\n * job success, job failure), failing to delete a PID file should not\n * prevent the process from exiting cleanly.\n *\n * Stale PID files are cleaned up by JobService.cleanupOldJobs().\n *\n * @param pidFile - Absolute path to the PID file\n * @param _context - Context indicating when the delete is happening (for future logging)\n * @returns Result indicating success or failure with error details\n */\nexport function deletePidFile(pidFile: string, _context: PidFileDeleteContext): PidFileResult {\n try {\n fs.unlinkSync(pidFile);\n return { success: true };\n } catch (error) {\n // ENOENT = file doesn't exist - that's success (nothing to delete)\n if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {\n return { success: true };\n }\n // Any other error = failure (permission denied, etc.)\n return {\n success: false,\n error: error instanceof Error ? error : new Error(String(error)),\n };\n }\n}\n\n/**\n * Build the path to a PID file for a given job.\n *\n * @param jobsDir - Directory where job files are stored\n * @param jobId - Job identifier\n * @returns Absolute path to the PID file\n */\nexport function buildPidFilePath(jobsDir: string, jobId: string): string {\n return path.join(jobsDir, `${jobId}.pid`);\n}\n","#!/usr/bin/env node\nimport { BackgroundWorker } from './background-worker.js';\nimport { writePidFile, deletePidFile, buildPidFilePath } from './pid-file.js';\nimport { createServices } from '../services/index.js';\nimport { JobService } from '../services/job.service.js';\n\n/**\n * Background worker CLI entry point\n *\n * Usage: background-worker-cli <job-id>\n *\n * This process runs detached from the parent and executes a single job.\n */\n\nasync function main(): Promise<void> {\n const jobId = process.argv[2];\n const dataDir = process.env['BLUERA_DATA_DIR'];\n\n if (jobId === undefined || jobId === '') {\n console.error('Error: Job ID required');\n console.error('Usage: background-worker-cli <job-id>');\n process.exit(1);\n }\n\n // Initialize services\n const jobService = new JobService(dataDir);\n const services = await createServices(undefined, dataDir);\n\n // Write PID file for job cancellation - CRITICAL: must succeed or job cannot be cancelled\n const pidFile = buildPidFilePath(\n jobService['jobsDir'], // Access private field for PID path\n jobId\n );\n\n try {\n writePidFile(pidFile, process.pid);\n } catch (error) {\n // CRITICAL: Cannot proceed without PID file - job would be uncancellable\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n\n // Handle SIGTERM for graceful shutdown\n process.on('SIGTERM', () => {\n console.log(`[${jobId}] Received SIGTERM, cancelling job...`);\n jobService.updateJob(jobId, {\n status: 'cancelled',\n message: 'Job cancelled by user',\n });\n\n // Clean up PID file (best-effort - don't block shutdown)\n const deleteResult = deletePidFile(pidFile, 'sigterm');\n if (!deleteResult.success && deleteResult.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file during SIGTERM: ${deleteResult.error.message}`\n );\n }\n\n process.exit(0);\n });\n\n // Create worker and execute job\n const worker = new BackgroundWorker(\n jobService,\n services.store,\n services.index,\n services.lance,\n services.embeddings\n );\n\n try {\n await worker.executeJob(jobId);\n\n // Clean up PID file on success (best-effort - don't change exit code)\n const successCleanup = deletePidFile(pidFile, 'success');\n if (!successCleanup.success && successCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after success: ${successCleanup.error.message}`\n );\n }\n\n console.log(`[${jobId}] Job completed successfully`);\n process.exit(0);\n } catch (error) {\n // Job service already updated with failure status in BackgroundWorker\n console.error(`[${jobId}] Job failed:`, error);\n\n // Clean up PID file on failure (best-effort - exit code reflects job failure)\n const failureCleanup = deletePidFile(pidFile, 'failure');\n if (!failureCleanup.success && failureCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after failure: ${failureCleanup.error.message}`\n );\n }\n\n process.exit(1);\n }\n}\n\nmain().catch((error: unknown) => {\n console.error('Fatal error in background worker:', error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;;;;;;;AAAA,SAAS,kBAAkB;AAkBpB,SAAS,uBACd,SACA,OACA,QAAgB,KACR;AACR,MAAI,UAAU,EAAG,QAAO;AACxB,SAAQ,UAAU,QAAS;AAC7B;AAEO,IAAM,mBAAN,MAAuB;AAAA,EAC5B,YACmB,YACA,cACA,cACA,YACA,iBACjB;AALiB;AACA;AACA;AACA;AACA;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA,EAKH,MAAM,WAAW,OAA8B;AAC7C,UAAM,MAAM,KAAK,WAAW,OAAO,KAAK;AAExC,QAAI,CAAC,KAAK;AACR,YAAM,IAAI,MAAM,OAAO,KAAK,YAAY;AAAA,IAC1C;AAEA,QAAI;AAEF,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,YAAY,IAAI,IAAI;AAAA,QAC7B,UAAU;AAAA,QACV,SAAS,EAAE,YAAW,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACjD,CAAC;AAGD,cAAQ,IAAI,MAAM;AAAA,QAChB,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF;AACE,gBAAM,IAAI,MAAM,qBAAqB,OAAO,IAAI,IAAI,CAAC,EAAE;AAAA,MAC3D;AAGA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,UAAU;AAAA,QACV,SAAS,GAAG,IAAI,IAAI;AAAA,QACpB,SAAS,EAAE,cAAa,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACnD,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,YAAM,eAAwC;AAAA,QAC5C,cAAa,oBAAI,KAAK,GAAE,YAAY;AAAA,MACtC;AACA,UAAI,iBAAiB,SAAS,MAAM,UAAU,QAAW;AACvD,qBAAa,OAAO,IAAI,MAAM;AAAA,MAChC,OAAO;AACL,qBAAa,OAAO,IAAI,OAAO,KAAK;AAAA,MACtC;AACA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,iBAAiB,QAAQ,MAAM,UAAU;AAAA,QAClD,SAAS;AAAA,MACX,CAAC;AACD,YAAM;AAAA,IACR;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAMA,SAAK,WAAW,UAAU,IAAI,IAAI;AAAA,MAChC,QAAQ;AAAA,MACR,SAAS;AAAA,MACT,UAAU;AAAA,IACZ,CAAC;AAGD,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,gBAAgB,uBAAuB,MAAM,SAAS,MAAM,OAAO,EAAE;AAC3E,cAAM,gBAAgB,KAAK;AAE3B,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA;AAAA,UACpC,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,cAAc,cAAc,OAAO,CAAC;AAC1E,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAGA,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAEA,cAAM,WAAW,uBAAuB,MAAM,SAAS,MAAM,KAAK;AAElE,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,QAAQ;AAAA;AAAA,UAC/B,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,SAAS,KAAK,kBAAkB,oBAAoB,UAAU,QAAQ,YAAY,IACxF,IAAI;AAEN,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AACA,QAAI,QAAQ,UAAa,OAAO,QAAQ,UAAU;AAChD,YAAM,IAAI,MAAM,4BAA4B;AAAA,IAC9C;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,OAAO,SAAS,OAAO;AACzB,YAAM,IAAI,MAAM,aAAa,OAAO,YAAY;AAAA,IAClD;AAEA,UAAM,mBAAmB,OAAO,aAAa,WAAW,WAAW;AACnE,UAAM,UAAU,IAAI,mBAAmB;AAGvC,YAAQ,GAAG,YAAY,CAAC,aAA4B;AAElD,YAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,UAAI,YAAY,WAAW,aAAa;AACtC;AAAA,MACF;AAGA,YAAM,gBAAiB,SAAS,eAAe,mBAAoB;AAEnE,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SACE,SAAS,WACT,iBAAiB,OAAO,SAAS,YAAY,CAAC,IAAI,OAAO,gBAAgB,CAAC;AAAA,QAC5E,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA,QACpC,SAAS,EAAE,cAAc,SAAS,aAAa;AAAA,MACjD,CAAC;AAAA,IACH,CAAC;AAED,QAAI;AACF,YAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,YAAM,OAAmB,CAAC;AAG1B,YAAM,eAMF;AAAA,QACF,UAAU;AAAA,QACV,QAAQ,UAAU;AAAA,QAClB,aAAa,eAAe;AAAA,MAC9B;AACA,UAAI,qBAAqB,QAAW;AAClC,qBAAa,mBAAmB;AAAA,MAClC;AACA,UAAI,uBAAuB,QAAW;AACpC,qBAAa,qBAAqB;AAAA,MACpC;AAGA,uBAAiB,UAAU,QAAQ,MAAM,KAAK,YAAY,GAAG;AAE3D,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,iBAAiB,OAAO,aAAa,OAAO;AAClD,cAAM,SAAS,MAAM,KAAK,gBAAgB,MAAM,cAAc;AAE9D,aAAK,KAAK;AAAA,UACR,IAAI,iBAAiB,GAAG,MAAM,EAAE,IAAI,WAAW,KAAK,EAAE,OAAO,OAAO,GAAG,EAAE,OAAO,KAAK,CAAC,EAAE;AAAA,UACxF,SAAS;AAAA,UACT;AAAA,UACA,UAAU;AAAA,YACR,MAAM;AAAA,YACN,SAAS,MAAM;AAAA,YACf,KAAK,OAAO;AAAA,YACZ,OAAO,OAAO;AAAA,YACd,WAAW,OAAO,cAAc;AAAA,YAChC,OAAO,OAAO;AAAA,YACd,WAAW,oBAAI,KAAK;AAAA,UACtB;AAAA,QACF,CAAC;AAAA,MACH;AAGA,UAAI,KAAK,SAAS,GAAG;AACnB,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS;AAAA,UACT,UAAU;AAAA,QACZ,CAAC;AAED,cAAM,KAAK,WAAW,aAAa,MAAM,IAAI,IAAI;AAEjD,cAAM,KAAK,WAAW,eAAe,MAAM,EAAE;AAAA,MAC/C;AAEA,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SAAS,uBAAuB,OAAO,KAAK,MAAM,CAAC;AAAA,QACnD,UAAU;AAAA,QACV,SAAS,EAAE,cAAc,KAAK,OAAO;AAAA,MACvC,CAAC;AAAA,IACH,UAAE;AACA,YAAM,QAAQ,KAAK;AAAA,IACrB;AAAA,EACF;AACF;;;ACvTA,OAAO,QAAQ;AACf,OAAO,UAAU;AA4BV,SAAS,aAAa,SAAiB,KAAmB;AAC/D,MAAI;AACF,OAAG,cAAc,SAAS,IAAI,SAAS,GAAG,OAAO;AAAA,EACnD,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,UAAM,IAAI;AAAA,MACR,sCAAsC,OAAO,+DAExB,OAAO;AAAA,IAC9B;AAAA,EACF;AACF;AAeO,SAAS,cAAc,SAAiB,UAA+C;AAC5F,MAAI;AACF,OAAG,WAAW,OAAO;AACrB,WAAO,EAAE,SAAS,KAAK;AAAA,EACzB,SAAS,OAAO;AAEd,QAAI,iBAAiB,SAAS,UAAU,SAAS,MAAM,SAAS,UAAU;AACxE,aAAO,EAAE,SAAS,KAAK;AAAA,IACzB;AAEA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,OAAO,iBAAiB,QAAQ,QAAQ,IAAI,MAAM,OAAO,KAAK,CAAC;AAAA,IACjE;AAAA,EACF;AACF;AASO,SAAS,iBAAiB,SAAiB,OAAuB;AACvE,SAAO,KAAK,KAAK,SAAS,GAAG,KAAK,MAAM;AAC1C;;;ACnEA,eAAe,OAAsB;AACnC,QAAM,QAAQ,QAAQ,KAAK,CAAC;AAC5B,QAAM,UAAU,QAAQ,IAAI,iBAAiB;AAE7C,MAAI,UAAU,UAAa,UAAU,IAAI;AACvC,YAAQ,MAAM,wBAAwB;AACtC,YAAQ,MAAM,uCAAuC;AACrD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,aAAa,IAAI,WAAW,OAAO;AACzC,QAAM,WAAW,MAAM,eAAe,QAAW,OAAO;AAGxD,QAAM,UAAU;AAAA,IACd,WAAW,SAAS;AAAA;AAAA,IACpB;AAAA,EACF;AAEA,MAAI;AACF,iBAAa,SAAS,QAAQ,GAAG;AAAA,EACnC,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,UAAQ,GAAG,WAAW,MAAM;AAC1B,YAAQ,IAAI,IAAI,KAAK,uCAAuC;AAC5D,eAAW,UAAU,OAAO;AAAA,MAC1B,QAAQ;AAAA,MACR,SAAS;AAAA,IACX,CAAC;AAGD,UAAM,eAAe,cAAc,SAAS,SAAS;AACrD,QAAI,CAAC,aAAa,WAAW,aAAa,UAAU,QAAW;AAC7D,cAAQ;AAAA,QACN,sDAAsD,aAAa,MAAM,OAAO;AAAA,MAClF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB,CAAC;AAGD,QAAM,SAAS,IAAI;AAAA,IACjB;AAAA,IACA,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,EACX;AAEA,MAAI;AACF,UAAM,OAAO,WAAW,KAAK;AAG7B,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,IAAI,IAAI,KAAK,8BAA8B;AACnD,YAAQ,KAAK,CAAC;AAAA,EAChB,SAAS,OAAO;AAEd,YAAQ,MAAM,IAAI,KAAK,iBAAiB,KAAK;AAG7C,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAEA,KAAK,EAAE,MAAM,CAAC,UAAmB;AAC/B,UAAQ,MAAM,qCAAqC,KAAK;AACxD,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
1
+ {"version":3,"sources":["../../src/workers/background-worker.ts","../../src/workers/pid-file.ts","../../src/workers/background-worker-cli.ts"],"sourcesContent":["import { createHash } from 'node:crypto';\nimport { IntelligentCrawler, type CrawlProgress } from '../crawl/intelligent-crawler.js';\nimport { IndexService } from '../services/index.service.js';\nimport { JobService } from '../services/job.service.js';\nimport { StoreService } from '../services/store.service.js';\nimport { createStoreId, createDocumentId } from '../types/brands.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { Document } from '../types/document.js';\nimport type { Job } from '../types/job.js';\n\n/**\n * Calculate index progress as a percentage, handling division by zero.\n * @param current - Current number of items processed\n * @param total - Total number of items (may be 0)\n * @param scale - Scale factor for progress (default 100 for 0-100%)\n * @returns Progress value, or 0 if total is 0\n */\nexport function calculateIndexProgress(\n current: number,\n total: number,\n scale: number = 100\n): number {\n if (total === 0) return 0;\n return (current / total) * scale;\n}\n\nexport class BackgroundWorker {\n constructor(\n private readonly jobService: JobService,\n private readonly storeService: StoreService,\n private readonly indexService: IndexService,\n private readonly lanceStore: LanceStore,\n private readonly embeddingEngine: EmbeddingEngine\n ) {}\n\n /**\n * Execute a job based on its type\n */\n async executeJob(jobId: string): Promise<void> {\n const job = this.jobService.getJob(jobId);\n\n if (!job) {\n throw new Error(`Job ${jobId} not found`);\n }\n\n try {\n // Update to running status\n this.jobService.updateJob(jobId, {\n status: 'running',\n message: `Starting ${job.type} operation...`,\n progress: 0,\n details: { startedAt: new Date().toISOString() },\n });\n\n // Execute based on job type\n switch (job.type) {\n case 'clone':\n await this.executeCloneJob(job);\n break;\n case 'index':\n await this.executeIndexJob(job);\n break;\n case 'crawl':\n await this.executeCrawlJob(job);\n break;\n default:\n throw new Error(`Unknown job type: ${String(job.type)}`);\n }\n\n // Mark as completed\n this.jobService.updateJob(jobId, {\n status: 'completed',\n progress: 100,\n message: `${job.type} operation completed successfully`,\n details: { completedAt: new Date().toISOString() },\n });\n } catch (error) {\n // Mark as failed\n const errorDetails: Record<string, unknown> = {\n completedAt: new Date().toISOString(),\n };\n if (error instanceof Error && error.stack !== undefined) {\n errorDetails['error'] = error.stack;\n } else {\n errorDetails['error'] = String(error);\n }\n this.jobService.updateJob(jobId, {\n status: 'failed',\n message: error instanceof Error ? error.message : 'Unknown error',\n details: errorDetails,\n });\n throw error;\n }\n }\n\n /**\n * Execute a clone job (git clone + initial indexing)\n */\n private async executeCloneJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for clone job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Clone is already done by the time the job is created\n // (happens in StoreService.create), so we just need to index\n\n // Update progress - cloning considered done (30%)\n this.jobService.updateJob(job.id, {\n status: 'running',\n message: 'Repository cloned, starting indexing...',\n progress: 30,\n });\n\n // Index the repository with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Indexing is 70% of total progress (30-100%)\n const indexProgress = calculateIndexProgress(event.current, event.total, 70);\n const totalProgress = 30 + indexProgress;\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, totalProgress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute an index job (re-indexing existing store)\n */\n private async executeIndexJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for index job');\n }\n\n // Get the store\n const store = await this.storeService.getByIdOrName(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Index with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n const progress = calculateIndexProgress(event.current, event.total);\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, progress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute a crawl job (web crawling + indexing)\n */\n private async executeCrawlJob(job: Job): Promise<void> {\n const { storeId, url, crawlInstruction, extractInstruction, maxPages, simple, useHeadless } =\n job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for crawl job');\n }\n if (url === undefined || typeof url !== 'string') {\n throw new Error('URL required for crawl job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (store?.type !== 'web') {\n throw new Error(`Web store ${storeId} not found`);\n }\n\n const resolvedMaxPages = typeof maxPages === 'number' ? maxPages : 50;\n const crawler = new IntelligentCrawler();\n\n // Listen for progress events\n crawler.on('progress', (progress: CrawlProgress) => {\n // Check if job was cancelled - just return early, for-await loop will throw and finally will cleanup\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n return;\n }\n\n // Crawling is 80% of total progress (0-80%)\n const crawlProgress = (progress.pagesVisited / resolvedMaxPages) * 80;\n\n this.jobService.updateJob(job.id, {\n message:\n progress.message ??\n `Crawling page ${String(progress.pagesVisited)}/${String(resolvedMaxPages)}`,\n progress: Math.min(80, crawlProgress),\n details: { pagesCrawled: progress.pagesVisited },\n });\n });\n\n try {\n await this.lanceStore.initialize(store.id);\n const docs: Document[] = [];\n\n // Build crawl options, only including defined values\n const crawlOptions: {\n maxPages: number;\n simple: boolean;\n useHeadless: boolean;\n crawlInstruction?: string;\n extractInstruction?: string;\n } = {\n maxPages: resolvedMaxPages,\n simple: simple ?? false,\n useHeadless: useHeadless ?? true, // Default to headless for reliability\n };\n if (crawlInstruction !== undefined) {\n crawlOptions.crawlInstruction = crawlInstruction;\n }\n if (extractInstruction !== undefined) {\n crawlOptions.extractInstruction = extractInstruction;\n }\n\n // Crawl pages using IntelligentCrawler\n for await (const result of crawler.crawl(url, crawlOptions)) {\n // Check cancellation between pages\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Embed and index the content (use extracted if available, otherwise markdown)\n const contentToEmbed = result.extracted ?? result.markdown;\n const vector = await this.embeddingEngine.embed(contentToEmbed);\n\n docs.push({\n id: createDocumentId(`${store.id}-${createHash('md5').update(result.url).digest('hex')}`),\n content: contentToEmbed,\n vector,\n metadata: {\n type: 'web',\n storeId: store.id,\n url: result.url,\n title: result.title,\n extracted: result.extracted !== undefined,\n depth: result.depth,\n indexedAt: new Date(),\n },\n });\n }\n\n // Index all documents (remaining 20%)\n if (docs.length > 0) {\n this.jobService.updateJob(job.id, {\n message: 'Indexing crawled documents...',\n progress: 85,\n });\n\n await this.lanceStore.addDocuments(store.id, docs);\n // Create FTS index for full-text search\n await this.lanceStore.createFtsIndex(store.id);\n }\n\n this.jobService.updateJob(job.id, {\n message: `Crawled and indexed ${String(docs.length)} pages`,\n progress: 100,\n details: { pagesCrawled: docs.length },\n });\n } finally {\n await crawler.stop();\n }\n }\n}\n","import fs from 'fs';\nimport path from 'path';\n\n/**\n * Result of a PID file delete operation.\n * Delete operations are best-effort and should not throw.\n */\nexport interface PidFileResult {\n success: boolean;\n error?: Error;\n}\n\n/**\n * Context for PID file deletion - indicates when the delete is happening.\n * Used for logging/debugging purposes.\n */\nexport type PidFileDeleteContext = 'sigterm' | 'success' | 'failure';\n\n/**\n * Write PID file - CRITICAL operation that must succeed.\n *\n * If the PID file cannot be written, the job cannot be cancelled through\n * the job management system. This is a critical failure and the job\n * should not proceed.\n *\n * @param pidFile - Absolute path to the PID file\n * @param pid - Process ID to write\n * @throws Error if PID file cannot be written\n */\nexport function writePidFile(pidFile: string, pid: number): void {\n try {\n fs.writeFileSync(pidFile, pid.toString(), 'utf-8');\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n throw new Error(\n `CRITICAL: Failed to write PID file ${pidFile}. ` +\n `Job cannot be cancelled without PID file. ` +\n `Original error: ${message}`\n );\n }\n}\n\n/**\n * Delete PID file - best-effort cleanup during shutdown.\n *\n * This operation should NEVER throw. During process shutdown (SIGTERM,\n * job success, job failure), failing to delete a PID file should not\n * prevent the process from exiting cleanly.\n *\n * Stale PID files are cleaned up by JobService.cleanupOldJobs().\n *\n * @param pidFile - Absolute path to the PID file\n * @param _context - Context indicating when the delete is happening (for future logging)\n * @returns Result indicating success or failure with error details\n */\nexport function deletePidFile(pidFile: string, _context: PidFileDeleteContext): PidFileResult {\n try {\n fs.unlinkSync(pidFile);\n return { success: true };\n } catch (error) {\n // ENOENT = file doesn't exist - that's success (nothing to delete)\n if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {\n return { success: true };\n }\n // Any other error = failure (permission denied, etc.)\n return {\n success: false,\n error: error instanceof Error ? error : new Error(String(error)),\n };\n }\n}\n\n/**\n * Build the path to a PID file for a given job.\n *\n * @param jobsDir - Directory where job files are stored\n * @param jobId - Job identifier\n * @returns Absolute path to the PID file\n */\nexport function buildPidFilePath(jobsDir: string, jobId: string): string {\n return path.join(jobsDir, `${jobId}.pid`);\n}\n","#!/usr/bin/env node\nimport { BackgroundWorker } from './background-worker.js';\nimport { writePidFile, deletePidFile, buildPidFilePath } from './pid-file.js';\nimport { createServices } from '../services/index.js';\nimport { JobService } from '../services/job.service.js';\n\n/**\n * Background worker CLI entry point\n *\n * Usage: background-worker-cli <job-id>\n *\n * This process runs detached from the parent and executes a single job.\n */\n\nasync function main(): Promise<void> {\n const jobId = process.argv[2];\n const dataDir = process.env['BLUERA_DATA_DIR'];\n\n if (jobId === undefined || jobId === '') {\n console.error('Error: Job ID required');\n console.error('Usage: background-worker-cli <job-id>');\n process.exit(1);\n }\n\n // Initialize services\n const jobService = new JobService(dataDir);\n const services = await createServices(undefined, dataDir);\n\n // Write PID file for job cancellation - CRITICAL: must succeed or job cannot be cancelled\n const pidFile = buildPidFilePath(\n jobService['jobsDir'], // Access private field for PID path\n jobId\n );\n\n try {\n writePidFile(pidFile, process.pid);\n } catch (error) {\n // CRITICAL: Cannot proceed without PID file - job would be uncancellable\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n\n // Handle SIGTERM for graceful shutdown\n process.on('SIGTERM', () => {\n console.log(`[${jobId}] Received SIGTERM, cancelling job...`);\n jobService.updateJob(jobId, {\n status: 'cancelled',\n message: 'Job cancelled by user',\n });\n\n // Clean up PID file (best-effort - don't block shutdown)\n const deleteResult = deletePidFile(pidFile, 'sigterm');\n if (!deleteResult.success && deleteResult.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file during SIGTERM: ${deleteResult.error.message}`\n );\n }\n\n process.exit(0);\n });\n\n // Create worker and execute job\n const worker = new BackgroundWorker(\n jobService,\n services.store,\n services.index,\n services.lance,\n services.embeddings\n );\n\n try {\n await worker.executeJob(jobId);\n\n // Clean up PID file on success (best-effort - don't change exit code)\n const successCleanup = deletePidFile(pidFile, 'success');\n if (!successCleanup.success && successCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after success: ${successCleanup.error.message}`\n );\n }\n\n console.log(`[${jobId}] Job completed successfully`);\n process.exit(0);\n } catch (error) {\n // Job service already updated with failure status in BackgroundWorker\n console.error(`[${jobId}] Job failed:`, error);\n\n // Clean up PID file on failure (best-effort - exit code reflects job failure)\n const failureCleanup = deletePidFile(pidFile, 'failure');\n if (!failureCleanup.success && failureCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after failure: ${failureCleanup.error.message}`\n );\n }\n\n process.exit(1);\n }\n}\n\nmain().catch((error: unknown) => {\n console.error('Fatal error in background worker:', error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;;;;;;;AAAA,SAAS,kBAAkB;AAkBpB,SAAS,uBACd,SACA,OACA,QAAgB,KACR;AACR,MAAI,UAAU,EAAG,QAAO;AACxB,SAAQ,UAAU,QAAS;AAC7B;AAEO,IAAM,mBAAN,MAAuB;AAAA,EAC5B,YACmB,YACA,cACA,cACA,YACA,iBACjB;AALiB;AACA;AACA;AACA;AACA;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA,EAKH,MAAM,WAAW,OAA8B;AAC7C,UAAM,MAAM,KAAK,WAAW,OAAO,KAAK;AAExC,QAAI,CAAC,KAAK;AACR,YAAM,IAAI,MAAM,OAAO,KAAK,YAAY;AAAA,IAC1C;AAEA,QAAI;AAEF,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,YAAY,IAAI,IAAI;AAAA,QAC7B,UAAU;AAAA,QACV,SAAS,EAAE,YAAW,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACjD,CAAC;AAGD,cAAQ,IAAI,MAAM;AAAA,QAChB,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF;AACE,gBAAM,IAAI,MAAM,qBAAqB,OAAO,IAAI,IAAI,CAAC,EAAE;AAAA,MAC3D;AAGA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,UAAU;AAAA,QACV,SAAS,GAAG,IAAI,IAAI;AAAA,QACpB,SAAS,EAAE,cAAa,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACnD,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,YAAM,eAAwC;AAAA,QAC5C,cAAa,oBAAI,KAAK,GAAE,YAAY;AAAA,MACtC;AACA,UAAI,iBAAiB,SAAS,MAAM,UAAU,QAAW;AACvD,qBAAa,OAAO,IAAI,MAAM;AAAA,MAChC,OAAO;AACL,qBAAa,OAAO,IAAI,OAAO,KAAK;AAAA,MACtC;AACA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,iBAAiB,QAAQ,MAAM,UAAU;AAAA,QAClD,SAAS;AAAA,MACX,CAAC;AACD,YAAM;AAAA,IACR;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAMA,SAAK,WAAW,UAAU,IAAI,IAAI;AAAA,MAChC,QAAQ;AAAA,MACR,SAAS;AAAA,MACT,UAAU;AAAA,IACZ,CAAC;AAGD,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,gBAAgB,uBAAuB,MAAM,SAAS,MAAM,OAAO,EAAE;AAC3E,cAAM,gBAAgB,KAAK;AAE3B,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA;AAAA,UACpC,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,cAAc,cAAc,OAAO,CAAC;AAC1E,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAGA,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAEA,cAAM,WAAW,uBAAuB,MAAM,SAAS,MAAM,KAAK;AAElE,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,QAAQ;AAAA;AAAA,UAC/B,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,SAAS,KAAK,kBAAkB,oBAAoB,UAAU,QAAQ,YAAY,IACxF,IAAI;AAEN,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AACA,QAAI,QAAQ,UAAa,OAAO,QAAQ,UAAU;AAChD,YAAM,IAAI,MAAM,4BAA4B;AAAA,IAC9C;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,OAAO,SAAS,OAAO;AACzB,YAAM,IAAI,MAAM,aAAa,OAAO,YAAY;AAAA,IAClD;AAEA,UAAM,mBAAmB,OAAO,aAAa,WAAW,WAAW;AACnE,UAAM,UAAU,IAAI,mBAAmB;AAGvC,YAAQ,GAAG,YAAY,CAAC,aAA4B;AAElD,YAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,UAAI,YAAY,WAAW,aAAa;AACtC;AAAA,MACF;AAGA,YAAM,gBAAiB,SAAS,eAAe,mBAAoB;AAEnE,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SACE,SAAS,WACT,iBAAiB,OAAO,SAAS,YAAY,CAAC,IAAI,OAAO,gBAAgB,CAAC;AAAA,QAC5E,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA,QACpC,SAAS,EAAE,cAAc,SAAS,aAAa;AAAA,MACjD,CAAC;AAAA,IACH,CAAC;AAED,QAAI;AACF,YAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,YAAM,OAAmB,CAAC;AAG1B,YAAM,eAMF;AAAA,QACF,UAAU;AAAA,QACV,QAAQ,UAAU;AAAA,QAClB,aAAa,eAAe;AAAA;AAAA,MAC9B;AACA,UAAI,qBAAqB,QAAW;AAClC,qBAAa,mBAAmB;AAAA,MAClC;AACA,UAAI,uBAAuB,QAAW;AACpC,qBAAa,qBAAqB;AAAA,MACpC;AAGA,uBAAiB,UAAU,QAAQ,MAAM,KAAK,YAAY,GAAG;AAE3D,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,iBAAiB,OAAO,aAAa,OAAO;AAClD,cAAM,SAAS,MAAM,KAAK,gBAAgB,MAAM,cAAc;AAE9D,aAAK,KAAK;AAAA,UACR,IAAI,iBAAiB,GAAG,MAAM,EAAE,IAAI,WAAW,KAAK,EAAE,OAAO,OAAO,GAAG,EAAE,OAAO,KAAK,CAAC,EAAE;AAAA,UACxF,SAAS;AAAA,UACT;AAAA,UACA,UAAU;AAAA,YACR,MAAM;AAAA,YACN,SAAS,MAAM;AAAA,YACf,KAAK,OAAO;AAAA,YACZ,OAAO,OAAO;AAAA,YACd,WAAW,OAAO,cAAc;AAAA,YAChC,OAAO,OAAO;AAAA,YACd,WAAW,oBAAI,KAAK;AAAA,UACtB;AAAA,QACF,CAAC;AAAA,MACH;AAGA,UAAI,KAAK,SAAS,GAAG;AACnB,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS;AAAA,UACT,UAAU;AAAA,QACZ,CAAC;AAED,cAAM,KAAK,WAAW,aAAa,MAAM,IAAI,IAAI;AAEjD,cAAM,KAAK,WAAW,eAAe,MAAM,EAAE;AAAA,MAC/C;AAEA,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SAAS,uBAAuB,OAAO,KAAK,MAAM,CAAC;AAAA,QACnD,UAAU;AAAA,QACV,SAAS,EAAE,cAAc,KAAK,OAAO;AAAA,MACvC,CAAC;AAAA,IACH,UAAE;AACA,YAAM,QAAQ,KAAK;AAAA,IACrB;AAAA,EACF;AACF;;;ACvTA,OAAO,QAAQ;AACf,OAAO,UAAU;AA4BV,SAAS,aAAa,SAAiB,KAAmB;AAC/D,MAAI;AACF,OAAG,cAAc,SAAS,IAAI,SAAS,GAAG,OAAO;AAAA,EACnD,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,UAAM,IAAI;AAAA,MACR,sCAAsC,OAAO,+DAExB,OAAO;AAAA,IAC9B;AAAA,EACF;AACF;AAeO,SAAS,cAAc,SAAiB,UAA+C;AAC5F,MAAI;AACF,OAAG,WAAW,OAAO;AACrB,WAAO,EAAE,SAAS,KAAK;AAAA,EACzB,SAAS,OAAO;AAEd,QAAI,iBAAiB,SAAS,UAAU,SAAS,MAAM,SAAS,UAAU;AACxE,aAAO,EAAE,SAAS,KAAK;AAAA,IACzB;AAEA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,OAAO,iBAAiB,QAAQ,QAAQ,IAAI,MAAM,OAAO,KAAK,CAAC;AAAA,IACjE;AAAA,EACF;AACF;AASO,SAAS,iBAAiB,SAAiB,OAAuB;AACvE,SAAO,KAAK,KAAK,SAAS,GAAG,KAAK,MAAM;AAC1C;;;ACnEA,eAAe,OAAsB;AACnC,QAAM,QAAQ,QAAQ,KAAK,CAAC;AAC5B,QAAM,UAAU,QAAQ,IAAI,iBAAiB;AAE7C,MAAI,UAAU,UAAa,UAAU,IAAI;AACvC,YAAQ,MAAM,wBAAwB;AACtC,YAAQ,MAAM,uCAAuC;AACrD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,aAAa,IAAI,WAAW,OAAO;AACzC,QAAM,WAAW,MAAM,eAAe,QAAW,OAAO;AAGxD,QAAM,UAAU;AAAA,IACd,WAAW,SAAS;AAAA;AAAA,IACpB;AAAA,EACF;AAEA,MAAI;AACF,iBAAa,SAAS,QAAQ,GAAG;AAAA,EACnC,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,UAAQ,GAAG,WAAW,MAAM;AAC1B,YAAQ,IAAI,IAAI,KAAK,uCAAuC;AAC5D,eAAW,UAAU,OAAO;AAAA,MAC1B,QAAQ;AAAA,MACR,SAAS;AAAA,IACX,CAAC;AAGD,UAAM,eAAe,cAAc,SAAS,SAAS;AACrD,QAAI,CAAC,aAAa,WAAW,aAAa,UAAU,QAAW;AAC7D,cAAQ;AAAA,QACN,sDAAsD,aAAa,MAAM,OAAO;AAAA,MAClF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB,CAAC;AAGD,QAAM,SAAS,IAAI;AAAA,IACjB;AAAA,IACA,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,EACX;AAEA,MAAI;AACF,UAAM,OAAO,WAAW,KAAK;AAG7B,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,IAAI,IAAI,KAAK,8BAA8B;AACnD,YAAQ,KAAK,CAAC;AAAA,EAChB,SAAS,OAAO;AAEd,YAAQ,MAAM,IAAI,KAAK,iBAAiB,KAAK;AAG7C,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAEA,KAAK,EAAE,MAAM,CAAC,UAAmB;AAC/B,UAAQ,MAAM,qCAAqC,KAAK;AACxD,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
@@ -13,6 +13,29 @@ GREEN='\033[0;32m'
13
13
  YELLOW='\033[1;33m'
14
14
  NC='\033[0m' # No Color
15
15
 
16
+ # =====================
17
+ # Helper Functions
18
+ # =====================
19
+
20
+ # Install Playwright browser (called after crawl4ai is confirmed installed)
21
+ install_playwright_browser() {
22
+ # Check if Playwright Chromium is already installed by testing if browser can be launched
23
+ if python3 -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); b = p.chromium.launch(); b.close(); p.stop()" 2>/dev/null; then
24
+ echo -e "${GREEN}[bluera-knowledge] Playwright Chromium ready ✓${NC}"
25
+ return 0
26
+ fi
27
+
28
+ echo -e "${YELLOW}[bluera-knowledge] Installing Playwright browser (one-time setup)...${NC}"
29
+ if python3 -m playwright install chromium 2>/dev/null; then
30
+ echo -e "${GREEN}[bluera-knowledge] Playwright Chromium installed ✓${NC}"
31
+ return 0
32
+ else
33
+ echo -e "${YELLOW}[bluera-knowledge] Playwright browser install failed.${NC}"
34
+ echo -e "${YELLOW}Manual fix: python3 -m playwright install chromium${NC}"
35
+ return 1
36
+ fi
37
+ }
38
+
16
39
  # =====================
17
40
  # Node.js Dependencies
18
41
  # =====================
@@ -60,6 +83,8 @@ if python3 -c "import crawl4ai" 2>/dev/null; then
60
83
  # Already installed - get version
61
84
  crawl4ai_version=$(python3 -c "import crawl4ai; print(crawl4ai.__version__)" 2>/dev/null || echo "unknown")
62
85
  echo -e "${GREEN}[bluera-knowledge] crawl4ai ${crawl4ai_version} is installed ✓${NC}"
86
+ # Ensure Playwright browser is installed for headless crawling
87
+ install_playwright_browser
63
88
  exit 0
64
89
  fi
65
90
 
@@ -87,12 +112,16 @@ if command -v pip3 &> /dev/null || command -v pip &> /dev/null; then
87
112
  echo -e "${GREEN}[bluera-knowledge] Successfully installed crawl4ai ✓${NC}"
88
113
  crawl4ai_version=$(python3 -c "import crawl4ai; print(crawl4ai.__version__)" 2>/dev/null || echo "installed")
89
114
  echo -e "${GREEN}[bluera-knowledge] crawl4ai ${crawl4ai_version} ready${NC}"
115
+ # Install Playwright browser for headless crawling
116
+ install_playwright_browser
90
117
  else
91
118
  # Fallback: try without --break-system-packages for older Python
92
119
  if $PIP_CMD install --quiet --user crawl4ai 2>/dev/null; then
93
120
  echo -e "${GREEN}[bluera-knowledge] Successfully installed crawl4ai ✓${NC}"
94
121
  crawl4ai_version=$(python3 -c "import crawl4ai; print(crawl4ai.__version__)" 2>/dev/null || echo "installed")
95
122
  echo -e "${GREEN}[bluera-knowledge] crawl4ai ${crawl4ai_version} ready${NC}"
123
+ # Install Playwright browser for headless crawling
124
+ install_playwright_browser
96
125
  else
97
126
  echo -e "${RED}[bluera-knowledge] Auto-installation failed${NC}"
98
127
  echo ""
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bluera-knowledge",
3
- "version": "0.9.42",
3
+ "version": "0.10.0",
4
4
  "description": "CLI tool for managing knowledge stores with semantic search",
5
5
  "type": "module",
6
6
  "bin": {
@@ -29,10 +29,15 @@ async def fetch_headless(url: str):
29
29
  if not result.success:
30
30
  raise Exception(f"Crawl failed: {result.error_message}")
31
31
 
32
+ # Combine internal and external links - let TypeScript filter by domain
33
+ all_links = []
34
+ if isinstance(result.links, dict):
35
+ all_links = result.links.get("internal", []) + result.links.get("external", [])
36
+
32
37
  return {
33
38
  "html": result.html or '',
34
39
  "markdown": result.markdown or result.cleaned_html or '',
35
- "links": result.links.get("internal", []) if isinstance(result.links, dict) else []
40
+ "links": all_links
36
41
  }
37
42
 
38
43
  def is_exported(node: ast.AST) -> bool:
@@ -132,7 +132,7 @@ describe('crawl command execution', () => {
132
132
  expect(mockCrawler.crawl).toHaveBeenCalledWith('https://example.com', {
133
133
  crawlInstruction: 'all documentation pages',
134
134
  maxPages: 50,
135
- useHeadless: false,
135
+ useHeadless: true,
136
136
  });
137
137
  expect(mockServices.embeddings.embed).toHaveBeenCalledTimes(2);
138
138
  expect(mockServices.lance.addDocuments).toHaveBeenCalledWith(
@@ -162,6 +162,46 @@ describe('crawl command execution', () => {
162
162
  expect(processExitSpy).not.toHaveBeenCalled();
163
163
  });
164
164
 
165
+ it('uses axios-only mode when --fast flag is provided', async () => {
166
+ const mockStore: WebStore = {
167
+ id: createStoreId('store-1'),
168
+ name: 'test-store',
169
+ type: 'web',
170
+ url: 'https://example.com',
171
+ depth: 2,
172
+ createdAt: new Date(),
173
+ updatedAt: new Date(),
174
+ };
175
+
176
+ mockServices.store.getByIdOrName.mockResolvedValue(mockStore);
177
+ mockServices.lance.initialize.mockResolvedValue(undefined);
178
+ mockServices.embeddings.embed.mockResolvedValue([0.1, 0.2, 0.3]);
179
+ mockServices.lance.addDocuments.mockResolvedValue(undefined);
180
+
181
+ mockCrawler.crawl.mockReturnValue(
182
+ (async function* () {
183
+ yield {
184
+ url: 'https://example.com/page1',
185
+ title: 'Page 1',
186
+ markdown: '# Page 1',
187
+ depth: 0,
188
+ };
189
+ })()
190
+ );
191
+
192
+ const command = createCrawlCommand(getOptions);
193
+ command.parseOptions(['--fast', '--max-pages', '25']);
194
+ const actionHandler = command._actionHandler;
195
+
196
+ await actionHandler(['https://example.com', 'test-store']);
197
+
198
+ expect(mockCrawler.crawl).toHaveBeenCalledWith('https://example.com', {
199
+ maxPages: 25,
200
+ useHeadless: false, // --fast disables headless
201
+ });
202
+ expect(mockCrawler.stop).toHaveBeenCalled();
203
+ });
204
+
165
205
  it('successfully crawls in simple mode', async () => {
166
206
  const mockStore: WebStore = {
167
207
  id: createStoreId('store-1'),
@@ -198,7 +238,7 @@ describe('crawl command execution', () => {
198
238
  expect(mockCrawler.crawl).toHaveBeenCalledWith('https://example.com', {
199
239
  maxPages: 25,
200
240
  simple: true,
201
- useHeadless: false,
241
+ useHeadless: true,
202
242
  });
203
243
  expect(mockCrawler.stop).toHaveBeenCalled();
204
244
  });
@@ -295,7 +335,7 @@ describe('crawl command execution', () => {
295
335
  crawlInstruction: 'all Getting Started pages',
296
336
  extractInstruction: 'code examples',
297
337
  maxPages: 100,
298
- useHeadless: false,
338
+ useHeadless: true,
299
339
  });
300
340
  });
301
341
  });
@@ -25,7 +25,7 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
25
25
  )
26
26
  .option('--simple', 'Use simple BFS mode instead of intelligent crawling')
27
27
  .option('--max-pages <number>', 'Maximum number of pages to crawl', '50')
28
- .option('--headless', 'Use headless browser for JavaScript-rendered sites')
28
+ .option('--fast', 'Use fast axios-only mode (may fail on JavaScript-heavy sites)')
29
29
  .action(
30
30
  async (
31
31
  url: string,
@@ -35,7 +35,7 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
35
35
  extract?: string;
36
36
  simple?: boolean;
37
37
  maxPages?: string;
38
- headless?: boolean;
38
+ fast?: boolean;
39
39
  }
40
40
  ) => {
41
41
  const globalOpts = getOptions();
@@ -125,7 +125,7 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
125
125
  ...(cmdOptions.extract !== undefined && { extractInstruction: cmdOptions.extract }),
126
126
  maxPages,
127
127
  ...(cmdOptions.simple !== undefined && { simple: cmdOptions.simple }),
128
- useHeadless: cmdOptions.headless ?? false,
128
+ useHeadless: !(cmdOptions.fast ?? false), // Default true (headless), --fast disables
129
129
  })) {
130
130
  // Use extracted content if available, otherwise markdown
131
131
  const contentToProcess = result.extracted ?? result.markdown;
@@ -81,7 +81,11 @@ describe('ClaudeClient', () => {
81
81
 
82
82
  describe('determineCrawlUrls', () => {
83
83
  it('should successfully parse valid crawl strategy response', async () => {
84
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all docs');
84
+ const promise = client.determineCrawlUrls(
85
+ 'https://example.com',
86
+ '<html>test</html>',
87
+ 'Find all docs'
88
+ );
85
89
 
86
90
  // Simulate successful response
87
91
  setTimeout(() => {
@@ -103,7 +107,11 @@ describe('ClaudeClient', () => {
103
107
  });
104
108
 
105
109
  it('should call spawn with correct arguments for determineCrawlUrls', async () => {
106
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all docs');
110
+ const promise = client.determineCrawlUrls(
111
+ 'https://example.com',
112
+ '<html>test</html>',
113
+ 'Find all docs'
114
+ );
107
115
 
108
116
  setTimeout(() => {
109
117
  mockProcess.stdout.emit(
@@ -136,7 +144,11 @@ describe('ClaudeClient', () => {
136
144
  });
137
145
 
138
146
  it('should write prompt to stdin', async () => {
139
- const promise = client.determineCrawlUrls('<html><body>Test</body></html>', 'Find tutorials');
147
+ const promise = client.determineCrawlUrls(
148
+ 'https://example.com',
149
+ '<html><body>Test</body></html>',
150
+ 'Find tutorials'
151
+ );
140
152
 
141
153
  setTimeout(() => {
142
154
  mockProcess.stdout.emit(
@@ -160,7 +172,11 @@ describe('ClaudeClient', () => {
160
172
  });
161
173
 
162
174
  it('should reject when response has no urls array', async () => {
163
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
175
+ const promise = client.determineCrawlUrls(
176
+ 'https://example.com',
177
+ '<html>test</html>',
178
+ 'Find all'
179
+ );
164
180
 
165
181
  setTimeout(() => {
166
182
  mockProcess.stdout.emit(
@@ -178,7 +194,11 @@ describe('ClaudeClient', () => {
178
194
  });
179
195
 
180
196
  it('should reject when response has empty urls array', async () => {
181
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
197
+ const promise = client.determineCrawlUrls(
198
+ 'https://example.com',
199
+ '<html>test</html>',
200
+ 'Find all'
201
+ );
182
202
 
183
203
  setTimeout(() => {
184
204
  mockProcess.stdout.emit(
@@ -197,7 +217,11 @@ describe('ClaudeClient', () => {
197
217
  });
198
218
 
199
219
  it('should reject when response has no reasoning', async () => {
200
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
220
+ const promise = client.determineCrawlUrls(
221
+ 'https://example.com',
222
+ '<html>test</html>',
223
+ 'Find all'
224
+ );
201
225
 
202
226
  setTimeout(() => {
203
227
  mockProcess.stdout.emit(
@@ -215,7 +239,11 @@ describe('ClaudeClient', () => {
215
239
  });
216
240
 
217
241
  it('should reject when urls contains non-string values', async () => {
218
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
242
+ const promise = client.determineCrawlUrls(
243
+ 'https://example.com',
244
+ '<html>test</html>',
245
+ 'Find all'
246
+ );
219
247
 
220
248
  setTimeout(() => {
221
249
  mockProcess.stdout.emit(
@@ -234,7 +262,11 @@ describe('ClaudeClient', () => {
234
262
  });
235
263
 
236
264
  it('should reject when response is not valid JSON', async () => {
237
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
265
+ const promise = client.determineCrawlUrls(
266
+ 'https://example.com',
267
+ '<html>test</html>',
268
+ 'Find all'
269
+ );
238
270
 
239
271
  setTimeout(() => {
240
272
  mockProcess.stdout.emit('data', Buffer.from('Not valid JSON'));
@@ -245,7 +277,11 @@ describe('ClaudeClient', () => {
245
277
  });
246
278
 
247
279
  it('should reject when response is null', async () => {
248
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
280
+ const promise = client.determineCrawlUrls(
281
+ 'https://example.com',
282
+ '<html>test</html>',
283
+ 'Find all'
284
+ );
249
285
 
250
286
  setTimeout(() => {
251
287
  mockProcess.stdout.emit('data', Buffer.from('null'));
@@ -257,7 +293,7 @@ describe('ClaudeClient', () => {
257
293
 
258
294
  it('should truncate HTML longer than 50000 characters', async () => {
259
295
  const longHtml = '<html>' + 'a'.repeat(60000) + '</html>';
260
- const promise = client.determineCrawlUrls(longHtml, 'Find all');
296
+ const promise = client.determineCrawlUrls('https://example.com', longHtml, 'Find all');
261
297
 
262
298
  setTimeout(() => {
263
299
  mockProcess.stdout.emit(
@@ -281,7 +317,7 @@ describe('ClaudeClient', () => {
281
317
 
282
318
  it('should not truncate HTML shorter than 50000 characters', async () => {
283
319
  const shortHtml = '<html><body>Short content</body></html>';
284
- const promise = client.determineCrawlUrls(shortHtml, 'Find all');
320
+ const promise = client.determineCrawlUrls('https://example.com', shortHtml, 'Find all');
285
321
 
286
322
  setTimeout(() => {
287
323
  mockProcess.stdout.emit(
@@ -302,6 +338,33 @@ describe('ClaudeClient', () => {
302
338
  expect(writtenPrompt).toContain(shortHtml);
303
339
  expect(writtenPrompt).not.toContain('[... HTML truncated ...]');
304
340
  });
341
+
342
+ it('should include seedUrl in prompt for relative URL resolution', async () => {
343
+ const promise = client.determineCrawlUrls(
344
+ 'https://code.claude.com/docs',
345
+ '<html><a href="/docs/en/hooks">Hooks</a></html>',
346
+ 'Find all docs'
347
+ );
348
+
349
+ setTimeout(() => {
350
+ mockProcess.stdout.emit(
351
+ 'data',
352
+ Buffer.from(
353
+ JSON.stringify({
354
+ urls: ['https://code.claude.com/docs/en/hooks'],
355
+ reasoning: 'Found hooks documentation',
356
+ })
357
+ )
358
+ );
359
+ mockProcess.emit('close', 0);
360
+ }, 10);
361
+
362
+ await promise;
363
+
364
+ const writtenPrompt = vi.mocked(mockProcess.stdin.write).mock.calls[0]?.[0] as string;
365
+ expect(writtenPrompt).toContain('Base URL: https://code.claude.com/docs');
366
+ expect(writtenPrompt).toContain('resolve them against the Base URL');
367
+ });
305
368
  });
306
369
 
307
370
  describe('extractContent', () => {
@@ -386,7 +449,11 @@ describe('ClaudeClient', () => {
386
449
 
387
450
  describe('Subprocess Management', () => {
388
451
  it('should handle process spawn errors', async () => {
389
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
452
+ const promise = client.determineCrawlUrls(
453
+ 'https://example.com',
454
+ '<html>test</html>',
455
+ 'Find all'
456
+ );
390
457
 
391
458
  setTimeout(() => {
392
459
  mockProcess.emit('error', new Error('spawn ENOENT'));
@@ -410,7 +477,11 @@ describe('ClaudeClient', () => {
410
477
  });
411
478
 
412
479
  it('should collect stderr data', async () => {
413
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
480
+ const promise = client.determineCrawlUrls(
481
+ 'https://example.com',
482
+ '<html>test</html>',
483
+ 'Find all'
484
+ );
414
485
 
415
486
  setTimeout(() => {
416
487
  mockProcess.stderr.emit('data', Buffer.from('Error message 1\n'));
@@ -466,14 +537,22 @@ describe('ClaudeClient', () => {
466
537
 
467
538
  describe('Timeout Handling', () => {
468
539
  it('should timeout after configured timeout period', async () => {
469
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
540
+ const promise = client.determineCrawlUrls(
541
+ 'https://example.com',
542
+ '<html>test</html>',
543
+ 'Find all'
544
+ );
470
545
 
471
546
  // Don't emit close event - let it timeout
472
547
  await expect(promise).rejects.toThrow('timed out after 100ms');
473
548
  });
474
549
 
475
550
  it('should kill process on timeout', async () => {
476
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
551
+ const promise = client.determineCrawlUrls(
552
+ 'https://example.com',
553
+ '<html>test</html>',
554
+ 'Find all'
555
+ );
477
556
 
478
557
  await expect(promise).rejects.toThrow('timed out');
479
558
  expect(mockProcess.kill).toHaveBeenCalledWith('SIGTERM');
@@ -540,7 +619,11 @@ describe('ClaudeClient', () => {
540
619
 
541
620
  describe('JSON Parsing', () => {
542
621
  it('should handle malformed JSON in response', async () => {
543
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
622
+ const promise = client.determineCrawlUrls(
623
+ 'https://example.com',
624
+ '<html>test</html>',
625
+ 'Find all'
626
+ );
544
627
 
545
628
  setTimeout(() => {
546
629
  mockProcess.stdout.emit('data', Buffer.from('{ invalid json }'));
@@ -551,7 +634,11 @@ describe('ClaudeClient', () => {
551
634
  });
552
635
 
553
636
  it('should handle incomplete JSON in response', async () => {
554
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
637
+ const promise = client.determineCrawlUrls(
638
+ 'https://example.com',
639
+ '<html>test</html>',
640
+ 'Find all'
641
+ );
555
642
 
556
643
  setTimeout(() => {
557
644
  mockProcess.stdout.emit('data', Buffer.from('{"urls": ["https://example.com"'));
@@ -562,7 +649,11 @@ describe('ClaudeClient', () => {
562
649
  });
563
650
 
564
651
  it('should handle JSON with extra whitespace', async () => {
565
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
652
+ const promise = client.determineCrawlUrls(
653
+ 'https://example.com',
654
+ '<html>test</html>',
655
+ 'Find all'
656
+ );
566
657
 
567
658
  setTimeout(() => {
568
659
  mockProcess.stdout.emit(
@@ -582,7 +673,11 @@ describe('ClaudeClient', () => {
582
673
  });
583
674
 
584
675
  it('should handle JSON arrays as invalid for determineCrawlUrls', async () => {
585
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
676
+ const promise = client.determineCrawlUrls(
677
+ 'https://example.com',
678
+ '<html>test</html>',
679
+ 'Find all'
680
+ );
586
681
 
587
682
  setTimeout(() => {
588
683
  mockProcess.stdout.emit('data', Buffer.from('[]'));
@@ -593,7 +688,11 @@ describe('ClaudeClient', () => {
593
688
  });
594
689
 
595
690
  it('should handle JSON primitives as invalid for determineCrawlUrls', async () => {
596
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
691
+ const promise = client.determineCrawlUrls(
692
+ 'https://example.com',
693
+ '<html>test</html>',
694
+ 'Find all'
695
+ );
597
696
 
598
697
  setTimeout(() => {
599
698
  mockProcess.stdout.emit('data', Buffer.from('"string response"'));
@@ -606,7 +705,11 @@ describe('ClaudeClient', () => {
606
705
 
607
706
  describe('Response Validation', () => {
608
707
  it('should validate urls is an array', async () => {
609
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
708
+ const promise = client.determineCrawlUrls(
709
+ 'https://example.com',
710
+ '<html>test</html>',
711
+ 'Find all'
712
+ );
610
713
 
611
714
  setTimeout(() => {
612
715
  mockProcess.stdout.emit(
@@ -625,7 +728,11 @@ describe('ClaudeClient', () => {
625
728
  });
626
729
 
627
730
  it('should validate reasoning is a string', async () => {
628
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
731
+ const promise = client.determineCrawlUrls(
732
+ 'https://example.com',
733
+ '<html>test</html>',
734
+ 'Find all'
735
+ );
629
736
 
630
737
  setTimeout(() => {
631
738
  mockProcess.stdout.emit(
@@ -644,7 +751,11 @@ describe('ClaudeClient', () => {
644
751
  });
645
752
 
646
753
  it('should accept valid response with multiple URLs', async () => {
647
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
754
+ const promise = client.determineCrawlUrls(
755
+ 'https://example.com',
756
+ '<html>test</html>',
757
+ 'Find all'
758
+ );
648
759
 
649
760
  setTimeout(() => {
650
761
  mockProcess.stdout.emit(
@@ -671,7 +782,11 @@ describe('ClaudeClient', () => {
671
782
 
672
783
  describe('Error Messages', () => {
673
784
  it('should wrap errors with context for determineCrawlUrls', async () => {
674
- const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
785
+ const promise = client.determineCrawlUrls(
786
+ 'https://example.com',
787
+ '<html>test</html>',
788
+ 'Find all'
789
+ );
675
790
 
676
791
  setTimeout(() => {
677
792
  mockProcess.emit('close', 1);