bluera-knowledge 0.9.42 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -14
- package/.mcp.json +12 -0
- package/CHANGELOG.md +93 -0
- package/README.md +41 -40
- package/dist/{chunk-MQE32YY6.js → chunk-ITH6FWQY.js} +19 -4
- package/dist/chunk-ITH6FWQY.js.map +1 -0
- package/dist/index.js +4 -3
- package/dist/index.js.map +1 -1
- package/dist/workers/background-worker-cli.js +3 -2
- package/dist/workers/background-worker-cli.js.map +1 -1
- package/hooks/check-dependencies.sh +29 -0
- package/package.json +1 -1
- package/python/crawl_worker.py +6 -1
- package/src/cli/commands/crawl.test.ts +43 -3
- package/src/cli/commands/crawl.ts +3 -3
- package/src/crawl/claude-client.test.ts +139 -24
- package/src/crawl/claude-client.ts +11 -2
- package/src/crawl/intelligent-crawler.test.ts +65 -0
- package/src/crawl/intelligent-crawler.ts +14 -2
- package/src/workers/background-worker.ts +1 -1
- package/dist/chunk-MQE32YY6.js.map +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
3
|
IntelligentCrawler
|
|
4
|
-
} from "../chunk-
|
|
4
|
+
} from "../chunk-ITH6FWQY.js";
|
|
5
5
|
import {
|
|
6
6
|
JobService,
|
|
7
7
|
createDocumentId,
|
|
@@ -186,7 +186,8 @@ var BackgroundWorker = class {
|
|
|
186
186
|
const crawlOptions = {
|
|
187
187
|
maxPages: resolvedMaxPages,
|
|
188
188
|
simple: simple ?? false,
|
|
189
|
-
useHeadless: useHeadless ??
|
|
189
|
+
useHeadless: useHeadless ?? true
|
|
190
|
+
// Default to headless for reliability
|
|
190
191
|
};
|
|
191
192
|
if (crawlInstruction !== void 0) {
|
|
192
193
|
crawlOptions.crawlInstruction = crawlInstruction;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/workers/background-worker.ts","../../src/workers/pid-file.ts","../../src/workers/background-worker-cli.ts"],"sourcesContent":["import { createHash } from 'node:crypto';\nimport { IntelligentCrawler, type CrawlProgress } from '../crawl/intelligent-crawler.js';\nimport { IndexService } from '../services/index.service.js';\nimport { JobService } from '../services/job.service.js';\nimport { StoreService } from '../services/store.service.js';\nimport { createStoreId, createDocumentId } from '../types/brands.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { Document } from '../types/document.js';\nimport type { Job } from '../types/job.js';\n\n/**\n * Calculate index progress as a percentage, handling division by zero.\n * @param current - Current number of items processed\n * @param total - Total number of items (may be 0)\n * @param scale - Scale factor for progress (default 100 for 0-100%)\n * @returns Progress value, or 0 if total is 0\n */\nexport function calculateIndexProgress(\n current: number,\n total: number,\n scale: number = 100\n): number {\n if (total === 0) return 0;\n return (current / total) * scale;\n}\n\nexport class BackgroundWorker {\n constructor(\n private readonly jobService: JobService,\n private readonly storeService: StoreService,\n private readonly indexService: IndexService,\n private readonly lanceStore: LanceStore,\n private readonly embeddingEngine: EmbeddingEngine\n ) {}\n\n /**\n * Execute a job based on its type\n */\n async executeJob(jobId: string): Promise<void> {\n const job = this.jobService.getJob(jobId);\n\n if (!job) {\n throw new Error(`Job ${jobId} not found`);\n }\n\n try {\n // Update to running status\n this.jobService.updateJob(jobId, {\n status: 'running',\n message: `Starting ${job.type} operation...`,\n progress: 0,\n details: { startedAt: new Date().toISOString() },\n });\n\n // Execute based on job type\n switch (job.type) {\n case 'clone':\n await this.executeCloneJob(job);\n break;\n case 'index':\n await this.executeIndexJob(job);\n break;\n case 'crawl':\n await this.executeCrawlJob(job);\n break;\n default:\n throw new Error(`Unknown job type: ${String(job.type)}`);\n }\n\n // Mark as completed\n this.jobService.updateJob(jobId, {\n status: 'completed',\n progress: 100,\n message: `${job.type} operation completed successfully`,\n details: { completedAt: new Date().toISOString() },\n });\n } catch (error) {\n // Mark as failed\n const errorDetails: Record<string, unknown> = {\n completedAt: new Date().toISOString(),\n };\n if (error instanceof Error && error.stack !== undefined) {\n errorDetails['error'] = error.stack;\n } else {\n errorDetails['error'] = String(error);\n }\n this.jobService.updateJob(jobId, {\n status: 'failed',\n message: error instanceof Error ? error.message : 'Unknown error',\n details: errorDetails,\n });\n throw error;\n }\n }\n\n /**\n * Execute a clone job (git clone + initial indexing)\n */\n private async executeCloneJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for clone job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Clone is already done by the time the job is created\n // (happens in StoreService.create), so we just need to index\n\n // Update progress - cloning considered done (30%)\n this.jobService.updateJob(job.id, {\n status: 'running',\n message: 'Repository cloned, starting indexing...',\n progress: 30,\n });\n\n // Index the repository with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Indexing is 70% of total progress (30-100%)\n const indexProgress = calculateIndexProgress(event.current, event.total, 70);\n const totalProgress = 30 + indexProgress;\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, totalProgress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute an index job (re-indexing existing store)\n */\n private async executeIndexJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for index job');\n }\n\n // Get the store\n const store = await this.storeService.getByIdOrName(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Index with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n const progress = calculateIndexProgress(event.current, event.total);\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, progress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute a crawl job (web crawling + indexing)\n */\n private async executeCrawlJob(job: Job): Promise<void> {\n const { storeId, url, crawlInstruction, extractInstruction, maxPages, simple, useHeadless } =\n job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for crawl job');\n }\n if (url === undefined || typeof url !== 'string') {\n throw new Error('URL required for crawl job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (store?.type !== 'web') {\n throw new Error(`Web store ${storeId} not found`);\n }\n\n const resolvedMaxPages = typeof maxPages === 'number' ? maxPages : 50;\n const crawler = new IntelligentCrawler();\n\n // Listen for progress events\n crawler.on('progress', (progress: CrawlProgress) => {\n // Check if job was cancelled - just return early, for-await loop will throw and finally will cleanup\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n return;\n }\n\n // Crawling is 80% of total progress (0-80%)\n const crawlProgress = (progress.pagesVisited / resolvedMaxPages) * 80;\n\n this.jobService.updateJob(job.id, {\n message:\n progress.message ??\n `Crawling page ${String(progress.pagesVisited)}/${String(resolvedMaxPages)}`,\n progress: Math.min(80, crawlProgress),\n details: { pagesCrawled: progress.pagesVisited },\n });\n });\n\n try {\n await this.lanceStore.initialize(store.id);\n const docs: Document[] = [];\n\n // Build crawl options, only including defined values\n const crawlOptions: {\n maxPages: number;\n simple: boolean;\n useHeadless: boolean;\n crawlInstruction?: string;\n extractInstruction?: string;\n } = {\n maxPages: resolvedMaxPages,\n simple: simple ?? false,\n useHeadless: useHeadless ?? false,\n };\n if (crawlInstruction !== undefined) {\n crawlOptions.crawlInstruction = crawlInstruction;\n }\n if (extractInstruction !== undefined) {\n crawlOptions.extractInstruction = extractInstruction;\n }\n\n // Crawl pages using IntelligentCrawler\n for await (const result of crawler.crawl(url, crawlOptions)) {\n // Check cancellation between pages\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Embed and index the content (use extracted if available, otherwise markdown)\n const contentToEmbed = result.extracted ?? result.markdown;\n const vector = await this.embeddingEngine.embed(contentToEmbed);\n\n docs.push({\n id: createDocumentId(`${store.id}-${createHash('md5').update(result.url).digest('hex')}`),\n content: contentToEmbed,\n vector,\n metadata: {\n type: 'web',\n storeId: store.id,\n url: result.url,\n title: result.title,\n extracted: result.extracted !== undefined,\n depth: result.depth,\n indexedAt: new Date(),\n },\n });\n }\n\n // Index all documents (remaining 20%)\n if (docs.length > 0) {\n this.jobService.updateJob(job.id, {\n message: 'Indexing crawled documents...',\n progress: 85,\n });\n\n await this.lanceStore.addDocuments(store.id, docs);\n // Create FTS index for full-text search\n await this.lanceStore.createFtsIndex(store.id);\n }\n\n this.jobService.updateJob(job.id, {\n message: `Crawled and indexed ${String(docs.length)} pages`,\n progress: 100,\n details: { pagesCrawled: docs.length },\n });\n } finally {\n await crawler.stop();\n }\n }\n}\n","import fs from 'fs';\nimport path from 'path';\n\n/**\n * Result of a PID file delete operation.\n * Delete operations are best-effort and should not throw.\n */\nexport interface PidFileResult {\n success: boolean;\n error?: Error;\n}\n\n/**\n * Context for PID file deletion - indicates when the delete is happening.\n * Used for logging/debugging purposes.\n */\nexport type PidFileDeleteContext = 'sigterm' | 'success' | 'failure';\n\n/**\n * Write PID file - CRITICAL operation that must succeed.\n *\n * If the PID file cannot be written, the job cannot be cancelled through\n * the job management system. This is a critical failure and the job\n * should not proceed.\n *\n * @param pidFile - Absolute path to the PID file\n * @param pid - Process ID to write\n * @throws Error if PID file cannot be written\n */\nexport function writePidFile(pidFile: string, pid: number): void {\n try {\n fs.writeFileSync(pidFile, pid.toString(), 'utf-8');\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n throw new Error(\n `CRITICAL: Failed to write PID file ${pidFile}. ` +\n `Job cannot be cancelled without PID file. ` +\n `Original error: ${message}`\n );\n }\n}\n\n/**\n * Delete PID file - best-effort cleanup during shutdown.\n *\n * This operation should NEVER throw. During process shutdown (SIGTERM,\n * job success, job failure), failing to delete a PID file should not\n * prevent the process from exiting cleanly.\n *\n * Stale PID files are cleaned up by JobService.cleanupOldJobs().\n *\n * @param pidFile - Absolute path to the PID file\n * @param _context - Context indicating when the delete is happening (for future logging)\n * @returns Result indicating success or failure with error details\n */\nexport function deletePidFile(pidFile: string, _context: PidFileDeleteContext): PidFileResult {\n try {\n fs.unlinkSync(pidFile);\n return { success: true };\n } catch (error) {\n // ENOENT = file doesn't exist - that's success (nothing to delete)\n if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {\n return { success: true };\n }\n // Any other error = failure (permission denied, etc.)\n return {\n success: false,\n error: error instanceof Error ? error : new Error(String(error)),\n };\n }\n}\n\n/**\n * Build the path to a PID file for a given job.\n *\n * @param jobsDir - Directory where job files are stored\n * @param jobId - Job identifier\n * @returns Absolute path to the PID file\n */\nexport function buildPidFilePath(jobsDir: string, jobId: string): string {\n return path.join(jobsDir, `${jobId}.pid`);\n}\n","#!/usr/bin/env node\nimport { BackgroundWorker } from './background-worker.js';\nimport { writePidFile, deletePidFile, buildPidFilePath } from './pid-file.js';\nimport { createServices } from '../services/index.js';\nimport { JobService } from '../services/job.service.js';\n\n/**\n * Background worker CLI entry point\n *\n * Usage: background-worker-cli <job-id>\n *\n * This process runs detached from the parent and executes a single job.\n */\n\nasync function main(): Promise<void> {\n const jobId = process.argv[2];\n const dataDir = process.env['BLUERA_DATA_DIR'];\n\n if (jobId === undefined || jobId === '') {\n console.error('Error: Job ID required');\n console.error('Usage: background-worker-cli <job-id>');\n process.exit(1);\n }\n\n // Initialize services\n const jobService = new JobService(dataDir);\n const services = await createServices(undefined, dataDir);\n\n // Write PID file for job cancellation - CRITICAL: must succeed or job cannot be cancelled\n const pidFile = buildPidFilePath(\n jobService['jobsDir'], // Access private field for PID path\n jobId\n );\n\n try {\n writePidFile(pidFile, process.pid);\n } catch (error) {\n // CRITICAL: Cannot proceed without PID file - job would be uncancellable\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n\n // Handle SIGTERM for graceful shutdown\n process.on('SIGTERM', () => {\n console.log(`[${jobId}] Received SIGTERM, cancelling job...`);\n jobService.updateJob(jobId, {\n status: 'cancelled',\n message: 'Job cancelled by user',\n });\n\n // Clean up PID file (best-effort - don't block shutdown)\n const deleteResult = deletePidFile(pidFile, 'sigterm');\n if (!deleteResult.success && deleteResult.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file during SIGTERM: ${deleteResult.error.message}`\n );\n }\n\n process.exit(0);\n });\n\n // Create worker and execute job\n const worker = new BackgroundWorker(\n jobService,\n services.store,\n services.index,\n services.lance,\n services.embeddings\n );\n\n try {\n await worker.executeJob(jobId);\n\n // Clean up PID file on success (best-effort - don't change exit code)\n const successCleanup = deletePidFile(pidFile, 'success');\n if (!successCleanup.success && successCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after success: ${successCleanup.error.message}`\n );\n }\n\n console.log(`[${jobId}] Job completed successfully`);\n process.exit(0);\n } catch (error) {\n // Job service already updated with failure status in BackgroundWorker\n console.error(`[${jobId}] Job failed:`, error);\n\n // Clean up PID file on failure (best-effort - exit code reflects job failure)\n const failureCleanup = deletePidFile(pidFile, 'failure');\n if (!failureCleanup.success && failureCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after failure: ${failureCleanup.error.message}`\n );\n }\n\n process.exit(1);\n }\n}\n\nmain().catch((error: unknown) => {\n console.error('Fatal error in background worker:', error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;;;;;;;AAAA,SAAS,kBAAkB;AAkBpB,SAAS,uBACd,SACA,OACA,QAAgB,KACR;AACR,MAAI,UAAU,EAAG,QAAO;AACxB,SAAQ,UAAU,QAAS;AAC7B;AAEO,IAAM,mBAAN,MAAuB;AAAA,EAC5B,YACmB,YACA,cACA,cACA,YACA,iBACjB;AALiB;AACA;AACA;AACA;AACA;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA,EAKH,MAAM,WAAW,OAA8B;AAC7C,UAAM,MAAM,KAAK,WAAW,OAAO,KAAK;AAExC,QAAI,CAAC,KAAK;AACR,YAAM,IAAI,MAAM,OAAO,KAAK,YAAY;AAAA,IAC1C;AAEA,QAAI;AAEF,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,YAAY,IAAI,IAAI;AAAA,QAC7B,UAAU;AAAA,QACV,SAAS,EAAE,YAAW,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACjD,CAAC;AAGD,cAAQ,IAAI,MAAM;AAAA,QAChB,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF;AACE,gBAAM,IAAI,MAAM,qBAAqB,OAAO,IAAI,IAAI,CAAC,EAAE;AAAA,MAC3D;AAGA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,UAAU;AAAA,QACV,SAAS,GAAG,IAAI,IAAI;AAAA,QACpB,SAAS,EAAE,cAAa,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACnD,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,YAAM,eAAwC;AAAA,QAC5C,cAAa,oBAAI,KAAK,GAAE,YAAY;AAAA,MACtC;AACA,UAAI,iBAAiB,SAAS,MAAM,UAAU,QAAW;AACvD,qBAAa,OAAO,IAAI,MAAM;AAAA,MAChC,OAAO;AACL,qBAAa,OAAO,IAAI,OAAO,KAAK;AAAA,MACtC;AACA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,iBAAiB,QAAQ,MAAM,UAAU;AAAA,QAClD,SAAS;AAAA,MACX,CAAC;AACD,YAAM;AAAA,IACR;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAMA,SAAK,WAAW,UAAU,IAAI,IAAI;AAAA,MAChC,QAAQ;AAAA,MACR,SAAS;AAAA,MACT,UAAU;AAAA,IACZ,CAAC;AAGD,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,gBAAgB,uBAAuB,MAAM,SAAS,MAAM,OAAO,EAAE;AAC3E,cAAM,gBAAgB,KAAK;AAE3B,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA;AAAA,UACpC,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,cAAc,cAAc,OAAO,CAAC;AAC1E,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAGA,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAEA,cAAM,WAAW,uBAAuB,MAAM,SAAS,MAAM,KAAK;AAElE,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,QAAQ;AAAA;AAAA,UAC/B,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,SAAS,KAAK,kBAAkB,oBAAoB,UAAU,QAAQ,YAAY,IACxF,IAAI;AAEN,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AACA,QAAI,QAAQ,UAAa,OAAO,QAAQ,UAAU;AAChD,YAAM,IAAI,MAAM,4BAA4B;AAAA,IAC9C;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,OAAO,SAAS,OAAO;AACzB,YAAM,IAAI,MAAM,aAAa,OAAO,YAAY;AAAA,IAClD;AAEA,UAAM,mBAAmB,OAAO,aAAa,WAAW,WAAW;AACnE,UAAM,UAAU,IAAI,mBAAmB;AAGvC,YAAQ,GAAG,YAAY,CAAC,aAA4B;AAElD,YAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,UAAI,YAAY,WAAW,aAAa;AACtC;AAAA,MACF;AAGA,YAAM,gBAAiB,SAAS,eAAe,mBAAoB;AAEnE,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SACE,SAAS,WACT,iBAAiB,OAAO,SAAS,YAAY,CAAC,IAAI,OAAO,gBAAgB,CAAC;AAAA,QAC5E,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA,QACpC,SAAS,EAAE,cAAc,SAAS,aAAa;AAAA,MACjD,CAAC;AAAA,IACH,CAAC;AAED,QAAI;AACF,YAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,YAAM,OAAmB,CAAC;AAG1B,YAAM,eAMF;AAAA,QACF,UAAU;AAAA,QACV,QAAQ,UAAU;AAAA,QAClB,aAAa,eAAe;AAAA,MAC9B;AACA,UAAI,qBAAqB,QAAW;AAClC,qBAAa,mBAAmB;AAAA,MAClC;AACA,UAAI,uBAAuB,QAAW;AACpC,qBAAa,qBAAqB;AAAA,MACpC;AAGA,uBAAiB,UAAU,QAAQ,MAAM,KAAK,YAAY,GAAG;AAE3D,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,iBAAiB,OAAO,aAAa,OAAO;AAClD,cAAM,SAAS,MAAM,KAAK,gBAAgB,MAAM,cAAc;AAE9D,aAAK,KAAK;AAAA,UACR,IAAI,iBAAiB,GAAG,MAAM,EAAE,IAAI,WAAW,KAAK,EAAE,OAAO,OAAO,GAAG,EAAE,OAAO,KAAK,CAAC,EAAE;AAAA,UACxF,SAAS;AAAA,UACT;AAAA,UACA,UAAU;AAAA,YACR,MAAM;AAAA,YACN,SAAS,MAAM;AAAA,YACf,KAAK,OAAO;AAAA,YACZ,OAAO,OAAO;AAAA,YACd,WAAW,OAAO,cAAc;AAAA,YAChC,OAAO,OAAO;AAAA,YACd,WAAW,oBAAI,KAAK;AAAA,UACtB;AAAA,QACF,CAAC;AAAA,MACH;AAGA,UAAI,KAAK,SAAS,GAAG;AACnB,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS;AAAA,UACT,UAAU;AAAA,QACZ,CAAC;AAED,cAAM,KAAK,WAAW,aAAa,MAAM,IAAI,IAAI;AAEjD,cAAM,KAAK,WAAW,eAAe,MAAM,EAAE;AAAA,MAC/C;AAEA,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SAAS,uBAAuB,OAAO,KAAK,MAAM,CAAC;AAAA,QACnD,UAAU;AAAA,QACV,SAAS,EAAE,cAAc,KAAK,OAAO;AAAA,MACvC,CAAC;AAAA,IACH,UAAE;AACA,YAAM,QAAQ,KAAK;AAAA,IACrB;AAAA,EACF;AACF;;;ACvTA,OAAO,QAAQ;AACf,OAAO,UAAU;AA4BV,SAAS,aAAa,SAAiB,KAAmB;AAC/D,MAAI;AACF,OAAG,cAAc,SAAS,IAAI,SAAS,GAAG,OAAO;AAAA,EACnD,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,UAAM,IAAI;AAAA,MACR,sCAAsC,OAAO,+DAExB,OAAO;AAAA,IAC9B;AAAA,EACF;AACF;AAeO,SAAS,cAAc,SAAiB,UAA+C;AAC5F,MAAI;AACF,OAAG,WAAW,OAAO;AACrB,WAAO,EAAE,SAAS,KAAK;AAAA,EACzB,SAAS,OAAO;AAEd,QAAI,iBAAiB,SAAS,UAAU,SAAS,MAAM,SAAS,UAAU;AACxE,aAAO,EAAE,SAAS,KAAK;AAAA,IACzB;AAEA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,OAAO,iBAAiB,QAAQ,QAAQ,IAAI,MAAM,OAAO,KAAK,CAAC;AAAA,IACjE;AAAA,EACF;AACF;AASO,SAAS,iBAAiB,SAAiB,OAAuB;AACvE,SAAO,KAAK,KAAK,SAAS,GAAG,KAAK,MAAM;AAC1C;;;ACnEA,eAAe,OAAsB;AACnC,QAAM,QAAQ,QAAQ,KAAK,CAAC;AAC5B,QAAM,UAAU,QAAQ,IAAI,iBAAiB;AAE7C,MAAI,UAAU,UAAa,UAAU,IAAI;AACvC,YAAQ,MAAM,wBAAwB;AACtC,YAAQ,MAAM,uCAAuC;AACrD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,aAAa,IAAI,WAAW,OAAO;AACzC,QAAM,WAAW,MAAM,eAAe,QAAW,OAAO;AAGxD,QAAM,UAAU;AAAA,IACd,WAAW,SAAS;AAAA;AAAA,IACpB;AAAA,EACF;AAEA,MAAI;AACF,iBAAa,SAAS,QAAQ,GAAG;AAAA,EACnC,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,UAAQ,GAAG,WAAW,MAAM;AAC1B,YAAQ,IAAI,IAAI,KAAK,uCAAuC;AAC5D,eAAW,UAAU,OAAO;AAAA,MAC1B,QAAQ;AAAA,MACR,SAAS;AAAA,IACX,CAAC;AAGD,UAAM,eAAe,cAAc,SAAS,SAAS;AACrD,QAAI,CAAC,aAAa,WAAW,aAAa,UAAU,QAAW;AAC7D,cAAQ;AAAA,QACN,sDAAsD,aAAa,MAAM,OAAO;AAAA,MAClF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB,CAAC;AAGD,QAAM,SAAS,IAAI;AAAA,IACjB;AAAA,IACA,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,EACX;AAEA,MAAI;AACF,UAAM,OAAO,WAAW,KAAK;AAG7B,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,IAAI,IAAI,KAAK,8BAA8B;AACnD,YAAQ,KAAK,CAAC;AAAA,EAChB,SAAS,OAAO;AAEd,YAAQ,MAAM,IAAI,KAAK,iBAAiB,KAAK;AAG7C,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAEA,KAAK,EAAE,MAAM,CAAC,UAAmB;AAC/B,UAAQ,MAAM,qCAAqC,KAAK;AACxD,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/workers/background-worker.ts","../../src/workers/pid-file.ts","../../src/workers/background-worker-cli.ts"],"sourcesContent":["import { createHash } from 'node:crypto';\nimport { IntelligentCrawler, type CrawlProgress } from '../crawl/intelligent-crawler.js';\nimport { IndexService } from '../services/index.service.js';\nimport { JobService } from '../services/job.service.js';\nimport { StoreService } from '../services/store.service.js';\nimport { createStoreId, createDocumentId } from '../types/brands.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { Document } from '../types/document.js';\nimport type { Job } from '../types/job.js';\n\n/**\n * Calculate index progress as a percentage, handling division by zero.\n * @param current - Current number of items processed\n * @param total - Total number of items (may be 0)\n * @param scale - Scale factor for progress (default 100 for 0-100%)\n * @returns Progress value, or 0 if total is 0\n */\nexport function calculateIndexProgress(\n current: number,\n total: number,\n scale: number = 100\n): number {\n if (total === 0) return 0;\n return (current / total) * scale;\n}\n\nexport class BackgroundWorker {\n constructor(\n private readonly jobService: JobService,\n private readonly storeService: StoreService,\n private readonly indexService: IndexService,\n private readonly lanceStore: LanceStore,\n private readonly embeddingEngine: EmbeddingEngine\n ) {}\n\n /**\n * Execute a job based on its type\n */\n async executeJob(jobId: string): Promise<void> {\n const job = this.jobService.getJob(jobId);\n\n if (!job) {\n throw new Error(`Job ${jobId} not found`);\n }\n\n try {\n // Update to running status\n this.jobService.updateJob(jobId, {\n status: 'running',\n message: `Starting ${job.type} operation...`,\n progress: 0,\n details: { startedAt: new Date().toISOString() },\n });\n\n // Execute based on job type\n switch (job.type) {\n case 'clone':\n await this.executeCloneJob(job);\n break;\n case 'index':\n await this.executeIndexJob(job);\n break;\n case 'crawl':\n await this.executeCrawlJob(job);\n break;\n default:\n throw new Error(`Unknown job type: ${String(job.type)}`);\n }\n\n // Mark as completed\n this.jobService.updateJob(jobId, {\n status: 'completed',\n progress: 100,\n message: `${job.type} operation completed successfully`,\n details: { completedAt: new Date().toISOString() },\n });\n } catch (error) {\n // Mark as failed\n const errorDetails: Record<string, unknown> = {\n completedAt: new Date().toISOString(),\n };\n if (error instanceof Error && error.stack !== undefined) {\n errorDetails['error'] = error.stack;\n } else {\n errorDetails['error'] = String(error);\n }\n this.jobService.updateJob(jobId, {\n status: 'failed',\n message: error instanceof Error ? error.message : 'Unknown error',\n details: errorDetails,\n });\n throw error;\n }\n }\n\n /**\n * Execute a clone job (git clone + initial indexing)\n */\n private async executeCloneJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for clone job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Clone is already done by the time the job is created\n // (happens in StoreService.create), so we just need to index\n\n // Update progress - cloning considered done (30%)\n this.jobService.updateJob(job.id, {\n status: 'running',\n message: 'Repository cloned, starting indexing...',\n progress: 30,\n });\n\n // Index the repository with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Indexing is 70% of total progress (30-100%)\n const indexProgress = calculateIndexProgress(event.current, event.total, 70);\n const totalProgress = 30 + indexProgress;\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, totalProgress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute an index job (re-indexing existing store)\n */\n private async executeIndexJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for index job');\n }\n\n // Get the store\n const store = await this.storeService.getByIdOrName(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Index with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n const progress = calculateIndexProgress(event.current, event.total);\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, progress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute a crawl job (web crawling + indexing)\n */\n private async executeCrawlJob(job: Job): Promise<void> {\n const { storeId, url, crawlInstruction, extractInstruction, maxPages, simple, useHeadless } =\n job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for crawl job');\n }\n if (url === undefined || typeof url !== 'string') {\n throw new Error('URL required for crawl job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (store?.type !== 'web') {\n throw new Error(`Web store ${storeId} not found`);\n }\n\n const resolvedMaxPages = typeof maxPages === 'number' ? maxPages : 50;\n const crawler = new IntelligentCrawler();\n\n // Listen for progress events\n crawler.on('progress', (progress: CrawlProgress) => {\n // Check if job was cancelled - just return early, for-await loop will throw and finally will cleanup\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n return;\n }\n\n // Crawling is 80% of total progress (0-80%)\n const crawlProgress = (progress.pagesVisited / resolvedMaxPages) * 80;\n\n this.jobService.updateJob(job.id, {\n message:\n progress.message ??\n `Crawling page ${String(progress.pagesVisited)}/${String(resolvedMaxPages)}`,\n progress: Math.min(80, crawlProgress),\n details: { pagesCrawled: progress.pagesVisited },\n });\n });\n\n try {\n await this.lanceStore.initialize(store.id);\n const docs: Document[] = [];\n\n // Build crawl options, only including defined values\n const crawlOptions: {\n maxPages: number;\n simple: boolean;\n useHeadless: boolean;\n crawlInstruction?: string;\n extractInstruction?: string;\n } = {\n maxPages: resolvedMaxPages,\n simple: simple ?? false,\n useHeadless: useHeadless ?? true, // Default to headless for reliability\n };\n if (crawlInstruction !== undefined) {\n crawlOptions.crawlInstruction = crawlInstruction;\n }\n if (extractInstruction !== undefined) {\n crawlOptions.extractInstruction = extractInstruction;\n }\n\n // Crawl pages using IntelligentCrawler\n for await (const result of crawler.crawl(url, crawlOptions)) {\n // Check cancellation between pages\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Embed and index the content (use extracted if available, otherwise markdown)\n const contentToEmbed = result.extracted ?? result.markdown;\n const vector = await this.embeddingEngine.embed(contentToEmbed);\n\n docs.push({\n id: createDocumentId(`${store.id}-${createHash('md5').update(result.url).digest('hex')}`),\n content: contentToEmbed,\n vector,\n metadata: {\n type: 'web',\n storeId: store.id,\n url: result.url,\n title: result.title,\n extracted: result.extracted !== undefined,\n depth: result.depth,\n indexedAt: new Date(),\n },\n });\n }\n\n // Index all documents (remaining 20%)\n if (docs.length > 0) {\n this.jobService.updateJob(job.id, {\n message: 'Indexing crawled documents...',\n progress: 85,\n });\n\n await this.lanceStore.addDocuments(store.id, docs);\n // Create FTS index for full-text search\n await this.lanceStore.createFtsIndex(store.id);\n }\n\n this.jobService.updateJob(job.id, {\n message: `Crawled and indexed ${String(docs.length)} pages`,\n progress: 100,\n details: { pagesCrawled: docs.length },\n });\n } finally {\n await crawler.stop();\n }\n }\n}\n","import fs from 'fs';\nimport path from 'path';\n\n/**\n * Result of a PID file delete operation.\n * Delete operations are best-effort and should not throw.\n */\nexport interface PidFileResult {\n success: boolean;\n error?: Error;\n}\n\n/**\n * Context for PID file deletion - indicates when the delete is happening.\n * Used for logging/debugging purposes.\n */\nexport type PidFileDeleteContext = 'sigterm' | 'success' | 'failure';\n\n/**\n * Write PID file - CRITICAL operation that must succeed.\n *\n * If the PID file cannot be written, the job cannot be cancelled through\n * the job management system. This is a critical failure and the job\n * should not proceed.\n *\n * @param pidFile - Absolute path to the PID file\n * @param pid - Process ID to write\n * @throws Error if PID file cannot be written\n */\nexport function writePidFile(pidFile: string, pid: number): void {\n try {\n fs.writeFileSync(pidFile, pid.toString(), 'utf-8');\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n throw new Error(\n `CRITICAL: Failed to write PID file ${pidFile}. ` +\n `Job cannot be cancelled without PID file. ` +\n `Original error: ${message}`\n );\n }\n}\n\n/**\n * Delete PID file - best-effort cleanup during shutdown.\n *\n * This operation should NEVER throw. During process shutdown (SIGTERM,\n * job success, job failure), failing to delete a PID file should not\n * prevent the process from exiting cleanly.\n *\n * Stale PID files are cleaned up by JobService.cleanupOldJobs().\n *\n * @param pidFile - Absolute path to the PID file\n * @param _context - Context indicating when the delete is happening (for future logging)\n * @returns Result indicating success or failure with error details\n */\nexport function deletePidFile(pidFile: string, _context: PidFileDeleteContext): PidFileResult {\n try {\n fs.unlinkSync(pidFile);\n return { success: true };\n } catch (error) {\n // ENOENT = file doesn't exist - that's success (nothing to delete)\n if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {\n return { success: true };\n }\n // Any other error = failure (permission denied, etc.)\n return {\n success: false,\n error: error instanceof Error ? error : new Error(String(error)),\n };\n }\n}\n\n/**\n * Build the path to a PID file for a given job.\n *\n * @param jobsDir - Directory where job files are stored\n * @param jobId - Job identifier\n * @returns Absolute path to the PID file\n */\nexport function buildPidFilePath(jobsDir: string, jobId: string): string {\n return path.join(jobsDir, `${jobId}.pid`);\n}\n","#!/usr/bin/env node\nimport { BackgroundWorker } from './background-worker.js';\nimport { writePidFile, deletePidFile, buildPidFilePath } from './pid-file.js';\nimport { createServices } from '../services/index.js';\nimport { JobService } from '../services/job.service.js';\n\n/**\n * Background worker CLI entry point\n *\n * Usage: background-worker-cli <job-id>\n *\n * This process runs detached from the parent and executes a single job.\n */\n\nasync function main(): Promise<void> {\n const jobId = process.argv[2];\n const dataDir = process.env['BLUERA_DATA_DIR'];\n\n if (jobId === undefined || jobId === '') {\n console.error('Error: Job ID required');\n console.error('Usage: background-worker-cli <job-id>');\n process.exit(1);\n }\n\n // Initialize services\n const jobService = new JobService(dataDir);\n const services = await createServices(undefined, dataDir);\n\n // Write PID file for job cancellation - CRITICAL: must succeed or job cannot be cancelled\n const pidFile = buildPidFilePath(\n jobService['jobsDir'], // Access private field for PID path\n jobId\n );\n\n try {\n writePidFile(pidFile, process.pid);\n } catch (error) {\n // CRITICAL: Cannot proceed without PID file - job would be uncancellable\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n\n // Handle SIGTERM for graceful shutdown\n process.on('SIGTERM', () => {\n console.log(`[${jobId}] Received SIGTERM, cancelling job...`);\n jobService.updateJob(jobId, {\n status: 'cancelled',\n message: 'Job cancelled by user',\n });\n\n // Clean up PID file (best-effort - don't block shutdown)\n const deleteResult = deletePidFile(pidFile, 'sigterm');\n if (!deleteResult.success && deleteResult.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file during SIGTERM: ${deleteResult.error.message}`\n );\n }\n\n process.exit(0);\n });\n\n // Create worker and execute job\n const worker = new BackgroundWorker(\n jobService,\n services.store,\n services.index,\n services.lance,\n services.embeddings\n );\n\n try {\n await worker.executeJob(jobId);\n\n // Clean up PID file on success (best-effort - don't change exit code)\n const successCleanup = deletePidFile(pidFile, 'success');\n if (!successCleanup.success && successCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after success: ${successCleanup.error.message}`\n );\n }\n\n console.log(`[${jobId}] Job completed successfully`);\n process.exit(0);\n } catch (error) {\n // Job service already updated with failure status in BackgroundWorker\n console.error(`[${jobId}] Job failed:`, error);\n\n // Clean up PID file on failure (best-effort - exit code reflects job failure)\n const failureCleanup = deletePidFile(pidFile, 'failure');\n if (!failureCleanup.success && failureCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after failure: ${failureCleanup.error.message}`\n );\n }\n\n process.exit(1);\n }\n}\n\nmain().catch((error: unknown) => {\n console.error('Fatal error in background worker:', error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;;;;;;;AAAA,SAAS,kBAAkB;AAkBpB,SAAS,uBACd,SACA,OACA,QAAgB,KACR;AACR,MAAI,UAAU,EAAG,QAAO;AACxB,SAAQ,UAAU,QAAS;AAC7B;AAEO,IAAM,mBAAN,MAAuB;AAAA,EAC5B,YACmB,YACA,cACA,cACA,YACA,iBACjB;AALiB;AACA;AACA;AACA;AACA;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA,EAKH,MAAM,WAAW,OAA8B;AAC7C,UAAM,MAAM,KAAK,WAAW,OAAO,KAAK;AAExC,QAAI,CAAC,KAAK;AACR,YAAM,IAAI,MAAM,OAAO,KAAK,YAAY;AAAA,IAC1C;AAEA,QAAI;AAEF,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,YAAY,IAAI,IAAI;AAAA,QAC7B,UAAU;AAAA,QACV,SAAS,EAAE,YAAW,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACjD,CAAC;AAGD,cAAQ,IAAI,MAAM;AAAA,QAChB,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF;AACE,gBAAM,IAAI,MAAM,qBAAqB,OAAO,IAAI,IAAI,CAAC,EAAE;AAAA,MAC3D;AAGA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,UAAU;AAAA,QACV,SAAS,GAAG,IAAI,IAAI;AAAA,QACpB,SAAS,EAAE,cAAa,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACnD,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,YAAM,eAAwC;AAAA,QAC5C,cAAa,oBAAI,KAAK,GAAE,YAAY;AAAA,MACtC;AACA,UAAI,iBAAiB,SAAS,MAAM,UAAU,QAAW;AACvD,qBAAa,OAAO,IAAI,MAAM;AAAA,MAChC,OAAO;AACL,qBAAa,OAAO,IAAI,OAAO,KAAK;AAAA,MACtC;AACA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,iBAAiB,QAAQ,MAAM,UAAU;AAAA,QAClD,SAAS;AAAA,MACX,CAAC;AACD,YAAM;AAAA,IACR;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAMA,SAAK,WAAW,UAAU,IAAI,IAAI;AAAA,MAChC,QAAQ;AAAA,MACR,SAAS;AAAA,MACT,UAAU;AAAA,IACZ,CAAC;AAGD,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,gBAAgB,uBAAuB,MAAM,SAAS,MAAM,OAAO,EAAE;AAC3E,cAAM,gBAAgB,KAAK;AAE3B,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA;AAAA,UACpC,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,cAAc,cAAc,OAAO,CAAC;AAC1E,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAGA,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAEA,cAAM,WAAW,uBAAuB,MAAM,SAAS,MAAM,KAAK;AAElE,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,QAAQ;AAAA;AAAA,UAC/B,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,SAAS,KAAK,kBAAkB,oBAAoB,UAAU,QAAQ,YAAY,IACxF,IAAI;AAEN,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AACA,QAAI,QAAQ,UAAa,OAAO,QAAQ,UAAU;AAChD,YAAM,IAAI,MAAM,4BAA4B;AAAA,IAC9C;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,OAAO,SAAS,OAAO;AACzB,YAAM,IAAI,MAAM,aAAa,OAAO,YAAY;AAAA,IAClD;AAEA,UAAM,mBAAmB,OAAO,aAAa,WAAW,WAAW;AACnE,UAAM,UAAU,IAAI,mBAAmB;AAGvC,YAAQ,GAAG,YAAY,CAAC,aAA4B;AAElD,YAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,UAAI,YAAY,WAAW,aAAa;AACtC;AAAA,MACF;AAGA,YAAM,gBAAiB,SAAS,eAAe,mBAAoB;AAEnE,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SACE,SAAS,WACT,iBAAiB,OAAO,SAAS,YAAY,CAAC,IAAI,OAAO,gBAAgB,CAAC;AAAA,QAC5E,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA,QACpC,SAAS,EAAE,cAAc,SAAS,aAAa;AAAA,MACjD,CAAC;AAAA,IACH,CAAC;AAED,QAAI;AACF,YAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,YAAM,OAAmB,CAAC;AAG1B,YAAM,eAMF;AAAA,QACF,UAAU;AAAA,QACV,QAAQ,UAAU;AAAA,QAClB,aAAa,eAAe;AAAA;AAAA,MAC9B;AACA,UAAI,qBAAqB,QAAW;AAClC,qBAAa,mBAAmB;AAAA,MAClC;AACA,UAAI,uBAAuB,QAAW;AACpC,qBAAa,qBAAqB;AAAA,MACpC;AAGA,uBAAiB,UAAU,QAAQ,MAAM,KAAK,YAAY,GAAG;AAE3D,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,iBAAiB,OAAO,aAAa,OAAO;AAClD,cAAM,SAAS,MAAM,KAAK,gBAAgB,MAAM,cAAc;AAE9D,aAAK,KAAK;AAAA,UACR,IAAI,iBAAiB,GAAG,MAAM,EAAE,IAAI,WAAW,KAAK,EAAE,OAAO,OAAO,GAAG,EAAE,OAAO,KAAK,CAAC,EAAE;AAAA,UACxF,SAAS;AAAA,UACT;AAAA,UACA,UAAU;AAAA,YACR,MAAM;AAAA,YACN,SAAS,MAAM;AAAA,YACf,KAAK,OAAO;AAAA,YACZ,OAAO,OAAO;AAAA,YACd,WAAW,OAAO,cAAc;AAAA,YAChC,OAAO,OAAO;AAAA,YACd,WAAW,oBAAI,KAAK;AAAA,UACtB;AAAA,QACF,CAAC;AAAA,MACH;AAGA,UAAI,KAAK,SAAS,GAAG;AACnB,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS;AAAA,UACT,UAAU;AAAA,QACZ,CAAC;AAED,cAAM,KAAK,WAAW,aAAa,MAAM,IAAI,IAAI;AAEjD,cAAM,KAAK,WAAW,eAAe,MAAM,EAAE;AAAA,MAC/C;AAEA,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SAAS,uBAAuB,OAAO,KAAK,MAAM,CAAC;AAAA,QACnD,UAAU;AAAA,QACV,SAAS,EAAE,cAAc,KAAK,OAAO;AAAA,MACvC,CAAC;AAAA,IACH,UAAE;AACA,YAAM,QAAQ,KAAK;AAAA,IACrB;AAAA,EACF;AACF;;;ACvTA,OAAO,QAAQ;AACf,OAAO,UAAU;AA4BV,SAAS,aAAa,SAAiB,KAAmB;AAC/D,MAAI;AACF,OAAG,cAAc,SAAS,IAAI,SAAS,GAAG,OAAO;AAAA,EACnD,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,UAAM,IAAI;AAAA,MACR,sCAAsC,OAAO,+DAExB,OAAO;AAAA,IAC9B;AAAA,EACF;AACF;AAeO,SAAS,cAAc,SAAiB,UAA+C;AAC5F,MAAI;AACF,OAAG,WAAW,OAAO;AACrB,WAAO,EAAE,SAAS,KAAK;AAAA,EACzB,SAAS,OAAO;AAEd,QAAI,iBAAiB,SAAS,UAAU,SAAS,MAAM,SAAS,UAAU;AACxE,aAAO,EAAE,SAAS,KAAK;AAAA,IACzB;AAEA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,OAAO,iBAAiB,QAAQ,QAAQ,IAAI,MAAM,OAAO,KAAK,CAAC;AAAA,IACjE;AAAA,EACF;AACF;AASO,SAAS,iBAAiB,SAAiB,OAAuB;AACvE,SAAO,KAAK,KAAK,SAAS,GAAG,KAAK,MAAM;AAC1C;;;ACnEA,eAAe,OAAsB;AACnC,QAAM,QAAQ,QAAQ,KAAK,CAAC;AAC5B,QAAM,UAAU,QAAQ,IAAI,iBAAiB;AAE7C,MAAI,UAAU,UAAa,UAAU,IAAI;AACvC,YAAQ,MAAM,wBAAwB;AACtC,YAAQ,MAAM,uCAAuC;AACrD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,aAAa,IAAI,WAAW,OAAO;AACzC,QAAM,WAAW,MAAM,eAAe,QAAW,OAAO;AAGxD,QAAM,UAAU;AAAA,IACd,WAAW,SAAS;AAAA;AAAA,IACpB;AAAA,EACF;AAEA,MAAI;AACF,iBAAa,SAAS,QAAQ,GAAG;AAAA,EACnC,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,UAAQ,GAAG,WAAW,MAAM;AAC1B,YAAQ,IAAI,IAAI,KAAK,uCAAuC;AAC5D,eAAW,UAAU,OAAO;AAAA,MAC1B,QAAQ;AAAA,MACR,SAAS;AAAA,IACX,CAAC;AAGD,UAAM,eAAe,cAAc,SAAS,SAAS;AACrD,QAAI,CAAC,aAAa,WAAW,aAAa,UAAU,QAAW;AAC7D,cAAQ;AAAA,QACN,sDAAsD,aAAa,MAAM,OAAO;AAAA,MAClF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB,CAAC;AAGD,QAAM,SAAS,IAAI;AAAA,IACjB;AAAA,IACA,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,EACX;AAEA,MAAI;AACF,UAAM,OAAO,WAAW,KAAK;AAG7B,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,IAAI,IAAI,KAAK,8BAA8B;AACnD,YAAQ,KAAK,CAAC;AAAA,EAChB,SAAS,OAAO;AAEd,YAAQ,MAAM,IAAI,KAAK,iBAAiB,KAAK;AAG7C,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAEA,KAAK,EAAE,MAAM,CAAC,UAAmB;AAC/B,UAAQ,MAAM,qCAAqC,KAAK;AACxD,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
|
@@ -13,6 +13,29 @@ GREEN='\033[0;32m'
|
|
|
13
13
|
YELLOW='\033[1;33m'
|
|
14
14
|
NC='\033[0m' # No Color
|
|
15
15
|
|
|
16
|
+
# =====================
|
|
17
|
+
# Helper Functions
|
|
18
|
+
# =====================
|
|
19
|
+
|
|
20
|
+
# Install Playwright browser (called after crawl4ai is confirmed installed)
|
|
21
|
+
install_playwright_browser() {
|
|
22
|
+
# Check if Playwright Chromium is already installed by testing if browser can be launched
|
|
23
|
+
if python3 -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); b = p.chromium.launch(); b.close(); p.stop()" 2>/dev/null; then
|
|
24
|
+
echo -e "${GREEN}[bluera-knowledge] Playwright Chromium ready ✓${NC}"
|
|
25
|
+
return 0
|
|
26
|
+
fi
|
|
27
|
+
|
|
28
|
+
echo -e "${YELLOW}[bluera-knowledge] Installing Playwright browser (one-time setup)...${NC}"
|
|
29
|
+
if python3 -m playwright install chromium 2>/dev/null; then
|
|
30
|
+
echo -e "${GREEN}[bluera-knowledge] Playwright Chromium installed ✓${NC}"
|
|
31
|
+
return 0
|
|
32
|
+
else
|
|
33
|
+
echo -e "${YELLOW}[bluera-knowledge] Playwright browser install failed.${NC}"
|
|
34
|
+
echo -e "${YELLOW}Manual fix: python3 -m playwright install chromium${NC}"
|
|
35
|
+
return 1
|
|
36
|
+
fi
|
|
37
|
+
}
|
|
38
|
+
|
|
16
39
|
# =====================
|
|
17
40
|
# Node.js Dependencies
|
|
18
41
|
# =====================
|
|
@@ -60,6 +83,8 @@ if python3 -c "import crawl4ai" 2>/dev/null; then
|
|
|
60
83
|
# Already installed - get version
|
|
61
84
|
crawl4ai_version=$(python3 -c "import crawl4ai; print(crawl4ai.__version__)" 2>/dev/null || echo "unknown")
|
|
62
85
|
echo -e "${GREEN}[bluera-knowledge] crawl4ai ${crawl4ai_version} is installed ✓${NC}"
|
|
86
|
+
# Ensure Playwright browser is installed for headless crawling
|
|
87
|
+
install_playwright_browser
|
|
63
88
|
exit 0
|
|
64
89
|
fi
|
|
65
90
|
|
|
@@ -87,12 +112,16 @@ if command -v pip3 &> /dev/null || command -v pip &> /dev/null; then
|
|
|
87
112
|
echo -e "${GREEN}[bluera-knowledge] Successfully installed crawl4ai ✓${NC}"
|
|
88
113
|
crawl4ai_version=$(python3 -c "import crawl4ai; print(crawl4ai.__version__)" 2>/dev/null || echo "installed")
|
|
89
114
|
echo -e "${GREEN}[bluera-knowledge] crawl4ai ${crawl4ai_version} ready${NC}"
|
|
115
|
+
# Install Playwright browser for headless crawling
|
|
116
|
+
install_playwright_browser
|
|
90
117
|
else
|
|
91
118
|
# Fallback: try without --break-system-packages for older Python
|
|
92
119
|
if $PIP_CMD install --quiet --user crawl4ai 2>/dev/null; then
|
|
93
120
|
echo -e "${GREEN}[bluera-knowledge] Successfully installed crawl4ai ✓${NC}"
|
|
94
121
|
crawl4ai_version=$(python3 -c "import crawl4ai; print(crawl4ai.__version__)" 2>/dev/null || echo "installed")
|
|
95
122
|
echo -e "${GREEN}[bluera-knowledge] crawl4ai ${crawl4ai_version} ready${NC}"
|
|
123
|
+
# Install Playwright browser for headless crawling
|
|
124
|
+
install_playwright_browser
|
|
96
125
|
else
|
|
97
126
|
echo -e "${RED}[bluera-knowledge] Auto-installation failed${NC}"
|
|
98
127
|
echo ""
|
package/package.json
CHANGED
package/python/crawl_worker.py
CHANGED
|
@@ -29,10 +29,15 @@ async def fetch_headless(url: str):
|
|
|
29
29
|
if not result.success:
|
|
30
30
|
raise Exception(f"Crawl failed: {result.error_message}")
|
|
31
31
|
|
|
32
|
+
# Combine internal and external links - let TypeScript filter by domain
|
|
33
|
+
all_links = []
|
|
34
|
+
if isinstance(result.links, dict):
|
|
35
|
+
all_links = result.links.get("internal", []) + result.links.get("external", [])
|
|
36
|
+
|
|
32
37
|
return {
|
|
33
38
|
"html": result.html or '',
|
|
34
39
|
"markdown": result.markdown or result.cleaned_html or '',
|
|
35
|
-
"links":
|
|
40
|
+
"links": all_links
|
|
36
41
|
}
|
|
37
42
|
|
|
38
43
|
def is_exported(node: ast.AST) -> bool:
|
|
@@ -132,7 +132,7 @@ describe('crawl command execution', () => {
|
|
|
132
132
|
expect(mockCrawler.crawl).toHaveBeenCalledWith('https://example.com', {
|
|
133
133
|
crawlInstruction: 'all documentation pages',
|
|
134
134
|
maxPages: 50,
|
|
135
|
-
useHeadless:
|
|
135
|
+
useHeadless: true,
|
|
136
136
|
});
|
|
137
137
|
expect(mockServices.embeddings.embed).toHaveBeenCalledTimes(2);
|
|
138
138
|
expect(mockServices.lance.addDocuments).toHaveBeenCalledWith(
|
|
@@ -162,6 +162,46 @@ describe('crawl command execution', () => {
|
|
|
162
162
|
expect(processExitSpy).not.toHaveBeenCalled();
|
|
163
163
|
});
|
|
164
164
|
|
|
165
|
+
it('uses axios-only mode when --fast flag is provided', async () => {
|
|
166
|
+
const mockStore: WebStore = {
|
|
167
|
+
id: createStoreId('store-1'),
|
|
168
|
+
name: 'test-store',
|
|
169
|
+
type: 'web',
|
|
170
|
+
url: 'https://example.com',
|
|
171
|
+
depth: 2,
|
|
172
|
+
createdAt: new Date(),
|
|
173
|
+
updatedAt: new Date(),
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
mockServices.store.getByIdOrName.mockResolvedValue(mockStore);
|
|
177
|
+
mockServices.lance.initialize.mockResolvedValue(undefined);
|
|
178
|
+
mockServices.embeddings.embed.mockResolvedValue([0.1, 0.2, 0.3]);
|
|
179
|
+
mockServices.lance.addDocuments.mockResolvedValue(undefined);
|
|
180
|
+
|
|
181
|
+
mockCrawler.crawl.mockReturnValue(
|
|
182
|
+
(async function* () {
|
|
183
|
+
yield {
|
|
184
|
+
url: 'https://example.com/page1',
|
|
185
|
+
title: 'Page 1',
|
|
186
|
+
markdown: '# Page 1',
|
|
187
|
+
depth: 0,
|
|
188
|
+
};
|
|
189
|
+
})()
|
|
190
|
+
);
|
|
191
|
+
|
|
192
|
+
const command = createCrawlCommand(getOptions);
|
|
193
|
+
command.parseOptions(['--fast', '--max-pages', '25']);
|
|
194
|
+
const actionHandler = command._actionHandler;
|
|
195
|
+
|
|
196
|
+
await actionHandler(['https://example.com', 'test-store']);
|
|
197
|
+
|
|
198
|
+
expect(mockCrawler.crawl).toHaveBeenCalledWith('https://example.com', {
|
|
199
|
+
maxPages: 25,
|
|
200
|
+
useHeadless: false, // --fast disables headless
|
|
201
|
+
});
|
|
202
|
+
expect(mockCrawler.stop).toHaveBeenCalled();
|
|
203
|
+
});
|
|
204
|
+
|
|
165
205
|
it('successfully crawls in simple mode', async () => {
|
|
166
206
|
const mockStore: WebStore = {
|
|
167
207
|
id: createStoreId('store-1'),
|
|
@@ -198,7 +238,7 @@ describe('crawl command execution', () => {
|
|
|
198
238
|
expect(mockCrawler.crawl).toHaveBeenCalledWith('https://example.com', {
|
|
199
239
|
maxPages: 25,
|
|
200
240
|
simple: true,
|
|
201
|
-
useHeadless:
|
|
241
|
+
useHeadless: true,
|
|
202
242
|
});
|
|
203
243
|
expect(mockCrawler.stop).toHaveBeenCalled();
|
|
204
244
|
});
|
|
@@ -295,7 +335,7 @@ describe('crawl command execution', () => {
|
|
|
295
335
|
crawlInstruction: 'all Getting Started pages',
|
|
296
336
|
extractInstruction: 'code examples',
|
|
297
337
|
maxPages: 100,
|
|
298
|
-
useHeadless:
|
|
338
|
+
useHeadless: true,
|
|
299
339
|
});
|
|
300
340
|
});
|
|
301
341
|
});
|
|
@@ -25,7 +25,7 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
25
25
|
)
|
|
26
26
|
.option('--simple', 'Use simple BFS mode instead of intelligent crawling')
|
|
27
27
|
.option('--max-pages <number>', 'Maximum number of pages to crawl', '50')
|
|
28
|
-
.option('--
|
|
28
|
+
.option('--fast', 'Use fast axios-only mode (may fail on JavaScript-heavy sites)')
|
|
29
29
|
.action(
|
|
30
30
|
async (
|
|
31
31
|
url: string,
|
|
@@ -35,7 +35,7 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
35
35
|
extract?: string;
|
|
36
36
|
simple?: boolean;
|
|
37
37
|
maxPages?: string;
|
|
38
|
-
|
|
38
|
+
fast?: boolean;
|
|
39
39
|
}
|
|
40
40
|
) => {
|
|
41
41
|
const globalOpts = getOptions();
|
|
@@ -125,7 +125,7 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
125
125
|
...(cmdOptions.extract !== undefined && { extractInstruction: cmdOptions.extract }),
|
|
126
126
|
maxPages,
|
|
127
127
|
...(cmdOptions.simple !== undefined && { simple: cmdOptions.simple }),
|
|
128
|
-
useHeadless: cmdOptions.
|
|
128
|
+
useHeadless: !(cmdOptions.fast ?? false), // Default true (headless), --fast disables
|
|
129
129
|
})) {
|
|
130
130
|
// Use extracted content if available, otherwise markdown
|
|
131
131
|
const contentToProcess = result.extracted ?? result.markdown;
|
|
@@ -81,7 +81,11 @@ describe('ClaudeClient', () => {
|
|
|
81
81
|
|
|
82
82
|
describe('determineCrawlUrls', () => {
|
|
83
83
|
it('should successfully parse valid crawl strategy response', async () => {
|
|
84
|
-
const promise = client.determineCrawlUrls(
|
|
84
|
+
const promise = client.determineCrawlUrls(
|
|
85
|
+
'https://example.com',
|
|
86
|
+
'<html>test</html>',
|
|
87
|
+
'Find all docs'
|
|
88
|
+
);
|
|
85
89
|
|
|
86
90
|
// Simulate successful response
|
|
87
91
|
setTimeout(() => {
|
|
@@ -103,7 +107,11 @@ describe('ClaudeClient', () => {
|
|
|
103
107
|
});
|
|
104
108
|
|
|
105
109
|
it('should call spawn with correct arguments for determineCrawlUrls', async () => {
|
|
106
|
-
const promise = client.determineCrawlUrls(
|
|
110
|
+
const promise = client.determineCrawlUrls(
|
|
111
|
+
'https://example.com',
|
|
112
|
+
'<html>test</html>',
|
|
113
|
+
'Find all docs'
|
|
114
|
+
);
|
|
107
115
|
|
|
108
116
|
setTimeout(() => {
|
|
109
117
|
mockProcess.stdout.emit(
|
|
@@ -136,7 +144,11 @@ describe('ClaudeClient', () => {
|
|
|
136
144
|
});
|
|
137
145
|
|
|
138
146
|
it('should write prompt to stdin', async () => {
|
|
139
|
-
const promise = client.determineCrawlUrls(
|
|
147
|
+
const promise = client.determineCrawlUrls(
|
|
148
|
+
'https://example.com',
|
|
149
|
+
'<html><body>Test</body></html>',
|
|
150
|
+
'Find tutorials'
|
|
151
|
+
);
|
|
140
152
|
|
|
141
153
|
setTimeout(() => {
|
|
142
154
|
mockProcess.stdout.emit(
|
|
@@ -160,7 +172,11 @@ describe('ClaudeClient', () => {
|
|
|
160
172
|
});
|
|
161
173
|
|
|
162
174
|
it('should reject when response has no urls array', async () => {
|
|
163
|
-
const promise = client.determineCrawlUrls(
|
|
175
|
+
const promise = client.determineCrawlUrls(
|
|
176
|
+
'https://example.com',
|
|
177
|
+
'<html>test</html>',
|
|
178
|
+
'Find all'
|
|
179
|
+
);
|
|
164
180
|
|
|
165
181
|
setTimeout(() => {
|
|
166
182
|
mockProcess.stdout.emit(
|
|
@@ -178,7 +194,11 @@ describe('ClaudeClient', () => {
|
|
|
178
194
|
});
|
|
179
195
|
|
|
180
196
|
it('should reject when response has empty urls array', async () => {
|
|
181
|
-
const promise = client.determineCrawlUrls(
|
|
197
|
+
const promise = client.determineCrawlUrls(
|
|
198
|
+
'https://example.com',
|
|
199
|
+
'<html>test</html>',
|
|
200
|
+
'Find all'
|
|
201
|
+
);
|
|
182
202
|
|
|
183
203
|
setTimeout(() => {
|
|
184
204
|
mockProcess.stdout.emit(
|
|
@@ -197,7 +217,11 @@ describe('ClaudeClient', () => {
|
|
|
197
217
|
});
|
|
198
218
|
|
|
199
219
|
it('should reject when response has no reasoning', async () => {
|
|
200
|
-
const promise = client.determineCrawlUrls(
|
|
220
|
+
const promise = client.determineCrawlUrls(
|
|
221
|
+
'https://example.com',
|
|
222
|
+
'<html>test</html>',
|
|
223
|
+
'Find all'
|
|
224
|
+
);
|
|
201
225
|
|
|
202
226
|
setTimeout(() => {
|
|
203
227
|
mockProcess.stdout.emit(
|
|
@@ -215,7 +239,11 @@ describe('ClaudeClient', () => {
|
|
|
215
239
|
});
|
|
216
240
|
|
|
217
241
|
it('should reject when urls contains non-string values', async () => {
|
|
218
|
-
const promise = client.determineCrawlUrls(
|
|
242
|
+
const promise = client.determineCrawlUrls(
|
|
243
|
+
'https://example.com',
|
|
244
|
+
'<html>test</html>',
|
|
245
|
+
'Find all'
|
|
246
|
+
);
|
|
219
247
|
|
|
220
248
|
setTimeout(() => {
|
|
221
249
|
mockProcess.stdout.emit(
|
|
@@ -234,7 +262,11 @@ describe('ClaudeClient', () => {
|
|
|
234
262
|
});
|
|
235
263
|
|
|
236
264
|
it('should reject when response is not valid JSON', async () => {
|
|
237
|
-
const promise = client.determineCrawlUrls(
|
|
265
|
+
const promise = client.determineCrawlUrls(
|
|
266
|
+
'https://example.com',
|
|
267
|
+
'<html>test</html>',
|
|
268
|
+
'Find all'
|
|
269
|
+
);
|
|
238
270
|
|
|
239
271
|
setTimeout(() => {
|
|
240
272
|
mockProcess.stdout.emit('data', Buffer.from('Not valid JSON'));
|
|
@@ -245,7 +277,11 @@ describe('ClaudeClient', () => {
|
|
|
245
277
|
});
|
|
246
278
|
|
|
247
279
|
it('should reject when response is null', async () => {
|
|
248
|
-
const promise = client.determineCrawlUrls(
|
|
280
|
+
const promise = client.determineCrawlUrls(
|
|
281
|
+
'https://example.com',
|
|
282
|
+
'<html>test</html>',
|
|
283
|
+
'Find all'
|
|
284
|
+
);
|
|
249
285
|
|
|
250
286
|
setTimeout(() => {
|
|
251
287
|
mockProcess.stdout.emit('data', Buffer.from('null'));
|
|
@@ -257,7 +293,7 @@ describe('ClaudeClient', () => {
|
|
|
257
293
|
|
|
258
294
|
it('should truncate HTML longer than 50000 characters', async () => {
|
|
259
295
|
const longHtml = '<html>' + 'a'.repeat(60000) + '</html>';
|
|
260
|
-
const promise = client.determineCrawlUrls(longHtml, 'Find all');
|
|
296
|
+
const promise = client.determineCrawlUrls('https://example.com', longHtml, 'Find all');
|
|
261
297
|
|
|
262
298
|
setTimeout(() => {
|
|
263
299
|
mockProcess.stdout.emit(
|
|
@@ -281,7 +317,7 @@ describe('ClaudeClient', () => {
|
|
|
281
317
|
|
|
282
318
|
it('should not truncate HTML shorter than 50000 characters', async () => {
|
|
283
319
|
const shortHtml = '<html><body>Short content</body></html>';
|
|
284
|
-
const promise = client.determineCrawlUrls(shortHtml, 'Find all');
|
|
320
|
+
const promise = client.determineCrawlUrls('https://example.com', shortHtml, 'Find all');
|
|
285
321
|
|
|
286
322
|
setTimeout(() => {
|
|
287
323
|
mockProcess.stdout.emit(
|
|
@@ -302,6 +338,33 @@ describe('ClaudeClient', () => {
|
|
|
302
338
|
expect(writtenPrompt).toContain(shortHtml);
|
|
303
339
|
expect(writtenPrompt).not.toContain('[... HTML truncated ...]');
|
|
304
340
|
});
|
|
341
|
+
|
|
342
|
+
it('should include seedUrl in prompt for relative URL resolution', async () => {
|
|
343
|
+
const promise = client.determineCrawlUrls(
|
|
344
|
+
'https://code.claude.com/docs',
|
|
345
|
+
'<html><a href="/docs/en/hooks">Hooks</a></html>',
|
|
346
|
+
'Find all docs'
|
|
347
|
+
);
|
|
348
|
+
|
|
349
|
+
setTimeout(() => {
|
|
350
|
+
mockProcess.stdout.emit(
|
|
351
|
+
'data',
|
|
352
|
+
Buffer.from(
|
|
353
|
+
JSON.stringify({
|
|
354
|
+
urls: ['https://code.claude.com/docs/en/hooks'],
|
|
355
|
+
reasoning: 'Found hooks documentation',
|
|
356
|
+
})
|
|
357
|
+
)
|
|
358
|
+
);
|
|
359
|
+
mockProcess.emit('close', 0);
|
|
360
|
+
}, 10);
|
|
361
|
+
|
|
362
|
+
await promise;
|
|
363
|
+
|
|
364
|
+
const writtenPrompt = vi.mocked(mockProcess.stdin.write).mock.calls[0]?.[0] as string;
|
|
365
|
+
expect(writtenPrompt).toContain('Base URL: https://code.claude.com/docs');
|
|
366
|
+
expect(writtenPrompt).toContain('resolve them against the Base URL');
|
|
367
|
+
});
|
|
305
368
|
});
|
|
306
369
|
|
|
307
370
|
describe('extractContent', () => {
|
|
@@ -386,7 +449,11 @@ describe('ClaudeClient', () => {
|
|
|
386
449
|
|
|
387
450
|
describe('Subprocess Management', () => {
|
|
388
451
|
it('should handle process spawn errors', async () => {
|
|
389
|
-
const promise = client.determineCrawlUrls(
|
|
452
|
+
const promise = client.determineCrawlUrls(
|
|
453
|
+
'https://example.com',
|
|
454
|
+
'<html>test</html>',
|
|
455
|
+
'Find all'
|
|
456
|
+
);
|
|
390
457
|
|
|
391
458
|
setTimeout(() => {
|
|
392
459
|
mockProcess.emit('error', new Error('spawn ENOENT'));
|
|
@@ -410,7 +477,11 @@ describe('ClaudeClient', () => {
|
|
|
410
477
|
});
|
|
411
478
|
|
|
412
479
|
it('should collect stderr data', async () => {
|
|
413
|
-
const promise = client.determineCrawlUrls(
|
|
480
|
+
const promise = client.determineCrawlUrls(
|
|
481
|
+
'https://example.com',
|
|
482
|
+
'<html>test</html>',
|
|
483
|
+
'Find all'
|
|
484
|
+
);
|
|
414
485
|
|
|
415
486
|
setTimeout(() => {
|
|
416
487
|
mockProcess.stderr.emit('data', Buffer.from('Error message 1\n'));
|
|
@@ -466,14 +537,22 @@ describe('ClaudeClient', () => {
|
|
|
466
537
|
|
|
467
538
|
describe('Timeout Handling', () => {
|
|
468
539
|
it('should timeout after configured timeout period', async () => {
|
|
469
|
-
const promise = client.determineCrawlUrls(
|
|
540
|
+
const promise = client.determineCrawlUrls(
|
|
541
|
+
'https://example.com',
|
|
542
|
+
'<html>test</html>',
|
|
543
|
+
'Find all'
|
|
544
|
+
);
|
|
470
545
|
|
|
471
546
|
// Don't emit close event - let it timeout
|
|
472
547
|
await expect(promise).rejects.toThrow('timed out after 100ms');
|
|
473
548
|
});
|
|
474
549
|
|
|
475
550
|
it('should kill process on timeout', async () => {
|
|
476
|
-
const promise = client.determineCrawlUrls(
|
|
551
|
+
const promise = client.determineCrawlUrls(
|
|
552
|
+
'https://example.com',
|
|
553
|
+
'<html>test</html>',
|
|
554
|
+
'Find all'
|
|
555
|
+
);
|
|
477
556
|
|
|
478
557
|
await expect(promise).rejects.toThrow('timed out');
|
|
479
558
|
expect(mockProcess.kill).toHaveBeenCalledWith('SIGTERM');
|
|
@@ -540,7 +619,11 @@ describe('ClaudeClient', () => {
|
|
|
540
619
|
|
|
541
620
|
describe('JSON Parsing', () => {
|
|
542
621
|
it('should handle malformed JSON in response', async () => {
|
|
543
|
-
const promise = client.determineCrawlUrls(
|
|
622
|
+
const promise = client.determineCrawlUrls(
|
|
623
|
+
'https://example.com',
|
|
624
|
+
'<html>test</html>',
|
|
625
|
+
'Find all'
|
|
626
|
+
);
|
|
544
627
|
|
|
545
628
|
setTimeout(() => {
|
|
546
629
|
mockProcess.stdout.emit('data', Buffer.from('{ invalid json }'));
|
|
@@ -551,7 +634,11 @@ describe('ClaudeClient', () => {
|
|
|
551
634
|
});
|
|
552
635
|
|
|
553
636
|
it('should handle incomplete JSON in response', async () => {
|
|
554
|
-
const promise = client.determineCrawlUrls(
|
|
637
|
+
const promise = client.determineCrawlUrls(
|
|
638
|
+
'https://example.com',
|
|
639
|
+
'<html>test</html>',
|
|
640
|
+
'Find all'
|
|
641
|
+
);
|
|
555
642
|
|
|
556
643
|
setTimeout(() => {
|
|
557
644
|
mockProcess.stdout.emit('data', Buffer.from('{"urls": ["https://example.com"'));
|
|
@@ -562,7 +649,11 @@ describe('ClaudeClient', () => {
|
|
|
562
649
|
});
|
|
563
650
|
|
|
564
651
|
it('should handle JSON with extra whitespace', async () => {
|
|
565
|
-
const promise = client.determineCrawlUrls(
|
|
652
|
+
const promise = client.determineCrawlUrls(
|
|
653
|
+
'https://example.com',
|
|
654
|
+
'<html>test</html>',
|
|
655
|
+
'Find all'
|
|
656
|
+
);
|
|
566
657
|
|
|
567
658
|
setTimeout(() => {
|
|
568
659
|
mockProcess.stdout.emit(
|
|
@@ -582,7 +673,11 @@ describe('ClaudeClient', () => {
|
|
|
582
673
|
});
|
|
583
674
|
|
|
584
675
|
it('should handle JSON arrays as invalid for determineCrawlUrls', async () => {
|
|
585
|
-
const promise = client.determineCrawlUrls(
|
|
676
|
+
const promise = client.determineCrawlUrls(
|
|
677
|
+
'https://example.com',
|
|
678
|
+
'<html>test</html>',
|
|
679
|
+
'Find all'
|
|
680
|
+
);
|
|
586
681
|
|
|
587
682
|
setTimeout(() => {
|
|
588
683
|
mockProcess.stdout.emit('data', Buffer.from('[]'));
|
|
@@ -593,7 +688,11 @@ describe('ClaudeClient', () => {
|
|
|
593
688
|
});
|
|
594
689
|
|
|
595
690
|
it('should handle JSON primitives as invalid for determineCrawlUrls', async () => {
|
|
596
|
-
const promise = client.determineCrawlUrls(
|
|
691
|
+
const promise = client.determineCrawlUrls(
|
|
692
|
+
'https://example.com',
|
|
693
|
+
'<html>test</html>',
|
|
694
|
+
'Find all'
|
|
695
|
+
);
|
|
597
696
|
|
|
598
697
|
setTimeout(() => {
|
|
599
698
|
mockProcess.stdout.emit('data', Buffer.from('"string response"'));
|
|
@@ -606,7 +705,11 @@ describe('ClaudeClient', () => {
|
|
|
606
705
|
|
|
607
706
|
describe('Response Validation', () => {
|
|
608
707
|
it('should validate urls is an array', async () => {
|
|
609
|
-
const promise = client.determineCrawlUrls(
|
|
708
|
+
const promise = client.determineCrawlUrls(
|
|
709
|
+
'https://example.com',
|
|
710
|
+
'<html>test</html>',
|
|
711
|
+
'Find all'
|
|
712
|
+
);
|
|
610
713
|
|
|
611
714
|
setTimeout(() => {
|
|
612
715
|
mockProcess.stdout.emit(
|
|
@@ -625,7 +728,11 @@ describe('ClaudeClient', () => {
|
|
|
625
728
|
});
|
|
626
729
|
|
|
627
730
|
it('should validate reasoning is a string', async () => {
|
|
628
|
-
const promise = client.determineCrawlUrls(
|
|
731
|
+
const promise = client.determineCrawlUrls(
|
|
732
|
+
'https://example.com',
|
|
733
|
+
'<html>test</html>',
|
|
734
|
+
'Find all'
|
|
735
|
+
);
|
|
629
736
|
|
|
630
737
|
setTimeout(() => {
|
|
631
738
|
mockProcess.stdout.emit(
|
|
@@ -644,7 +751,11 @@ describe('ClaudeClient', () => {
|
|
|
644
751
|
});
|
|
645
752
|
|
|
646
753
|
it('should accept valid response with multiple URLs', async () => {
|
|
647
|
-
const promise = client.determineCrawlUrls(
|
|
754
|
+
const promise = client.determineCrawlUrls(
|
|
755
|
+
'https://example.com',
|
|
756
|
+
'<html>test</html>',
|
|
757
|
+
'Find all'
|
|
758
|
+
);
|
|
648
759
|
|
|
649
760
|
setTimeout(() => {
|
|
650
761
|
mockProcess.stdout.emit(
|
|
@@ -671,7 +782,11 @@ describe('ClaudeClient', () => {
|
|
|
671
782
|
|
|
672
783
|
describe('Error Messages', () => {
|
|
673
784
|
it('should wrap errors with context for determineCrawlUrls', async () => {
|
|
674
|
-
const promise = client.determineCrawlUrls(
|
|
785
|
+
const promise = client.determineCrawlUrls(
|
|
786
|
+
'https://example.com',
|
|
787
|
+
'<html>test</html>',
|
|
788
|
+
'Find all'
|
|
789
|
+
);
|
|
675
790
|
|
|
676
791
|
setTimeout(() => {
|
|
677
792
|
mockProcess.emit('close', 1);
|