bluera-knowledge 0.9.43 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/CHANGELOG.md +75 -0
- package/README.md +114 -42
- package/commands/sync.md +96 -0
- package/dist/{chunk-MQE32YY6.js → chunk-6U45VP5Z.js} +42 -6
- package/dist/chunk-6U45VP5Z.js.map +1 -0
- package/dist/{chunk-CUHYSPRV.js → chunk-DP5XBPQV.js} +372 -2
- package/dist/chunk-DP5XBPQV.js.map +1 -0
- package/dist/{chunk-DWAIT2OD.js → chunk-UE4ZIJYA.js} +74 -5
- package/dist/{chunk-DWAIT2OD.js.map → chunk-UE4ZIJYA.js.map} +1 -1
- package/dist/index.js +216 -7
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +4 -3
- package/dist/workers/background-worker-cli.js.map +1 -1
- package/hooks/check-dependencies.sh +29 -0
- package/package.json +1 -1
- package/python/crawl_worker.py +6 -1
- package/src/cli/commands/crawl.test.ts +43 -3
- package/src/cli/commands/crawl.ts +3 -3
- package/src/cli/commands/sync.test.ts +54 -0
- package/src/cli/commands/sync.ts +264 -0
- package/src/cli/index.ts +1 -0
- package/src/crawl/claude-client.test.ts +195 -24
- package/src/crawl/claude-client.ts +38 -3
- package/src/crawl/intelligent-crawler.test.ts +65 -0
- package/src/crawl/intelligent-crawler.ts +14 -2
- package/src/index.ts +2 -0
- package/src/mcp/commands/index.ts +2 -0
- package/src/mcp/commands/sync.commands.test.ts +283 -0
- package/src/mcp/commands/sync.commands.ts +233 -0
- package/src/services/gitignore.service.test.ts +157 -0
- package/src/services/gitignore.service.ts +132 -0
- package/src/services/store-definition.service.test.ts +440 -0
- package/src/services/store-definition.service.ts +198 -0
- package/src/services/store.service.test.ts +279 -1
- package/src/services/store.service.ts +101 -4
- package/src/types/index.ts +18 -0
- package/src/types/store-definition.test.ts +492 -0
- package/src/types/store-definition.ts +129 -0
- package/src/workers/background-worker.ts +1 -1
- package/dist/chunk-CUHYSPRV.js.map +0 -1
- package/dist/chunk-MQE32YY6.js.map +0 -1
package/dist/mcp/server.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
3
|
IntelligentCrawler
|
|
4
|
-
} from "../chunk-
|
|
4
|
+
} from "../chunk-6U45VP5Z.js";
|
|
5
5
|
import {
|
|
6
6
|
JobService,
|
|
7
7
|
createDocumentId,
|
|
8
8
|
createServices,
|
|
9
9
|
createStoreId
|
|
10
|
-
} from "../chunk-
|
|
10
|
+
} from "../chunk-UE4ZIJYA.js";
|
|
11
11
|
import "../chunk-6FHWC36B.js";
|
|
12
12
|
|
|
13
13
|
// src/workers/background-worker.ts
|
|
@@ -186,7 +186,8 @@ var BackgroundWorker = class {
|
|
|
186
186
|
const crawlOptions = {
|
|
187
187
|
maxPages: resolvedMaxPages,
|
|
188
188
|
simple: simple ?? false,
|
|
189
|
-
useHeadless: useHeadless ??
|
|
189
|
+
useHeadless: useHeadless ?? true
|
|
190
|
+
// Default to headless for reliability
|
|
190
191
|
};
|
|
191
192
|
if (crawlInstruction !== void 0) {
|
|
192
193
|
crawlOptions.crawlInstruction = crawlInstruction;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/workers/background-worker.ts","../../src/workers/pid-file.ts","../../src/workers/background-worker-cli.ts"],"sourcesContent":["import { createHash } from 'node:crypto';\nimport { IntelligentCrawler, type CrawlProgress } from '../crawl/intelligent-crawler.js';\nimport { IndexService } from '../services/index.service.js';\nimport { JobService } from '../services/job.service.js';\nimport { StoreService } from '../services/store.service.js';\nimport { createStoreId, createDocumentId } from '../types/brands.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { Document } from '../types/document.js';\nimport type { Job } from '../types/job.js';\n\n/**\n * Calculate index progress as a percentage, handling division by zero.\n * @param current - Current number of items processed\n * @param total - Total number of items (may be 0)\n * @param scale - Scale factor for progress (default 100 for 0-100%)\n * @returns Progress value, or 0 if total is 0\n */\nexport function calculateIndexProgress(\n current: number,\n total: number,\n scale: number = 100\n): number {\n if (total === 0) return 0;\n return (current / total) * scale;\n}\n\nexport class BackgroundWorker {\n constructor(\n private readonly jobService: JobService,\n private readonly storeService: StoreService,\n private readonly indexService: IndexService,\n private readonly lanceStore: LanceStore,\n private readonly embeddingEngine: EmbeddingEngine\n ) {}\n\n /**\n * Execute a job based on its type\n */\n async executeJob(jobId: string): Promise<void> {\n const job = this.jobService.getJob(jobId);\n\n if (!job) {\n throw new Error(`Job ${jobId} not found`);\n }\n\n try {\n // Update to running status\n this.jobService.updateJob(jobId, {\n status: 'running',\n message: `Starting ${job.type} operation...`,\n progress: 0,\n details: { startedAt: new Date().toISOString() },\n });\n\n // Execute based on job type\n switch (job.type) {\n case 'clone':\n await this.executeCloneJob(job);\n break;\n case 'index':\n await this.executeIndexJob(job);\n break;\n case 'crawl':\n await this.executeCrawlJob(job);\n break;\n default:\n throw new Error(`Unknown job type: ${String(job.type)}`);\n }\n\n // Mark as completed\n this.jobService.updateJob(jobId, {\n status: 'completed',\n progress: 100,\n message: `${job.type} operation completed successfully`,\n details: { completedAt: new Date().toISOString() },\n });\n } catch (error) {\n // Mark as failed\n const errorDetails: Record<string, unknown> = {\n completedAt: new Date().toISOString(),\n };\n if (error instanceof Error && error.stack !== undefined) {\n errorDetails['error'] = error.stack;\n } else {\n errorDetails['error'] = String(error);\n }\n this.jobService.updateJob(jobId, {\n status: 'failed',\n message: error instanceof Error ? error.message : 'Unknown error',\n details: errorDetails,\n });\n throw error;\n }\n }\n\n /**\n * Execute a clone job (git clone + initial indexing)\n */\n private async executeCloneJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for clone job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Clone is already done by the time the job is created\n // (happens in StoreService.create), so we just need to index\n\n // Update progress - cloning considered done (30%)\n this.jobService.updateJob(job.id, {\n status: 'running',\n message: 'Repository cloned, starting indexing...',\n progress: 30,\n });\n\n // Index the repository with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Indexing is 70% of total progress (30-100%)\n const indexProgress = calculateIndexProgress(event.current, event.total, 70);\n const totalProgress = 30 + indexProgress;\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, totalProgress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute an index job (re-indexing existing store)\n */\n private async executeIndexJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for index job');\n }\n\n // Get the store\n const store = await this.storeService.getByIdOrName(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Index with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n const progress = calculateIndexProgress(event.current, event.total);\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, progress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute a crawl job (web crawling + indexing)\n */\n private async executeCrawlJob(job: Job): Promise<void> {\n const { storeId, url, crawlInstruction, extractInstruction, maxPages, simple, useHeadless } =\n job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for crawl job');\n }\n if (url === undefined || typeof url !== 'string') {\n throw new Error('URL required for crawl job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (store?.type !== 'web') {\n throw new Error(`Web store ${storeId} not found`);\n }\n\n const resolvedMaxPages = typeof maxPages === 'number' ? maxPages : 50;\n const crawler = new IntelligentCrawler();\n\n // Listen for progress events\n crawler.on('progress', (progress: CrawlProgress) => {\n // Check if job was cancelled - just return early, for-await loop will throw and finally will cleanup\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n return;\n }\n\n // Crawling is 80% of total progress (0-80%)\n const crawlProgress = (progress.pagesVisited / resolvedMaxPages) * 80;\n\n this.jobService.updateJob(job.id, {\n message:\n progress.message ??\n `Crawling page ${String(progress.pagesVisited)}/${String(resolvedMaxPages)}`,\n progress: Math.min(80, crawlProgress),\n details: { pagesCrawled: progress.pagesVisited },\n });\n });\n\n try {\n await this.lanceStore.initialize(store.id);\n const docs: Document[] = [];\n\n // Build crawl options, only including defined values\n const crawlOptions: {\n maxPages: number;\n simple: boolean;\n useHeadless: boolean;\n crawlInstruction?: string;\n extractInstruction?: string;\n } = {\n maxPages: resolvedMaxPages,\n simple: simple ?? false,\n useHeadless: useHeadless ?? false,\n };\n if (crawlInstruction !== undefined) {\n crawlOptions.crawlInstruction = crawlInstruction;\n }\n if (extractInstruction !== undefined) {\n crawlOptions.extractInstruction = extractInstruction;\n }\n\n // Crawl pages using IntelligentCrawler\n for await (const result of crawler.crawl(url, crawlOptions)) {\n // Check cancellation between pages\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Embed and index the content (use extracted if available, otherwise markdown)\n const contentToEmbed = result.extracted ?? result.markdown;\n const vector = await this.embeddingEngine.embed(contentToEmbed);\n\n docs.push({\n id: createDocumentId(`${store.id}-${createHash('md5').update(result.url).digest('hex')}`),\n content: contentToEmbed,\n vector,\n metadata: {\n type: 'web',\n storeId: store.id,\n url: result.url,\n title: result.title,\n extracted: result.extracted !== undefined,\n depth: result.depth,\n indexedAt: new Date(),\n },\n });\n }\n\n // Index all documents (remaining 20%)\n if (docs.length > 0) {\n this.jobService.updateJob(job.id, {\n message: 'Indexing crawled documents...',\n progress: 85,\n });\n\n await this.lanceStore.addDocuments(store.id, docs);\n // Create FTS index for full-text search\n await this.lanceStore.createFtsIndex(store.id);\n }\n\n this.jobService.updateJob(job.id, {\n message: `Crawled and indexed ${String(docs.length)} pages`,\n progress: 100,\n details: { pagesCrawled: docs.length },\n });\n } finally {\n await crawler.stop();\n }\n }\n}\n","import fs from 'fs';\nimport path from 'path';\n\n/**\n * Result of a PID file delete operation.\n * Delete operations are best-effort and should not throw.\n */\nexport interface PidFileResult {\n success: boolean;\n error?: Error;\n}\n\n/**\n * Context for PID file deletion - indicates when the delete is happening.\n * Used for logging/debugging purposes.\n */\nexport type PidFileDeleteContext = 'sigterm' | 'success' | 'failure';\n\n/**\n * Write PID file - CRITICAL operation that must succeed.\n *\n * If the PID file cannot be written, the job cannot be cancelled through\n * the job management system. This is a critical failure and the job\n * should not proceed.\n *\n * @param pidFile - Absolute path to the PID file\n * @param pid - Process ID to write\n * @throws Error if PID file cannot be written\n */\nexport function writePidFile(pidFile: string, pid: number): void {\n try {\n fs.writeFileSync(pidFile, pid.toString(), 'utf-8');\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n throw new Error(\n `CRITICAL: Failed to write PID file ${pidFile}. ` +\n `Job cannot be cancelled without PID file. ` +\n `Original error: ${message}`\n );\n }\n}\n\n/**\n * Delete PID file - best-effort cleanup during shutdown.\n *\n * This operation should NEVER throw. During process shutdown (SIGTERM,\n * job success, job failure), failing to delete a PID file should not\n * prevent the process from exiting cleanly.\n *\n * Stale PID files are cleaned up by JobService.cleanupOldJobs().\n *\n * @param pidFile - Absolute path to the PID file\n * @param _context - Context indicating when the delete is happening (for future logging)\n * @returns Result indicating success or failure with error details\n */\nexport function deletePidFile(pidFile: string, _context: PidFileDeleteContext): PidFileResult {\n try {\n fs.unlinkSync(pidFile);\n return { success: true };\n } catch (error) {\n // ENOENT = file doesn't exist - that's success (nothing to delete)\n if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {\n return { success: true };\n }\n // Any other error = failure (permission denied, etc.)\n return {\n success: false,\n error: error instanceof Error ? error : new Error(String(error)),\n };\n }\n}\n\n/**\n * Build the path to a PID file for a given job.\n *\n * @param jobsDir - Directory where job files are stored\n * @param jobId - Job identifier\n * @returns Absolute path to the PID file\n */\nexport function buildPidFilePath(jobsDir: string, jobId: string): string {\n return path.join(jobsDir, `${jobId}.pid`);\n}\n","#!/usr/bin/env node\nimport { BackgroundWorker } from './background-worker.js';\nimport { writePidFile, deletePidFile, buildPidFilePath } from './pid-file.js';\nimport { createServices } from '../services/index.js';\nimport { JobService } from '../services/job.service.js';\n\n/**\n * Background worker CLI entry point\n *\n * Usage: background-worker-cli <job-id>\n *\n * This process runs detached from the parent and executes a single job.\n */\n\nasync function main(): Promise<void> {\n const jobId = process.argv[2];\n const dataDir = process.env['BLUERA_DATA_DIR'];\n\n if (jobId === undefined || jobId === '') {\n console.error('Error: Job ID required');\n console.error('Usage: background-worker-cli <job-id>');\n process.exit(1);\n }\n\n // Initialize services\n const jobService = new JobService(dataDir);\n const services = await createServices(undefined, dataDir);\n\n // Write PID file for job cancellation - CRITICAL: must succeed or job cannot be cancelled\n const pidFile = buildPidFilePath(\n jobService['jobsDir'], // Access private field for PID path\n jobId\n );\n\n try {\n writePidFile(pidFile, process.pid);\n } catch (error) {\n // CRITICAL: Cannot proceed without PID file - job would be uncancellable\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n\n // Handle SIGTERM for graceful shutdown\n process.on('SIGTERM', () => {\n console.log(`[${jobId}] Received SIGTERM, cancelling job...`);\n jobService.updateJob(jobId, {\n status: 'cancelled',\n message: 'Job cancelled by user',\n });\n\n // Clean up PID file (best-effort - don't block shutdown)\n const deleteResult = deletePidFile(pidFile, 'sigterm');\n if (!deleteResult.success && deleteResult.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file during SIGTERM: ${deleteResult.error.message}`\n );\n }\n\n process.exit(0);\n });\n\n // Create worker and execute job\n const worker = new BackgroundWorker(\n jobService,\n services.store,\n services.index,\n services.lance,\n services.embeddings\n );\n\n try {\n await worker.executeJob(jobId);\n\n // Clean up PID file on success (best-effort - don't change exit code)\n const successCleanup = deletePidFile(pidFile, 'success');\n if (!successCleanup.success && successCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after success: ${successCleanup.error.message}`\n );\n }\n\n console.log(`[${jobId}] Job completed successfully`);\n process.exit(0);\n } catch (error) {\n // Job service already updated with failure status in BackgroundWorker\n console.error(`[${jobId}] Job failed:`, error);\n\n // Clean up PID file on failure (best-effort - exit code reflects job failure)\n const failureCleanup = deletePidFile(pidFile, 'failure');\n if (!failureCleanup.success && failureCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after failure: ${failureCleanup.error.message}`\n );\n }\n\n process.exit(1);\n }\n}\n\nmain().catch((error: unknown) => {\n console.error('Fatal error in background worker:', error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;;;;;;;AAAA,SAAS,kBAAkB;AAkBpB,SAAS,uBACd,SACA,OACA,QAAgB,KACR;AACR,MAAI,UAAU,EAAG,QAAO;AACxB,SAAQ,UAAU,QAAS;AAC7B;AAEO,IAAM,mBAAN,MAAuB;AAAA,EAC5B,YACmB,YACA,cACA,cACA,YACA,iBACjB;AALiB;AACA;AACA;AACA;AACA;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA,EAKH,MAAM,WAAW,OAA8B;AAC7C,UAAM,MAAM,KAAK,WAAW,OAAO,KAAK;AAExC,QAAI,CAAC,KAAK;AACR,YAAM,IAAI,MAAM,OAAO,KAAK,YAAY;AAAA,IAC1C;AAEA,QAAI;AAEF,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,YAAY,IAAI,IAAI;AAAA,QAC7B,UAAU;AAAA,QACV,SAAS,EAAE,YAAW,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACjD,CAAC;AAGD,cAAQ,IAAI,MAAM;AAAA,QAChB,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF;AACE,gBAAM,IAAI,MAAM,qBAAqB,OAAO,IAAI,IAAI,CAAC,EAAE;AAAA,MAC3D;AAGA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,UAAU;AAAA,QACV,SAAS,GAAG,IAAI,IAAI;AAAA,QACpB,SAAS,EAAE,cAAa,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACnD,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,YAAM,eAAwC;AAAA,QAC5C,cAAa,oBAAI,KAAK,GAAE,YAAY;AAAA,MACtC;AACA,UAAI,iBAAiB,SAAS,MAAM,UAAU,QAAW;AACvD,qBAAa,OAAO,IAAI,MAAM;AAAA,MAChC,OAAO;AACL,qBAAa,OAAO,IAAI,OAAO,KAAK;AAAA,MACtC;AACA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,iBAAiB,QAAQ,MAAM,UAAU;AAAA,QAClD,SAAS;AAAA,MACX,CAAC;AACD,YAAM;AAAA,IACR;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAMA,SAAK,WAAW,UAAU,IAAI,IAAI;AAAA,MAChC,QAAQ;AAAA,MACR,SAAS;AAAA,MACT,UAAU;AAAA,IACZ,CAAC;AAGD,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,gBAAgB,uBAAuB,MAAM,SAAS,MAAM,OAAO,EAAE;AAC3E,cAAM,gBAAgB,KAAK;AAE3B,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA;AAAA,UACpC,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,cAAc,cAAc,OAAO,CAAC;AAC1E,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAGA,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAEA,cAAM,WAAW,uBAAuB,MAAM,SAAS,MAAM,KAAK;AAElE,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,QAAQ;AAAA;AAAA,UAC/B,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,SAAS,KAAK,kBAAkB,oBAAoB,UAAU,QAAQ,YAAY,IACxF,IAAI;AAEN,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AACA,QAAI,QAAQ,UAAa,OAAO,QAAQ,UAAU;AAChD,YAAM,IAAI,MAAM,4BAA4B;AAAA,IAC9C;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,OAAO,SAAS,OAAO;AACzB,YAAM,IAAI,MAAM,aAAa,OAAO,YAAY;AAAA,IAClD;AAEA,UAAM,mBAAmB,OAAO,aAAa,WAAW,WAAW;AACnE,UAAM,UAAU,IAAI,mBAAmB;AAGvC,YAAQ,GAAG,YAAY,CAAC,aAA4B;AAElD,YAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,UAAI,YAAY,WAAW,aAAa;AACtC;AAAA,MACF;AAGA,YAAM,gBAAiB,SAAS,eAAe,mBAAoB;AAEnE,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SACE,SAAS,WACT,iBAAiB,OAAO,SAAS,YAAY,CAAC,IAAI,OAAO,gBAAgB,CAAC;AAAA,QAC5E,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA,QACpC,SAAS,EAAE,cAAc,SAAS,aAAa;AAAA,MACjD,CAAC;AAAA,IACH,CAAC;AAED,QAAI;AACF,YAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,YAAM,OAAmB,CAAC;AAG1B,YAAM,eAMF;AAAA,QACF,UAAU;AAAA,QACV,QAAQ,UAAU;AAAA,QAClB,aAAa,eAAe;AAAA,MAC9B;AACA,UAAI,qBAAqB,QAAW;AAClC,qBAAa,mBAAmB;AAAA,MAClC;AACA,UAAI,uBAAuB,QAAW;AACpC,qBAAa,qBAAqB;AAAA,MACpC;AAGA,uBAAiB,UAAU,QAAQ,MAAM,KAAK,YAAY,GAAG;AAE3D,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,iBAAiB,OAAO,aAAa,OAAO;AAClD,cAAM,SAAS,MAAM,KAAK,gBAAgB,MAAM,cAAc;AAE9D,aAAK,KAAK;AAAA,UACR,IAAI,iBAAiB,GAAG,MAAM,EAAE,IAAI,WAAW,KAAK,EAAE,OAAO,OAAO,GAAG,EAAE,OAAO,KAAK,CAAC,EAAE;AAAA,UACxF,SAAS;AAAA,UACT;AAAA,UACA,UAAU;AAAA,YACR,MAAM;AAAA,YACN,SAAS,MAAM;AAAA,YACf,KAAK,OAAO;AAAA,YACZ,OAAO,OAAO;AAAA,YACd,WAAW,OAAO,cAAc;AAAA,YAChC,OAAO,OAAO;AAAA,YACd,WAAW,oBAAI,KAAK;AAAA,UACtB;AAAA,QACF,CAAC;AAAA,MACH;AAGA,UAAI,KAAK,SAAS,GAAG;AACnB,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS;AAAA,UACT,UAAU;AAAA,QACZ,CAAC;AAED,cAAM,KAAK,WAAW,aAAa,MAAM,IAAI,IAAI;AAEjD,cAAM,KAAK,WAAW,eAAe,MAAM,EAAE;AAAA,MAC/C;AAEA,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SAAS,uBAAuB,OAAO,KAAK,MAAM,CAAC;AAAA,QACnD,UAAU;AAAA,QACV,SAAS,EAAE,cAAc,KAAK,OAAO;AAAA,MACvC,CAAC;AAAA,IACH,UAAE;AACA,YAAM,QAAQ,KAAK;AAAA,IACrB;AAAA,EACF;AACF;;;ACvTA,OAAO,QAAQ;AACf,OAAO,UAAU;AA4BV,SAAS,aAAa,SAAiB,KAAmB;AAC/D,MAAI;AACF,OAAG,cAAc,SAAS,IAAI,SAAS,GAAG,OAAO;AAAA,EACnD,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,UAAM,IAAI;AAAA,MACR,sCAAsC,OAAO,+DAExB,OAAO;AAAA,IAC9B;AAAA,EACF;AACF;AAeO,SAAS,cAAc,SAAiB,UAA+C;AAC5F,MAAI;AACF,OAAG,WAAW,OAAO;AACrB,WAAO,EAAE,SAAS,KAAK;AAAA,EACzB,SAAS,OAAO;AAEd,QAAI,iBAAiB,SAAS,UAAU,SAAS,MAAM,SAAS,UAAU;AACxE,aAAO,EAAE,SAAS,KAAK;AAAA,IACzB;AAEA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,OAAO,iBAAiB,QAAQ,QAAQ,IAAI,MAAM,OAAO,KAAK,CAAC;AAAA,IACjE;AAAA,EACF;AACF;AASO,SAAS,iBAAiB,SAAiB,OAAuB;AACvE,SAAO,KAAK,KAAK,SAAS,GAAG,KAAK,MAAM;AAC1C;;;ACnEA,eAAe,OAAsB;AACnC,QAAM,QAAQ,QAAQ,KAAK,CAAC;AAC5B,QAAM,UAAU,QAAQ,IAAI,iBAAiB;AAE7C,MAAI,UAAU,UAAa,UAAU,IAAI;AACvC,YAAQ,MAAM,wBAAwB;AACtC,YAAQ,MAAM,uCAAuC;AACrD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,aAAa,IAAI,WAAW,OAAO;AACzC,QAAM,WAAW,MAAM,eAAe,QAAW,OAAO;AAGxD,QAAM,UAAU;AAAA,IACd,WAAW,SAAS;AAAA;AAAA,IACpB;AAAA,EACF;AAEA,MAAI;AACF,iBAAa,SAAS,QAAQ,GAAG;AAAA,EACnC,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,UAAQ,GAAG,WAAW,MAAM;AAC1B,YAAQ,IAAI,IAAI,KAAK,uCAAuC;AAC5D,eAAW,UAAU,OAAO;AAAA,MAC1B,QAAQ;AAAA,MACR,SAAS;AAAA,IACX,CAAC;AAGD,UAAM,eAAe,cAAc,SAAS,SAAS;AACrD,QAAI,CAAC,aAAa,WAAW,aAAa,UAAU,QAAW;AAC7D,cAAQ;AAAA,QACN,sDAAsD,aAAa,MAAM,OAAO;AAAA,MAClF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB,CAAC;AAGD,QAAM,SAAS,IAAI;AAAA,IACjB;AAAA,IACA,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,EACX;AAEA,MAAI;AACF,UAAM,OAAO,WAAW,KAAK;AAG7B,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,IAAI,IAAI,KAAK,8BAA8B;AACnD,YAAQ,KAAK,CAAC;AAAA,EAChB,SAAS,OAAO;AAEd,YAAQ,MAAM,IAAI,KAAK,iBAAiB,KAAK;AAG7C,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAEA,KAAK,EAAE,MAAM,CAAC,UAAmB;AAC/B,UAAQ,MAAM,qCAAqC,KAAK;AACxD,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/workers/background-worker.ts","../../src/workers/pid-file.ts","../../src/workers/background-worker-cli.ts"],"sourcesContent":["import { createHash } from 'node:crypto';\nimport { IntelligentCrawler, type CrawlProgress } from '../crawl/intelligent-crawler.js';\nimport { IndexService } from '../services/index.service.js';\nimport { JobService } from '../services/job.service.js';\nimport { StoreService } from '../services/store.service.js';\nimport { createStoreId, createDocumentId } from '../types/brands.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { Document } from '../types/document.js';\nimport type { Job } from '../types/job.js';\n\n/**\n * Calculate index progress as a percentage, handling division by zero.\n * @param current - Current number of items processed\n * @param total - Total number of items (may be 0)\n * @param scale - Scale factor for progress (default 100 for 0-100%)\n * @returns Progress value, or 0 if total is 0\n */\nexport function calculateIndexProgress(\n current: number,\n total: number,\n scale: number = 100\n): number {\n if (total === 0) return 0;\n return (current / total) * scale;\n}\n\nexport class BackgroundWorker {\n constructor(\n private readonly jobService: JobService,\n private readonly storeService: StoreService,\n private readonly indexService: IndexService,\n private readonly lanceStore: LanceStore,\n private readonly embeddingEngine: EmbeddingEngine\n ) {}\n\n /**\n * Execute a job based on its type\n */\n async executeJob(jobId: string): Promise<void> {\n const job = this.jobService.getJob(jobId);\n\n if (!job) {\n throw new Error(`Job ${jobId} not found`);\n }\n\n try {\n // Update to running status\n this.jobService.updateJob(jobId, {\n status: 'running',\n message: `Starting ${job.type} operation...`,\n progress: 0,\n details: { startedAt: new Date().toISOString() },\n });\n\n // Execute based on job type\n switch (job.type) {\n case 'clone':\n await this.executeCloneJob(job);\n break;\n case 'index':\n await this.executeIndexJob(job);\n break;\n case 'crawl':\n await this.executeCrawlJob(job);\n break;\n default:\n throw new Error(`Unknown job type: ${String(job.type)}`);\n }\n\n // Mark as completed\n this.jobService.updateJob(jobId, {\n status: 'completed',\n progress: 100,\n message: `${job.type} operation completed successfully`,\n details: { completedAt: new Date().toISOString() },\n });\n } catch (error) {\n // Mark as failed\n const errorDetails: Record<string, unknown> = {\n completedAt: new Date().toISOString(),\n };\n if (error instanceof Error && error.stack !== undefined) {\n errorDetails['error'] = error.stack;\n } else {\n errorDetails['error'] = String(error);\n }\n this.jobService.updateJob(jobId, {\n status: 'failed',\n message: error instanceof Error ? error.message : 'Unknown error',\n details: errorDetails,\n });\n throw error;\n }\n }\n\n /**\n * Execute a clone job (git clone + initial indexing)\n */\n private async executeCloneJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for clone job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Clone is already done by the time the job is created\n // (happens in StoreService.create), so we just need to index\n\n // Update progress - cloning considered done (30%)\n this.jobService.updateJob(job.id, {\n status: 'running',\n message: 'Repository cloned, starting indexing...',\n progress: 30,\n });\n\n // Index the repository with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Indexing is 70% of total progress (30-100%)\n const indexProgress = calculateIndexProgress(event.current, event.total, 70);\n const totalProgress = 30 + indexProgress;\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, totalProgress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute an index job (re-indexing existing store)\n */\n private async executeIndexJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for index job');\n }\n\n // Get the store\n const store = await this.storeService.getByIdOrName(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Index with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n const progress = calculateIndexProgress(event.current, event.total);\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, progress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute a crawl job (web crawling + indexing)\n */\n private async executeCrawlJob(job: Job): Promise<void> {\n const { storeId, url, crawlInstruction, extractInstruction, maxPages, simple, useHeadless } =\n job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for crawl job');\n }\n if (url === undefined || typeof url !== 'string') {\n throw new Error('URL required for crawl job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (store?.type !== 'web') {\n throw new Error(`Web store ${storeId} not found`);\n }\n\n const resolvedMaxPages = typeof maxPages === 'number' ? maxPages : 50;\n const crawler = new IntelligentCrawler();\n\n // Listen for progress events\n crawler.on('progress', (progress: CrawlProgress) => {\n // Check if job was cancelled - just return early, for-await loop will throw and finally will cleanup\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n return;\n }\n\n // Crawling is 80% of total progress (0-80%)\n const crawlProgress = (progress.pagesVisited / resolvedMaxPages) * 80;\n\n this.jobService.updateJob(job.id, {\n message:\n progress.message ??\n `Crawling page ${String(progress.pagesVisited)}/${String(resolvedMaxPages)}`,\n progress: Math.min(80, crawlProgress),\n details: { pagesCrawled: progress.pagesVisited },\n });\n });\n\n try {\n await this.lanceStore.initialize(store.id);\n const docs: Document[] = [];\n\n // Build crawl options, only including defined values\n const crawlOptions: {\n maxPages: number;\n simple: boolean;\n useHeadless: boolean;\n crawlInstruction?: string;\n extractInstruction?: string;\n } = {\n maxPages: resolvedMaxPages,\n simple: simple ?? false,\n useHeadless: useHeadless ?? true, // Default to headless for reliability\n };\n if (crawlInstruction !== undefined) {\n crawlOptions.crawlInstruction = crawlInstruction;\n }\n if (extractInstruction !== undefined) {\n crawlOptions.extractInstruction = extractInstruction;\n }\n\n // Crawl pages using IntelligentCrawler\n for await (const result of crawler.crawl(url, crawlOptions)) {\n // Check cancellation between pages\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Embed and index the content (use extracted if available, otherwise markdown)\n const contentToEmbed = result.extracted ?? result.markdown;\n const vector = await this.embeddingEngine.embed(contentToEmbed);\n\n docs.push({\n id: createDocumentId(`${store.id}-${createHash('md5').update(result.url).digest('hex')}`),\n content: contentToEmbed,\n vector,\n metadata: {\n type: 'web',\n storeId: store.id,\n url: result.url,\n title: result.title,\n extracted: result.extracted !== undefined,\n depth: result.depth,\n indexedAt: new Date(),\n },\n });\n }\n\n // Index all documents (remaining 20%)\n if (docs.length > 0) {\n this.jobService.updateJob(job.id, {\n message: 'Indexing crawled documents...',\n progress: 85,\n });\n\n await this.lanceStore.addDocuments(store.id, docs);\n // Create FTS index for full-text search\n await this.lanceStore.createFtsIndex(store.id);\n }\n\n this.jobService.updateJob(job.id, {\n message: `Crawled and indexed ${String(docs.length)} pages`,\n progress: 100,\n details: { pagesCrawled: docs.length },\n });\n } finally {\n await crawler.stop();\n }\n }\n}\n","import fs from 'fs';\nimport path from 'path';\n\n/**\n * Result of a PID file delete operation.\n * Delete operations are best-effort and should not throw.\n */\nexport interface PidFileResult {\n success: boolean;\n error?: Error;\n}\n\n/**\n * Context for PID file deletion - indicates when the delete is happening.\n * Used for logging/debugging purposes.\n */\nexport type PidFileDeleteContext = 'sigterm' | 'success' | 'failure';\n\n/**\n * Write PID file - CRITICAL operation that must succeed.\n *\n * If the PID file cannot be written, the job cannot be cancelled through\n * the job management system. This is a critical failure and the job\n * should not proceed.\n *\n * @param pidFile - Absolute path to the PID file\n * @param pid - Process ID to write\n * @throws Error if PID file cannot be written\n */\nexport function writePidFile(pidFile: string, pid: number): void {\n try {\n fs.writeFileSync(pidFile, pid.toString(), 'utf-8');\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n throw new Error(\n `CRITICAL: Failed to write PID file ${pidFile}. ` +\n `Job cannot be cancelled without PID file. ` +\n `Original error: ${message}`\n );\n }\n}\n\n/**\n * Delete PID file - best-effort cleanup during shutdown.\n *\n * This operation should NEVER throw. During process shutdown (SIGTERM,\n * job success, job failure), failing to delete a PID file should not\n * prevent the process from exiting cleanly.\n *\n * Stale PID files are cleaned up by JobService.cleanupOldJobs().\n *\n * @param pidFile - Absolute path to the PID file\n * @param _context - Context indicating when the delete is happening (for future logging)\n * @returns Result indicating success or failure with error details\n */\nexport function deletePidFile(pidFile: string, _context: PidFileDeleteContext): PidFileResult {\n try {\n fs.unlinkSync(pidFile);\n return { success: true };\n } catch (error) {\n // ENOENT = file doesn't exist - that's success (nothing to delete)\n if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {\n return { success: true };\n }\n // Any other error = failure (permission denied, etc.)\n return {\n success: false,\n error: error instanceof Error ? error : new Error(String(error)),\n };\n }\n}\n\n/**\n * Build the path to a PID file for a given job.\n *\n * @param jobsDir - Directory where job files are stored\n * @param jobId - Job identifier\n * @returns Absolute path to the PID file\n */\nexport function buildPidFilePath(jobsDir: string, jobId: string): string {\n return path.join(jobsDir, `${jobId}.pid`);\n}\n","#!/usr/bin/env node\nimport { BackgroundWorker } from './background-worker.js';\nimport { writePidFile, deletePidFile, buildPidFilePath } from './pid-file.js';\nimport { createServices } from '../services/index.js';\nimport { JobService } from '../services/job.service.js';\n\n/**\n * Background worker CLI entry point\n *\n * Usage: background-worker-cli <job-id>\n *\n * This process runs detached from the parent and executes a single job.\n */\n\nasync function main(): Promise<void> {\n const jobId = process.argv[2];\n const dataDir = process.env['BLUERA_DATA_DIR'];\n\n if (jobId === undefined || jobId === '') {\n console.error('Error: Job ID required');\n console.error('Usage: background-worker-cli <job-id>');\n process.exit(1);\n }\n\n // Initialize services\n const jobService = new JobService(dataDir);\n const services = await createServices(undefined, dataDir);\n\n // Write PID file for job cancellation - CRITICAL: must succeed or job cannot be cancelled\n const pidFile = buildPidFilePath(\n jobService['jobsDir'], // Access private field for PID path\n jobId\n );\n\n try {\n writePidFile(pidFile, process.pid);\n } catch (error) {\n // CRITICAL: Cannot proceed without PID file - job would be uncancellable\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n\n // Handle SIGTERM for graceful shutdown\n process.on('SIGTERM', () => {\n console.log(`[${jobId}] Received SIGTERM, cancelling job...`);\n jobService.updateJob(jobId, {\n status: 'cancelled',\n message: 'Job cancelled by user',\n });\n\n // Clean up PID file (best-effort - don't block shutdown)\n const deleteResult = deletePidFile(pidFile, 'sigterm');\n if (!deleteResult.success && deleteResult.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file during SIGTERM: ${deleteResult.error.message}`\n );\n }\n\n process.exit(0);\n });\n\n // Create worker and execute job\n const worker = new BackgroundWorker(\n jobService,\n services.store,\n services.index,\n services.lance,\n services.embeddings\n );\n\n try {\n await worker.executeJob(jobId);\n\n // Clean up PID file on success (best-effort - don't change exit code)\n const successCleanup = deletePidFile(pidFile, 'success');\n if (!successCleanup.success && successCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after success: ${successCleanup.error.message}`\n );\n }\n\n console.log(`[${jobId}] Job completed successfully`);\n process.exit(0);\n } catch (error) {\n // Job service already updated with failure status in BackgroundWorker\n console.error(`[${jobId}] Job failed:`, error);\n\n // Clean up PID file on failure (best-effort - exit code reflects job failure)\n const failureCleanup = deletePidFile(pidFile, 'failure');\n if (!failureCleanup.success && failureCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after failure: ${failureCleanup.error.message}`\n );\n }\n\n process.exit(1);\n }\n}\n\nmain().catch((error: unknown) => {\n console.error('Fatal error in background worker:', error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;;;;;;;AAAA,SAAS,kBAAkB;AAkBpB,SAAS,uBACd,SACA,OACA,QAAgB,KACR;AACR,MAAI,UAAU,EAAG,QAAO;AACxB,SAAQ,UAAU,QAAS;AAC7B;AAEO,IAAM,mBAAN,MAAuB;AAAA,EAC5B,YACmB,YACA,cACA,cACA,YACA,iBACjB;AALiB;AACA;AACA;AACA;AACA;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA,EAKH,MAAM,WAAW,OAA8B;AAC7C,UAAM,MAAM,KAAK,WAAW,OAAO,KAAK;AAExC,QAAI,CAAC,KAAK;AACR,YAAM,IAAI,MAAM,OAAO,KAAK,YAAY;AAAA,IAC1C;AAEA,QAAI;AAEF,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,YAAY,IAAI,IAAI;AAAA,QAC7B,UAAU;AAAA,QACV,SAAS,EAAE,YAAW,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACjD,CAAC;AAGD,cAAQ,IAAI,MAAM;AAAA,QAChB,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF;AACE,gBAAM,IAAI,MAAM,qBAAqB,OAAO,IAAI,IAAI,CAAC,EAAE;AAAA,MAC3D;AAGA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,UAAU;AAAA,QACV,SAAS,GAAG,IAAI,IAAI;AAAA,QACpB,SAAS,EAAE,cAAa,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACnD,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,YAAM,eAAwC;AAAA,QAC5C,cAAa,oBAAI,KAAK,GAAE,YAAY;AAAA,MACtC;AACA,UAAI,iBAAiB,SAAS,MAAM,UAAU,QAAW;AACvD,qBAAa,OAAO,IAAI,MAAM;AAAA,MAChC,OAAO;AACL,qBAAa,OAAO,IAAI,OAAO,KAAK;AAAA,MACtC;AACA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,iBAAiB,QAAQ,MAAM,UAAU;AAAA,QAClD,SAAS;AAAA,MACX,CAAC;AACD,YAAM;AAAA,IACR;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAMA,SAAK,WAAW,UAAU,IAAI,IAAI;AAAA,MAChC,QAAQ;AAAA,MACR,SAAS;AAAA,MACT,UAAU;AAAA,IACZ,CAAC;AAGD,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,gBAAgB,uBAAuB,MAAM,SAAS,MAAM,OAAO,EAAE;AAC3E,cAAM,gBAAgB,KAAK;AAE3B,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA;AAAA,UACpC,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,cAAc,cAAc,OAAO,CAAC;AAC1E,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAGA,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAEA,cAAM,WAAW,uBAAuB,MAAM,SAAS,MAAM,KAAK;AAElE,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,QAAQ;AAAA;AAAA,UAC/B,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,SAAS,KAAK,kBAAkB,oBAAoB,UAAU,QAAQ,YAAY,IACxF,IAAI;AAEN,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AACA,QAAI,QAAQ,UAAa,OAAO,QAAQ,UAAU;AAChD,YAAM,IAAI,MAAM,4BAA4B;AAAA,IAC9C;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,OAAO,SAAS,OAAO;AACzB,YAAM,IAAI,MAAM,aAAa,OAAO,YAAY;AAAA,IAClD;AAEA,UAAM,mBAAmB,OAAO,aAAa,WAAW,WAAW;AACnE,UAAM,UAAU,IAAI,mBAAmB;AAGvC,YAAQ,GAAG,YAAY,CAAC,aAA4B;AAElD,YAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,UAAI,YAAY,WAAW,aAAa;AACtC;AAAA,MACF;AAGA,YAAM,gBAAiB,SAAS,eAAe,mBAAoB;AAEnE,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SACE,SAAS,WACT,iBAAiB,OAAO,SAAS,YAAY,CAAC,IAAI,OAAO,gBAAgB,CAAC;AAAA,QAC5E,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA,QACpC,SAAS,EAAE,cAAc,SAAS,aAAa;AAAA,MACjD,CAAC;AAAA,IACH,CAAC;AAED,QAAI;AACF,YAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,YAAM,OAAmB,CAAC;AAG1B,YAAM,eAMF;AAAA,QACF,UAAU;AAAA,QACV,QAAQ,UAAU;AAAA,QAClB,aAAa,eAAe;AAAA;AAAA,MAC9B;AACA,UAAI,qBAAqB,QAAW;AAClC,qBAAa,mBAAmB;AAAA,MAClC;AACA,UAAI,uBAAuB,QAAW;AACpC,qBAAa,qBAAqB;AAAA,MACpC;AAGA,uBAAiB,UAAU,QAAQ,MAAM,KAAK,YAAY,GAAG;AAE3D,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,iBAAiB,OAAO,aAAa,OAAO;AAClD,cAAM,SAAS,MAAM,KAAK,gBAAgB,MAAM,cAAc;AAE9D,aAAK,KAAK;AAAA,UACR,IAAI,iBAAiB,GAAG,MAAM,EAAE,IAAI,WAAW,KAAK,EAAE,OAAO,OAAO,GAAG,EAAE,OAAO,KAAK,CAAC,EAAE;AAAA,UACxF,SAAS;AAAA,UACT;AAAA,UACA,UAAU;AAAA,YACR,MAAM;AAAA,YACN,SAAS,MAAM;AAAA,YACf,KAAK,OAAO;AAAA,YACZ,OAAO,OAAO;AAAA,YACd,WAAW,OAAO,cAAc;AAAA,YAChC,OAAO,OAAO;AAAA,YACd,WAAW,oBAAI,KAAK;AAAA,UACtB;AAAA,QACF,CAAC;AAAA,MACH;AAGA,UAAI,KAAK,SAAS,GAAG;AACnB,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS;AAAA,UACT,UAAU;AAAA,QACZ,CAAC;AAED,cAAM,KAAK,WAAW,aAAa,MAAM,IAAI,IAAI;AAEjD,cAAM,KAAK,WAAW,eAAe,MAAM,EAAE;AAAA,MAC/C;AAEA,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SAAS,uBAAuB,OAAO,KAAK,MAAM,CAAC;AAAA,QACnD,UAAU;AAAA,QACV,SAAS,EAAE,cAAc,KAAK,OAAO;AAAA,MACvC,CAAC;AAAA,IACH,UAAE;AACA,YAAM,QAAQ,KAAK;AAAA,IACrB;AAAA,EACF;AACF;;;ACvTA,OAAO,QAAQ;AACf,OAAO,UAAU;AA4BV,SAAS,aAAa,SAAiB,KAAmB;AAC/D,MAAI;AACF,OAAG,cAAc,SAAS,IAAI,SAAS,GAAG,OAAO;AAAA,EACnD,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,UAAM,IAAI;AAAA,MACR,sCAAsC,OAAO,+DAExB,OAAO;AAAA,IAC9B;AAAA,EACF;AACF;AAeO,SAAS,cAAc,SAAiB,UAA+C;AAC5F,MAAI;AACF,OAAG,WAAW,OAAO;AACrB,WAAO,EAAE,SAAS,KAAK;AAAA,EACzB,SAAS,OAAO;AAEd,QAAI,iBAAiB,SAAS,UAAU,SAAS,MAAM,SAAS,UAAU;AACxE,aAAO,EAAE,SAAS,KAAK;AAAA,IACzB;AAEA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,OAAO,iBAAiB,QAAQ,QAAQ,IAAI,MAAM,OAAO,KAAK,CAAC;AAAA,IACjE;AAAA,EACF;AACF;AASO,SAAS,iBAAiB,SAAiB,OAAuB;AACvE,SAAO,KAAK,KAAK,SAAS,GAAG,KAAK,MAAM;AAC1C;;;ACnEA,eAAe,OAAsB;AACnC,QAAM,QAAQ,QAAQ,KAAK,CAAC;AAC5B,QAAM,UAAU,QAAQ,IAAI,iBAAiB;AAE7C,MAAI,UAAU,UAAa,UAAU,IAAI;AACvC,YAAQ,MAAM,wBAAwB;AACtC,YAAQ,MAAM,uCAAuC;AACrD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,aAAa,IAAI,WAAW,OAAO;AACzC,QAAM,WAAW,MAAM,eAAe,QAAW,OAAO;AAGxD,QAAM,UAAU;AAAA,IACd,WAAW,SAAS;AAAA;AAAA,IACpB;AAAA,EACF;AAEA,MAAI;AACF,iBAAa,SAAS,QAAQ,GAAG;AAAA,EACnC,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,UAAQ,GAAG,WAAW,MAAM;AAC1B,YAAQ,IAAI,IAAI,KAAK,uCAAuC;AAC5D,eAAW,UAAU,OAAO;AAAA,MAC1B,QAAQ;AAAA,MACR,SAAS;AAAA,IACX,CAAC;AAGD,UAAM,eAAe,cAAc,SAAS,SAAS;AACrD,QAAI,CAAC,aAAa,WAAW,aAAa,UAAU,QAAW;AAC7D,cAAQ;AAAA,QACN,sDAAsD,aAAa,MAAM,OAAO;AAAA,MAClF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB,CAAC;AAGD,QAAM,SAAS,IAAI;AAAA,IACjB;AAAA,IACA,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,EACX;AAEA,MAAI;AACF,UAAM,OAAO,WAAW,KAAK;AAG7B,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,IAAI,IAAI,KAAK,8BAA8B;AACnD,YAAQ,KAAK,CAAC;AAAA,EAChB,SAAS,OAAO;AAEd,YAAQ,MAAM,IAAI,KAAK,iBAAiB,KAAK;AAG7C,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAEA,KAAK,EAAE,MAAM,CAAC,UAAmB;AAC/B,UAAQ,MAAM,qCAAqC,KAAK;AACxD,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
|
@@ -13,6 +13,29 @@ GREEN='\033[0;32m'
|
|
|
13
13
|
YELLOW='\033[1;33m'
|
|
14
14
|
NC='\033[0m' # No Color
|
|
15
15
|
|
|
16
|
+
# =====================
|
|
17
|
+
# Helper Functions
|
|
18
|
+
# =====================
|
|
19
|
+
|
|
20
|
+
# Install Playwright browser (called after crawl4ai is confirmed installed)
|
|
21
|
+
install_playwright_browser() {
|
|
22
|
+
# Check if Playwright Chromium is already installed by testing if browser can be launched
|
|
23
|
+
if python3 -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); b = p.chromium.launch(); b.close(); p.stop()" 2>/dev/null; then
|
|
24
|
+
echo -e "${GREEN}[bluera-knowledge] Playwright Chromium ready ✓${NC}"
|
|
25
|
+
return 0
|
|
26
|
+
fi
|
|
27
|
+
|
|
28
|
+
echo -e "${YELLOW}[bluera-knowledge] Installing Playwright browser (one-time setup)...${NC}"
|
|
29
|
+
if python3 -m playwright install chromium 2>/dev/null; then
|
|
30
|
+
echo -e "${GREEN}[bluera-knowledge] Playwright Chromium installed ✓${NC}"
|
|
31
|
+
return 0
|
|
32
|
+
else
|
|
33
|
+
echo -e "${YELLOW}[bluera-knowledge] Playwright browser install failed.${NC}"
|
|
34
|
+
echo -e "${YELLOW}Manual fix: python3 -m playwright install chromium${NC}"
|
|
35
|
+
return 1
|
|
36
|
+
fi
|
|
37
|
+
}
|
|
38
|
+
|
|
16
39
|
# =====================
|
|
17
40
|
# Node.js Dependencies
|
|
18
41
|
# =====================
|
|
@@ -60,6 +83,8 @@ if python3 -c "import crawl4ai" 2>/dev/null; then
|
|
|
60
83
|
# Already installed - get version
|
|
61
84
|
crawl4ai_version=$(python3 -c "import crawl4ai; print(crawl4ai.__version__)" 2>/dev/null || echo "unknown")
|
|
62
85
|
echo -e "${GREEN}[bluera-knowledge] crawl4ai ${crawl4ai_version} is installed ✓${NC}"
|
|
86
|
+
# Ensure Playwright browser is installed for headless crawling
|
|
87
|
+
install_playwright_browser
|
|
63
88
|
exit 0
|
|
64
89
|
fi
|
|
65
90
|
|
|
@@ -87,12 +112,16 @@ if command -v pip3 &> /dev/null || command -v pip &> /dev/null; then
|
|
|
87
112
|
echo -e "${GREEN}[bluera-knowledge] Successfully installed crawl4ai ✓${NC}"
|
|
88
113
|
crawl4ai_version=$(python3 -c "import crawl4ai; print(crawl4ai.__version__)" 2>/dev/null || echo "installed")
|
|
89
114
|
echo -e "${GREEN}[bluera-knowledge] crawl4ai ${crawl4ai_version} ready${NC}"
|
|
115
|
+
# Install Playwright browser for headless crawling
|
|
116
|
+
install_playwright_browser
|
|
90
117
|
else
|
|
91
118
|
# Fallback: try without --break-system-packages for older Python
|
|
92
119
|
if $PIP_CMD install --quiet --user crawl4ai 2>/dev/null; then
|
|
93
120
|
echo -e "${GREEN}[bluera-knowledge] Successfully installed crawl4ai ✓${NC}"
|
|
94
121
|
crawl4ai_version=$(python3 -c "import crawl4ai; print(crawl4ai.__version__)" 2>/dev/null || echo "installed")
|
|
95
122
|
echo -e "${GREEN}[bluera-knowledge] crawl4ai ${crawl4ai_version} ready${NC}"
|
|
123
|
+
# Install Playwright browser for headless crawling
|
|
124
|
+
install_playwright_browser
|
|
96
125
|
else
|
|
97
126
|
echo -e "${RED}[bluera-knowledge] Auto-installation failed${NC}"
|
|
98
127
|
echo ""
|
package/package.json
CHANGED
package/python/crawl_worker.py
CHANGED
|
@@ -29,10 +29,15 @@ async def fetch_headless(url: str):
|
|
|
29
29
|
if not result.success:
|
|
30
30
|
raise Exception(f"Crawl failed: {result.error_message}")
|
|
31
31
|
|
|
32
|
+
# Combine internal and external links - let TypeScript filter by domain
|
|
33
|
+
all_links = []
|
|
34
|
+
if isinstance(result.links, dict):
|
|
35
|
+
all_links = result.links.get("internal", []) + result.links.get("external", [])
|
|
36
|
+
|
|
32
37
|
return {
|
|
33
38
|
"html": result.html or '',
|
|
34
39
|
"markdown": result.markdown or result.cleaned_html or '',
|
|
35
|
-
"links":
|
|
40
|
+
"links": all_links
|
|
36
41
|
}
|
|
37
42
|
|
|
38
43
|
def is_exported(node: ast.AST) -> bool:
|
|
@@ -132,7 +132,7 @@ describe('crawl command execution', () => {
|
|
|
132
132
|
expect(mockCrawler.crawl).toHaveBeenCalledWith('https://example.com', {
|
|
133
133
|
crawlInstruction: 'all documentation pages',
|
|
134
134
|
maxPages: 50,
|
|
135
|
-
useHeadless:
|
|
135
|
+
useHeadless: true,
|
|
136
136
|
});
|
|
137
137
|
expect(mockServices.embeddings.embed).toHaveBeenCalledTimes(2);
|
|
138
138
|
expect(mockServices.lance.addDocuments).toHaveBeenCalledWith(
|
|
@@ -162,6 +162,46 @@ describe('crawl command execution', () => {
|
|
|
162
162
|
expect(processExitSpy).not.toHaveBeenCalled();
|
|
163
163
|
});
|
|
164
164
|
|
|
165
|
+
it('uses axios-only mode when --fast flag is provided', async () => {
|
|
166
|
+
const mockStore: WebStore = {
|
|
167
|
+
id: createStoreId('store-1'),
|
|
168
|
+
name: 'test-store',
|
|
169
|
+
type: 'web',
|
|
170
|
+
url: 'https://example.com',
|
|
171
|
+
depth: 2,
|
|
172
|
+
createdAt: new Date(),
|
|
173
|
+
updatedAt: new Date(),
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
mockServices.store.getByIdOrName.mockResolvedValue(mockStore);
|
|
177
|
+
mockServices.lance.initialize.mockResolvedValue(undefined);
|
|
178
|
+
mockServices.embeddings.embed.mockResolvedValue([0.1, 0.2, 0.3]);
|
|
179
|
+
mockServices.lance.addDocuments.mockResolvedValue(undefined);
|
|
180
|
+
|
|
181
|
+
mockCrawler.crawl.mockReturnValue(
|
|
182
|
+
(async function* () {
|
|
183
|
+
yield {
|
|
184
|
+
url: 'https://example.com/page1',
|
|
185
|
+
title: 'Page 1',
|
|
186
|
+
markdown: '# Page 1',
|
|
187
|
+
depth: 0,
|
|
188
|
+
};
|
|
189
|
+
})()
|
|
190
|
+
);
|
|
191
|
+
|
|
192
|
+
const command = createCrawlCommand(getOptions);
|
|
193
|
+
command.parseOptions(['--fast', '--max-pages', '25']);
|
|
194
|
+
const actionHandler = command._actionHandler;
|
|
195
|
+
|
|
196
|
+
await actionHandler(['https://example.com', 'test-store']);
|
|
197
|
+
|
|
198
|
+
expect(mockCrawler.crawl).toHaveBeenCalledWith('https://example.com', {
|
|
199
|
+
maxPages: 25,
|
|
200
|
+
useHeadless: false, // --fast disables headless
|
|
201
|
+
});
|
|
202
|
+
expect(mockCrawler.stop).toHaveBeenCalled();
|
|
203
|
+
});
|
|
204
|
+
|
|
165
205
|
it('successfully crawls in simple mode', async () => {
|
|
166
206
|
const mockStore: WebStore = {
|
|
167
207
|
id: createStoreId('store-1'),
|
|
@@ -198,7 +238,7 @@ describe('crawl command execution', () => {
|
|
|
198
238
|
expect(mockCrawler.crawl).toHaveBeenCalledWith('https://example.com', {
|
|
199
239
|
maxPages: 25,
|
|
200
240
|
simple: true,
|
|
201
|
-
useHeadless:
|
|
241
|
+
useHeadless: true,
|
|
202
242
|
});
|
|
203
243
|
expect(mockCrawler.stop).toHaveBeenCalled();
|
|
204
244
|
});
|
|
@@ -295,7 +335,7 @@ describe('crawl command execution', () => {
|
|
|
295
335
|
crawlInstruction: 'all Getting Started pages',
|
|
296
336
|
extractInstruction: 'code examples',
|
|
297
337
|
maxPages: 100,
|
|
298
|
-
useHeadless:
|
|
338
|
+
useHeadless: true,
|
|
299
339
|
});
|
|
300
340
|
});
|
|
301
341
|
});
|
|
@@ -25,7 +25,7 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
25
25
|
)
|
|
26
26
|
.option('--simple', 'Use simple BFS mode instead of intelligent crawling')
|
|
27
27
|
.option('--max-pages <number>', 'Maximum number of pages to crawl', '50')
|
|
28
|
-
.option('--
|
|
28
|
+
.option('--fast', 'Use fast axios-only mode (may fail on JavaScript-heavy sites)')
|
|
29
29
|
.action(
|
|
30
30
|
async (
|
|
31
31
|
url: string,
|
|
@@ -35,7 +35,7 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
35
35
|
extract?: string;
|
|
36
36
|
simple?: boolean;
|
|
37
37
|
maxPages?: string;
|
|
38
|
-
|
|
38
|
+
fast?: boolean;
|
|
39
39
|
}
|
|
40
40
|
) => {
|
|
41
41
|
const globalOpts = getOptions();
|
|
@@ -125,7 +125,7 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
125
125
|
...(cmdOptions.extract !== undefined && { extractInstruction: cmdOptions.extract }),
|
|
126
126
|
maxPages,
|
|
127
127
|
...(cmdOptions.simple !== undefined && { simple: cmdOptions.simple }),
|
|
128
|
-
useHeadless: cmdOptions.
|
|
128
|
+
useHeadless: !(cmdOptions.fast ?? false), // Default true (headless), --fast disables
|
|
129
129
|
})) {
|
|
130
130
|
// Use extracted content if available, otherwise markdown
|
|
131
131
|
const contentToProcess = result.extracted ?? result.markdown;
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { createSyncCommand } from './sync.js';
|
|
3
|
+
import type { GlobalOptions } from '../program.js';
|
|
4
|
+
|
|
5
|
+
describe('createSyncCommand', () => {
|
|
6
|
+
function createTestOptions(): GlobalOptions {
|
|
7
|
+
return {
|
|
8
|
+
dataDir: '/tmp/test-data',
|
|
9
|
+
projectRoot: '/tmp/test-project',
|
|
10
|
+
format: 'table',
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
it('creates sync command with correct name and description', () => {
|
|
15
|
+
const cmd = createSyncCommand(createTestOptions);
|
|
16
|
+
expect(cmd.name()).toBe('sync');
|
|
17
|
+
expect(cmd.description()).toContain('Sync');
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it('has --dry-run option', () => {
|
|
21
|
+
const cmd = createSyncCommand(createTestOptions);
|
|
22
|
+
const options = cmd.options;
|
|
23
|
+
const dryRunOpt = options.find((o) => o.long === '--dry-run');
|
|
24
|
+
expect(dryRunOpt).toBeDefined();
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it('has --prune option', () => {
|
|
28
|
+
const cmd = createSyncCommand(createTestOptions);
|
|
29
|
+
const options = cmd.options;
|
|
30
|
+
const pruneOpt = options.find((o) => o.long === '--prune');
|
|
31
|
+
expect(pruneOpt).toBeDefined();
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it('has --reindex option', () => {
|
|
35
|
+
const cmd = createSyncCommand(createTestOptions);
|
|
36
|
+
const options = cmd.options;
|
|
37
|
+
const reindexOpt = options.find((o) => o.long === '--reindex');
|
|
38
|
+
expect(reindexOpt).toBeDefined();
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it('has correct option descriptions', () => {
|
|
42
|
+
const cmd = createSyncCommand(createTestOptions);
|
|
43
|
+
const options = cmd.options;
|
|
44
|
+
|
|
45
|
+
const dryRunOpt = options.find((o) => o.long === '--dry-run');
|
|
46
|
+
expect(dryRunOpt?.description).toContain('without making changes');
|
|
47
|
+
|
|
48
|
+
const pruneOpt = options.find((o) => o.long === '--prune');
|
|
49
|
+
expect(pruneOpt?.description).toContain('Remove');
|
|
50
|
+
|
|
51
|
+
const reindexOpt = options.find((o) => o.long === '--reindex');
|
|
52
|
+
expect(reindexOpt?.description).toContain('index');
|
|
53
|
+
});
|
|
54
|
+
});
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import { Command } from 'commander';
|
|
2
|
+
import { createServices, destroyServices } from '../../services/index.js';
|
|
3
|
+
import { StoreDefinitionService } from '../../services/store-definition.service.js';
|
|
4
|
+
import {
|
|
5
|
+
isFileStoreDefinition,
|
|
6
|
+
isRepoStoreDefinition,
|
|
7
|
+
isWebStoreDefinition,
|
|
8
|
+
} from '../../types/store-definition.js';
|
|
9
|
+
import type { StoreService } from '../../services/store.service.js';
|
|
10
|
+
import type { StoreDefinition } from '../../types/store-definition.js';
|
|
11
|
+
import type { GlobalOptions } from '../program.js';
|
|
12
|
+
|
|
13
|
+
interface SyncResult {
|
|
14
|
+
created: string[];
|
|
15
|
+
skipped: string[];
|
|
16
|
+
failed: Array<{ name: string; error: string }>;
|
|
17
|
+
orphans: string[];
|
|
18
|
+
pruned: string[];
|
|
19
|
+
dryRun: boolean;
|
|
20
|
+
wouldCreate: string[];
|
|
21
|
+
wouldPrune: string[];
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Create a store from a definition
|
|
26
|
+
*/
|
|
27
|
+
async function createStoreFromDefinition(
|
|
28
|
+
def: StoreDefinition,
|
|
29
|
+
defService: StoreDefinitionService,
|
|
30
|
+
storeService: StoreService
|
|
31
|
+
): Promise<{ success: true } | { success: false; error: string }> {
|
|
32
|
+
try {
|
|
33
|
+
if (isFileStoreDefinition(def)) {
|
|
34
|
+
const resolvedPath = defService.resolvePath(def.path);
|
|
35
|
+
const createResult = await storeService.create(
|
|
36
|
+
{
|
|
37
|
+
name: def.name,
|
|
38
|
+
type: 'file',
|
|
39
|
+
path: resolvedPath,
|
|
40
|
+
description: def.description,
|
|
41
|
+
tags: def.tags,
|
|
42
|
+
},
|
|
43
|
+
{ skipDefinitionSync: true }
|
|
44
|
+
);
|
|
45
|
+
if (!createResult.success) {
|
|
46
|
+
return { success: false, error: createResult.error.message };
|
|
47
|
+
}
|
|
48
|
+
return { success: true };
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (isRepoStoreDefinition(def)) {
|
|
52
|
+
const createResult = await storeService.create(
|
|
53
|
+
{
|
|
54
|
+
name: def.name,
|
|
55
|
+
type: 'repo',
|
|
56
|
+
url: def.url,
|
|
57
|
+
branch: def.branch,
|
|
58
|
+
depth: def.depth,
|
|
59
|
+
description: def.description,
|
|
60
|
+
tags: def.tags,
|
|
61
|
+
},
|
|
62
|
+
{ skipDefinitionSync: true }
|
|
63
|
+
);
|
|
64
|
+
if (!createResult.success) {
|
|
65
|
+
return { success: false, error: createResult.error.message };
|
|
66
|
+
}
|
|
67
|
+
return { success: true };
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (isWebStoreDefinition(def)) {
|
|
71
|
+
const createResult = await storeService.create(
|
|
72
|
+
{
|
|
73
|
+
name: def.name,
|
|
74
|
+
type: 'web',
|
|
75
|
+
url: def.url,
|
|
76
|
+
depth: def.depth,
|
|
77
|
+
description: def.description,
|
|
78
|
+
tags: def.tags,
|
|
79
|
+
},
|
|
80
|
+
{ skipDefinitionSync: true }
|
|
81
|
+
);
|
|
82
|
+
if (!createResult.success) {
|
|
83
|
+
return { success: false, error: createResult.error.message };
|
|
84
|
+
}
|
|
85
|
+
return { success: true };
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return { success: false, error: 'Unknown store definition type' };
|
|
89
|
+
} catch (error) {
|
|
90
|
+
return {
|
|
91
|
+
success: false,
|
|
92
|
+
error: error instanceof Error ? error.message : String(error),
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
export function createSyncCommand(getOptions: () => GlobalOptions): Command {
|
|
98
|
+
const sync = new Command('sync').description(
|
|
99
|
+
'Sync stores from definitions config (bootstrap on fresh clone)'
|
|
100
|
+
);
|
|
101
|
+
|
|
102
|
+
sync
|
|
103
|
+
.option('--dry-run', 'Show what would happen without making changes')
|
|
104
|
+
.option('--prune', 'Remove stores not in definitions')
|
|
105
|
+
.option('--reindex', 'Re-index existing stores after sync')
|
|
106
|
+
.action(async (options: { dryRun?: boolean; prune?: boolean; reindex?: boolean }) => {
|
|
107
|
+
const globalOpts = getOptions();
|
|
108
|
+
const projectRoot = globalOpts.projectRoot ?? process.cwd();
|
|
109
|
+
|
|
110
|
+
const defService = new StoreDefinitionService(projectRoot);
|
|
111
|
+
const services = await createServices(globalOpts.config, globalOpts.dataDir, projectRoot);
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
const config = await defService.load();
|
|
115
|
+
const existingStores = await services.store.list();
|
|
116
|
+
const existingNames = new Set(existingStores.map((s) => s.name));
|
|
117
|
+
const definedNames = new Set(config.stores.map((d) => d.name));
|
|
118
|
+
|
|
119
|
+
const result: SyncResult = {
|
|
120
|
+
created: [],
|
|
121
|
+
skipped: [],
|
|
122
|
+
failed: [],
|
|
123
|
+
orphans: [],
|
|
124
|
+
pruned: [],
|
|
125
|
+
dryRun: options.dryRun === true,
|
|
126
|
+
wouldCreate: [],
|
|
127
|
+
wouldPrune: [],
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
// Process each definition
|
|
131
|
+
for (const def of config.stores) {
|
|
132
|
+
if (existingNames.has(def.name)) {
|
|
133
|
+
result.skipped.push(def.name);
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if (options.dryRun === true) {
|
|
138
|
+
result.wouldCreate.push(def.name);
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const createResult = await createStoreFromDefinition(def, defService, services.store);
|
|
143
|
+
if (createResult.success) {
|
|
144
|
+
result.created.push(def.name);
|
|
145
|
+
} else {
|
|
146
|
+
result.failed.push({ name: def.name, error: createResult.error });
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Find orphans
|
|
151
|
+
for (const store of existingStores) {
|
|
152
|
+
if (!definedNames.has(store.name)) {
|
|
153
|
+
result.orphans.push(store.name);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Prune orphans if requested
|
|
158
|
+
if (options.prune === true && result.orphans.length > 0) {
|
|
159
|
+
if (options.dryRun === true) {
|
|
160
|
+
result.wouldPrune = [...result.orphans];
|
|
161
|
+
} else {
|
|
162
|
+
for (const orphanName of result.orphans) {
|
|
163
|
+
const store = await services.store.getByName(orphanName);
|
|
164
|
+
if (store !== undefined) {
|
|
165
|
+
const deleteResult = await services.store.delete(store.id, {
|
|
166
|
+
skipDefinitionSync: true,
|
|
167
|
+
});
|
|
168
|
+
if (deleteResult.success) {
|
|
169
|
+
result.pruned.push(orphanName);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Output result
|
|
177
|
+
if (globalOpts.format === 'json') {
|
|
178
|
+
console.log(JSON.stringify(result, null, 2));
|
|
179
|
+
} else {
|
|
180
|
+
printHumanReadable(result, globalOpts.quiet === true);
|
|
181
|
+
}
|
|
182
|
+
} finally {
|
|
183
|
+
await destroyServices(services);
|
|
184
|
+
}
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
return sync;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function printHumanReadable(result: SyncResult, quiet: boolean): void {
|
|
191
|
+
if (quiet) {
|
|
192
|
+
// Just print created/pruned store names
|
|
193
|
+
for (const name of result.created) {
|
|
194
|
+
console.log(`created: ${name}`);
|
|
195
|
+
}
|
|
196
|
+
for (const name of result.pruned) {
|
|
197
|
+
console.log(`pruned: ${name}`);
|
|
198
|
+
}
|
|
199
|
+
for (const name of result.wouldCreate) {
|
|
200
|
+
console.log(`would create: ${name}`);
|
|
201
|
+
}
|
|
202
|
+
for (const name of result.wouldPrune) {
|
|
203
|
+
console.log(`would prune: ${name}`);
|
|
204
|
+
}
|
|
205
|
+
return;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if (result.dryRun) {
|
|
209
|
+
console.log('\n[DRY RUN] No changes made.\n');
|
|
210
|
+
} else {
|
|
211
|
+
console.log('\nSync completed.\n');
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if (result.created.length > 0) {
|
|
215
|
+
console.log(`Created (${String(result.created.length)}):`);
|
|
216
|
+
for (const name of result.created) {
|
|
217
|
+
console.log(` + ${name}`);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if (result.wouldCreate.length > 0) {
|
|
222
|
+
console.log(`Would create (${String(result.wouldCreate.length)}):`);
|
|
223
|
+
for (const name of result.wouldCreate) {
|
|
224
|
+
console.log(` + ${name}`);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (result.skipped.length > 0) {
|
|
229
|
+
console.log(`Skipped (already exist) (${String(result.skipped.length)}):`);
|
|
230
|
+
for (const name of result.skipped) {
|
|
231
|
+
console.log(` - ${name}`);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
if (result.failed.length > 0) {
|
|
236
|
+
console.log(`Failed (${String(result.failed.length)}):`);
|
|
237
|
+
for (const { name, error } of result.failed) {
|
|
238
|
+
console.log(` ! ${name}: ${error}`);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if (result.orphans.length > 0) {
|
|
243
|
+
console.log(`Orphans (not in definitions) (${String(result.orphans.length)}):`);
|
|
244
|
+
for (const name of result.orphans) {
|
|
245
|
+
console.log(` ? ${name}`);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if (result.pruned.length > 0) {
|
|
250
|
+
console.log(`Pruned (${String(result.pruned.length)}):`);
|
|
251
|
+
for (const name of result.pruned) {
|
|
252
|
+
console.log(` x ${name}`);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
if (result.wouldPrune.length > 0) {
|
|
257
|
+
console.log(`Would prune (${String(result.wouldPrune.length)}):`);
|
|
258
|
+
for (const name of result.wouldPrune) {
|
|
259
|
+
console.log(` x ${name}`);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
console.log('');
|
|
264
|
+
}
|
package/src/cli/index.ts
CHANGED
|
@@ -5,3 +5,4 @@ export { createIndexCommand } from './commands/index-cmd.js';
|
|
|
5
5
|
export { createServeCommand } from './commands/serve.js';
|
|
6
6
|
export { createCrawlCommand } from './commands/crawl.js';
|
|
7
7
|
export { createSetupCommand } from './commands/setup.js';
|
|
8
|
+
export { createSyncCommand } from './commands/sync.js';
|