bluera-knowledge 0.9.36 → 0.9.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/dist/{chunk-Z2KKVH45.js → chunk-36IFANFI.js} +2 -2
- package/dist/{chunk-WFNPNAAP.js → chunk-XJFV7AJW.js} +61 -30
- package/dist/chunk-XJFV7AJW.js.map +1 -0
- package/dist/{chunk-DC7CGSGT.js → chunk-ZAWIPEYX.js} +2 -2
- package/dist/index.js +4 -3
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +3 -2
- package/dist/workers/background-worker-cli.js.map +1 -1
- package/package.json +1 -1
- package/plugin.json +1 -1
- package/src/cli/commands/crawl.test.ts +1 -0
- package/src/cli/commands/crawl.ts +2 -0
- package/src/cli/commands/index-cmd.test.ts +1 -0
- package/src/db/lance.ts +15 -21
- package/src/services/index.service.test.ts +1 -0
- package/src/services/index.service.ts +2 -0
- package/src/services/search.service.test.ts +209 -0
- package/src/services/search.service.ts +77 -19
- package/src/workers/background-worker.test.ts +1 -0
- package/src/workers/background-worker.ts +2 -0
- package/tests/integration/search-quality.test.ts +5 -3
- package/dist/chunk-WFNPNAAP.js.map +0 -1
- /package/dist/{chunk-Z2KKVH45.js.map → chunk-36IFANFI.js.map} +0 -0
- /package/dist/{chunk-DC7CGSGT.js.map → chunk-ZAWIPEYX.js.map} +0 -0
package/dist/mcp/server.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
3
|
IntelligentCrawler
|
|
4
|
-
} from "../chunk-
|
|
4
|
+
} from "../chunk-ZAWIPEYX.js";
|
|
5
5
|
import {
|
|
6
6
|
JobService,
|
|
7
7
|
createDocumentId,
|
|
8
8
|
createServices,
|
|
9
9
|
createStoreId
|
|
10
|
-
} from "../chunk-
|
|
10
|
+
} from "../chunk-XJFV7AJW.js";
|
|
11
11
|
import "../chunk-6FHWC36B.js";
|
|
12
12
|
|
|
13
13
|
// src/workers/background-worker.ts
|
|
@@ -222,6 +222,7 @@ var BackgroundWorker = class {
|
|
|
222
222
|
progress: 85
|
|
223
223
|
});
|
|
224
224
|
await this.lanceStore.addDocuments(store.id, docs);
|
|
225
|
+
await this.lanceStore.createFtsIndex(store.id);
|
|
225
226
|
}
|
|
226
227
|
this.jobService.updateJob(job.id, {
|
|
227
228
|
message: `Crawled and indexed ${String(docs.length)} pages`,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/workers/background-worker.ts","../../src/workers/pid-file.ts","../../src/workers/background-worker-cli.ts"],"sourcesContent":["import { createHash } from 'node:crypto';\nimport { IntelligentCrawler, type CrawlProgress } from '../crawl/intelligent-crawler.js';\nimport { IndexService } from '../services/index.service.js';\nimport { JobService } from '../services/job.service.js';\nimport { StoreService } from '../services/store.service.js';\nimport { createStoreId, createDocumentId } from '../types/brands.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { Document } from '../types/document.js';\nimport type { Job } from '../types/job.js';\n\n/**\n * Calculate index progress as a percentage, handling division by zero.\n * @param current - Current number of items processed\n * @param total - Total number of items (may be 0)\n * @param scale - Scale factor for progress (default 100 for 0-100%)\n * @returns Progress value, or 0 if total is 0\n */\nexport function calculateIndexProgress(\n current: number,\n total: number,\n scale: number = 100\n): number {\n if (total === 0) return 0;\n return (current / total) * scale;\n}\n\nexport class BackgroundWorker {\n constructor(\n private readonly jobService: JobService,\n private readonly storeService: StoreService,\n private readonly indexService: IndexService,\n private readonly lanceStore: LanceStore,\n private readonly embeddingEngine: EmbeddingEngine\n ) {}\n\n /**\n * Execute a job based on its type\n */\n async executeJob(jobId: string): Promise<void> {\n const job = this.jobService.getJob(jobId);\n\n if (!job) {\n throw new Error(`Job ${jobId} not found`);\n }\n\n try {\n // Update to running status\n this.jobService.updateJob(jobId, {\n status: 'running',\n message: `Starting ${job.type} operation...`,\n progress: 0,\n details: { startedAt: new Date().toISOString() },\n });\n\n // Execute based on job type\n switch (job.type) {\n case 'clone':\n await this.executeCloneJob(job);\n break;\n case 'index':\n await this.executeIndexJob(job);\n break;\n case 'crawl':\n await this.executeCrawlJob(job);\n break;\n default:\n throw new Error(`Unknown job type: ${String(job.type)}`);\n }\n\n // Mark as completed\n this.jobService.updateJob(jobId, {\n status: 'completed',\n progress: 100,\n message: `${job.type} operation completed successfully`,\n details: { completedAt: new Date().toISOString() },\n });\n } catch (error) {\n // Mark as failed\n const errorDetails: Record<string, unknown> = {\n completedAt: new Date().toISOString(),\n };\n if (error instanceof Error && error.stack !== undefined) {\n errorDetails['error'] = error.stack;\n } else {\n errorDetails['error'] = String(error);\n }\n this.jobService.updateJob(jobId, {\n status: 'failed',\n message: error instanceof Error ? error.message : 'Unknown error',\n details: errorDetails,\n });\n throw error;\n }\n }\n\n /**\n * Execute a clone job (git clone + initial indexing)\n */\n private async executeCloneJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for clone job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Clone is already done by the time the job is created\n // (happens in StoreService.create), so we just need to index\n\n // Update progress - cloning considered done (30%)\n this.jobService.updateJob(job.id, {\n status: 'running',\n message: 'Repository cloned, starting indexing...',\n progress: 30,\n });\n\n // Index the repository with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Indexing is 70% of total progress (30-100%)\n const indexProgress = calculateIndexProgress(event.current, event.total, 70);\n const totalProgress = 30 + indexProgress;\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, totalProgress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute an index job (re-indexing existing store)\n */\n private async executeIndexJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for index job');\n }\n\n // Get the store\n const store = await this.storeService.getByIdOrName(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Index with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n const progress = calculateIndexProgress(event.current, event.total);\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, progress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute a crawl job (web crawling + indexing)\n */\n private async executeCrawlJob(job: Job): Promise<void> {\n const { storeId, url, crawlInstruction, extractInstruction, maxPages, simple, useHeadless } =\n job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for crawl job');\n }\n if (url === undefined || typeof url !== 'string') {\n throw new Error('URL required for crawl job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (store?.type !== 'web') {\n throw new Error(`Web store ${storeId} not found`);\n }\n\n const resolvedMaxPages = typeof maxPages === 'number' ? maxPages : 50;\n const crawler = new IntelligentCrawler();\n\n // Listen for progress events\n crawler.on('progress', (progress: CrawlProgress) => {\n // Check if job was cancelled - just return early, for-await loop will throw and finally will cleanup\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n return;\n }\n\n // Crawling is 80% of total progress (0-80%)\n const crawlProgress = (progress.pagesVisited / resolvedMaxPages) * 80;\n\n this.jobService.updateJob(job.id, {\n message:\n progress.message ??\n `Crawling page ${String(progress.pagesVisited)}/${String(resolvedMaxPages)}`,\n progress: Math.min(80, crawlProgress),\n details: { pagesCrawled: progress.pagesVisited },\n });\n });\n\n try {\n await this.lanceStore.initialize(store.id);\n const docs: Document[] = [];\n\n // Build crawl options, only including defined values\n const crawlOptions: {\n maxPages: number;\n simple: boolean;\n useHeadless: boolean;\n crawlInstruction?: string;\n extractInstruction?: string;\n } = {\n maxPages: resolvedMaxPages,\n simple: simple ?? false,\n useHeadless: useHeadless ?? false,\n };\n if (crawlInstruction !== undefined) {\n crawlOptions.crawlInstruction = crawlInstruction;\n }\n if (extractInstruction !== undefined) {\n crawlOptions.extractInstruction = extractInstruction;\n }\n\n // Crawl pages using IntelligentCrawler\n for await (const result of crawler.crawl(url, crawlOptions)) {\n // Check cancellation between pages\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Embed and index the content (use extracted if available, otherwise markdown)\n const contentToEmbed = result.extracted ?? result.markdown;\n const vector = await this.embeddingEngine.embed(contentToEmbed);\n\n docs.push({\n id: createDocumentId(`${store.id}-${createHash('md5').update(result.url).digest('hex')}`),\n content: contentToEmbed,\n vector,\n metadata: {\n type: 'web',\n storeId: store.id,\n url: result.url,\n title: result.title,\n extracted: result.extracted !== undefined,\n depth: result.depth,\n indexedAt: new Date(),\n },\n });\n }\n\n // Index all documents (remaining 20%)\n if (docs.length > 0) {\n this.jobService.updateJob(job.id, {\n message: 'Indexing crawled documents...',\n progress: 85,\n });\n\n await this.lanceStore.addDocuments(store.id, docs);\n }\n\n this.jobService.updateJob(job.id, {\n message: `Crawled and indexed ${String(docs.length)} pages`,\n progress: 100,\n details: { pagesCrawled: docs.length },\n });\n } finally {\n await crawler.stop();\n }\n }\n}\n","import fs from 'fs';\nimport path from 'path';\n\n/**\n * Result of a PID file delete operation.\n * Delete operations are best-effort and should not throw.\n */\nexport interface PidFileResult {\n success: boolean;\n error?: Error;\n}\n\n/**\n * Context for PID file deletion - indicates when the delete is happening.\n * Used for logging/debugging purposes.\n */\nexport type PidFileDeleteContext = 'sigterm' | 'success' | 'failure';\n\n/**\n * Write PID file - CRITICAL operation that must succeed.\n *\n * If the PID file cannot be written, the job cannot be cancelled through\n * the job management system. This is a critical failure and the job\n * should not proceed.\n *\n * @param pidFile - Absolute path to the PID file\n * @param pid - Process ID to write\n * @throws Error if PID file cannot be written\n */\nexport function writePidFile(pidFile: string, pid: number): void {\n try {\n fs.writeFileSync(pidFile, pid.toString(), 'utf-8');\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n throw new Error(\n `CRITICAL: Failed to write PID file ${pidFile}. ` +\n `Job cannot be cancelled without PID file. ` +\n `Original error: ${message}`\n );\n }\n}\n\n/**\n * Delete PID file - best-effort cleanup during shutdown.\n *\n * This operation should NEVER throw. During process shutdown (SIGTERM,\n * job success, job failure), failing to delete a PID file should not\n * prevent the process from exiting cleanly.\n *\n * Stale PID files are cleaned up by JobService.cleanupOldJobs().\n *\n * @param pidFile - Absolute path to the PID file\n * @param _context - Context indicating when the delete is happening (for future logging)\n * @returns Result indicating success or failure with error details\n */\nexport function deletePidFile(pidFile: string, _context: PidFileDeleteContext): PidFileResult {\n try {\n fs.unlinkSync(pidFile);\n return { success: true };\n } catch (error) {\n // ENOENT = file doesn't exist - that's success (nothing to delete)\n if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {\n return { success: true };\n }\n // Any other error = failure (permission denied, etc.)\n return {\n success: false,\n error: error instanceof Error ? error : new Error(String(error)),\n };\n }\n}\n\n/**\n * Build the path to a PID file for a given job.\n *\n * @param jobsDir - Directory where job files are stored\n * @param jobId - Job identifier\n * @returns Absolute path to the PID file\n */\nexport function buildPidFilePath(jobsDir: string, jobId: string): string {\n return path.join(jobsDir, `${jobId}.pid`);\n}\n","#!/usr/bin/env node\nimport { BackgroundWorker } from './background-worker.js';\nimport { writePidFile, deletePidFile, buildPidFilePath } from './pid-file.js';\nimport { createServices } from '../services/index.js';\nimport { JobService } from '../services/job.service.js';\n\n/**\n * Background worker CLI entry point\n *\n * Usage: background-worker-cli <job-id>\n *\n * This process runs detached from the parent and executes a single job.\n */\n\nasync function main(): Promise<void> {\n const jobId = process.argv[2];\n const dataDir = process.env['BLUERA_DATA_DIR'];\n\n if (jobId === undefined || jobId === '') {\n console.error('Error: Job ID required');\n console.error('Usage: background-worker-cli <job-id>');\n process.exit(1);\n }\n\n // Initialize services\n const jobService = new JobService(dataDir);\n const services = await createServices(undefined, dataDir);\n\n // Write PID file for job cancellation - CRITICAL: must succeed or job cannot be cancelled\n const pidFile = buildPidFilePath(\n jobService['jobsDir'], // Access private field for PID path\n jobId\n );\n\n try {\n writePidFile(pidFile, process.pid);\n } catch (error) {\n // CRITICAL: Cannot proceed without PID file - job would be uncancellable\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n\n // Handle SIGTERM for graceful shutdown\n process.on('SIGTERM', () => {\n console.log(`[${jobId}] Received SIGTERM, cancelling job...`);\n jobService.updateJob(jobId, {\n status: 'cancelled',\n message: 'Job cancelled by user',\n });\n\n // Clean up PID file (best-effort - don't block shutdown)\n const deleteResult = deletePidFile(pidFile, 'sigterm');\n if (!deleteResult.success && deleteResult.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file during SIGTERM: ${deleteResult.error.message}`\n );\n }\n\n process.exit(0);\n });\n\n // Create worker and execute job\n const worker = new BackgroundWorker(\n jobService,\n services.store,\n services.index,\n services.lance,\n services.embeddings\n );\n\n try {\n await worker.executeJob(jobId);\n\n // Clean up PID file on success (best-effort - don't change exit code)\n const successCleanup = deletePidFile(pidFile, 'success');\n if (!successCleanup.success && successCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after success: ${successCleanup.error.message}`\n );\n }\n\n console.log(`[${jobId}] Job completed successfully`);\n process.exit(0);\n } catch (error) {\n // Job service already updated with failure status in BackgroundWorker\n console.error(`[${jobId}] Job failed:`, error);\n\n // Clean up PID file on failure (best-effort - exit code reflects job failure)\n const failureCleanup = deletePidFile(pidFile, 'failure');\n if (!failureCleanup.success && failureCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after failure: ${failureCleanup.error.message}`\n );\n }\n\n process.exit(1);\n }\n}\n\nmain().catch((error: unknown) => {\n console.error('Fatal error in background worker:', error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;;;;;;;AAAA,SAAS,kBAAkB;AAkBpB,SAAS,uBACd,SACA,OACA,QAAgB,KACR;AACR,MAAI,UAAU,EAAG,QAAO;AACxB,SAAQ,UAAU,QAAS;AAC7B;AAEO,IAAM,mBAAN,MAAuB;AAAA,EAC5B,YACmB,YACA,cACA,cACA,YACA,iBACjB;AALiB;AACA;AACA;AACA;AACA;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA,EAKH,MAAM,WAAW,OAA8B;AAC7C,UAAM,MAAM,KAAK,WAAW,OAAO,KAAK;AAExC,QAAI,CAAC,KAAK;AACR,YAAM,IAAI,MAAM,OAAO,KAAK,YAAY;AAAA,IAC1C;AAEA,QAAI;AAEF,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,YAAY,IAAI,IAAI;AAAA,QAC7B,UAAU;AAAA,QACV,SAAS,EAAE,YAAW,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACjD,CAAC;AAGD,cAAQ,IAAI,MAAM;AAAA,QAChB,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF;AACE,gBAAM,IAAI,MAAM,qBAAqB,OAAO,IAAI,IAAI,CAAC,EAAE;AAAA,MAC3D;AAGA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,UAAU;AAAA,QACV,SAAS,GAAG,IAAI,IAAI;AAAA,QACpB,SAAS,EAAE,cAAa,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACnD,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,YAAM,eAAwC;AAAA,QAC5C,cAAa,oBAAI,KAAK,GAAE,YAAY;AAAA,MACtC;AACA,UAAI,iBAAiB,SAAS,MAAM,UAAU,QAAW;AACvD,qBAAa,OAAO,IAAI,MAAM;AAAA,MAChC,OAAO;AACL,qBAAa,OAAO,IAAI,OAAO,KAAK;AAAA,MACtC;AACA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,iBAAiB,QAAQ,MAAM,UAAU;AAAA,QAClD,SAAS;AAAA,MACX,CAAC;AACD,YAAM;AAAA,IACR;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAMA,SAAK,WAAW,UAAU,IAAI,IAAI;AAAA,MAChC,QAAQ;AAAA,MACR,SAAS;AAAA,MACT,UAAU;AAAA,IACZ,CAAC;AAGD,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,gBAAgB,uBAAuB,MAAM,SAAS,MAAM,OAAO,EAAE;AAC3E,cAAM,gBAAgB,KAAK;AAE3B,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA;AAAA,UACpC,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,cAAc,cAAc,OAAO,CAAC;AAC1E,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAGA,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAEA,cAAM,WAAW,uBAAuB,MAAM,SAAS,MAAM,KAAK;AAElE,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,QAAQ;AAAA;AAAA,UAC/B,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,SAAS,KAAK,kBAAkB,oBAAoB,UAAU,QAAQ,YAAY,IACxF,IAAI;AAEN,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AACA,QAAI,QAAQ,UAAa,OAAO,QAAQ,UAAU;AAChD,YAAM,IAAI,MAAM,4BAA4B;AAAA,IAC9C;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,OAAO,SAAS,OAAO;AACzB,YAAM,IAAI,MAAM,aAAa,OAAO,YAAY;AAAA,IAClD;AAEA,UAAM,mBAAmB,OAAO,aAAa,WAAW,WAAW;AACnE,UAAM,UAAU,IAAI,mBAAmB;AAGvC,YAAQ,GAAG,YAAY,CAAC,aAA4B;AAElD,YAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,UAAI,YAAY,WAAW,aAAa;AACtC;AAAA,MACF;AAGA,YAAM,gBAAiB,SAAS,eAAe,mBAAoB;AAEnE,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SACE,SAAS,WACT,iBAAiB,OAAO,SAAS,YAAY,CAAC,IAAI,OAAO,gBAAgB,CAAC;AAAA,QAC5E,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA,QACpC,SAAS,EAAE,cAAc,SAAS,aAAa;AAAA,MACjD,CAAC;AAAA,IACH,CAAC;AAED,QAAI;AACF,YAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,YAAM,OAAmB,CAAC;AAG1B,YAAM,eAMF;AAAA,QACF,UAAU;AAAA,QACV,QAAQ,UAAU;AAAA,QAClB,aAAa,eAAe;AAAA,MAC9B;AACA,UAAI,qBAAqB,QAAW;AAClC,qBAAa,mBAAmB;AAAA,MAClC;AACA,UAAI,uBAAuB,QAAW;AACpC,qBAAa,qBAAqB;AAAA,MACpC;AAGA,uBAAiB,UAAU,QAAQ,MAAM,KAAK,YAAY,GAAG;AAE3D,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,iBAAiB,OAAO,aAAa,OAAO;AAClD,cAAM,SAAS,MAAM,KAAK,gBAAgB,MAAM,cAAc;AAE9D,aAAK,KAAK;AAAA,UACR,IAAI,iBAAiB,GAAG,MAAM,EAAE,IAAI,WAAW,KAAK,EAAE,OAAO,OAAO,GAAG,EAAE,OAAO,KAAK,CAAC,EAAE;AAAA,UACxF,SAAS;AAAA,UACT;AAAA,UACA,UAAU;AAAA,YACR,MAAM;AAAA,YACN,SAAS,MAAM;AAAA,YACf,KAAK,OAAO;AAAA,YACZ,OAAO,OAAO;AAAA,YACd,WAAW,OAAO,cAAc;AAAA,YAChC,OAAO,OAAO;AAAA,YACd,WAAW,oBAAI,KAAK;AAAA,UACtB;AAAA,QACF,CAAC;AAAA,MACH;AAGA,UAAI,KAAK,SAAS,GAAG;AACnB,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS;AAAA,UACT,UAAU;AAAA,QACZ,CAAC;AAED,cAAM,KAAK,WAAW,aAAa,MAAM,IAAI,IAAI;AAAA,MACnD;AAEA,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SAAS,uBAAuB,OAAO,KAAK,MAAM,CAAC;AAAA,QACnD,UAAU;AAAA,QACV,SAAS,EAAE,cAAc,KAAK,OAAO;AAAA,MACvC,CAAC;AAAA,IACH,UAAE;AACA,YAAM,QAAQ,KAAK;AAAA,IACrB;AAAA,EACF;AACF;;;ACrTA,OAAO,QAAQ;AACf,OAAO,UAAU;AA4BV,SAAS,aAAa,SAAiB,KAAmB;AAC/D,MAAI;AACF,OAAG,cAAc,SAAS,IAAI,SAAS,GAAG,OAAO;AAAA,EACnD,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,UAAM,IAAI;AAAA,MACR,sCAAsC,OAAO,+DAExB,OAAO;AAAA,IAC9B;AAAA,EACF;AACF;AAeO,SAAS,cAAc,SAAiB,UAA+C;AAC5F,MAAI;AACF,OAAG,WAAW,OAAO;AACrB,WAAO,EAAE,SAAS,KAAK;AAAA,EACzB,SAAS,OAAO;AAEd,QAAI,iBAAiB,SAAS,UAAU,SAAS,MAAM,SAAS,UAAU;AACxE,aAAO,EAAE,SAAS,KAAK;AAAA,IACzB;AAEA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,OAAO,iBAAiB,QAAQ,QAAQ,IAAI,MAAM,OAAO,KAAK,CAAC;AAAA,IACjE;AAAA,EACF;AACF;AASO,SAAS,iBAAiB,SAAiB,OAAuB;AACvE,SAAO,KAAK,KAAK,SAAS,GAAG,KAAK,MAAM;AAC1C;;;ACnEA,eAAe,OAAsB;AACnC,QAAM,QAAQ,QAAQ,KAAK,CAAC;AAC5B,QAAM,UAAU,QAAQ,IAAI,iBAAiB;AAE7C,MAAI,UAAU,UAAa,UAAU,IAAI;AACvC,YAAQ,MAAM,wBAAwB;AACtC,YAAQ,MAAM,uCAAuC;AACrD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,aAAa,IAAI,WAAW,OAAO;AACzC,QAAM,WAAW,MAAM,eAAe,QAAW,OAAO;AAGxD,QAAM,UAAU;AAAA,IACd,WAAW,SAAS;AAAA;AAAA,IACpB;AAAA,EACF;AAEA,MAAI;AACF,iBAAa,SAAS,QAAQ,GAAG;AAAA,EACnC,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,UAAQ,GAAG,WAAW,MAAM;AAC1B,YAAQ,IAAI,IAAI,KAAK,uCAAuC;AAC5D,eAAW,UAAU,OAAO;AAAA,MAC1B,QAAQ;AAAA,MACR,SAAS;AAAA,IACX,CAAC;AAGD,UAAM,eAAe,cAAc,SAAS,SAAS;AACrD,QAAI,CAAC,aAAa,WAAW,aAAa,UAAU,QAAW;AAC7D,cAAQ;AAAA,QACN,sDAAsD,aAAa,MAAM,OAAO;AAAA,MAClF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB,CAAC;AAGD,QAAM,SAAS,IAAI;AAAA,IACjB;AAAA,IACA,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,EACX;AAEA,MAAI;AACF,UAAM,OAAO,WAAW,KAAK;AAG7B,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,IAAI,IAAI,KAAK,8BAA8B;AACnD,YAAQ,KAAK,CAAC;AAAA,EAChB,SAAS,OAAO;AAEd,YAAQ,MAAM,IAAI,KAAK,iBAAiB,KAAK;AAG7C,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAEA,KAAK,EAAE,MAAM,CAAC,UAAmB;AAC/B,UAAQ,MAAM,qCAAqC,KAAK;AACxD,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/workers/background-worker.ts","../../src/workers/pid-file.ts","../../src/workers/background-worker-cli.ts"],"sourcesContent":["import { createHash } from 'node:crypto';\nimport { IntelligentCrawler, type CrawlProgress } from '../crawl/intelligent-crawler.js';\nimport { IndexService } from '../services/index.service.js';\nimport { JobService } from '../services/job.service.js';\nimport { StoreService } from '../services/store.service.js';\nimport { createStoreId, createDocumentId } from '../types/brands.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { Document } from '../types/document.js';\nimport type { Job } from '../types/job.js';\n\n/**\n * Calculate index progress as a percentage, handling division by zero.\n * @param current - Current number of items processed\n * @param total - Total number of items (may be 0)\n * @param scale - Scale factor for progress (default 100 for 0-100%)\n * @returns Progress value, or 0 if total is 0\n */\nexport function calculateIndexProgress(\n current: number,\n total: number,\n scale: number = 100\n): number {\n if (total === 0) return 0;\n return (current / total) * scale;\n}\n\nexport class BackgroundWorker {\n constructor(\n private readonly jobService: JobService,\n private readonly storeService: StoreService,\n private readonly indexService: IndexService,\n private readonly lanceStore: LanceStore,\n private readonly embeddingEngine: EmbeddingEngine\n ) {}\n\n /**\n * Execute a job based on its type\n */\n async executeJob(jobId: string): Promise<void> {\n const job = this.jobService.getJob(jobId);\n\n if (!job) {\n throw new Error(`Job ${jobId} not found`);\n }\n\n try {\n // Update to running status\n this.jobService.updateJob(jobId, {\n status: 'running',\n message: `Starting ${job.type} operation...`,\n progress: 0,\n details: { startedAt: new Date().toISOString() },\n });\n\n // Execute based on job type\n switch (job.type) {\n case 'clone':\n await this.executeCloneJob(job);\n break;\n case 'index':\n await this.executeIndexJob(job);\n break;\n case 'crawl':\n await this.executeCrawlJob(job);\n break;\n default:\n throw new Error(`Unknown job type: ${String(job.type)}`);\n }\n\n // Mark as completed\n this.jobService.updateJob(jobId, {\n status: 'completed',\n progress: 100,\n message: `${job.type} operation completed successfully`,\n details: { completedAt: new Date().toISOString() },\n });\n } catch (error) {\n // Mark as failed\n const errorDetails: Record<string, unknown> = {\n completedAt: new Date().toISOString(),\n };\n if (error instanceof Error && error.stack !== undefined) {\n errorDetails['error'] = error.stack;\n } else {\n errorDetails['error'] = String(error);\n }\n this.jobService.updateJob(jobId, {\n status: 'failed',\n message: error instanceof Error ? error.message : 'Unknown error',\n details: errorDetails,\n });\n throw error;\n }\n }\n\n /**\n * Execute a clone job (git clone + initial indexing)\n */\n private async executeCloneJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for clone job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Clone is already done by the time the job is created\n // (happens in StoreService.create), so we just need to index\n\n // Update progress - cloning considered done (30%)\n this.jobService.updateJob(job.id, {\n status: 'running',\n message: 'Repository cloned, starting indexing...',\n progress: 30,\n });\n\n // Index the repository with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Indexing is 70% of total progress (30-100%)\n const indexProgress = calculateIndexProgress(event.current, event.total, 70);\n const totalProgress = 30 + indexProgress;\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, totalProgress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute an index job (re-indexing existing store)\n */\n private async executeIndexJob(job: Job): Promise<void> {\n const { storeId } = job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for index job');\n }\n\n // Get the store\n const store = await this.storeService.getByIdOrName(createStoreId(storeId));\n if (!store) {\n throw new Error(`Store ${storeId} not found`);\n }\n\n // Index with progress updates\n const result = await this.indexService.indexStore(\n store,\n (event: { type: string; current: number; total: number; message: string }) => {\n // Check if job was cancelled\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n const progress = calculateIndexProgress(event.current, event.total);\n\n this.jobService.updateJob(job.id, {\n message: `Indexed ${String(event.current)}/${String(event.total)} files`,\n progress: Math.min(99, progress), // Cap at 99 until fully complete\n details: {\n filesProcessed: event.current,\n totalFiles: event.total,\n },\n });\n }\n );\n\n if (!result.success) {\n throw result.error;\n }\n }\n\n /**\n * Execute a crawl job (web crawling + indexing)\n */\n private async executeCrawlJob(job: Job): Promise<void> {\n const { storeId, url, crawlInstruction, extractInstruction, maxPages, simple, useHeadless } =\n job.details;\n\n if (storeId === undefined || typeof storeId !== 'string') {\n throw new Error('Store ID required for crawl job');\n }\n if (url === undefined || typeof url !== 'string') {\n throw new Error('URL required for crawl job');\n }\n\n // Get the store\n const store = await this.storeService.get(createStoreId(storeId));\n if (store?.type !== 'web') {\n throw new Error(`Web store ${storeId} not found`);\n }\n\n const resolvedMaxPages = typeof maxPages === 'number' ? maxPages : 50;\n const crawler = new IntelligentCrawler();\n\n // Listen for progress events\n crawler.on('progress', (progress: CrawlProgress) => {\n // Check if job was cancelled - just return early, for-await loop will throw and finally will cleanup\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n return;\n }\n\n // Crawling is 80% of total progress (0-80%)\n const crawlProgress = (progress.pagesVisited / resolvedMaxPages) * 80;\n\n this.jobService.updateJob(job.id, {\n message:\n progress.message ??\n `Crawling page ${String(progress.pagesVisited)}/${String(resolvedMaxPages)}`,\n progress: Math.min(80, crawlProgress),\n details: { pagesCrawled: progress.pagesVisited },\n });\n });\n\n try {\n await this.lanceStore.initialize(store.id);\n const docs: Document[] = [];\n\n // Build crawl options, only including defined values\n const crawlOptions: {\n maxPages: number;\n simple: boolean;\n useHeadless: boolean;\n crawlInstruction?: string;\n extractInstruction?: string;\n } = {\n maxPages: resolvedMaxPages,\n simple: simple ?? false,\n useHeadless: useHeadless ?? false,\n };\n if (crawlInstruction !== undefined) {\n crawlOptions.crawlInstruction = crawlInstruction;\n }\n if (extractInstruction !== undefined) {\n crawlOptions.extractInstruction = extractInstruction;\n }\n\n // Crawl pages using IntelligentCrawler\n for await (const result of crawler.crawl(url, crawlOptions)) {\n // Check cancellation between pages\n const currentJob = this.jobService.getJob(job.id);\n if (currentJob?.status === 'cancelled') {\n throw new Error('Job cancelled by user');\n }\n\n // Embed and index the content (use extracted if available, otherwise markdown)\n const contentToEmbed = result.extracted ?? result.markdown;\n const vector = await this.embeddingEngine.embed(contentToEmbed);\n\n docs.push({\n id: createDocumentId(`${store.id}-${createHash('md5').update(result.url).digest('hex')}`),\n content: contentToEmbed,\n vector,\n metadata: {\n type: 'web',\n storeId: store.id,\n url: result.url,\n title: result.title,\n extracted: result.extracted !== undefined,\n depth: result.depth,\n indexedAt: new Date(),\n },\n });\n }\n\n // Index all documents (remaining 20%)\n if (docs.length > 0) {\n this.jobService.updateJob(job.id, {\n message: 'Indexing crawled documents...',\n progress: 85,\n });\n\n await this.lanceStore.addDocuments(store.id, docs);\n // Create FTS index for full-text search\n await this.lanceStore.createFtsIndex(store.id);\n }\n\n this.jobService.updateJob(job.id, {\n message: `Crawled and indexed ${String(docs.length)} pages`,\n progress: 100,\n details: { pagesCrawled: docs.length },\n });\n } finally {\n await crawler.stop();\n }\n }\n}\n","import fs from 'fs';\nimport path from 'path';\n\n/**\n * Result of a PID file delete operation.\n * Delete operations are best-effort and should not throw.\n */\nexport interface PidFileResult {\n success: boolean;\n error?: Error;\n}\n\n/**\n * Context for PID file deletion - indicates when the delete is happening.\n * Used for logging/debugging purposes.\n */\nexport type PidFileDeleteContext = 'sigterm' | 'success' | 'failure';\n\n/**\n * Write PID file - CRITICAL operation that must succeed.\n *\n * If the PID file cannot be written, the job cannot be cancelled through\n * the job management system. This is a critical failure and the job\n * should not proceed.\n *\n * @param pidFile - Absolute path to the PID file\n * @param pid - Process ID to write\n * @throws Error if PID file cannot be written\n */\nexport function writePidFile(pidFile: string, pid: number): void {\n try {\n fs.writeFileSync(pidFile, pid.toString(), 'utf-8');\n } catch (error) {\n const message = error instanceof Error ? error.message : String(error);\n throw new Error(\n `CRITICAL: Failed to write PID file ${pidFile}. ` +\n `Job cannot be cancelled without PID file. ` +\n `Original error: ${message}`\n );\n }\n}\n\n/**\n * Delete PID file - best-effort cleanup during shutdown.\n *\n * This operation should NEVER throw. During process shutdown (SIGTERM,\n * job success, job failure), failing to delete a PID file should not\n * prevent the process from exiting cleanly.\n *\n * Stale PID files are cleaned up by JobService.cleanupOldJobs().\n *\n * @param pidFile - Absolute path to the PID file\n * @param _context - Context indicating when the delete is happening (for future logging)\n * @returns Result indicating success or failure with error details\n */\nexport function deletePidFile(pidFile: string, _context: PidFileDeleteContext): PidFileResult {\n try {\n fs.unlinkSync(pidFile);\n return { success: true };\n } catch (error) {\n // ENOENT = file doesn't exist - that's success (nothing to delete)\n if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {\n return { success: true };\n }\n // Any other error = failure (permission denied, etc.)\n return {\n success: false,\n error: error instanceof Error ? error : new Error(String(error)),\n };\n }\n}\n\n/**\n * Build the path to a PID file for a given job.\n *\n * @param jobsDir - Directory where job files are stored\n * @param jobId - Job identifier\n * @returns Absolute path to the PID file\n */\nexport function buildPidFilePath(jobsDir: string, jobId: string): string {\n return path.join(jobsDir, `${jobId}.pid`);\n}\n","#!/usr/bin/env node\nimport { BackgroundWorker } from './background-worker.js';\nimport { writePidFile, deletePidFile, buildPidFilePath } from './pid-file.js';\nimport { createServices } from '../services/index.js';\nimport { JobService } from '../services/job.service.js';\n\n/**\n * Background worker CLI entry point\n *\n * Usage: background-worker-cli <job-id>\n *\n * This process runs detached from the parent and executes a single job.\n */\n\nasync function main(): Promise<void> {\n const jobId = process.argv[2];\n const dataDir = process.env['BLUERA_DATA_DIR'];\n\n if (jobId === undefined || jobId === '') {\n console.error('Error: Job ID required');\n console.error('Usage: background-worker-cli <job-id>');\n process.exit(1);\n }\n\n // Initialize services\n const jobService = new JobService(dataDir);\n const services = await createServices(undefined, dataDir);\n\n // Write PID file for job cancellation - CRITICAL: must succeed or job cannot be cancelled\n const pidFile = buildPidFilePath(\n jobService['jobsDir'], // Access private field for PID path\n jobId\n );\n\n try {\n writePidFile(pidFile, process.pid);\n } catch (error) {\n // CRITICAL: Cannot proceed without PID file - job would be uncancellable\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n\n // Handle SIGTERM for graceful shutdown\n process.on('SIGTERM', () => {\n console.log(`[${jobId}] Received SIGTERM, cancelling job...`);\n jobService.updateJob(jobId, {\n status: 'cancelled',\n message: 'Job cancelled by user',\n });\n\n // Clean up PID file (best-effort - don't block shutdown)\n const deleteResult = deletePidFile(pidFile, 'sigterm');\n if (!deleteResult.success && deleteResult.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file during SIGTERM: ${deleteResult.error.message}`\n );\n }\n\n process.exit(0);\n });\n\n // Create worker and execute job\n const worker = new BackgroundWorker(\n jobService,\n services.store,\n services.index,\n services.lance,\n services.embeddings\n );\n\n try {\n await worker.executeJob(jobId);\n\n // Clean up PID file on success (best-effort - don't change exit code)\n const successCleanup = deletePidFile(pidFile, 'success');\n if (!successCleanup.success && successCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after success: ${successCleanup.error.message}`\n );\n }\n\n console.log(`[${jobId}] Job completed successfully`);\n process.exit(0);\n } catch (error) {\n // Job service already updated with failure status in BackgroundWorker\n console.error(`[${jobId}] Job failed:`, error);\n\n // Clean up PID file on failure (best-effort - exit code reflects job failure)\n const failureCleanup = deletePidFile(pidFile, 'failure');\n if (!failureCleanup.success && failureCleanup.error !== undefined) {\n console.error(\n `Warning: Could not remove PID file after failure: ${failureCleanup.error.message}`\n );\n }\n\n process.exit(1);\n }\n}\n\nmain().catch((error: unknown) => {\n console.error('Fatal error in background worker:', error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;;;;;;;AAAA,SAAS,kBAAkB;AAkBpB,SAAS,uBACd,SACA,OACA,QAAgB,KACR;AACR,MAAI,UAAU,EAAG,QAAO;AACxB,SAAQ,UAAU,QAAS;AAC7B;AAEO,IAAM,mBAAN,MAAuB;AAAA,EAC5B,YACmB,YACA,cACA,cACA,YACA,iBACjB;AALiB;AACA;AACA;AACA;AACA;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA,EAKH,MAAM,WAAW,OAA8B;AAC7C,UAAM,MAAM,KAAK,WAAW,OAAO,KAAK;AAExC,QAAI,CAAC,KAAK;AACR,YAAM,IAAI,MAAM,OAAO,KAAK,YAAY;AAAA,IAC1C;AAEA,QAAI;AAEF,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,YAAY,IAAI,IAAI;AAAA,QAC7B,UAAU;AAAA,QACV,SAAS,EAAE,YAAW,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACjD,CAAC;AAGD,cAAQ,IAAI,MAAM;AAAA,QAChB,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF,KAAK;AACH,gBAAM,KAAK,gBAAgB,GAAG;AAC9B;AAAA,QACF;AACE,gBAAM,IAAI,MAAM,qBAAqB,OAAO,IAAI,IAAI,CAAC,EAAE;AAAA,MAC3D;AAGA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,UAAU;AAAA,QACV,SAAS,GAAG,IAAI,IAAI;AAAA,QACpB,SAAS,EAAE,cAAa,oBAAI,KAAK,GAAE,YAAY,EAAE;AAAA,MACnD,CAAC;AAAA,IACH,SAAS,OAAO;AAEd,YAAM,eAAwC;AAAA,QAC5C,cAAa,oBAAI,KAAK,GAAE,YAAY;AAAA,MACtC;AACA,UAAI,iBAAiB,SAAS,MAAM,UAAU,QAAW;AACvD,qBAAa,OAAO,IAAI,MAAM;AAAA,MAChC,OAAO;AACL,qBAAa,OAAO,IAAI,OAAO,KAAK;AAAA,MACtC;AACA,WAAK,WAAW,UAAU,OAAO;AAAA,QAC/B,QAAQ;AAAA,QACR,SAAS,iBAAiB,QAAQ,MAAM,UAAU;AAAA,QAClD,SAAS;AAAA,MACX,CAAC;AACD,YAAM;AAAA,IACR;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAMA,SAAK,WAAW,UAAU,IAAI,IAAI;AAAA,MAChC,QAAQ;AAAA,MACR,SAAS;AAAA,MACT,UAAU;AAAA,IACZ,CAAC;AAGD,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,gBAAgB,uBAAuB,MAAM,SAAS,MAAM,OAAO,EAAE;AAC3E,cAAM,gBAAgB,KAAK;AAE3B,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA;AAAA,UACpC,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,QAAQ,IAAI,IAAI;AAExB,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,cAAc,cAAc,OAAO,CAAC;AAC1E,QAAI,CAAC,OAAO;AACV,YAAM,IAAI,MAAM,SAAS,OAAO,YAAY;AAAA,IAC9C;AAGA,UAAM,SAAS,MAAM,KAAK,aAAa;AAAA,MACrC;AAAA,MACA,CAAC,UAA6E;AAE5E,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAEA,cAAM,WAAW,uBAAuB,MAAM,SAAS,MAAM,KAAK;AAElE,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS,WAAW,OAAO,MAAM,OAAO,CAAC,IAAI,OAAO,MAAM,KAAK,CAAC;AAAA,UAChE,UAAU,KAAK,IAAI,IAAI,QAAQ;AAAA;AAAA,UAC/B,SAAS;AAAA,YACP,gBAAgB,MAAM;AAAA,YACtB,YAAY,MAAM;AAAA,UACpB;AAAA,QACF,CAAC;AAAA,MACH;AAAA,IACF;AAEA,QAAI,CAAC,OAAO,SAAS;AACnB,YAAM,OAAO;AAAA,IACf;AAAA,EACF;AAAA;AAAA;AAAA;AAAA,EAKA,MAAc,gBAAgB,KAAyB;AACrD,UAAM,EAAE,SAAS,KAAK,kBAAkB,oBAAoB,UAAU,QAAQ,YAAY,IACxF,IAAI;AAEN,QAAI,YAAY,UAAa,OAAO,YAAY,UAAU;AACxD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IACnD;AACA,QAAI,QAAQ,UAAa,OAAO,QAAQ,UAAU;AAChD,YAAM,IAAI,MAAM,4BAA4B;AAAA,IAC9C;AAGA,UAAM,QAAQ,MAAM,KAAK,aAAa,IAAI,cAAc,OAAO,CAAC;AAChE,QAAI,OAAO,SAAS,OAAO;AACzB,YAAM,IAAI,MAAM,aAAa,OAAO,YAAY;AAAA,IAClD;AAEA,UAAM,mBAAmB,OAAO,aAAa,WAAW,WAAW;AACnE,UAAM,UAAU,IAAI,mBAAmB;AAGvC,YAAQ,GAAG,YAAY,CAAC,aAA4B;AAElD,YAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,UAAI,YAAY,WAAW,aAAa;AACtC;AAAA,MACF;AAGA,YAAM,gBAAiB,SAAS,eAAe,mBAAoB;AAEnE,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SACE,SAAS,WACT,iBAAiB,OAAO,SAAS,YAAY,CAAC,IAAI,OAAO,gBAAgB,CAAC;AAAA,QAC5E,UAAU,KAAK,IAAI,IAAI,aAAa;AAAA,QACpC,SAAS,EAAE,cAAc,SAAS,aAAa;AAAA,MACjD,CAAC;AAAA,IACH,CAAC;AAED,QAAI;AACF,YAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,YAAM,OAAmB,CAAC;AAG1B,YAAM,eAMF;AAAA,QACF,UAAU;AAAA,QACV,QAAQ,UAAU;AAAA,QAClB,aAAa,eAAe;AAAA,MAC9B;AACA,UAAI,qBAAqB,QAAW;AAClC,qBAAa,mBAAmB;AAAA,MAClC;AACA,UAAI,uBAAuB,QAAW;AACpC,qBAAa,qBAAqB;AAAA,MACpC;AAGA,uBAAiB,UAAU,QAAQ,MAAM,KAAK,YAAY,GAAG;AAE3D,cAAM,aAAa,KAAK,WAAW,OAAO,IAAI,EAAE;AAChD,YAAI,YAAY,WAAW,aAAa;AACtC,gBAAM,IAAI,MAAM,uBAAuB;AAAA,QACzC;AAGA,cAAM,iBAAiB,OAAO,aAAa,OAAO;AAClD,cAAM,SAAS,MAAM,KAAK,gBAAgB,MAAM,cAAc;AAE9D,aAAK,KAAK;AAAA,UACR,IAAI,iBAAiB,GAAG,MAAM,EAAE,IAAI,WAAW,KAAK,EAAE,OAAO,OAAO,GAAG,EAAE,OAAO,KAAK,CAAC,EAAE;AAAA,UACxF,SAAS;AAAA,UACT;AAAA,UACA,UAAU;AAAA,YACR,MAAM;AAAA,YACN,SAAS,MAAM;AAAA,YACf,KAAK,OAAO;AAAA,YACZ,OAAO,OAAO;AAAA,YACd,WAAW,OAAO,cAAc;AAAA,YAChC,OAAO,OAAO;AAAA,YACd,WAAW,oBAAI,KAAK;AAAA,UACtB;AAAA,QACF,CAAC;AAAA,MACH;AAGA,UAAI,KAAK,SAAS,GAAG;AACnB,aAAK,WAAW,UAAU,IAAI,IAAI;AAAA,UAChC,SAAS;AAAA,UACT,UAAU;AAAA,QACZ,CAAC;AAED,cAAM,KAAK,WAAW,aAAa,MAAM,IAAI,IAAI;AAEjD,cAAM,KAAK,WAAW,eAAe,MAAM,EAAE;AAAA,MAC/C;AAEA,WAAK,WAAW,UAAU,IAAI,IAAI;AAAA,QAChC,SAAS,uBAAuB,OAAO,KAAK,MAAM,CAAC;AAAA,QACnD,UAAU;AAAA,QACV,SAAS,EAAE,cAAc,KAAK,OAAO;AAAA,MACvC,CAAC;AAAA,IACH,UAAE;AACA,YAAM,QAAQ,KAAK;AAAA,IACrB;AAAA,EACF;AACF;;;ACvTA,OAAO,QAAQ;AACf,OAAO,UAAU;AA4BV,SAAS,aAAa,SAAiB,KAAmB;AAC/D,MAAI;AACF,OAAG,cAAc,SAAS,IAAI,SAAS,GAAG,OAAO;AAAA,EACnD,SAAS,OAAO;AACd,UAAM,UAAU,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AACrE,UAAM,IAAI;AAAA,MACR,sCAAsC,OAAO,+DAExB,OAAO;AAAA,IAC9B;AAAA,EACF;AACF;AAeO,SAAS,cAAc,SAAiB,UAA+C;AAC5F,MAAI;AACF,OAAG,WAAW,OAAO;AACrB,WAAO,EAAE,SAAS,KAAK;AAAA,EACzB,SAAS,OAAO;AAEd,QAAI,iBAAiB,SAAS,UAAU,SAAS,MAAM,SAAS,UAAU;AACxE,aAAO,EAAE,SAAS,KAAK;AAAA,IACzB;AAEA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,OAAO,iBAAiB,QAAQ,QAAQ,IAAI,MAAM,OAAO,KAAK,CAAC;AAAA,IACjE;AAAA,EACF;AACF;AASO,SAAS,iBAAiB,SAAiB,OAAuB;AACvE,SAAO,KAAK,KAAK,SAAS,GAAG,KAAK,MAAM;AAC1C;;;ACnEA,eAAe,OAAsB;AACnC,QAAM,QAAQ,QAAQ,KAAK,CAAC;AAC5B,QAAM,UAAU,QAAQ,IAAI,iBAAiB;AAE7C,MAAI,UAAU,UAAa,UAAU,IAAI;AACvC,YAAQ,MAAM,wBAAwB;AACtC,YAAQ,MAAM,uCAAuC;AACrD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,QAAM,aAAa,IAAI,WAAW,OAAO;AACzC,QAAM,WAAW,MAAM,eAAe,QAAW,OAAO;AAGxD,QAAM,UAAU;AAAA,IACd,WAAW,SAAS;AAAA;AAAA,IACpB;AAAA,EACF;AAEA,MAAI;AACF,iBAAa,SAAS,QAAQ,GAAG;AAAA,EACnC,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AAGA,UAAQ,GAAG,WAAW,MAAM;AAC1B,YAAQ,IAAI,IAAI,KAAK,uCAAuC;AAC5D,eAAW,UAAU,OAAO;AAAA,MAC1B,QAAQ;AAAA,MACR,SAAS;AAAA,IACX,CAAC;AAGD,UAAM,eAAe,cAAc,SAAS,SAAS;AACrD,QAAI,CAAC,aAAa,WAAW,aAAa,UAAU,QAAW;AAC7D,cAAQ;AAAA,QACN,sDAAsD,aAAa,MAAM,OAAO;AAAA,MAClF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB,CAAC;AAGD,QAAM,SAAS,IAAI;AAAA,IACjB;AAAA,IACA,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,EACX;AAEA,MAAI;AACF,UAAM,OAAO,WAAW,KAAK;AAG7B,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,IAAI,IAAI,KAAK,8BAA8B;AACnD,YAAQ,KAAK,CAAC;AAAA,EAChB,SAAS,OAAO;AAEd,YAAQ,MAAM,IAAI,KAAK,iBAAiB,KAAK;AAG7C,UAAM,iBAAiB,cAAc,SAAS,SAAS;AACvD,QAAI,CAAC,eAAe,WAAW,eAAe,UAAU,QAAW;AACjE,cAAQ;AAAA,QACN,qDAAqD,eAAe,MAAM,OAAO;AAAA,MACnF;AAAA,IACF;AAEA,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;AAEA,KAAK,EAAE,MAAM,CAAC,UAAmB;AAC/B,UAAQ,MAAM,qCAAqC,KAAK;AACxD,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
package/package.json
CHANGED
package/plugin.json
CHANGED
|
@@ -172,6 +172,8 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
172
172
|
spinner.text = 'Indexing documents...';
|
|
173
173
|
}
|
|
174
174
|
await services.lance.addDocuments(store.id, docs);
|
|
175
|
+
// Create FTS index for full-text search
|
|
176
|
+
await services.lance.createFtsIndex(store.id);
|
|
175
177
|
}
|
|
176
178
|
|
|
177
179
|
const crawlResult = {
|
package/src/db/lance.ts
CHANGED
|
@@ -74,33 +74,27 @@ export class LanceStore {
|
|
|
74
74
|
storeId: StoreId,
|
|
75
75
|
vector: number[],
|
|
76
76
|
limit: number,
|
|
77
|
-
threshold
|
|
77
|
+
// threshold is kept for API compatibility but filtering is done after normalization
|
|
78
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
79
|
+
_threshold?: number
|
|
78
80
|
): Promise<
|
|
79
81
|
Array<{ id: DocumentId; content: string; score: number; metadata: DocumentMetadata }>
|
|
80
82
|
> {
|
|
81
83
|
const table = await this.getTable(storeId);
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if (threshold !== undefined) {
|
|
85
|
-
query = query.distanceType('cosine');
|
|
86
|
-
}
|
|
84
|
+
const query = table.vectorSearch(vector).limit(limit).distanceType('cosine');
|
|
87
85
|
|
|
88
86
|
// eslint-disable-next-line @typescript-eslint/consistent-type-assertions
|
|
89
87
|
const results = (await query.toArray()) as SearchHit[];
|
|
90
88
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
score: 1 - r._distance,
|
|
101
|
-
// eslint-disable-next-line @typescript-eslint/consistent-type-assertions
|
|
102
|
-
metadata: JSON.parse(r.metadata) as DocumentMetadata,
|
|
103
|
-
}));
|
|
89
|
+
// Return all results - threshold filtering is applied after score normalization
|
|
90
|
+
// in search.service.ts to match displayed scores
|
|
91
|
+
return results.map((r) => ({
|
|
92
|
+
id: createDocumentId(r.id),
|
|
93
|
+
content: r.content,
|
|
94
|
+
score: 1 - r._distance,
|
|
95
|
+
// eslint-disable-next-line @typescript-eslint/consistent-type-assertions
|
|
96
|
+
metadata: JSON.parse(r.metadata) as DocumentMetadata,
|
|
97
|
+
}));
|
|
104
98
|
}
|
|
105
99
|
|
|
106
100
|
async createFtsIndex(storeId: StoreId): Promise<void> {
|
|
@@ -125,13 +119,13 @@ export class LanceStore {
|
|
|
125
119
|
id: string;
|
|
126
120
|
content: string;
|
|
127
121
|
metadata: string;
|
|
128
|
-
|
|
122
|
+
_score: number;
|
|
129
123
|
}>;
|
|
130
124
|
|
|
131
125
|
return results.map((r) => ({
|
|
132
126
|
id: createDocumentId(r.id),
|
|
133
127
|
content: r.content,
|
|
134
|
-
score: r.
|
|
128
|
+
score: r._score,
|
|
135
129
|
// eslint-disable-next-line @typescript-eslint/consistent-type-assertions
|
|
136
130
|
metadata: JSON.parse(r.metadata) as DocumentMetadata,
|
|
137
131
|
}));
|
|
@@ -1094,6 +1094,7 @@ describe('IndexService - Error Handling Edge Cases', () => {
|
|
|
1094
1094
|
// Create a mock that throws a string instead of Error
|
|
1095
1095
|
const mockLanceStore = {
|
|
1096
1096
|
addDocuments: vi.fn().mockRejectedValue('string error'),
|
|
1097
|
+
createFtsIndex: vi.fn().mockResolvedValue(undefined),
|
|
1097
1098
|
} as unknown as LanceStore;
|
|
1098
1099
|
|
|
1099
1100
|
const indexService = new IndexService(mockLanceStore, embeddingEngine);
|
|
@@ -196,6 +196,8 @@ export class IndexService {
|
|
|
196
196
|
|
|
197
197
|
if (documents.length > 0) {
|
|
198
198
|
await this.lanceStore.addDocuments(store.id, documents);
|
|
199
|
+
// Create FTS index for full-text search
|
|
200
|
+
await this.lanceStore.createFtsIndex(store.id);
|
|
199
201
|
}
|
|
200
202
|
|
|
201
203
|
// Build and save code graph if service is available and we have source files
|
|
@@ -1783,3 +1783,212 @@ describe('SearchService - Code Graph Integration', () => {
|
|
|
1783
1783
|
expect(results.results[0]?.full?.relatedCode?.length).toBe(10);
|
|
1784
1784
|
});
|
|
1785
1785
|
});
|
|
1786
|
+
|
|
1787
|
+
describe('SearchService - Threshold Filtering', () => {
|
|
1788
|
+
let mockLanceStore: LanceStore;
|
|
1789
|
+
let mockEmbeddingEngine: EmbeddingEngine;
|
|
1790
|
+
let searchService: SearchService;
|
|
1791
|
+
const storeId = createStoreId('test-store');
|
|
1792
|
+
|
|
1793
|
+
beforeEach(() => {
|
|
1794
|
+
mockLanceStore = {
|
|
1795
|
+
search: vi.fn(),
|
|
1796
|
+
fullTextSearch: vi.fn(),
|
|
1797
|
+
} as unknown as LanceStore;
|
|
1798
|
+
|
|
1799
|
+
mockEmbeddingEngine = {
|
|
1800
|
+
embed: vi.fn().mockResolvedValue([0.1, 0.2, 0.3]),
|
|
1801
|
+
} as unknown as EmbeddingEngine;
|
|
1802
|
+
|
|
1803
|
+
searchService = new SearchService(mockLanceStore, mockEmbeddingEngine);
|
|
1804
|
+
});
|
|
1805
|
+
|
|
1806
|
+
it('applies threshold to normalized scores, not raw scores', async () => {
|
|
1807
|
+
// Setup: 3 results with different raw scores
|
|
1808
|
+
// In hybrid mode with RRF, ranks matter more than raw scores
|
|
1809
|
+
// doc1 appears in both vector and FTS -> highest RRF score -> normalized to 1.0
|
|
1810
|
+
// doc2 appears only in vector -> middle RRF score -> normalized to ~0.5
|
|
1811
|
+
// doc3 appears only in vector, lowest rank -> lowest RRF score -> normalized to 0.0
|
|
1812
|
+
vi.mocked(mockLanceStore.search).mockResolvedValue([
|
|
1813
|
+
{
|
|
1814
|
+
id: createDocumentId('doc1'),
|
|
1815
|
+
score: 0.9,
|
|
1816
|
+
content: 'result 1',
|
|
1817
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1818
|
+
},
|
|
1819
|
+
{
|
|
1820
|
+
id: createDocumentId('doc2'),
|
|
1821
|
+
score: 0.7,
|
|
1822
|
+
content: 'result 2',
|
|
1823
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1824
|
+
},
|
|
1825
|
+
{
|
|
1826
|
+
id: createDocumentId('doc3'),
|
|
1827
|
+
score: 0.5,
|
|
1828
|
+
content: 'result 3',
|
|
1829
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1830
|
+
},
|
|
1831
|
+
]);
|
|
1832
|
+
// Add doc1 and doc2 to FTS results so they both have good RRF scores
|
|
1833
|
+
vi.mocked(mockLanceStore.fullTextSearch).mockResolvedValue([
|
|
1834
|
+
{
|
|
1835
|
+
id: createDocumentId('doc1'),
|
|
1836
|
+
score: 0.9,
|
|
1837
|
+
content: 'result 1',
|
|
1838
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1839
|
+
},
|
|
1840
|
+
{
|
|
1841
|
+
id: createDocumentId('doc2'),
|
|
1842
|
+
score: 0.7,
|
|
1843
|
+
content: 'result 2',
|
|
1844
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1845
|
+
},
|
|
1846
|
+
]);
|
|
1847
|
+
|
|
1848
|
+
// With threshold 0.4, doc1 (1.0) and doc2 (~0.47) should pass
|
|
1849
|
+
// doc3 (0.0) should be filtered out
|
|
1850
|
+
const results = await searchService.search({
|
|
1851
|
+
query: 'test query',
|
|
1852
|
+
stores: [storeId],
|
|
1853
|
+
mode: 'hybrid',
|
|
1854
|
+
limit: 10,
|
|
1855
|
+
threshold: 0.4,
|
|
1856
|
+
});
|
|
1857
|
+
|
|
1858
|
+
// Should return 2 results: scores >= 0.4 (normalized)
|
|
1859
|
+
expect(results.results.length).toBe(2);
|
|
1860
|
+
expect(results.results[0]?.id).toBe(createDocumentId('doc1'));
|
|
1861
|
+
expect(results.results[1]?.id).toBe(createDocumentId('doc2'));
|
|
1862
|
+
|
|
1863
|
+
// Verify normalized scores
|
|
1864
|
+
expect(results.results[0]?.score).toBe(1.0);
|
|
1865
|
+
expect(results.results[1]?.score).toBeGreaterThanOrEqual(0.4);
|
|
1866
|
+
|
|
1867
|
+
// Verify doc3 was filtered out (its normalized score is 0.0)
|
|
1868
|
+
expect(results.results.find((r) => r.id === createDocumentId('doc3'))).toBeUndefined();
|
|
1869
|
+
});
|
|
1870
|
+
|
|
1871
|
+
it('returns all results when threshold is 0', async () => {
|
|
1872
|
+
vi.mocked(mockLanceStore.search).mockResolvedValue([
|
|
1873
|
+
{
|
|
1874
|
+
id: createDocumentId('doc1'),
|
|
1875
|
+
score: 0.9,
|
|
1876
|
+
content: 'result 1',
|
|
1877
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1878
|
+
},
|
|
1879
|
+
{
|
|
1880
|
+
id: createDocumentId('doc2'),
|
|
1881
|
+
score: 0.1,
|
|
1882
|
+
content: 'result 2',
|
|
1883
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1884
|
+
},
|
|
1885
|
+
]);
|
|
1886
|
+
vi.mocked(mockLanceStore.fullTextSearch).mockResolvedValue([]);
|
|
1887
|
+
|
|
1888
|
+
const results = await searchService.search({
|
|
1889
|
+
query: 'test query',
|
|
1890
|
+
stores: [storeId],
|
|
1891
|
+
mode: 'hybrid',
|
|
1892
|
+
limit: 10,
|
|
1893
|
+
threshold: 0,
|
|
1894
|
+
});
|
|
1895
|
+
|
|
1896
|
+
// All results should be returned (scores >= 0)
|
|
1897
|
+
expect(results.results.length).toBe(2);
|
|
1898
|
+
});
|
|
1899
|
+
|
|
1900
|
+
it('returns no results when threshold is higher than all scores', async () => {
|
|
1901
|
+
vi.mocked(mockLanceStore.search).mockResolvedValue([
|
|
1902
|
+
{
|
|
1903
|
+
id: createDocumentId('doc1'),
|
|
1904
|
+
score: 0.9,
|
|
1905
|
+
content: 'result 1',
|
|
1906
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1907
|
+
},
|
|
1908
|
+
{
|
|
1909
|
+
id: createDocumentId('doc2'),
|
|
1910
|
+
score: 0.8,
|
|
1911
|
+
content: 'result 2',
|
|
1912
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1913
|
+
},
|
|
1914
|
+
]);
|
|
1915
|
+
vi.mocked(mockLanceStore.fullTextSearch).mockResolvedValue([]);
|
|
1916
|
+
|
|
1917
|
+
// Threshold > 1.0 means no results pass
|
|
1918
|
+
const results = await searchService.search({
|
|
1919
|
+
query: 'test query',
|
|
1920
|
+
stores: [storeId],
|
|
1921
|
+
mode: 'hybrid',
|
|
1922
|
+
limit: 10,
|
|
1923
|
+
threshold: 1.1,
|
|
1924
|
+
});
|
|
1925
|
+
|
|
1926
|
+
expect(results.results.length).toBe(0);
|
|
1927
|
+
});
|
|
1928
|
+
|
|
1929
|
+
it('applies threshold in vector mode after score calculation', async () => {
|
|
1930
|
+
vi.mocked(mockLanceStore.search).mockResolvedValue([
|
|
1931
|
+
{
|
|
1932
|
+
id: createDocumentId('doc1'),
|
|
1933
|
+
score: 0.9,
|
|
1934
|
+
content: 'result 1',
|
|
1935
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1936
|
+
},
|
|
1937
|
+
{
|
|
1938
|
+
id: createDocumentId('doc2'),
|
|
1939
|
+
score: 0.3,
|
|
1940
|
+
content: 'result 2',
|
|
1941
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1942
|
+
},
|
|
1943
|
+
]);
|
|
1944
|
+
|
|
1945
|
+
const results = await searchService.search({
|
|
1946
|
+
query: 'test query',
|
|
1947
|
+
stores: [storeId],
|
|
1948
|
+
mode: 'vector',
|
|
1949
|
+
limit: 10,
|
|
1950
|
+
threshold: 0.5,
|
|
1951
|
+
});
|
|
1952
|
+
|
|
1953
|
+
// Only doc1 should pass (normalized score 1.0 >= 0.5)
|
|
1954
|
+
// doc2 has normalized score 0.0 which is < 0.5
|
|
1955
|
+
expect(results.results.length).toBe(1);
|
|
1956
|
+
expect(results.results[0]?.id).toBe(createDocumentId('doc1'));
|
|
1957
|
+
});
|
|
1958
|
+
|
|
1959
|
+
it('maintains correct result count metadata after threshold filtering', async () => {
|
|
1960
|
+
vi.mocked(mockLanceStore.search).mockResolvedValue([
|
|
1961
|
+
{
|
|
1962
|
+
id: createDocumentId('doc1'),
|
|
1963
|
+
score: 0.9,
|
|
1964
|
+
content: 'result 1',
|
|
1965
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1966
|
+
},
|
|
1967
|
+
{
|
|
1968
|
+
id: createDocumentId('doc2'),
|
|
1969
|
+
score: 0.5,
|
|
1970
|
+
content: 'result 2',
|
|
1971
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1972
|
+
},
|
|
1973
|
+
{
|
|
1974
|
+
id: createDocumentId('doc3'),
|
|
1975
|
+
score: 0.1,
|
|
1976
|
+
content: 'result 3',
|
|
1977
|
+
metadata: { type: 'file' as const, storeId, indexedAt: new Date() },
|
|
1978
|
+
},
|
|
1979
|
+
]);
|
|
1980
|
+
vi.mocked(mockLanceStore.fullTextSearch).mockResolvedValue([]);
|
|
1981
|
+
|
|
1982
|
+
const results = await searchService.search({
|
|
1983
|
+
query: 'test query',
|
|
1984
|
+
stores: [storeId],
|
|
1985
|
+
mode: 'hybrid',
|
|
1986
|
+
limit: 10,
|
|
1987
|
+
threshold: 0.5,
|
|
1988
|
+
});
|
|
1989
|
+
|
|
1990
|
+
// Check response metadata
|
|
1991
|
+
expect(results.totalResults).toBe(results.results.length);
|
|
1992
|
+
expect(results.query).toBe('test query');
|
|
1993
|
+
});
|
|
1994
|
+
});
|
|
@@ -370,6 +370,48 @@ export class SearchService {
|
|
|
370
370
|
return queryTerms.filter((term) => lowerContent.includes(term)).length;
|
|
371
371
|
}
|
|
372
372
|
|
|
373
|
+
/**
|
|
374
|
+
* Normalize scores to 0-1 range and optionally filter by threshold.
|
|
375
|
+
* This ensures threshold values match displayed scores (UX consistency).
|
|
376
|
+
*
|
|
377
|
+
* Edge case handling:
|
|
378
|
+
* - If there's only 1 result or all results have the same score, normalization
|
|
379
|
+
* would make them all 1.0. In this case, we keep the raw scores to allow
|
|
380
|
+
* threshold filtering to work meaningfully on absolute quality.
|
|
381
|
+
*/
|
|
382
|
+
private normalizeAndFilterScores(results: SearchResult[], threshold?: number): SearchResult[] {
|
|
383
|
+
if (results.length === 0) return [];
|
|
384
|
+
|
|
385
|
+
// Sort by score descending
|
|
386
|
+
const sorted = [...results].sort((a, b) => b.score - a.score);
|
|
387
|
+
|
|
388
|
+
// Get score range for normalization
|
|
389
|
+
const first = sorted[0];
|
|
390
|
+
const last = sorted[sorted.length - 1];
|
|
391
|
+
if (first === undefined || last === undefined) return [];
|
|
392
|
+
|
|
393
|
+
const maxScore = first.score;
|
|
394
|
+
const minScore = last.score;
|
|
395
|
+
const range = maxScore - minScore;
|
|
396
|
+
|
|
397
|
+
// Only normalize when there's meaningful score variation
|
|
398
|
+
// If all scores are the same (range = 0), keep raw scores for threshold filtering
|
|
399
|
+
const normalized =
|
|
400
|
+
range > 0
|
|
401
|
+
? sorted.map((r) => ({
|
|
402
|
+
...r,
|
|
403
|
+
score: Math.round(((r.score - minScore) / range) * 1000000) / 1000000,
|
|
404
|
+
}))
|
|
405
|
+
: sorted; // Keep raw scores when no variation (allows threshold to filter by quality)
|
|
406
|
+
|
|
407
|
+
// Apply threshold filter on scores
|
|
408
|
+
if (threshold !== undefined) {
|
|
409
|
+
return normalized.filter((r) => r.score >= threshold);
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
return normalized;
|
|
413
|
+
}
|
|
414
|
+
|
|
373
415
|
private async vectorSearch(
|
|
374
416
|
query: string,
|
|
375
417
|
stores: readonly StoreId[],
|
|
@@ -391,7 +433,9 @@ export class SearchService {
|
|
|
391
433
|
);
|
|
392
434
|
}
|
|
393
435
|
|
|
394
|
-
|
|
436
|
+
// Normalize scores and apply threshold filter
|
|
437
|
+
const normalized = this.normalizeAndFilterScores(results, threshold);
|
|
438
|
+
return normalized.slice(0, limit);
|
|
395
439
|
}
|
|
396
440
|
|
|
397
441
|
private async ftsSearch(
|
|
@@ -425,9 +469,9 @@ export class SearchService {
|
|
|
425
469
|
// Classify query intents for context-aware ranking (supports multiple intents)
|
|
426
470
|
const intents = classifyQueryIntents(query);
|
|
427
471
|
|
|
428
|
-
// Get both result sets
|
|
472
|
+
// Get both result sets (don't pass threshold - apply after RRF normalization)
|
|
429
473
|
const [vectorResults, ftsResults] = await Promise.all([
|
|
430
|
-
this.vectorSearch(query, stores, limit * 2
|
|
474
|
+
this.vectorSearch(query, stores, limit * 2),
|
|
431
475
|
this.ftsSearch(query, stores, limit * 2),
|
|
432
476
|
]);
|
|
433
477
|
|
|
@@ -534,34 +578,48 @@ export class SearchService {
|
|
|
534
578
|
const sorted = rrfScores.sort((a, b) => b.score - a.score).slice(0, limit);
|
|
535
579
|
|
|
536
580
|
// Normalize scores to 0-1 range for better interpretability
|
|
581
|
+
let normalizedResults: SearchResult[];
|
|
582
|
+
|
|
537
583
|
if (sorted.length > 0) {
|
|
538
584
|
const first = sorted[0];
|
|
539
585
|
const last = sorted[sorted.length - 1];
|
|
540
586
|
if (first === undefined || last === undefined) {
|
|
541
|
-
|
|
587
|
+
normalizedResults = sorted.map((r) => ({
|
|
542
588
|
...r.result,
|
|
543
589
|
score: r.score,
|
|
544
590
|
rankingMetadata: r.metadata,
|
|
545
591
|
}));
|
|
592
|
+
} else {
|
|
593
|
+
const maxScore = first.score;
|
|
594
|
+
const minScore = last.score;
|
|
595
|
+
const range = maxScore - minScore;
|
|
596
|
+
|
|
597
|
+
if (range > 0) {
|
|
598
|
+
// Round to avoid floating point precision issues in threshold comparisons
|
|
599
|
+
normalizedResults = sorted.map((r) => ({
|
|
600
|
+
...r.result,
|
|
601
|
+
score: Math.round(((r.score - minScore) / range) * 1000000) / 1000000,
|
|
602
|
+
rankingMetadata: r.metadata,
|
|
603
|
+
}));
|
|
604
|
+
} else {
|
|
605
|
+
// All same score - keep raw scores (allows threshold to filter by quality)
|
|
606
|
+
normalizedResults = sorted.map((r) => ({
|
|
607
|
+
...r.result,
|
|
608
|
+
score: r.score,
|
|
609
|
+
rankingMetadata: r.metadata,
|
|
610
|
+
}));
|
|
611
|
+
}
|
|
546
612
|
}
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
613
|
+
} else {
|
|
614
|
+
normalizedResults = [];
|
|
615
|
+
}
|
|
550
616
|
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
score: (r.score - minScore) / range,
|
|
555
|
-
rankingMetadata: r.metadata,
|
|
556
|
-
}));
|
|
557
|
-
}
|
|
617
|
+
// Apply threshold filter on normalized scores (UX consistency)
|
|
618
|
+
if (threshold !== undefined) {
|
|
619
|
+
return normalizedResults.filter((r) => r.score >= threshold);
|
|
558
620
|
}
|
|
559
621
|
|
|
560
|
-
return
|
|
561
|
-
...r.result,
|
|
562
|
-
score: r.score,
|
|
563
|
-
rankingMetadata: r.metadata,
|
|
564
|
-
}));
|
|
622
|
+
return normalizedResults;
|
|
565
623
|
}
|
|
566
624
|
|
|
567
625
|
async searchAllStores(query: SearchQuery, storeIds: StoreId[]): Promise<SearchResponse> {
|
|
@@ -27,6 +27,7 @@ describe('BackgroundWorker', () => {
|
|
|
27
27
|
lanceStore = {
|
|
28
28
|
initialize: vi.fn().mockResolvedValue(undefined),
|
|
29
29
|
addDocuments: vi.fn().mockResolvedValue(undefined),
|
|
30
|
+
createFtsIndex: vi.fn().mockResolvedValue(undefined),
|
|
30
31
|
} as unknown as LanceStore;
|
|
31
32
|
embeddingEngine = {
|
|
32
33
|
embed: vi.fn().mockResolvedValue(new Array(384).fill(0)),
|