bluera-knowledge 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/README.md +21 -0
- package/dist/{chunk-OZIVRLZE.js → chunk-BZQ7KWEE.js} +67 -5
- package/dist/chunk-BZQ7KWEE.js.map +1 -0
- package/dist/{chunk-HXBIIMYL.js → chunk-H25AEF47.js} +42 -1
- package/dist/chunk-H25AEF47.js.map +1 -0
- package/dist/{chunk-26MBEEKM.js → chunk-VNHZ534Q.js} +2 -2
- package/dist/{chunk-PZE2MO7H.js → chunk-ZR23KJPJ.js} +206 -38
- package/dist/chunk-ZR23KJPJ.js.map +1 -0
- package/dist/index.js +13 -6
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.d.ts +44 -3
- package/dist/mcp/server.js +3 -3
- package/dist/{watch.service-NXRWLJG6.js → watch.service-THP6X5ZZ.js} +2 -2
- package/dist/workers/background-worker-cli.js +3 -3
- package/package.json +2 -2
- package/dist/chunk-HXBIIMYL.js.map +0 -1
- package/dist/chunk-OZIVRLZE.js.map +0 -1
- package/dist/chunk-PZE2MO7H.js.map +0 -1
- /package/dist/{chunk-26MBEEKM.js.map → chunk-VNHZ534Q.js.map} +0 -0
- /package/dist/{watch.service-NXRWLJG6.js.map → watch.service-THP6X5ZZ.js.map} +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/services/watch.service.ts","../src/utils/ignore-patterns.ts","../src/utils/model-validation.ts"],"sourcesContent":["import { watch, type FSWatcher } from 'chokidar';\nimport { normalizeGlobPatterns } from '../utils/ignore-patterns.js';\nimport { validateStoreModelCompatibility } from '../utils/model-validation.js';\nimport type { IndexService } from './index.service.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { FileStore, RepoStore } from '../types/store.js';\n\nexport interface WatchServiceOptions {\n ignorePatterns?: readonly string[];\n /** Current embedding model ID for compatibility validation */\n currentModelId?: string;\n}\n\nexport class WatchService {\n private readonly watchers: Map<string, FSWatcher> = new Map();\n private readonly pendingTimeouts: Map<string, NodeJS.Timeout> = new Map();\n private readonly indexService: IndexService;\n private readonly lanceStore: LanceStore;\n private readonly embeddings: EmbeddingEngine;\n private readonly ignorePatterns: readonly string[];\n private readonly currentModelId: string | undefined;\n\n constructor(\n indexService: IndexService,\n lanceStore: LanceStore,\n embeddings: EmbeddingEngine,\n options: WatchServiceOptions = {}\n ) {\n this.indexService = indexService;\n this.lanceStore = lanceStore;\n this.embeddings = embeddings;\n // Use shared utility to normalize patterns to glob format with defaults\n this.ignorePatterns = normalizeGlobPatterns(options.ignorePatterns ?? []);\n this.currentModelId = options.currentModelId;\n }\n\n async watch(\n store: FileStore | RepoStore,\n debounceMs: number,\n onReindex: (() => void) | undefined,\n onError: (error: Error) => void\n ): Promise<void> {\n if (this.watchers.has(store.id)) {\n return Promise.resolve(); // Already watching\n }\n\n let timeout: NodeJS.Timeout | null = null;\n\n const watcher = watch(store.path, {\n ignored: [...this.ignorePatterns],\n persistent: true,\n ignoreInitial: true,\n });\n\n const reindexHandler = (): void => {\n if (timeout) clearTimeout(timeout);\n timeout = setTimeout(() => {\n this.pendingTimeouts.delete(store.id);\n void (async (): Promise<void> => {\n try {\n // Validate model compatibility before incremental indexing\n // If currentModelId is set, check that store was indexed with same model\n if (this.currentModelId !== undefined) {\n validateStoreModelCompatibility(store, { currentModelId: this.currentModelId });\n }\n\n this.lanceStore.setDimensions(await this.embeddings.ensureDimensions());\n await this.lanceStore.initialize(store.id);\n\n // Try incremental indexing first if available, fall back to full indexing\n let useFullReindex = true;\n if (typeof this.indexService.indexStoreIncremental === 'function') {\n const incrementalResult = await this.indexService.indexStoreIncremental(store);\n if (incrementalResult.success) {\n useFullReindex = false;\n }\n }\n\n if (useFullReindex) {\n const fullResult = await this.indexService.indexStore(store);\n if (!fullResult.success) {\n onError(fullResult.error);\n return;\n }\n }\n\n onReindex?.();\n } catch (e) {\n const error = e instanceof Error ? e : new Error(String(e));\n onError(error);\n }\n })();\n }, debounceMs);\n this.pendingTimeouts.set(store.id, timeout);\n };\n\n watcher.on('all', reindexHandler);\n\n watcher.on('error', (e) => {\n const error = e instanceof Error ? e : new Error(String(e));\n onError(error);\n });\n\n this.watchers.set(store.id, watcher);\n return Promise.resolve();\n }\n\n async unwatch(storeId: string): Promise<void> {\n // Clear any pending timeout to prevent timer leak\n const pendingTimeout = this.pendingTimeouts.get(storeId);\n if (pendingTimeout) {\n clearTimeout(pendingTimeout);\n this.pendingTimeouts.delete(storeId);\n }\n\n const watcher = this.watchers.get(storeId);\n if (watcher) {\n await watcher.close();\n this.watchers.delete(storeId);\n }\n }\n\n async unwatchAll(): Promise<void> {\n for (const [id] of this.watchers) {\n await this.unwatch(id);\n }\n }\n}\n","/**\n * Unified ignore pattern handling for consistent behavior across IndexService and WatchService.\n *\n * Pattern normalization ensures the same config patterns work identically whether used\n * for fs.readdir scanning (IndexService) or chokidar watching (WatchService).\n */\n\n/** Default directories to always ignore */\nexport const DEFAULT_IGNORE_DIRS = ['node_modules', '.git', '.bluera', 'dist', 'build'] as const;\n\n/**\n * Normalize patterns to standard glob format for chokidar and micromatch.\n *\n * Transformations:\n * - 'node_modules' → '** /node_modules/**' (directory anywhere in tree)\n * - 'node_modules/**' → '** /node_modules/**' (explicit directory pattern)\n * - '*.min.js' → '**\\/*.min.js' (extension pattern anywhere)\n * - '** /foo/**' → unchanged (already in glob format)\n *\n * @param patterns - User-provided patterns from config\n * @param includeDefaults - Whether to include DEFAULT_IGNORE_DIRS (default: true)\n */\nexport function normalizeGlobPatterns(\n patterns: readonly string[],\n includeDefaults = true\n): string[] {\n const result: string[] = [];\n\n // Add defaults first\n if (includeDefaults) {\n for (const dir of DEFAULT_IGNORE_DIRS) {\n result.push(`**/${dir}/**`);\n }\n }\n\n // Process user patterns\n for (const pattern of patterns) {\n if (pattern.startsWith('**/') && pattern.endsWith('/**')) {\n // Already in glob format\n result.push(pattern);\n } else if (pattern.endsWith('/**')) {\n // Directory pattern: 'foo/**' → '**/foo/**'\n result.push(`**/${pattern}`);\n } else if (pattern.startsWith('*.')) {\n // Extension pattern: '*.min.js' → '**/*.min.js'\n result.push(`**/${pattern}`);\n } else if (!pattern.includes('/') && !pattern.includes('*')) {\n // Simple directory name: 'node_modules' → '**/node_modules/**'\n result.push(`**/${pattern}/**`);\n } else {\n // Keep as-is (might be a specific path pattern)\n result.push(pattern);\n }\n }\n\n return result;\n}\n\n/**\n * Parsed patterns optimized for fs.readdir scanning.\n */\nexport interface ScanningPatterns {\n /** Directory names to skip during traversal (e.g., 'node_modules', '.git') */\n dirs: Set<string>;\n /** Predicate functions to test if a filename should be ignored (e.g., for '*.min.js') */\n fileMatchers: Array<(filename: string) => boolean>;\n}\n\n/**\n * Parse patterns into structures optimized for fs.readdir filtering.\n *\n * This is more efficient than glob matching for directory traversal since\n * it allows early termination when encountering ignored directories.\n *\n * @param patterns - User-provided patterns from config\n * @param includeDefaults - Whether to include DEFAULT_IGNORE_DIRS (default: true)\n */\nexport function parseIgnorePatternsForScanning(\n patterns: readonly string[],\n includeDefaults = true\n): ScanningPatterns {\n const dirs = new Set<string>();\n const fileMatchers: Array<(filename: string) => boolean> = [];\n\n // Add defaults first\n if (includeDefaults) {\n for (const dir of DEFAULT_IGNORE_DIRS) {\n dirs.add(dir);\n }\n }\n\n // Process user patterns\n for (const pattern of patterns) {\n if (pattern.startsWith('**/') && pattern.endsWith('/**')) {\n // Glob format: '**/node_modules/**' → extract 'node_modules'\n const inner = pattern.slice(3, -3);\n if (!inner.includes('/') && !inner.includes('*')) {\n dirs.add(inner);\n }\n } else if (pattern.endsWith('/**')) {\n // Directory pattern: 'node_modules/**' → 'node_modules'\n dirs.add(pattern.slice(0, -3));\n } else if (pattern.startsWith('*.')) {\n // Extension pattern: '*.min.js' → matches files ending with '.min.js'\n const ext = pattern.slice(1); // Remove leading '*'\n fileMatchers.push((filename) => filename.endsWith(ext));\n } else if (!pattern.includes('/') && !pattern.includes('*')) {\n // Simple directory name: 'node_modules' → treat as directory\n dirs.add(pattern);\n }\n // Note: Complex patterns like 'src/**/*.test.ts' are not supported for scanning\n // They would require full glob matching which defeats the purpose of fast scanning\n }\n\n return { dirs, fileMatchers };\n}\n","/**\n * Model Compatibility Validation\n *\n * Guards against searching or incrementally indexing stores with mismatched embedding models.\n * Ensures stores are only queried with the same model they were indexed with.\n */\n\nimport type { Store } from '../types/store.js';\n\nexport interface ModelValidationContext {\n currentModelId: string;\n}\n\n/**\n * Validates that a store's embedding model matches the current configuration.\n *\n * @throws Error if store was indexed with a different model or has no model tracking\n */\nexport function validateStoreModelCompatibility(store: Store, ctx: ModelValidationContext): void {\n // Stores without modelId (schemaVersion < 2) must be reindexed\n if (!store.modelId) {\n throw new Error(\n `Store \"${store.name}\" has no model tracking (schema v1). ` +\n `Reindex required: /bluera-knowledge:index ${store.name}`\n );\n }\n\n if (store.modelId !== ctx.currentModelId) {\n throw new Error(\n `Model mismatch: Store \"${store.name}\" was indexed with \"${store.modelId}\", ` +\n `but current config uses \"${ctx.currentModelId}\". ` +\n `Reindex required: /bluera-knowledge:index ${store.name}`\n );\n }\n}\n\n/**\n * Check if a store's model matches the current configuration without throwing.\n *\n * @returns Object with compatibility status and details\n */\nexport function checkStoreModelCompatibility(\n store: Store,\n ctx: ModelValidationContext\n): {\n compatible: boolean;\n modelId: string | undefined;\n reason?: string;\n} {\n if (!store.modelId) {\n return {\n compatible: false,\n modelId: undefined,\n reason: 'Store has no model tracking (schema v1)',\n };\n }\n\n if (store.modelId !== ctx.currentModelId) {\n return {\n compatible: false,\n modelId: store.modelId,\n reason: `Indexed with different model: ${store.modelId}`,\n };\n }\n\n return {\n compatible: true,\n modelId: store.modelId,\n };\n}\n"],"mappings":";AAAA,SAAS,aAA6B;;;ACQ/B,IAAM,sBAAsB,CAAC,gBAAgB,QAAQ,WAAW,QAAQ,OAAO;AAc/E,SAAS,sBACd,UACA,kBAAkB,MACR;AACV,QAAM,SAAmB,CAAC;AAG1B,MAAI,iBAAiB;AACnB,eAAW,OAAO,qBAAqB;AACrC,aAAO,KAAK,MAAM,GAAG,KAAK;AAAA,IAC5B;AAAA,EACF;AAGA,aAAW,WAAW,UAAU;AAC9B,QAAI,QAAQ,WAAW,KAAK,KAAK,QAAQ,SAAS,KAAK,GAAG;AAExD,aAAO,KAAK,OAAO;AAAA,IACrB,WAAW,QAAQ,SAAS,KAAK,GAAG;AAElC,aAAO,KAAK,MAAM,OAAO,EAAE;AAAA,IAC7B,WAAW,QAAQ,WAAW,IAAI,GAAG;AAEnC,aAAO,KAAK,MAAM,OAAO,EAAE;AAAA,IAC7B,WAAW,CAAC,QAAQ,SAAS,GAAG,KAAK,CAAC,QAAQ,SAAS,GAAG,GAAG;AAE3D,aAAO,KAAK,MAAM,OAAO,KAAK;AAAA,IAChC,OAAO;AAEL,aAAO,KAAK,OAAO;AAAA,IACrB;AAAA,EACF;AAEA,SAAO;AACT;AAqBO,SAAS,+BACd,UACA,kBAAkB,MACA;AAClB,QAAM,OAAO,oBAAI,IAAY;AAC7B,QAAM,eAAqD,CAAC;AAG5D,MAAI,iBAAiB;AACnB,eAAW,OAAO,qBAAqB;AACrC,WAAK,IAAI,GAAG;AAAA,IACd;AAAA,EACF;AAGA,aAAW,WAAW,UAAU;AAC9B,QAAI,QAAQ,WAAW,KAAK,KAAK,QAAQ,SAAS,KAAK,GAAG;AAExD,YAAM,QAAQ,QAAQ,MAAM,GAAG,EAAE;AACjC,UAAI,CAAC,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,SAAS,GAAG,GAAG;AAChD,aAAK,IAAI,KAAK;AAAA,MAChB;AAAA,IACF,WAAW,QAAQ,SAAS,KAAK,GAAG;AAElC,WAAK,IAAI,QAAQ,MAAM,GAAG,EAAE,CAAC;AAAA,IAC/B,WAAW,QAAQ,WAAW,IAAI,GAAG;AAEnC,YAAM,MAAM,QAAQ,MAAM,CAAC;AAC3B,mBAAa,KAAK,CAAC,aAAa,SAAS,SAAS,GAAG,CAAC;AAAA,IACxD,WAAW,CAAC,QAAQ,SAAS,GAAG,KAAK,CAAC,QAAQ,SAAS,GAAG,GAAG;AAE3D,WAAK,IAAI,OAAO;AAAA,IAClB;AAAA,EAGF;AAEA,SAAO,EAAE,MAAM,aAAa;AAC9B;;;ACjGO,SAAS,gCAAgC,OAAc,KAAmC;AAE/F,MAAI,CAAC,MAAM,SAAS;AAClB,UAAM,IAAI;AAAA,MACR,UAAU,MAAM,IAAI,kFAC2B,MAAM,IAAI;AAAA,IAC3D;AAAA,EACF;AAEA,MAAI,MAAM,YAAY,IAAI,gBAAgB;AACxC,UAAM,IAAI;AAAA,MACR,0BAA0B,MAAM,IAAI,uBAAuB,MAAM,OAAO,+BAC1C,IAAI,cAAc,gDACD,MAAM,IAAI;AAAA,IAC3D;AAAA,EACF;AACF;AAOO,SAAS,6BACd,OACA,KAKA;AACA,MAAI,CAAC,MAAM,SAAS;AAClB,WAAO;AAAA,MACL,YAAY;AAAA,MACZ,SAAS;AAAA,MACT,QAAQ;AAAA,IACV;AAAA,EACF;AAEA,MAAI,MAAM,YAAY,IAAI,gBAAgB;AACxC,WAAO;AAAA,MACL,YAAY;AAAA,MACZ,SAAS,MAAM;AAAA,MACf,QAAQ,iCAAiC,MAAM,OAAO;AAAA,IACxD;AAAA,EACF;AAEA,SAAO;AAAA,IACL,YAAY;AAAA,IACZ,SAAS,MAAM;AAAA,EACjB;AACF;;;AFvDO,IAAM,eAAN,MAAmB;AAAA,EACP,WAAmC,oBAAI,IAAI;AAAA,EAC3C,kBAA+C,oBAAI,IAAI;AAAA,EACvD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EAEjB,YACE,cACA,YACA,YACA,UAA+B,CAAC,GAChC;AACA,SAAK,eAAe;AACpB,SAAK,aAAa;AAClB,SAAK,aAAa;AAElB,SAAK,iBAAiB,sBAAsB,QAAQ,kBAAkB,CAAC,CAAC;AACxE,SAAK,iBAAiB,QAAQ;AAAA,EAChC;AAAA,EAEA,MAAM,MACJ,OACA,YACA,WACA,SACe;AACf,QAAI,KAAK,SAAS,IAAI,MAAM,EAAE,GAAG;AAC/B,aAAO,QAAQ,QAAQ;AAAA,IACzB;AAEA,QAAI,UAAiC;AAErC,UAAM,UAAU,MAAM,MAAM,MAAM;AAAA,MAChC,SAAS,CAAC,GAAG,KAAK,cAAc;AAAA,MAChC,YAAY;AAAA,MACZ,eAAe;AAAA,IACjB,CAAC;AAED,UAAM,iBAAiB,MAAY;AACjC,UAAI,QAAS,cAAa,OAAO;AACjC,gBAAU,WAAW,MAAM;AACzB,aAAK,gBAAgB,OAAO,MAAM,EAAE;AACpC,cAAM,YAA2B;AAC/B,cAAI;AAGF,gBAAI,KAAK,mBAAmB,QAAW;AACrC,8CAAgC,OAAO,EAAE,gBAAgB,KAAK,eAAe,CAAC;AAAA,YAChF;AAEA,iBAAK,WAAW,cAAc,MAAM,KAAK,WAAW,iBAAiB,CAAC;AACtE,kBAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AAGzC,gBAAI,iBAAiB;AACrB,gBAAI,OAAO,KAAK,aAAa,0BAA0B,YAAY;AACjE,oBAAM,oBAAoB,MAAM,KAAK,aAAa,sBAAsB,KAAK;AAC7E,kBAAI,kBAAkB,SAAS;AAC7B,iCAAiB;AAAA,cACnB;AAAA,YACF;AAEA,gBAAI,gBAAgB;AAClB,oBAAM,aAAa,MAAM,KAAK,aAAa,WAAW,KAAK;AAC3D,kBAAI,CAAC,WAAW,SAAS;AACvB,wBAAQ,WAAW,KAAK;AACxB;AAAA,cACF;AAAA,YACF;AAEA,wBAAY;AAAA,UACd,SAAS,GAAG;AACV,kBAAM,QAAQ,aAAa,QAAQ,IAAI,IAAI,MAAM,OAAO,CAAC,CAAC;AAC1D,oBAAQ,KAAK;AAAA,UACf;AAAA,QACF,GAAG;AAAA,MACL,GAAG,UAAU;AACb,WAAK,gBAAgB,IAAI,MAAM,IAAI,OAAO;AAAA,IAC5C;AAEA,YAAQ,GAAG,OAAO,cAAc;AAEhC,YAAQ,GAAG,SAAS,CAAC,MAAM;AACzB,YAAM,QAAQ,aAAa,QAAQ,IAAI,IAAI,MAAM,OAAO,CAAC,CAAC;AAC1D,cAAQ,KAAK;AAAA,IACf,CAAC;AAED,SAAK,SAAS,IAAI,MAAM,IAAI,OAAO;AACnC,WAAO,QAAQ,QAAQ;AAAA,EACzB;AAAA,EAEA,MAAM,QAAQ,SAAgC;AAE5C,UAAM,iBAAiB,KAAK,gBAAgB,IAAI,OAAO;AACvD,QAAI,gBAAgB;AAClB,mBAAa,cAAc;AAC3B,WAAK,gBAAgB,OAAO,OAAO;AAAA,IACrC;AAEA,UAAM,UAAU,KAAK,SAAS,IAAI,OAAO;AACzC,QAAI,SAAS;AACX,YAAM,QAAQ,MAAM;AACpB,WAAK,SAAS,OAAO,OAAO;AAAA,IAC9B;AAAA,EACF;AAAA,EAEA,MAAM,aAA4B;AAChC,eAAW,CAAC,EAAE,KAAK,KAAK,UAAU;AAChC,YAAM,KAAK,QAAQ,EAAE;AAAA,IACvB;AAAA,EACF;AACF;","names":[]}
|
|
@@ -2,7 +2,7 @@ import {
|
|
|
2
2
|
createLogger,
|
|
3
3
|
summarizePayload,
|
|
4
4
|
truncateForLog
|
|
5
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-ZR23KJPJ.js";
|
|
6
6
|
|
|
7
7
|
// src/crawl/intelligent-crawler.ts
|
|
8
8
|
import { EventEmitter } from "events";
|
|
@@ -916,4 +916,4 @@ export {
|
|
|
916
916
|
getCrawlStrategy,
|
|
917
917
|
IntelligentCrawler
|
|
918
918
|
};
|
|
919
|
-
//# sourceMappingURL=chunk-
|
|
919
|
+
//# sourceMappingURL=chunk-VNHZ534Q.js.map
|
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
} from "./chunk-CLIMKLTW.js";
|
|
5
5
|
import {
|
|
6
6
|
parseIgnorePatternsForScanning
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-H25AEF47.js";
|
|
8
8
|
import {
|
|
9
9
|
__require
|
|
10
10
|
} from "./chunk-DGUM43GV.js";
|
|
@@ -2063,12 +2063,12 @@ var DEFAULT_CONFIG = {
|
|
|
2063
2063
|
version: 1,
|
|
2064
2064
|
dataDir: ".bluera/bluera-knowledge/data",
|
|
2065
2065
|
embedding: {
|
|
2066
|
-
model: "Xenova/
|
|
2066
|
+
model: "Xenova/bge-small-en-v1.5",
|
|
2067
2067
|
batchSize: 32,
|
|
2068
2068
|
dtype: "fp32",
|
|
2069
2069
|
pooling: "mean",
|
|
2070
2070
|
normalize: true,
|
|
2071
|
-
queryPrefix: "",
|
|
2071
|
+
queryPrefix: "Represent this sentence for searching relevant passages: ",
|
|
2072
2072
|
docPrefix: "",
|
|
2073
2073
|
maxInFlightBatches: 1
|
|
2074
2074
|
},
|
|
@@ -3755,15 +3755,13 @@ function detectContentType(results) {
|
|
|
3755
3755
|
}
|
|
3756
3756
|
var SearchService = class {
|
|
3757
3757
|
lanceStore;
|
|
3758
|
-
embeddingEngine;
|
|
3759
3758
|
codeUnitService;
|
|
3760
3759
|
codeGraphService;
|
|
3761
3760
|
graphCache;
|
|
3762
3761
|
searchConfig;
|
|
3763
3762
|
unsubscribeCacheInvalidation;
|
|
3764
|
-
constructor(lanceStore,
|
|
3763
|
+
constructor(lanceStore, codeGraphService, searchConfig) {
|
|
3765
3764
|
this.lanceStore = lanceStore;
|
|
3766
|
-
this.embeddingEngine = embeddingEngine;
|
|
3767
3765
|
this.codeUnitService = new CodeUnitService();
|
|
3768
3766
|
this.codeGraphService = codeGraphService;
|
|
3769
3767
|
this.graphCache = /* @__PURE__ */ new Map();
|
|
@@ -3978,17 +3976,17 @@ var SearchService = class {
|
|
|
3978
3976
|
/**
|
|
3979
3977
|
* Fetch raw vector search results without normalization.
|
|
3980
3978
|
* Returns results with raw cosine similarity scores [0-1].
|
|
3979
|
+
* Uses LanceDB's embedding function for query embedding,
|
|
3980
|
+
* ensuring consistent query/document embedding through a single code path.
|
|
3981
3981
|
*/
|
|
3982
3982
|
async vectorSearchRaw(query, stores, limit) {
|
|
3983
|
-
const queryVector = await this.embeddingEngine.embedQuery(query);
|
|
3984
3983
|
const results = [];
|
|
3985
3984
|
for (const storeId of stores) {
|
|
3986
|
-
const hits = await this.lanceStore.
|
|
3985
|
+
const hits = await this.lanceStore.searchText(storeId, query, limit);
|
|
3987
3986
|
results.push(
|
|
3988
3987
|
...hits.map((r) => ({
|
|
3989
3988
|
id: r.id,
|
|
3990
3989
|
score: r.score,
|
|
3991
|
-
// Raw cosine similarity (1 - distance)
|
|
3992
3990
|
content: r.content,
|
|
3993
3991
|
metadata: r.metadata
|
|
3994
3992
|
}))
|
|
@@ -4822,6 +4820,9 @@ function extractRepoName(url) {
|
|
|
4822
4820
|
return name;
|
|
4823
4821
|
}
|
|
4824
4822
|
|
|
4823
|
+
// src/types/store.ts
|
|
4824
|
+
var CURRENT_SCHEMA_VERSION = 2;
|
|
4825
|
+
|
|
4825
4826
|
// src/services/store.service.ts
|
|
4826
4827
|
async function fileExists4(path4) {
|
|
4827
4828
|
try {
|
|
@@ -4836,12 +4837,21 @@ var StoreService = class {
|
|
|
4836
4837
|
definitionService;
|
|
4837
4838
|
gitignoreService;
|
|
4838
4839
|
projectRoot;
|
|
4840
|
+
embeddingModelId;
|
|
4839
4841
|
registry = { stores: [] };
|
|
4840
4842
|
constructor(dataDir, options) {
|
|
4841
4843
|
this.dataDir = dataDir;
|
|
4842
|
-
this.definitionService = options
|
|
4843
|
-
this.gitignoreService = options
|
|
4844
|
-
this.projectRoot = options
|
|
4844
|
+
this.definitionService = options.definitionService ?? void 0;
|
|
4845
|
+
this.gitignoreService = options.gitignoreService ?? void 0;
|
|
4846
|
+
this.projectRoot = options.projectRoot ?? void 0;
|
|
4847
|
+
this.embeddingModelId = options.embeddingModelId;
|
|
4848
|
+
}
|
|
4849
|
+
/**
|
|
4850
|
+
* Get the current embedding model ID used for new stores.
|
|
4851
|
+
* Used by model compatibility validation.
|
|
4852
|
+
*/
|
|
4853
|
+
getCurrentModelId() {
|
|
4854
|
+
return this.embeddingModelId;
|
|
4845
4855
|
}
|
|
4846
4856
|
async initialize() {
|
|
4847
4857
|
await mkdir5(this.dataDir, { recursive: true });
|
|
@@ -4980,7 +4990,9 @@ var StoreService = class {
|
|
|
4980
4990
|
tags: input.tags,
|
|
4981
4991
|
status: "ready",
|
|
4982
4992
|
createdAt: now,
|
|
4983
|
-
updatedAt: now
|
|
4993
|
+
updatedAt: now,
|
|
4994
|
+
schemaVersion: CURRENT_SCHEMA_VERSION,
|
|
4995
|
+
modelId: this.embeddingModelId
|
|
4984
4996
|
};
|
|
4985
4997
|
break;
|
|
4986
4998
|
}
|
|
@@ -5025,7 +5037,9 @@ var StoreService = class {
|
|
|
5025
5037
|
tags: input.tags,
|
|
5026
5038
|
status: "ready",
|
|
5027
5039
|
createdAt: now,
|
|
5028
|
-
updatedAt: now
|
|
5040
|
+
updatedAt: now,
|
|
5041
|
+
schemaVersion: CURRENT_SCHEMA_VERSION,
|
|
5042
|
+
modelId: this.embeddingModelId
|
|
5029
5043
|
};
|
|
5030
5044
|
break;
|
|
5031
5045
|
}
|
|
@@ -5046,7 +5060,9 @@ var StoreService = class {
|
|
|
5046
5060
|
tags: input.tags,
|
|
5047
5061
|
status: "ready",
|
|
5048
5062
|
createdAt: now,
|
|
5049
|
-
updatedAt: now
|
|
5063
|
+
updatedAt: now,
|
|
5064
|
+
schemaVersion: CURRENT_SCHEMA_VERSION,
|
|
5065
|
+
modelId: this.embeddingModelId
|
|
5050
5066
|
};
|
|
5051
5067
|
break;
|
|
5052
5068
|
default: {
|
|
@@ -5425,12 +5441,12 @@ import { join as join11 } from "path";
|
|
|
5425
5441
|
import { pipeline, env } from "@huggingface/transformers";
|
|
5426
5442
|
env.cacheDir = join11(homedir2(), ".cache", "huggingface-transformers");
|
|
5427
5443
|
var DEFAULT_EMBEDDING_CONFIG = {
|
|
5428
|
-
model: "Xenova/
|
|
5444
|
+
model: "Xenova/bge-small-en-v1.5",
|
|
5429
5445
|
batchSize: 32,
|
|
5430
5446
|
dtype: "fp32",
|
|
5431
5447
|
pooling: "mean",
|
|
5432
5448
|
normalize: true,
|
|
5433
|
-
queryPrefix: "",
|
|
5449
|
+
queryPrefix: "Represent this sentence for searching relevant passages: ",
|
|
5434
5450
|
docPrefix: "",
|
|
5435
5451
|
maxInFlightBatches: 1
|
|
5436
5452
|
};
|
|
@@ -5654,6 +5670,88 @@ var EmbeddingEngine = class {
|
|
|
5654
5670
|
|
|
5655
5671
|
// src/db/lance.ts
|
|
5656
5672
|
import * as lancedb from "@lancedb/lancedb";
|
|
5673
|
+
import { LanceSchema } from "@lancedb/lancedb/embedding";
|
|
5674
|
+
import { Utf8 } from "apache-arrow";
|
|
5675
|
+
|
|
5676
|
+
// src/db/lance-embedding-function.ts
|
|
5677
|
+
import { TextEmbeddingFunction, getRegistry } from "@lancedb/lancedb/embedding";
|
|
5678
|
+
import { Float32 } from "apache-arrow";
|
|
5679
|
+
var HuggingFaceEmbeddingFunction = class extends TextEmbeddingFunction {
|
|
5680
|
+
engine;
|
|
5681
|
+
embeddingConfig;
|
|
5682
|
+
_ndims = null;
|
|
5683
|
+
constructor(optionsRaw) {
|
|
5684
|
+
super();
|
|
5685
|
+
const options = this.resolveVariables(optionsRaw ?? {});
|
|
5686
|
+
this.embeddingConfig = {
|
|
5687
|
+
model: options.model ?? "Xenova/bge-small-en-v1.5",
|
|
5688
|
+
batchSize: options.batchSize ?? 32,
|
|
5689
|
+
dtype: options.dtype ?? "fp32",
|
|
5690
|
+
pooling: options.pooling ?? "mean",
|
|
5691
|
+
normalize: options.normalize ?? true,
|
|
5692
|
+
queryPrefix: options.queryPrefix ?? "",
|
|
5693
|
+
docPrefix: options.docPrefix ?? "",
|
|
5694
|
+
maxInFlightBatches: 1
|
|
5695
|
+
// Single-threaded for LanceDB integration
|
|
5696
|
+
};
|
|
5697
|
+
this.engine = new EmbeddingEngine(this.embeddingConfig);
|
|
5698
|
+
}
|
|
5699
|
+
/**
|
|
5700
|
+
* Initialize the embedding model. Called by LanceDB before embeddings are computed.
|
|
5701
|
+
*/
|
|
5702
|
+
async init() {
|
|
5703
|
+
this._ndims = await this.engine.ensureDimensions();
|
|
5704
|
+
}
|
|
5705
|
+
/**
|
|
5706
|
+
* Return embedding dimensions. Must call init() first.
|
|
5707
|
+
*/
|
|
5708
|
+
ndims() {
|
|
5709
|
+
if (this._ndims === null) {
|
|
5710
|
+
throw new Error("HuggingFaceEmbeddingFunction not initialized. Call init() first.");
|
|
5711
|
+
}
|
|
5712
|
+
return this._ndims;
|
|
5713
|
+
}
|
|
5714
|
+
/**
|
|
5715
|
+
* Return embedding data type (always Float32 for our models).
|
|
5716
|
+
*/
|
|
5717
|
+
embeddingDataType() {
|
|
5718
|
+
return new Float32();
|
|
5719
|
+
}
|
|
5720
|
+
/**
|
|
5721
|
+
* Generate embeddings for a batch of texts (documents).
|
|
5722
|
+
* Called during table.add() operations.
|
|
5723
|
+
*/
|
|
5724
|
+
async generateEmbeddings(texts) {
|
|
5725
|
+
return this.engine.embedBatch(texts);
|
|
5726
|
+
}
|
|
5727
|
+
/**
|
|
5728
|
+
* Compute embedding for a single query.
|
|
5729
|
+
* Called during table.search(query) operations.
|
|
5730
|
+
*/
|
|
5731
|
+
async computeQueryEmbeddings(data) {
|
|
5732
|
+
const embedding = await this.engine.embedQuery(data);
|
|
5733
|
+
return Array.from(embedding);
|
|
5734
|
+
}
|
|
5735
|
+
/**
|
|
5736
|
+
* Get the model ID for provenance tracking.
|
|
5737
|
+
*/
|
|
5738
|
+
getModelId() {
|
|
5739
|
+
return this.embeddingConfig.model;
|
|
5740
|
+
}
|
|
5741
|
+
/**
|
|
5742
|
+
* Get the full embedding config.
|
|
5743
|
+
*/
|
|
5744
|
+
getConfig() {
|
|
5745
|
+
return this.embeddingConfig;
|
|
5746
|
+
}
|
|
5747
|
+
/**
|
|
5748
|
+
* Dispose the underlying engine to free resources.
|
|
5749
|
+
*/
|
|
5750
|
+
async dispose() {
|
|
5751
|
+
await this.engine.dispose();
|
|
5752
|
+
}
|
|
5753
|
+
};
|
|
5754
|
+
getRegistry().register("HuggingFaceEmbeddingFunction")(HuggingFaceEmbeddingFunction);
|
|
5657
5755
|
|
|
5658
5756
|
// src/types/document.ts
|
|
5659
5757
|
import { z as z5 } from "zod";
|
|
@@ -5671,15 +5769,51 @@ var DocumentMetadataSchema = z5.object({
|
|
|
5671
5769
|
}).loose();
|
|
5672
5770
|
|
|
5673
5771
|
// src/db/lance.ts
|
|
5772
|
+
function isSearchHit(value) {
|
|
5773
|
+
if (typeof value !== "object" || value === null) return false;
|
|
5774
|
+
return "id" in value && "content" in value && "metadata" in value && "_distance" in value && typeof value.id === "string" && typeof value.content === "string" && typeof value.metadata === "string" && typeof value._distance === "number";
|
|
5775
|
+
}
|
|
5776
|
+
function parseDocumentMetadata(jsonStr) {
|
|
5777
|
+
const parsed = DocumentMetadataSchema.parse(JSON.parse(jsonStr));
|
|
5778
|
+
return {
|
|
5779
|
+
...parsed,
|
|
5780
|
+
storeId: createStoreId(parsed.storeId)
|
|
5781
|
+
};
|
|
5782
|
+
}
|
|
5674
5783
|
var LanceStore = class {
|
|
5675
5784
|
connection = null;
|
|
5676
5785
|
tables = /* @__PURE__ */ new Map();
|
|
5677
5786
|
dataDir;
|
|
5678
5787
|
// eslint-disable-next-line @typescript-eslint/prefer-readonly -- set via setDimensions()
|
|
5679
5788
|
_dimensions = null;
|
|
5789
|
+
embeddingFunction = null;
|
|
5680
5790
|
constructor(dataDir) {
|
|
5681
5791
|
this.dataDir = dataDir;
|
|
5682
5792
|
}
|
|
5793
|
+
/**
|
|
5794
|
+
* Set the embedding function for auto-embedding queries.
|
|
5795
|
+
* Must be called before initialize() for new tables.
|
|
5796
|
+
* The embedding function is initialized and its dimensions are used for schema creation.
|
|
5797
|
+
*/
|
|
5798
|
+
async setEmbeddingFunction(config) {
|
|
5799
|
+
this.embeddingFunction = new HuggingFaceEmbeddingFunction({
|
|
5800
|
+
model: config.model,
|
|
5801
|
+
batchSize: config.batchSize,
|
|
5802
|
+
dtype: config.dtype,
|
|
5803
|
+
pooling: config.pooling,
|
|
5804
|
+
normalize: config.normalize,
|
|
5805
|
+
queryPrefix: config.queryPrefix,
|
|
5806
|
+
docPrefix: config.docPrefix
|
|
5807
|
+
});
|
|
5808
|
+
await this.embeddingFunction.init();
|
|
5809
|
+
this._dimensions = this.embeddingFunction.ndims();
|
|
5810
|
+
}
|
|
5811
|
+
/**
|
|
5812
|
+
* Check if embedding function is available for auto-embedding queries.
|
|
5813
|
+
*/
|
|
5814
|
+
hasEmbeddingFunction() {
|
|
5815
|
+
return this.embeddingFunction !== null;
|
|
5816
|
+
}
|
|
5683
5817
|
/**
|
|
5684
5818
|
* Set the embedding dimensions. Must be called before initialize().
|
|
5685
5819
|
* This allows dimensions to be derived from the embedding model at runtime.
|
|
@@ -5690,22 +5824,35 @@ var LanceStore = class {
|
|
|
5690
5824
|
}
|
|
5691
5825
|
async initialize(storeId) {
|
|
5692
5826
|
if (this._dimensions === null) {
|
|
5693
|
-
throw new Error(
|
|
5827
|
+
throw new Error(
|
|
5828
|
+
"Dimensions not set. Call setDimensions() or setEmbeddingFunction() before initialize()."
|
|
5829
|
+
);
|
|
5694
5830
|
}
|
|
5695
5831
|
this.connection ??= await lancedb.connect(this.dataDir);
|
|
5696
5832
|
const tableName = this.getTableName(storeId);
|
|
5697
5833
|
const tableNames = await this.connection.tableNames();
|
|
5698
5834
|
if (!tableNames.includes(tableName)) {
|
|
5699
|
-
|
|
5700
|
-
{
|
|
5701
|
-
id:
|
|
5702
|
-
content:
|
|
5703
|
-
vector:
|
|
5704
|
-
metadata:
|
|
5705
|
-
}
|
|
5706
|
-
|
|
5707
|
-
|
|
5708
|
-
|
|
5835
|
+
if (this.embeddingFunction !== null) {
|
|
5836
|
+
const schema = LanceSchema({
|
|
5837
|
+
id: new Utf8(),
|
|
5838
|
+
content: this.embeddingFunction.sourceField(),
|
|
5839
|
+
vector: this.embeddingFunction.vectorField(),
|
|
5840
|
+
metadata: new Utf8()
|
|
5841
|
+
});
|
|
5842
|
+
const table = await this.connection.createEmptyTable(tableName, schema);
|
|
5843
|
+
this.tables.set(tableName, table);
|
|
5844
|
+
} else {
|
|
5845
|
+
const table = await this.connection.createTable(tableName, [
|
|
5846
|
+
{
|
|
5847
|
+
id: "__init__",
|
|
5848
|
+
content: "",
|
|
5849
|
+
vector: new Array(this._dimensions).fill(0),
|
|
5850
|
+
metadata: "{}"
|
|
5851
|
+
}
|
|
5852
|
+
]);
|
|
5853
|
+
await table.delete('id = "__init__"');
|
|
5854
|
+
this.tables.set(tableName, table);
|
|
5855
|
+
}
|
|
5709
5856
|
} else {
|
|
5710
5857
|
const table = await this.connection.openTable(tableName);
|
|
5711
5858
|
this.tables.set(tableName, table);
|
|
@@ -5749,6 +5896,29 @@ var LanceStore = class {
|
|
|
5749
5896
|
};
|
|
5750
5897
|
});
|
|
5751
5898
|
}
|
|
5899
|
+
/**
|
|
5900
|
+
* Search using a text query with automatic embedding.
|
|
5901
|
+
* Requires setEmbeddingFunction() to have been called.
|
|
5902
|
+
* Uses the embedding function to compute query embeddings consistently with document embeddings.
|
|
5903
|
+
*/
|
|
5904
|
+
async searchText(storeId, query, limit) {
|
|
5905
|
+
if (this.embeddingFunction === null) {
|
|
5906
|
+
throw new Error(
|
|
5907
|
+
"Embedding function not set. Call setEmbeddingFunction() before searchText()."
|
|
5908
|
+
);
|
|
5909
|
+
}
|
|
5910
|
+
const queryEmbedding = await this.embeddingFunction.computeQueryEmbeddings(query);
|
|
5911
|
+
const table = await this.getTable(storeId);
|
|
5912
|
+
const searchQuery = table.vectorSearch(queryEmbedding).limit(limit).distanceType("cosine");
|
|
5913
|
+
const rawResults = await searchQuery.toArray();
|
|
5914
|
+
const results = rawResults.filter(isSearchHit);
|
|
5915
|
+
return results.map((r) => ({
|
|
5916
|
+
id: createDocumentId(r.id),
|
|
5917
|
+
content: r.content,
|
|
5918
|
+
score: 1 - r._distance,
|
|
5919
|
+
metadata: parseDocumentMetadata(r.metadata)
|
|
5920
|
+
}));
|
|
5921
|
+
}
|
|
5752
5922
|
async createFtsIndex(storeId) {
|
|
5753
5923
|
const table = await this.getTable(storeId);
|
|
5754
5924
|
await table.createIndex("content", {
|
|
@@ -5865,12 +6035,7 @@ var LazyServiceContainer = class {
|
|
|
5865
6035
|
get search() {
|
|
5866
6036
|
if (this._search === null) {
|
|
5867
6037
|
logger4.debug("Lazy-initializing SearchService");
|
|
5868
|
-
this._search = new SearchService(
|
|
5869
|
-
this.lance,
|
|
5870
|
-
this.embeddings,
|
|
5871
|
-
this.codeGraph,
|
|
5872
|
-
this.appConfig.search
|
|
5873
|
-
);
|
|
6038
|
+
this._search = new SearchService(this.lance, this.codeGraph, this.appConfig.search);
|
|
5874
6039
|
}
|
|
5875
6040
|
return this._search;
|
|
5876
6041
|
}
|
|
@@ -5929,7 +6094,8 @@ async function createLazyServices(configPath, dataDir, projectRoot) {
|
|
|
5929
6094
|
const storeOptions = {
|
|
5930
6095
|
definitionService,
|
|
5931
6096
|
gitignoreService,
|
|
5932
|
-
projectRoot: resolvedProjectRoot
|
|
6097
|
+
projectRoot: resolvedProjectRoot,
|
|
6098
|
+
embeddingModelId: appConfig.embedding.model
|
|
5933
6099
|
};
|
|
5934
6100
|
const store = new StoreService(resolvedDataDir, storeOptions);
|
|
5935
6101
|
await store.initialize();
|
|
@@ -5950,19 +6116,21 @@ async function createServices(configPath, dataDir, projectRoot) {
|
|
|
5950
6116
|
const lance = new LanceStore(resolvedDataDir);
|
|
5951
6117
|
const embeddings = new EmbeddingEngine(appConfig.embedding);
|
|
5952
6118
|
await embeddings.initialize();
|
|
6119
|
+
await lance.setEmbeddingFunction(appConfig.embedding);
|
|
5953
6120
|
const resolvedProjectRoot = config.resolveProjectRoot();
|
|
5954
6121
|
const definitionService = new StoreDefinitionService(resolvedProjectRoot);
|
|
5955
6122
|
const gitignoreService = new GitignoreService(resolvedProjectRoot);
|
|
5956
6123
|
const storeOptions = {
|
|
5957
6124
|
definitionService,
|
|
5958
6125
|
gitignoreService,
|
|
5959
|
-
projectRoot: resolvedProjectRoot
|
|
6126
|
+
projectRoot: resolvedProjectRoot,
|
|
6127
|
+
embeddingModelId: appConfig.embedding.model
|
|
5960
6128
|
};
|
|
5961
6129
|
const store = new StoreService(resolvedDataDir, storeOptions);
|
|
5962
6130
|
await store.initialize();
|
|
5963
6131
|
const codeGraph = new CodeGraphService(resolvedDataDir, pythonBridge);
|
|
5964
6132
|
const manifest = new ManifestService(resolvedDataDir);
|
|
5965
|
-
const search = new SearchService(lance,
|
|
6133
|
+
const search = new SearchService(lance, codeGraph, appConfig.search);
|
|
5966
6134
|
const index = new IndexService(lance, embeddings, {
|
|
5967
6135
|
codeGraphService: codeGraph,
|
|
5968
6136
|
manifestService: manifest,
|
|
@@ -6054,4 +6222,4 @@ export {
|
|
|
6054
6222
|
createServices,
|
|
6055
6223
|
destroyServices
|
|
6056
6224
|
};
|
|
6057
|
-
//# sourceMappingURL=chunk-
|
|
6225
|
+
//# sourceMappingURL=chunk-ZR23KJPJ.js.map
|