bluera-knowledge 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/services/watch.service.ts","../src/utils/ignore-patterns.ts","../src/utils/model-validation.ts"],"sourcesContent":["import { watch, type FSWatcher } from 'chokidar';\nimport { normalizeGlobPatterns } from '../utils/ignore-patterns.js';\nimport { validateStoreModelCompatibility } from '../utils/model-validation.js';\nimport type { IndexService } from './index.service.js';\nimport type { EmbeddingEngine } from '../db/embeddings.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { FileStore, RepoStore } from '../types/store.js';\n\nexport interface WatchServiceOptions {\n ignorePatterns?: readonly string[];\n /** Current embedding model ID for compatibility validation */\n currentModelId?: string;\n}\n\nexport class WatchService {\n private readonly watchers: Map<string, FSWatcher> = new Map();\n private readonly pendingTimeouts: Map<string, NodeJS.Timeout> = new Map();\n private readonly indexService: IndexService;\n private readonly lanceStore: LanceStore;\n private readonly embeddings: EmbeddingEngine;\n private readonly ignorePatterns: readonly string[];\n private readonly currentModelId: string | undefined;\n\n constructor(\n indexService: IndexService,\n lanceStore: LanceStore,\n embeddings: EmbeddingEngine,\n options: WatchServiceOptions = {}\n ) {\n this.indexService = indexService;\n this.lanceStore = lanceStore;\n this.embeddings = embeddings;\n // Use shared utility to normalize patterns to glob format with defaults\n this.ignorePatterns = normalizeGlobPatterns(options.ignorePatterns ?? []);\n this.currentModelId = options.currentModelId;\n }\n\n async watch(\n store: FileStore | RepoStore,\n debounceMs: number,\n onReindex: (() => void) | undefined,\n onError: (error: Error) => void\n ): Promise<void> {\n if (this.watchers.has(store.id)) {\n return Promise.resolve(); // Already watching\n }\n\n let timeout: NodeJS.Timeout | null = null;\n\n const watcher = watch(store.path, {\n ignored: [...this.ignorePatterns],\n persistent: true,\n ignoreInitial: true,\n });\n\n const reindexHandler = (): void => {\n if (timeout) clearTimeout(timeout);\n timeout = setTimeout(() => {\n this.pendingTimeouts.delete(store.id);\n void (async (): Promise<void> => {\n try {\n // Validate model compatibility before incremental indexing\n // If currentModelId is set, check that store was indexed with same model\n if (this.currentModelId !== undefined) {\n validateStoreModelCompatibility(store, { currentModelId: this.currentModelId });\n }\n\n this.lanceStore.setDimensions(await this.embeddings.ensureDimensions());\n await this.lanceStore.initialize(store.id);\n\n // Try incremental indexing first if available, fall back to full indexing\n let useFullReindex = true;\n if (typeof this.indexService.indexStoreIncremental === 'function') {\n const incrementalResult = await this.indexService.indexStoreIncremental(store);\n if (incrementalResult.success) {\n useFullReindex = false;\n }\n }\n\n if (useFullReindex) {\n const fullResult = await this.indexService.indexStore(store);\n if (!fullResult.success) {\n onError(fullResult.error);\n return;\n }\n }\n\n onReindex?.();\n } catch (e) {\n const error = e instanceof Error ? e : new Error(String(e));\n onError(error);\n }\n })();\n }, debounceMs);\n this.pendingTimeouts.set(store.id, timeout);\n };\n\n watcher.on('all', reindexHandler);\n\n watcher.on('error', (e) => {\n const error = e instanceof Error ? e : new Error(String(e));\n onError(error);\n });\n\n this.watchers.set(store.id, watcher);\n return Promise.resolve();\n }\n\n async unwatch(storeId: string): Promise<void> {\n // Clear any pending timeout to prevent timer leak\n const pendingTimeout = this.pendingTimeouts.get(storeId);\n if (pendingTimeout) {\n clearTimeout(pendingTimeout);\n this.pendingTimeouts.delete(storeId);\n }\n\n const watcher = this.watchers.get(storeId);\n if (watcher) {\n await watcher.close();\n this.watchers.delete(storeId);\n }\n }\n\n async unwatchAll(): Promise<void> {\n for (const [id] of this.watchers) {\n await this.unwatch(id);\n }\n }\n}\n","/**\n * Unified ignore pattern handling for consistent behavior across IndexService and WatchService.\n *\n * Pattern normalization ensures the same config patterns work identically whether used\n * for fs.readdir scanning (IndexService) or chokidar watching (WatchService).\n */\n\n/** Default directories to always ignore */\nexport const DEFAULT_IGNORE_DIRS = ['node_modules', '.git', '.bluera', 'dist', 'build'] as const;\n\n/**\n * Normalize patterns to standard glob format for chokidar and micromatch.\n *\n * Transformations:\n * - 'node_modules' → '** /node_modules/**' (directory anywhere in tree)\n * - 'node_modules/**' → '** /node_modules/**' (explicit directory pattern)\n * - '*.min.js' → '**\\/*.min.js' (extension pattern anywhere)\n * - '** /foo/**' → unchanged (already in glob format)\n *\n * @param patterns - User-provided patterns from config\n * @param includeDefaults - Whether to include DEFAULT_IGNORE_DIRS (default: true)\n */\nexport function normalizeGlobPatterns(\n patterns: readonly string[],\n includeDefaults = true\n): string[] {\n const result: string[] = [];\n\n // Add defaults first\n if (includeDefaults) {\n for (const dir of DEFAULT_IGNORE_DIRS) {\n result.push(`**/${dir}/**`);\n }\n }\n\n // Process user patterns\n for (const pattern of patterns) {\n if (pattern.startsWith('**/') && pattern.endsWith('/**')) {\n // Already in glob format\n result.push(pattern);\n } else if (pattern.endsWith('/**')) {\n // Directory pattern: 'foo/**' → '**/foo/**'\n result.push(`**/${pattern}`);\n } else if (pattern.startsWith('*.')) {\n // Extension pattern: '*.min.js' → '**/*.min.js'\n result.push(`**/${pattern}`);\n } else if (!pattern.includes('/') && !pattern.includes('*')) {\n // Simple directory name: 'node_modules' → '**/node_modules/**'\n result.push(`**/${pattern}/**`);\n } else {\n // Keep as-is (might be a specific path pattern)\n result.push(pattern);\n }\n }\n\n return result;\n}\n\n/**\n * Parsed patterns optimized for fs.readdir scanning.\n */\nexport interface ScanningPatterns {\n /** Directory names to skip during traversal (e.g., 'node_modules', '.git') */\n dirs: Set<string>;\n /** Predicate functions to test if a filename should be ignored (e.g., for '*.min.js') */\n fileMatchers: Array<(filename: string) => boolean>;\n}\n\n/**\n * Parse patterns into structures optimized for fs.readdir filtering.\n *\n * This is more efficient than glob matching for directory traversal since\n * it allows early termination when encountering ignored directories.\n *\n * @param patterns - User-provided patterns from config\n * @param includeDefaults - Whether to include DEFAULT_IGNORE_DIRS (default: true)\n */\nexport function parseIgnorePatternsForScanning(\n patterns: readonly string[],\n includeDefaults = true\n): ScanningPatterns {\n const dirs = new Set<string>();\n const fileMatchers: Array<(filename: string) => boolean> = [];\n\n // Add defaults first\n if (includeDefaults) {\n for (const dir of DEFAULT_IGNORE_DIRS) {\n dirs.add(dir);\n }\n }\n\n // Process user patterns\n for (const pattern of patterns) {\n if (pattern.startsWith('**/') && pattern.endsWith('/**')) {\n // Glob format: '**/node_modules/**' → extract 'node_modules'\n const inner = pattern.slice(3, -3);\n if (!inner.includes('/') && !inner.includes('*')) {\n dirs.add(inner);\n }\n } else if (pattern.endsWith('/**')) {\n // Directory pattern: 'node_modules/**' → 'node_modules'\n dirs.add(pattern.slice(0, -3));\n } else if (pattern.startsWith('*.')) {\n // Extension pattern: '*.min.js' → matches files ending with '.min.js'\n const ext = pattern.slice(1); // Remove leading '*'\n fileMatchers.push((filename) => filename.endsWith(ext));\n } else if (!pattern.includes('/') && !pattern.includes('*')) {\n // Simple directory name: 'node_modules' → treat as directory\n dirs.add(pattern);\n }\n // Note: Complex patterns like 'src/**/*.test.ts' are not supported for scanning\n // They would require full glob matching which defeats the purpose of fast scanning\n }\n\n return { dirs, fileMatchers };\n}\n","/**\n * Model Compatibility Validation\n *\n * Guards against searching or incrementally indexing stores with mismatched embedding models.\n * Ensures stores are only queried with the same model they were indexed with.\n */\n\nimport type { Store } from '../types/store.js';\n\nexport interface ModelValidationContext {\n currentModelId: string;\n}\n\n/**\n * Validates that a store's embedding model matches the current configuration.\n *\n * @throws Error if store was indexed with a different model or has no model tracking\n */\nexport function validateStoreModelCompatibility(store: Store, ctx: ModelValidationContext): void {\n // Stores without modelId (schemaVersion < 2) must be reindexed\n if (!store.modelId) {\n throw new Error(\n `Store \"${store.name}\" has no model tracking (schema v1). ` +\n `Reindex required: /bluera-knowledge:index ${store.name}`\n );\n }\n\n if (store.modelId !== ctx.currentModelId) {\n throw new Error(\n `Model mismatch: Store \"${store.name}\" was indexed with \"${store.modelId}\", ` +\n `but current config uses \"${ctx.currentModelId}\". ` +\n `Reindex required: /bluera-knowledge:index ${store.name}`\n );\n }\n}\n\n/**\n * Check if a store's model matches the current configuration without throwing.\n *\n * @returns Object with compatibility status and details\n */\nexport function checkStoreModelCompatibility(\n store: Store,\n ctx: ModelValidationContext\n): {\n compatible: boolean;\n modelId: string | undefined;\n reason?: string;\n} {\n if (!store.modelId) {\n return {\n compatible: false,\n modelId: undefined,\n reason: 'Store has no model tracking (schema v1)',\n };\n }\n\n if (store.modelId !== ctx.currentModelId) {\n return {\n compatible: false,\n modelId: store.modelId,\n reason: `Indexed with different model: ${store.modelId}`,\n };\n }\n\n return {\n compatible: true,\n modelId: store.modelId,\n };\n}\n"],"mappings":";AAAA,SAAS,aAA6B;;;ACQ/B,IAAM,sBAAsB,CAAC,gBAAgB,QAAQ,WAAW,QAAQ,OAAO;AAc/E,SAAS,sBACd,UACA,kBAAkB,MACR;AACV,QAAM,SAAmB,CAAC;AAG1B,MAAI,iBAAiB;AACnB,eAAW,OAAO,qBAAqB;AACrC,aAAO,KAAK,MAAM,GAAG,KAAK;AAAA,IAC5B;AAAA,EACF;AAGA,aAAW,WAAW,UAAU;AAC9B,QAAI,QAAQ,WAAW,KAAK,KAAK,QAAQ,SAAS,KAAK,GAAG;AAExD,aAAO,KAAK,OAAO;AAAA,IACrB,WAAW,QAAQ,SAAS,KAAK,GAAG;AAElC,aAAO,KAAK,MAAM,OAAO,EAAE;AAAA,IAC7B,WAAW,QAAQ,WAAW,IAAI,GAAG;AAEnC,aAAO,KAAK,MAAM,OAAO,EAAE;AAAA,IAC7B,WAAW,CAAC,QAAQ,SAAS,GAAG,KAAK,CAAC,QAAQ,SAAS,GAAG,GAAG;AAE3D,aAAO,KAAK,MAAM,OAAO,KAAK;AAAA,IAChC,OAAO;AAEL,aAAO,KAAK,OAAO;AAAA,IACrB;AAAA,EACF;AAEA,SAAO;AACT;AAqBO,SAAS,+BACd,UACA,kBAAkB,MACA;AAClB,QAAM,OAAO,oBAAI,IAAY;AAC7B,QAAM,eAAqD,CAAC;AAG5D,MAAI,iBAAiB;AACnB,eAAW,OAAO,qBAAqB;AACrC,WAAK,IAAI,GAAG;AAAA,IACd;AAAA,EACF;AAGA,aAAW,WAAW,UAAU;AAC9B,QAAI,QAAQ,WAAW,KAAK,KAAK,QAAQ,SAAS,KAAK,GAAG;AAExD,YAAM,QAAQ,QAAQ,MAAM,GAAG,EAAE;AACjC,UAAI,CAAC,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,SAAS,GAAG,GAAG;AAChD,aAAK,IAAI,KAAK;AAAA,MAChB;AAAA,IACF,WAAW,QAAQ,SAAS,KAAK,GAAG;AAElC,WAAK,IAAI,QAAQ,MAAM,GAAG,EAAE,CAAC;AAAA,IAC/B,WAAW,QAAQ,WAAW,IAAI,GAAG;AAEnC,YAAM,MAAM,QAAQ,MAAM,CAAC;AAC3B,mBAAa,KAAK,CAAC,aAAa,SAAS,SAAS,GAAG,CAAC;AAAA,IACxD,WAAW,CAAC,QAAQ,SAAS,GAAG,KAAK,CAAC,QAAQ,SAAS,GAAG,GAAG;AAE3D,WAAK,IAAI,OAAO;AAAA,IAClB;AAAA,EAGF;AAEA,SAAO,EAAE,MAAM,aAAa;AAC9B;;;ACjGO,SAAS,gCAAgC,OAAc,KAAmC;AAE/F,MAAI,CAAC,MAAM,SAAS;AAClB,UAAM,IAAI;AAAA,MACR,UAAU,MAAM,IAAI,kFAC2B,MAAM,IAAI;AAAA,IAC3D;AAAA,EACF;AAEA,MAAI,MAAM,YAAY,IAAI,gBAAgB;AACxC,UAAM,IAAI;AAAA,MACR,0BAA0B,MAAM,IAAI,uBAAuB,MAAM,OAAO,+BAC1C,IAAI,cAAc,gDACD,MAAM,IAAI;AAAA,IAC3D;AAAA,EACF;AACF;AAOO,SAAS,6BACd,OACA,KAKA;AACA,MAAI,CAAC,MAAM,SAAS;AAClB,WAAO;AAAA,MACL,YAAY;AAAA,MACZ,SAAS;AAAA,MACT,QAAQ;AAAA,IACV;AAAA,EACF;AAEA,MAAI,MAAM,YAAY,IAAI,gBAAgB;AACxC,WAAO;AAAA,MACL,YAAY;AAAA,MACZ,SAAS,MAAM;AAAA,MACf,QAAQ,iCAAiC,MAAM,OAAO;AAAA,IACxD;AAAA,EACF;AAEA,SAAO;AAAA,IACL,YAAY;AAAA,IACZ,SAAS,MAAM;AAAA,EACjB;AACF;;;AFvDO,IAAM,eAAN,MAAmB;AAAA,EACP,WAAmC,oBAAI,IAAI;AAAA,EAC3C,kBAA+C,oBAAI,IAAI;AAAA,EACvD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EAEjB,YACE,cACA,YACA,YACA,UAA+B,CAAC,GAChC;AACA,SAAK,eAAe;AACpB,SAAK,aAAa;AAClB,SAAK,aAAa;AAElB,SAAK,iBAAiB,sBAAsB,QAAQ,kBAAkB,CAAC,CAAC;AACxE,SAAK,iBAAiB,QAAQ;AAAA,EAChC;AAAA,EAEA,MAAM,MACJ,OACA,YACA,WACA,SACe;AACf,QAAI,KAAK,SAAS,IAAI,MAAM,EAAE,GAAG;AAC/B,aAAO,QAAQ,QAAQ;AAAA,IACzB;AAEA,QAAI,UAAiC;AAErC,UAAM,UAAU,MAAM,MAAM,MAAM;AAAA,MAChC,SAAS,CAAC,GAAG,KAAK,cAAc;AAAA,MAChC,YAAY;AAAA,MACZ,eAAe;AAAA,IACjB,CAAC;AAED,UAAM,iBAAiB,MAAY;AACjC,UAAI,QAAS,cAAa,OAAO;AACjC,gBAAU,WAAW,MAAM;AACzB,aAAK,gBAAgB,OAAO,MAAM,EAAE;AACpC,cAAM,YAA2B;AAC/B,cAAI;AAGF,gBAAI,KAAK,mBAAmB,QAAW;AACrC,8CAAgC,OAAO,EAAE,gBAAgB,KAAK,eAAe,CAAC;AAAA,YAChF;AAEA,iBAAK,WAAW,cAAc,MAAM,KAAK,WAAW,iBAAiB,CAAC;AACtE,kBAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AAGzC,gBAAI,iBAAiB;AACrB,gBAAI,OAAO,KAAK,aAAa,0BAA0B,YAAY;AACjE,oBAAM,oBAAoB,MAAM,KAAK,aAAa,sBAAsB,KAAK;AAC7E,kBAAI,kBAAkB,SAAS;AAC7B,iCAAiB;AAAA,cACnB;AAAA,YACF;AAEA,gBAAI,gBAAgB;AAClB,oBAAM,aAAa,MAAM,KAAK,aAAa,WAAW,KAAK;AAC3D,kBAAI,CAAC,WAAW,SAAS;AACvB,wBAAQ,WAAW,KAAK;AACxB;AAAA,cACF;AAAA,YACF;AAEA,wBAAY;AAAA,UACd,SAAS,GAAG;AACV,kBAAM,QAAQ,aAAa,QAAQ,IAAI,IAAI,MAAM,OAAO,CAAC,CAAC;AAC1D,oBAAQ,KAAK;AAAA,UACf;AAAA,QACF,GAAG;AAAA,MACL,GAAG,UAAU;AACb,WAAK,gBAAgB,IAAI,MAAM,IAAI,OAAO;AAAA,IAC5C;AAEA,YAAQ,GAAG,OAAO,cAAc;AAEhC,YAAQ,GAAG,SAAS,CAAC,MAAM;AACzB,YAAM,QAAQ,aAAa,QAAQ,IAAI,IAAI,MAAM,OAAO,CAAC,CAAC;AAC1D,cAAQ,KAAK;AAAA,IACf,CAAC;AAED,SAAK,SAAS,IAAI,MAAM,IAAI,OAAO;AACnC,WAAO,QAAQ,QAAQ;AAAA,EACzB;AAAA,EAEA,MAAM,QAAQ,SAAgC;AAE5C,UAAM,iBAAiB,KAAK,gBAAgB,IAAI,OAAO;AACvD,QAAI,gBAAgB;AAClB,mBAAa,cAAc;AAC3B,WAAK,gBAAgB,OAAO,OAAO;AAAA,IACrC;AAEA,UAAM,UAAU,KAAK,SAAS,IAAI,OAAO;AACzC,QAAI,SAAS;AACX,YAAM,QAAQ,MAAM;AACpB,WAAK,SAAS,OAAO,OAAO;AAAA,IAC9B;AAAA,EACF;AAAA,EAEA,MAAM,aAA4B;AAChC,eAAW,CAAC,EAAE,KAAK,KAAK,UAAU;AAChC,YAAM,KAAK,QAAQ,EAAE;AAAA,IACvB;AAAA,EACF;AACF;","names":[]}
@@ -2,7 +2,7 @@ import {
2
2
  createLogger,
3
3
  summarizePayload,
4
4
  truncateForLog
5
- } from "./chunk-PZE2MO7H.js";
5
+ } from "./chunk-ZR23KJPJ.js";
6
6
 
7
7
  // src/crawl/intelligent-crawler.ts
8
8
  import { EventEmitter } from "events";
@@ -916,4 +916,4 @@ export {
916
916
  getCrawlStrategy,
917
917
  IntelligentCrawler
918
918
  };
919
- //# sourceMappingURL=chunk-26MBEEKM.js.map
919
+ //# sourceMappingURL=chunk-VNHZ534Q.js.map
@@ -4,7 +4,7 @@ import {
4
4
  } from "./chunk-CLIMKLTW.js";
5
5
  import {
6
6
  parseIgnorePatternsForScanning
7
- } from "./chunk-HXBIIMYL.js";
7
+ } from "./chunk-H25AEF47.js";
8
8
  import {
9
9
  __require
10
10
  } from "./chunk-DGUM43GV.js";
@@ -2063,12 +2063,12 @@ var DEFAULT_CONFIG = {
2063
2063
  version: 1,
2064
2064
  dataDir: ".bluera/bluera-knowledge/data",
2065
2065
  embedding: {
2066
- model: "Xenova/all-MiniLM-L6-v2",
2066
+ model: "Xenova/bge-small-en-v1.5",
2067
2067
  batchSize: 32,
2068
2068
  dtype: "fp32",
2069
2069
  pooling: "mean",
2070
2070
  normalize: true,
2071
- queryPrefix: "",
2071
+ queryPrefix: "Represent this sentence for searching relevant passages: ",
2072
2072
  docPrefix: "",
2073
2073
  maxInFlightBatches: 1
2074
2074
  },
@@ -3755,15 +3755,13 @@ function detectContentType(results) {
3755
3755
  }
3756
3756
  var SearchService = class {
3757
3757
  lanceStore;
3758
- embeddingEngine;
3759
3758
  codeUnitService;
3760
3759
  codeGraphService;
3761
3760
  graphCache;
3762
3761
  searchConfig;
3763
3762
  unsubscribeCacheInvalidation;
3764
- constructor(lanceStore, embeddingEngine, codeGraphService, searchConfig) {
3763
+ constructor(lanceStore, codeGraphService, searchConfig) {
3765
3764
  this.lanceStore = lanceStore;
3766
- this.embeddingEngine = embeddingEngine;
3767
3765
  this.codeUnitService = new CodeUnitService();
3768
3766
  this.codeGraphService = codeGraphService;
3769
3767
  this.graphCache = /* @__PURE__ */ new Map();
@@ -3978,17 +3976,17 @@ var SearchService = class {
3978
3976
  /**
3979
3977
  * Fetch raw vector search results without normalization.
3980
3978
  * Returns results with raw cosine similarity scores [0-1].
3979
+ * Uses LanceDB's embedding function for query embedding,
3980
+ * ensuring consistent query/document embedding through a single code path.
3981
3981
  */
3982
3982
  async vectorSearchRaw(query, stores, limit) {
3983
- const queryVector = await this.embeddingEngine.embedQuery(query);
3984
3983
  const results = [];
3985
3984
  for (const storeId of stores) {
3986
- const hits = await this.lanceStore.search(storeId, queryVector, limit);
3985
+ const hits = await this.lanceStore.searchText(storeId, query, limit);
3987
3986
  results.push(
3988
3987
  ...hits.map((r) => ({
3989
3988
  id: r.id,
3990
3989
  score: r.score,
3991
- // Raw cosine similarity (1 - distance)
3992
3990
  content: r.content,
3993
3991
  metadata: r.metadata
3994
3992
  }))
@@ -4822,6 +4820,9 @@ function extractRepoName(url) {
4822
4820
  return name;
4823
4821
  }
4824
4822
 
4823
+ // src/types/store.ts
4824
+ var CURRENT_SCHEMA_VERSION = 2;
4825
+
4825
4826
  // src/services/store.service.ts
4826
4827
  async function fileExists4(path4) {
4827
4828
  try {
@@ -4836,12 +4837,21 @@ var StoreService = class {
4836
4837
  definitionService;
4837
4838
  gitignoreService;
4838
4839
  projectRoot;
4840
+ embeddingModelId;
4839
4841
  registry = { stores: [] };
4840
4842
  constructor(dataDir, options) {
4841
4843
  this.dataDir = dataDir;
4842
- this.definitionService = options?.definitionService ?? void 0;
4843
- this.gitignoreService = options?.gitignoreService ?? void 0;
4844
- this.projectRoot = options?.projectRoot ?? void 0;
4844
+ this.definitionService = options.definitionService ?? void 0;
4845
+ this.gitignoreService = options.gitignoreService ?? void 0;
4846
+ this.projectRoot = options.projectRoot ?? void 0;
4847
+ this.embeddingModelId = options.embeddingModelId;
4848
+ }
4849
+ /**
4850
+ * Get the current embedding model ID used for new stores.
4851
+ * Used by model compatibility validation.
4852
+ */
4853
+ getCurrentModelId() {
4854
+ return this.embeddingModelId;
4845
4855
  }
4846
4856
  async initialize() {
4847
4857
  await mkdir5(this.dataDir, { recursive: true });
@@ -4980,7 +4990,9 @@ var StoreService = class {
4980
4990
  tags: input.tags,
4981
4991
  status: "ready",
4982
4992
  createdAt: now,
4983
- updatedAt: now
4993
+ updatedAt: now,
4994
+ schemaVersion: CURRENT_SCHEMA_VERSION,
4995
+ modelId: this.embeddingModelId
4984
4996
  };
4985
4997
  break;
4986
4998
  }
@@ -5025,7 +5037,9 @@ var StoreService = class {
5025
5037
  tags: input.tags,
5026
5038
  status: "ready",
5027
5039
  createdAt: now,
5028
- updatedAt: now
5040
+ updatedAt: now,
5041
+ schemaVersion: CURRENT_SCHEMA_VERSION,
5042
+ modelId: this.embeddingModelId
5029
5043
  };
5030
5044
  break;
5031
5045
  }
@@ -5046,7 +5060,9 @@ var StoreService = class {
5046
5060
  tags: input.tags,
5047
5061
  status: "ready",
5048
5062
  createdAt: now,
5049
- updatedAt: now
5063
+ updatedAt: now,
5064
+ schemaVersion: CURRENT_SCHEMA_VERSION,
5065
+ modelId: this.embeddingModelId
5050
5066
  };
5051
5067
  break;
5052
5068
  default: {
@@ -5425,12 +5441,12 @@ import { join as join11 } from "path";
5425
5441
  import { pipeline, env } from "@huggingface/transformers";
5426
5442
  env.cacheDir = join11(homedir2(), ".cache", "huggingface-transformers");
5427
5443
  var DEFAULT_EMBEDDING_CONFIG = {
5428
- model: "Xenova/all-MiniLM-L6-v2",
5444
+ model: "Xenova/bge-small-en-v1.5",
5429
5445
  batchSize: 32,
5430
5446
  dtype: "fp32",
5431
5447
  pooling: "mean",
5432
5448
  normalize: true,
5433
- queryPrefix: "",
5449
+ queryPrefix: "Represent this sentence for searching relevant passages: ",
5434
5450
  docPrefix: "",
5435
5451
  maxInFlightBatches: 1
5436
5452
  };
@@ -5654,6 +5670,88 @@ var EmbeddingEngine = class {
5654
5670
 
5655
5671
  // src/db/lance.ts
5656
5672
  import * as lancedb from "@lancedb/lancedb";
5673
+ import { LanceSchema } from "@lancedb/lancedb/embedding";
5674
+ import { Utf8 } from "apache-arrow";
5675
+
5676
+ // src/db/lance-embedding-function.ts
5677
+ import { TextEmbeddingFunction, getRegistry } from "@lancedb/lancedb/embedding";
5678
+ import { Float32 } from "apache-arrow";
5679
+ var HuggingFaceEmbeddingFunction = class extends TextEmbeddingFunction {
5680
+ engine;
5681
+ embeddingConfig;
5682
+ _ndims = null;
5683
+ constructor(optionsRaw) {
5684
+ super();
5685
+ const options = this.resolveVariables(optionsRaw ?? {});
5686
+ this.embeddingConfig = {
5687
+ model: options.model ?? "Xenova/bge-small-en-v1.5",
5688
+ batchSize: options.batchSize ?? 32,
5689
+ dtype: options.dtype ?? "fp32",
5690
+ pooling: options.pooling ?? "mean",
5691
+ normalize: options.normalize ?? true,
5692
+ queryPrefix: options.queryPrefix ?? "",
5693
+ docPrefix: options.docPrefix ?? "",
5694
+ maxInFlightBatches: 1
5695
+ // Single-threaded for LanceDB integration
5696
+ };
5697
+ this.engine = new EmbeddingEngine(this.embeddingConfig);
5698
+ }
5699
+ /**
5700
+ * Initialize the embedding model. Called by LanceDB before embeddings are computed.
5701
+ */
5702
+ async init() {
5703
+ this._ndims = await this.engine.ensureDimensions();
5704
+ }
5705
+ /**
5706
+ * Return embedding dimensions. Must call init() first.
5707
+ */
5708
+ ndims() {
5709
+ if (this._ndims === null) {
5710
+ throw new Error("HuggingFaceEmbeddingFunction not initialized. Call init() first.");
5711
+ }
5712
+ return this._ndims;
5713
+ }
5714
+ /**
5715
+ * Return embedding data type (always Float32 for our models).
5716
+ */
5717
+ embeddingDataType() {
5718
+ return new Float32();
5719
+ }
5720
+ /**
5721
+ * Generate embeddings for a batch of texts (documents).
5722
+ * Called during table.add() operations.
5723
+ */
5724
+ async generateEmbeddings(texts) {
5725
+ return this.engine.embedBatch(texts);
5726
+ }
5727
+ /**
5728
+ * Compute embedding for a single query.
5729
+ * Called during table.search(query) operations.
5730
+ */
5731
+ async computeQueryEmbeddings(data) {
5732
+ const embedding = await this.engine.embedQuery(data);
5733
+ return Array.from(embedding);
5734
+ }
5735
+ /**
5736
+ * Get the model ID for provenance tracking.
5737
+ */
5738
+ getModelId() {
5739
+ return this.embeddingConfig.model;
5740
+ }
5741
+ /**
5742
+ * Get the full embedding config.
5743
+ */
5744
+ getConfig() {
5745
+ return this.embeddingConfig;
5746
+ }
5747
+ /**
5748
+ * Dispose the underlying engine to free resources.
5749
+ */
5750
+ async dispose() {
5751
+ await this.engine.dispose();
5752
+ }
5753
+ };
5754
+ getRegistry().register("HuggingFaceEmbeddingFunction")(HuggingFaceEmbeddingFunction);
5657
5755
 
5658
5756
  // src/types/document.ts
5659
5757
  import { z as z5 } from "zod";
@@ -5671,15 +5769,51 @@ var DocumentMetadataSchema = z5.object({
5671
5769
  }).loose();
5672
5770
 
5673
5771
  // src/db/lance.ts
5772
+ function isSearchHit(value) {
5773
+ if (typeof value !== "object" || value === null) return false;
5774
+ return "id" in value && "content" in value && "metadata" in value && "_distance" in value && typeof value.id === "string" && typeof value.content === "string" && typeof value.metadata === "string" && typeof value._distance === "number";
5775
+ }
5776
+ function parseDocumentMetadata(jsonStr) {
5777
+ const parsed = DocumentMetadataSchema.parse(JSON.parse(jsonStr));
5778
+ return {
5779
+ ...parsed,
5780
+ storeId: createStoreId(parsed.storeId)
5781
+ };
5782
+ }
5674
5783
  var LanceStore = class {
5675
5784
  connection = null;
5676
5785
  tables = /* @__PURE__ */ new Map();
5677
5786
  dataDir;
5678
5787
  // eslint-disable-next-line @typescript-eslint/prefer-readonly -- set via setDimensions()
5679
5788
  _dimensions = null;
5789
+ embeddingFunction = null;
5680
5790
  constructor(dataDir) {
5681
5791
  this.dataDir = dataDir;
5682
5792
  }
5793
+ /**
5794
+ * Set the embedding function for auto-embedding queries.
5795
+ * Must be called before initialize() for new tables.
5796
+ * The embedding function is initialized and its dimensions are used for schema creation.
5797
+ */
5798
+ async setEmbeddingFunction(config) {
5799
+ this.embeddingFunction = new HuggingFaceEmbeddingFunction({
5800
+ model: config.model,
5801
+ batchSize: config.batchSize,
5802
+ dtype: config.dtype,
5803
+ pooling: config.pooling,
5804
+ normalize: config.normalize,
5805
+ queryPrefix: config.queryPrefix,
5806
+ docPrefix: config.docPrefix
5807
+ });
5808
+ await this.embeddingFunction.init();
5809
+ this._dimensions = this.embeddingFunction.ndims();
5810
+ }
5811
+ /**
5812
+ * Check if embedding function is available for auto-embedding queries.
5813
+ */
5814
+ hasEmbeddingFunction() {
5815
+ return this.embeddingFunction !== null;
5816
+ }
5683
5817
  /**
5684
5818
  * Set the embedding dimensions. Must be called before initialize().
5685
5819
  * This allows dimensions to be derived from the embedding model at runtime.
@@ -5690,22 +5824,35 @@ var LanceStore = class {
5690
5824
  }
5691
5825
  async initialize(storeId) {
5692
5826
  if (this._dimensions === null) {
5693
- throw new Error("Dimensions not set. Call setDimensions() before initialize().");
5827
+ throw new Error(
5828
+ "Dimensions not set. Call setDimensions() or setEmbeddingFunction() before initialize()."
5829
+ );
5694
5830
  }
5695
5831
  this.connection ??= await lancedb.connect(this.dataDir);
5696
5832
  const tableName = this.getTableName(storeId);
5697
5833
  const tableNames = await this.connection.tableNames();
5698
5834
  if (!tableNames.includes(tableName)) {
5699
- const table = await this.connection.createTable(tableName, [
5700
- {
5701
- id: "__init__",
5702
- content: "",
5703
- vector: new Array(this._dimensions).fill(0),
5704
- metadata: "{}"
5705
- }
5706
- ]);
5707
- await table.delete('id = "__init__"');
5708
- this.tables.set(tableName, table);
5835
+ if (this.embeddingFunction !== null) {
5836
+ const schema = LanceSchema({
5837
+ id: new Utf8(),
5838
+ content: this.embeddingFunction.sourceField(),
5839
+ vector: this.embeddingFunction.vectorField(),
5840
+ metadata: new Utf8()
5841
+ });
5842
+ const table = await this.connection.createEmptyTable(tableName, schema);
5843
+ this.tables.set(tableName, table);
5844
+ } else {
5845
+ const table = await this.connection.createTable(tableName, [
5846
+ {
5847
+ id: "__init__",
5848
+ content: "",
5849
+ vector: new Array(this._dimensions).fill(0),
5850
+ metadata: "{}"
5851
+ }
5852
+ ]);
5853
+ await table.delete('id = "__init__"');
5854
+ this.tables.set(tableName, table);
5855
+ }
5709
5856
  } else {
5710
5857
  const table = await this.connection.openTable(tableName);
5711
5858
  this.tables.set(tableName, table);
@@ -5749,6 +5896,29 @@ var LanceStore = class {
5749
5896
  };
5750
5897
  });
5751
5898
  }
5899
+ /**
5900
+ * Search using a text query with automatic embedding.
5901
+ * Requires setEmbeddingFunction() to have been called.
5902
+ * Uses the embedding function to compute query embeddings consistently with document embeddings.
5903
+ */
5904
+ async searchText(storeId, query, limit) {
5905
+ if (this.embeddingFunction === null) {
5906
+ throw new Error(
5907
+ "Embedding function not set. Call setEmbeddingFunction() before searchText()."
5908
+ );
5909
+ }
5910
+ const queryEmbedding = await this.embeddingFunction.computeQueryEmbeddings(query);
5911
+ const table = await this.getTable(storeId);
5912
+ const searchQuery = table.vectorSearch(queryEmbedding).limit(limit).distanceType("cosine");
5913
+ const rawResults = await searchQuery.toArray();
5914
+ const results = rawResults.filter(isSearchHit);
5915
+ return results.map((r) => ({
5916
+ id: createDocumentId(r.id),
5917
+ content: r.content,
5918
+ score: 1 - r._distance,
5919
+ metadata: parseDocumentMetadata(r.metadata)
5920
+ }));
5921
+ }
5752
5922
  async createFtsIndex(storeId) {
5753
5923
  const table = await this.getTable(storeId);
5754
5924
  await table.createIndex("content", {
@@ -5865,12 +6035,7 @@ var LazyServiceContainer = class {
5865
6035
  get search() {
5866
6036
  if (this._search === null) {
5867
6037
  logger4.debug("Lazy-initializing SearchService");
5868
- this._search = new SearchService(
5869
- this.lance,
5870
- this.embeddings,
5871
- this.codeGraph,
5872
- this.appConfig.search
5873
- );
6038
+ this._search = new SearchService(this.lance, this.codeGraph, this.appConfig.search);
5874
6039
  }
5875
6040
  return this._search;
5876
6041
  }
@@ -5929,7 +6094,8 @@ async function createLazyServices(configPath, dataDir, projectRoot) {
5929
6094
  const storeOptions = {
5930
6095
  definitionService,
5931
6096
  gitignoreService,
5932
- projectRoot: resolvedProjectRoot
6097
+ projectRoot: resolvedProjectRoot,
6098
+ embeddingModelId: appConfig.embedding.model
5933
6099
  };
5934
6100
  const store = new StoreService(resolvedDataDir, storeOptions);
5935
6101
  await store.initialize();
@@ -5950,19 +6116,21 @@ async function createServices(configPath, dataDir, projectRoot) {
5950
6116
  const lance = new LanceStore(resolvedDataDir);
5951
6117
  const embeddings = new EmbeddingEngine(appConfig.embedding);
5952
6118
  await embeddings.initialize();
6119
+ await lance.setEmbeddingFunction(appConfig.embedding);
5953
6120
  const resolvedProjectRoot = config.resolveProjectRoot();
5954
6121
  const definitionService = new StoreDefinitionService(resolvedProjectRoot);
5955
6122
  const gitignoreService = new GitignoreService(resolvedProjectRoot);
5956
6123
  const storeOptions = {
5957
6124
  definitionService,
5958
6125
  gitignoreService,
5959
- projectRoot: resolvedProjectRoot
6126
+ projectRoot: resolvedProjectRoot,
6127
+ embeddingModelId: appConfig.embedding.model
5960
6128
  };
5961
6129
  const store = new StoreService(resolvedDataDir, storeOptions);
5962
6130
  await store.initialize();
5963
6131
  const codeGraph = new CodeGraphService(resolvedDataDir, pythonBridge);
5964
6132
  const manifest = new ManifestService(resolvedDataDir);
5965
- const search = new SearchService(lance, embeddings, codeGraph, appConfig.search);
6133
+ const search = new SearchService(lance, codeGraph, appConfig.search);
5966
6134
  const index = new IndexService(lance, embeddings, {
5967
6135
  codeGraphService: codeGraph,
5968
6136
  manifestService: manifest,
@@ -6054,4 +6222,4 @@ export {
6054
6222
  createServices,
6055
6223
  destroyServices
6056
6224
  };
6057
- //# sourceMappingURL=chunk-PZE2MO7H.js.map
6225
+ //# sourceMappingURL=chunk-ZR23KJPJ.js.map