bluera-knowledge 0.19.7 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/README.md +21 -0
- package/dist/{chunk-6BC5OG4M.js → chunk-BZQ7KWEE.js} +67 -5
- package/dist/chunk-BZQ7KWEE.js.map +1 -0
- package/dist/{chunk-HXBIIMYL.js → chunk-H25AEF47.js} +42 -1
- package/dist/chunk-H25AEF47.js.map +1 -0
- package/dist/{chunk-TWX7MN5L.js → chunk-VNHZ534Q.js} +2 -2
- package/dist/{chunk-JPJI3VMA.js → chunk-ZR23KJPJ.js} +345 -69
- package/dist/chunk-ZR23KJPJ.js.map +1 -0
- package/dist/index.js +14 -7
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.d.ts +96 -10
- package/dist/mcp/server.js +3 -3
- package/dist/{watch.service-NXRWLJG6.js → watch.service-THP6X5ZZ.js} +2 -2
- package/dist/workers/background-worker-cli.js +4 -4
- package/dist/workers/background-worker-cli.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-6BC5OG4M.js.map +0 -1
- package/dist/chunk-HXBIIMYL.js.map +0 -1
- package/dist/chunk-JPJI3VMA.js.map +0 -1
- /package/dist/{chunk-TWX7MN5L.js.map → chunk-VNHZ534Q.js.map} +0 -0
- /package/dist/{watch.service-NXRWLJG6.js.map → watch.service-THP6X5ZZ.js.map} +0 -0
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
} from "./chunk-CLIMKLTW.js";
|
|
5
5
|
import {
|
|
6
6
|
parseIgnorePatternsForScanning
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-H25AEF47.js";
|
|
8
8
|
import {
|
|
9
9
|
__require
|
|
10
10
|
} from "./chunk-DGUM43GV.js";
|
|
@@ -2063,8 +2063,14 @@ var DEFAULT_CONFIG = {
|
|
|
2063
2063
|
version: 1,
|
|
2064
2064
|
dataDir: ".bluera/bluera-knowledge/data",
|
|
2065
2065
|
embedding: {
|
|
2066
|
-
model: "Xenova/
|
|
2067
|
-
batchSize: 32
|
|
2066
|
+
model: "Xenova/bge-small-en-v1.5",
|
|
2067
|
+
batchSize: 32,
|
|
2068
|
+
dtype: "fp32",
|
|
2069
|
+
pooling: "mean",
|
|
2070
|
+
normalize: true,
|
|
2071
|
+
queryPrefix: "Represent this sentence for searching relevant passages: ",
|
|
2072
|
+
docPrefix: "",
|
|
2073
|
+
maxInFlightBatches: 1
|
|
2068
2074
|
},
|
|
2069
2075
|
indexing: {
|
|
2070
2076
|
concurrency: 4,
|
|
@@ -3749,15 +3755,13 @@ function detectContentType(results) {
|
|
|
3749
3755
|
}
|
|
3750
3756
|
var SearchService = class {
|
|
3751
3757
|
lanceStore;
|
|
3752
|
-
embeddingEngine;
|
|
3753
3758
|
codeUnitService;
|
|
3754
3759
|
codeGraphService;
|
|
3755
3760
|
graphCache;
|
|
3756
3761
|
searchConfig;
|
|
3757
3762
|
unsubscribeCacheInvalidation;
|
|
3758
|
-
constructor(lanceStore,
|
|
3763
|
+
constructor(lanceStore, codeGraphService, searchConfig) {
|
|
3759
3764
|
this.lanceStore = lanceStore;
|
|
3760
|
-
this.embeddingEngine = embeddingEngine;
|
|
3761
3765
|
this.codeUnitService = new CodeUnitService();
|
|
3762
3766
|
this.codeGraphService = codeGraphService;
|
|
3763
3767
|
this.graphCache = /* @__PURE__ */ new Map();
|
|
@@ -3972,17 +3976,17 @@ var SearchService = class {
|
|
|
3972
3976
|
/**
|
|
3973
3977
|
* Fetch raw vector search results without normalization.
|
|
3974
3978
|
* Returns results with raw cosine similarity scores [0-1].
|
|
3979
|
+
* Uses LanceDB's embedding function for query embedding,
|
|
3980
|
+
* ensuring consistent query/document embedding through a single code path.
|
|
3975
3981
|
*/
|
|
3976
3982
|
async vectorSearchRaw(query, stores, limit) {
|
|
3977
|
-
const queryVector = await this.embeddingEngine.embed(query);
|
|
3978
3983
|
const results = [];
|
|
3979
3984
|
for (const storeId of stores) {
|
|
3980
|
-
const hits = await this.lanceStore.
|
|
3985
|
+
const hits = await this.lanceStore.searchText(storeId, query, limit);
|
|
3981
3986
|
results.push(
|
|
3982
3987
|
...hits.map((r) => ({
|
|
3983
3988
|
id: r.id,
|
|
3984
3989
|
score: r.score,
|
|
3985
|
-
// Raw cosine similarity (1 - distance)
|
|
3986
3990
|
content: r.content,
|
|
3987
3991
|
metadata: r.metadata
|
|
3988
3992
|
}))
|
|
@@ -4816,6 +4820,9 @@ function extractRepoName(url) {
|
|
|
4816
4820
|
return name;
|
|
4817
4821
|
}
|
|
4818
4822
|
|
|
4823
|
+
// src/types/store.ts
|
|
4824
|
+
var CURRENT_SCHEMA_VERSION = 2;
|
|
4825
|
+
|
|
4819
4826
|
// src/services/store.service.ts
|
|
4820
4827
|
async function fileExists4(path4) {
|
|
4821
4828
|
try {
|
|
@@ -4830,12 +4837,21 @@ var StoreService = class {
|
|
|
4830
4837
|
definitionService;
|
|
4831
4838
|
gitignoreService;
|
|
4832
4839
|
projectRoot;
|
|
4840
|
+
embeddingModelId;
|
|
4833
4841
|
registry = { stores: [] };
|
|
4834
4842
|
constructor(dataDir, options) {
|
|
4835
4843
|
this.dataDir = dataDir;
|
|
4836
|
-
this.definitionService = options
|
|
4837
|
-
this.gitignoreService = options
|
|
4838
|
-
this.projectRoot = options
|
|
4844
|
+
this.definitionService = options.definitionService ?? void 0;
|
|
4845
|
+
this.gitignoreService = options.gitignoreService ?? void 0;
|
|
4846
|
+
this.projectRoot = options.projectRoot ?? void 0;
|
|
4847
|
+
this.embeddingModelId = options.embeddingModelId;
|
|
4848
|
+
}
|
|
4849
|
+
/**
|
|
4850
|
+
* Get the current embedding model ID used for new stores.
|
|
4851
|
+
* Used by model compatibility validation.
|
|
4852
|
+
*/
|
|
4853
|
+
getCurrentModelId() {
|
|
4854
|
+
return this.embeddingModelId;
|
|
4839
4855
|
}
|
|
4840
4856
|
async initialize() {
|
|
4841
4857
|
await mkdir5(this.dataDir, { recursive: true });
|
|
@@ -4974,7 +4990,9 @@ var StoreService = class {
|
|
|
4974
4990
|
tags: input.tags,
|
|
4975
4991
|
status: "ready",
|
|
4976
4992
|
createdAt: now,
|
|
4977
|
-
updatedAt: now
|
|
4993
|
+
updatedAt: now,
|
|
4994
|
+
schemaVersion: CURRENT_SCHEMA_VERSION,
|
|
4995
|
+
modelId: this.embeddingModelId
|
|
4978
4996
|
};
|
|
4979
4997
|
break;
|
|
4980
4998
|
}
|
|
@@ -5019,7 +5037,9 @@ var StoreService = class {
|
|
|
5019
5037
|
tags: input.tags,
|
|
5020
5038
|
status: "ready",
|
|
5021
5039
|
createdAt: now,
|
|
5022
|
-
updatedAt: now
|
|
5040
|
+
updatedAt: now,
|
|
5041
|
+
schemaVersion: CURRENT_SCHEMA_VERSION,
|
|
5042
|
+
modelId: this.embeddingModelId
|
|
5023
5043
|
};
|
|
5024
5044
|
break;
|
|
5025
5045
|
}
|
|
@@ -5040,7 +5060,9 @@ var StoreService = class {
|
|
|
5040
5060
|
tags: input.tags,
|
|
5041
5061
|
status: "ready",
|
|
5042
5062
|
createdAt: now,
|
|
5043
|
-
updatedAt: now
|
|
5063
|
+
updatedAt: now,
|
|
5064
|
+
schemaVersion: CURRENT_SCHEMA_VERSION,
|
|
5065
|
+
modelId: this.embeddingModelId
|
|
5044
5066
|
};
|
|
5045
5067
|
break;
|
|
5046
5068
|
default: {
|
|
@@ -5418,6 +5440,16 @@ import { homedir as homedir2 } from "os";
|
|
|
5418
5440
|
import { join as join11 } from "path";
|
|
5419
5441
|
import { pipeline, env } from "@huggingface/transformers";
|
|
5420
5442
|
env.cacheDir = join11(homedir2(), ".cache", "huggingface-transformers");
|
|
5443
|
+
var DEFAULT_EMBEDDING_CONFIG = {
|
|
5444
|
+
model: "Xenova/bge-small-en-v1.5",
|
|
5445
|
+
batchSize: 32,
|
|
5446
|
+
dtype: "fp32",
|
|
5447
|
+
pooling: "mean",
|
|
5448
|
+
normalize: true,
|
|
5449
|
+
queryPrefix: "Represent this sentence for searching relevant passages: ",
|
|
5450
|
+
docPrefix: "",
|
|
5451
|
+
maxInFlightBatches: 1
|
|
5452
|
+
};
|
|
5421
5453
|
var EmbeddingEngine = class {
|
|
5422
5454
|
extractor = null;
|
|
5423
5455
|
initPromise = null;
|
|
@@ -5425,11 +5457,9 @@ var EmbeddingEngine = class {
|
|
|
5425
5457
|
_dimensions = null;
|
|
5426
5458
|
// eslint-disable-next-line @typescript-eslint/prefer-readonly -- mutated in dispose()
|
|
5427
5459
|
disposed = false;
|
|
5428
|
-
|
|
5429
|
-
|
|
5430
|
-
|
|
5431
|
-
this.modelName = modelName;
|
|
5432
|
-
this.batchSize = batchSize;
|
|
5460
|
+
config;
|
|
5461
|
+
constructor(config = DEFAULT_EMBEDDING_CONFIG) {
|
|
5462
|
+
this.config = config;
|
|
5433
5463
|
}
|
|
5434
5464
|
/**
|
|
5435
5465
|
* Guard against use-after-dispose
|
|
@@ -5448,8 +5478,8 @@ var EmbeddingEngine = class {
|
|
|
5448
5478
|
if (this.extractor !== null) return;
|
|
5449
5479
|
this.initPromise ??= (async () => {
|
|
5450
5480
|
try {
|
|
5451
|
-
this.extractor = await pipeline("feature-extraction", this.
|
|
5452
|
-
dtype:
|
|
5481
|
+
this.extractor = await pipeline("feature-extraction", this.config.model, {
|
|
5482
|
+
dtype: this.config.dtype
|
|
5453
5483
|
});
|
|
5454
5484
|
} catch (error) {
|
|
5455
5485
|
this.initPromise = null;
|
|
@@ -5458,7 +5488,22 @@ var EmbeddingEngine = class {
|
|
|
5458
5488
|
})();
|
|
5459
5489
|
await this.initPromise;
|
|
5460
5490
|
}
|
|
5461
|
-
|
|
5491
|
+
/**
|
|
5492
|
+
* Embed a search query. Applies queryPrefix for asymmetric models.
|
|
5493
|
+
*/
|
|
5494
|
+
async embedQuery(text) {
|
|
5495
|
+
return this.embedText(this.config.queryPrefix + text);
|
|
5496
|
+
}
|
|
5497
|
+
/**
|
|
5498
|
+
* Embed a document for indexing. Applies docPrefix for asymmetric models.
|
|
5499
|
+
*/
|
|
5500
|
+
async embedDocument(text) {
|
|
5501
|
+
return this.embedText(this.config.docPrefix + text);
|
|
5502
|
+
}
|
|
5503
|
+
/**
|
|
5504
|
+
* Internal: embed text without prefix.
|
|
5505
|
+
*/
|
|
5506
|
+
async embedText(text) {
|
|
5462
5507
|
this.assertNotDisposed();
|
|
5463
5508
|
if (this.extractor === null) {
|
|
5464
5509
|
await this.initialize();
|
|
@@ -5467,13 +5512,17 @@ var EmbeddingEngine = class {
|
|
|
5467
5512
|
throw new Error("Failed to initialize embedding model");
|
|
5468
5513
|
}
|
|
5469
5514
|
const output = await this.extractor(text, {
|
|
5470
|
-
pooling:
|
|
5471
|
-
normalize:
|
|
5515
|
+
pooling: this.config.pooling,
|
|
5516
|
+
normalize: this.config.normalize
|
|
5472
5517
|
});
|
|
5473
|
-
const
|
|
5474
|
-
this._dimensions ??=
|
|
5475
|
-
return
|
|
5518
|
+
const dim = output.dims[output.dims.length - 1] ?? 0;
|
|
5519
|
+
this._dimensions ??= dim;
|
|
5520
|
+
return Float32Array.from(output.data);
|
|
5476
5521
|
}
|
|
5522
|
+
/**
|
|
5523
|
+
* Embed a batch of documents with optional parallelism.
|
|
5524
|
+
* When maxInFlightBatches > 1, processes multiple batches concurrently.
|
|
5525
|
+
*/
|
|
5477
5526
|
async embedBatch(texts) {
|
|
5478
5527
|
this.assertNotDisposed();
|
|
5479
5528
|
if (this.extractor === null) {
|
|
@@ -5482,26 +5531,79 @@ var EmbeddingEngine = class {
|
|
|
5482
5531
|
if (this.extractor === null) {
|
|
5483
5532
|
throw new Error("Failed to initialize embedding model");
|
|
5484
5533
|
}
|
|
5534
|
+
const batches = [];
|
|
5535
|
+
for (let i = 0; i < texts.length; i += this.config.batchSize) {
|
|
5536
|
+
batches.push(texts.slice(i, i + this.config.batchSize));
|
|
5537
|
+
}
|
|
5538
|
+
if (batches.length === 0) {
|
|
5539
|
+
return [];
|
|
5540
|
+
}
|
|
5541
|
+
if (this.config.maxInFlightBatches <= 1) {
|
|
5542
|
+
return this.embedBatchesSequential(batches);
|
|
5543
|
+
} else {
|
|
5544
|
+
return this.embedBatchesConcurrent(batches);
|
|
5545
|
+
}
|
|
5546
|
+
}
|
|
5547
|
+
/**
|
|
5548
|
+
* Process batches sequentially (original behavior).
|
|
5549
|
+
*/
|
|
5550
|
+
async embedBatchesSequential(batches) {
|
|
5485
5551
|
const results = [];
|
|
5486
|
-
for (let i = 0; i <
|
|
5487
|
-
const batch =
|
|
5488
|
-
|
|
5489
|
-
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
const dim = output.dims[output.dims.length - 1] ?? 0;
|
|
5493
|
-
for (let b = 0; b < batch.length; b++) {
|
|
5494
|
-
const start = b * dim;
|
|
5495
|
-
const end = start + dim;
|
|
5496
|
-
results.push(Array.from(output.data.slice(start, end), (v) => Number(v)));
|
|
5497
|
-
}
|
|
5498
|
-
this._dimensions ??= dim;
|
|
5499
|
-
if (i + this.batchSize < texts.length) {
|
|
5552
|
+
for (let i = 0; i < batches.length; i++) {
|
|
5553
|
+
const batch = batches[i];
|
|
5554
|
+
if (batch === void 0) continue;
|
|
5555
|
+
const batchResults = await this.processSingleBatch(batch);
|
|
5556
|
+
results.push(...batchResults);
|
|
5557
|
+
if (i < batches.length - 1) {
|
|
5500
5558
|
await new Promise((resolve4) => setImmediate(resolve4));
|
|
5501
5559
|
}
|
|
5502
5560
|
}
|
|
5503
5561
|
return results;
|
|
5504
5562
|
}
|
|
5563
|
+
/**
|
|
5564
|
+
* Process batches with controlled concurrency.
|
|
5565
|
+
*/
|
|
5566
|
+
async embedBatchesConcurrent(batches) {
|
|
5567
|
+
const results = new Array(batches.length);
|
|
5568
|
+
let inFlight = 0;
|
|
5569
|
+
const maxConcurrent = this.config.maxInFlightBatches;
|
|
5570
|
+
await Promise.all(
|
|
5571
|
+
batches.map(async (batch, idx) => {
|
|
5572
|
+
while (inFlight >= maxConcurrent) {
|
|
5573
|
+
await new Promise((resolve4) => setImmediate(resolve4));
|
|
5574
|
+
}
|
|
5575
|
+
inFlight++;
|
|
5576
|
+
try {
|
|
5577
|
+
results[idx] = await this.processSingleBatch(batch);
|
|
5578
|
+
} finally {
|
|
5579
|
+
inFlight--;
|
|
5580
|
+
}
|
|
5581
|
+
})
|
|
5582
|
+
);
|
|
5583
|
+
return results.flat();
|
|
5584
|
+
}
|
|
5585
|
+
/**
|
|
5586
|
+
* Process a single batch and return embeddings.
|
|
5587
|
+
*/
|
|
5588
|
+
async processSingleBatch(batch) {
|
|
5589
|
+
if (this.extractor === null) {
|
|
5590
|
+
throw new Error("Extractor not initialized");
|
|
5591
|
+
}
|
|
5592
|
+
const prefixedBatch = batch.map((text) => this.config.docPrefix + text);
|
|
5593
|
+
const output = await this.extractor(prefixedBatch, {
|
|
5594
|
+
pooling: this.config.pooling,
|
|
5595
|
+
normalize: this.config.normalize
|
|
5596
|
+
});
|
|
5597
|
+
const dim = output.dims[output.dims.length - 1] ?? 0;
|
|
5598
|
+
const batchResults = [];
|
|
5599
|
+
for (let b = 0; b < batch.length; b++) {
|
|
5600
|
+
const start = b * dim;
|
|
5601
|
+
const end = start + dim;
|
|
5602
|
+
batchResults.push(Float32Array.from(output.data.slice(start, end)));
|
|
5603
|
+
}
|
|
5604
|
+
this._dimensions ??= dim;
|
|
5605
|
+
return batchResults;
|
|
5606
|
+
}
|
|
5505
5607
|
/**
|
|
5506
5608
|
* Get cached embedding dimensions. Throws if embed() hasn't been called yet.
|
|
5507
5609
|
* Use ensureDimensions() if you need to guarantee dimensions are available.
|
|
@@ -5512,13 +5614,38 @@ var EmbeddingEngine = class {
|
|
|
5512
5614
|
}
|
|
5513
5615
|
return this._dimensions;
|
|
5514
5616
|
}
|
|
5617
|
+
/**
|
|
5618
|
+
* Check if the embedding pipeline is initialized.
|
|
5619
|
+
*/
|
|
5620
|
+
isInitialized() {
|
|
5621
|
+
return this.extractor !== null;
|
|
5622
|
+
}
|
|
5623
|
+
/**
|
|
5624
|
+
* Check if this engine has been disposed.
|
|
5625
|
+
*/
|
|
5626
|
+
isDisposed() {
|
|
5627
|
+
return this.disposed;
|
|
5628
|
+
}
|
|
5629
|
+
/**
|
|
5630
|
+
* Reset the engine to uninitialized state, allowing reuse after disposal.
|
|
5631
|
+
* If currently initialized, disposes the pipeline first.
|
|
5632
|
+
*/
|
|
5633
|
+
async reset() {
|
|
5634
|
+
if (this.extractor !== null) {
|
|
5635
|
+
await this.extractor.dispose();
|
|
5636
|
+
this.extractor = null;
|
|
5637
|
+
}
|
|
5638
|
+
this.initPromise = null;
|
|
5639
|
+
this._dimensions = null;
|
|
5640
|
+
this.disposed = false;
|
|
5641
|
+
}
|
|
5515
5642
|
/**
|
|
5516
5643
|
* Ensure dimensions are available, initializing the model if needed.
|
|
5517
5644
|
* Returns the embedding dimensions for the current model.
|
|
5518
5645
|
*/
|
|
5519
5646
|
async ensureDimensions() {
|
|
5520
5647
|
if (this._dimensions === null) {
|
|
5521
|
-
await this.
|
|
5648
|
+
await this.embedText("dimension probe");
|
|
5522
5649
|
}
|
|
5523
5650
|
if (this._dimensions === null) {
|
|
5524
5651
|
throw new Error("Failed to determine embedding dimensions");
|
|
@@ -5543,6 +5670,88 @@ var EmbeddingEngine = class {
|
|
|
5543
5670
|
|
|
5544
5671
|
// src/db/lance.ts
|
|
5545
5672
|
import * as lancedb from "@lancedb/lancedb";
|
|
5673
|
+
import { LanceSchema } from "@lancedb/lancedb/embedding";
|
|
5674
|
+
import { Utf8 } from "apache-arrow";
|
|
5675
|
+
|
|
5676
|
+
// src/db/lance-embedding-function.ts
|
|
5677
|
+
import { TextEmbeddingFunction, getRegistry } from "@lancedb/lancedb/embedding";
|
|
5678
|
+
import { Float32 } from "apache-arrow";
|
|
5679
|
+
var HuggingFaceEmbeddingFunction = class extends TextEmbeddingFunction {
|
|
5680
|
+
engine;
|
|
5681
|
+
embeddingConfig;
|
|
5682
|
+
_ndims = null;
|
|
5683
|
+
constructor(optionsRaw) {
|
|
5684
|
+
super();
|
|
5685
|
+
const options = this.resolveVariables(optionsRaw ?? {});
|
|
5686
|
+
this.embeddingConfig = {
|
|
5687
|
+
model: options.model ?? "Xenova/bge-small-en-v1.5",
|
|
5688
|
+
batchSize: options.batchSize ?? 32,
|
|
5689
|
+
dtype: options.dtype ?? "fp32",
|
|
5690
|
+
pooling: options.pooling ?? "mean",
|
|
5691
|
+
normalize: options.normalize ?? true,
|
|
5692
|
+
queryPrefix: options.queryPrefix ?? "",
|
|
5693
|
+
docPrefix: options.docPrefix ?? "",
|
|
5694
|
+
maxInFlightBatches: 1
|
|
5695
|
+
// Single-threaded for LanceDB integration
|
|
5696
|
+
};
|
|
5697
|
+
this.engine = new EmbeddingEngine(this.embeddingConfig);
|
|
5698
|
+
}
|
|
5699
|
+
/**
|
|
5700
|
+
* Initialize the embedding model. Called by LanceDB before embeddings are computed.
|
|
5701
|
+
*/
|
|
5702
|
+
async init() {
|
|
5703
|
+
this._ndims = await this.engine.ensureDimensions();
|
|
5704
|
+
}
|
|
5705
|
+
/**
|
|
5706
|
+
* Return embedding dimensions. Must call init() first.
|
|
5707
|
+
*/
|
|
5708
|
+
ndims() {
|
|
5709
|
+
if (this._ndims === null) {
|
|
5710
|
+
throw new Error("HuggingFaceEmbeddingFunction not initialized. Call init() first.");
|
|
5711
|
+
}
|
|
5712
|
+
return this._ndims;
|
|
5713
|
+
}
|
|
5714
|
+
/**
|
|
5715
|
+
* Return embedding data type (always Float32 for our models).
|
|
5716
|
+
*/
|
|
5717
|
+
embeddingDataType() {
|
|
5718
|
+
return new Float32();
|
|
5719
|
+
}
|
|
5720
|
+
/**
|
|
5721
|
+
* Generate embeddings for a batch of texts (documents).
|
|
5722
|
+
* Called during table.add() operations.
|
|
5723
|
+
*/
|
|
5724
|
+
async generateEmbeddings(texts) {
|
|
5725
|
+
return this.engine.embedBatch(texts);
|
|
5726
|
+
}
|
|
5727
|
+
/**
|
|
5728
|
+
* Compute embedding for a single query.
|
|
5729
|
+
* Called during table.search(query) operations.
|
|
5730
|
+
*/
|
|
5731
|
+
async computeQueryEmbeddings(data) {
|
|
5732
|
+
const embedding = await this.engine.embedQuery(data);
|
|
5733
|
+
return Array.from(embedding);
|
|
5734
|
+
}
|
|
5735
|
+
/**
|
|
5736
|
+
* Get the model ID for provenance tracking.
|
|
5737
|
+
*/
|
|
5738
|
+
getModelId() {
|
|
5739
|
+
return this.embeddingConfig.model;
|
|
5740
|
+
}
|
|
5741
|
+
/**
|
|
5742
|
+
* Get the full embedding config.
|
|
5743
|
+
*/
|
|
5744
|
+
getConfig() {
|
|
5745
|
+
return this.embeddingConfig;
|
|
5746
|
+
}
|
|
5747
|
+
/**
|
|
5748
|
+
* Dispose the underlying engine to free resources.
|
|
5749
|
+
*/
|
|
5750
|
+
async dispose() {
|
|
5751
|
+
await this.engine.dispose();
|
|
5752
|
+
}
|
|
5753
|
+
};
|
|
5754
|
+
getRegistry().register("HuggingFaceEmbeddingFunction")(HuggingFaceEmbeddingFunction);
|
|
5546
5755
|
|
|
5547
5756
|
// src/types/document.ts
|
|
5548
5757
|
import { z as z5 } from "zod";
|
|
@@ -5560,15 +5769,51 @@ var DocumentMetadataSchema = z5.object({
|
|
|
5560
5769
|
}).loose();
|
|
5561
5770
|
|
|
5562
5771
|
// src/db/lance.ts
|
|
5772
|
+
function isSearchHit(value) {
|
|
5773
|
+
if (typeof value !== "object" || value === null) return false;
|
|
5774
|
+
return "id" in value && "content" in value && "metadata" in value && "_distance" in value && typeof value.id === "string" && typeof value.content === "string" && typeof value.metadata === "string" && typeof value._distance === "number";
|
|
5775
|
+
}
|
|
5776
|
+
function parseDocumentMetadata(jsonStr) {
|
|
5777
|
+
const parsed = DocumentMetadataSchema.parse(JSON.parse(jsonStr));
|
|
5778
|
+
return {
|
|
5779
|
+
...parsed,
|
|
5780
|
+
storeId: createStoreId(parsed.storeId)
|
|
5781
|
+
};
|
|
5782
|
+
}
|
|
5563
5783
|
var LanceStore = class {
|
|
5564
5784
|
connection = null;
|
|
5565
5785
|
tables = /* @__PURE__ */ new Map();
|
|
5566
5786
|
dataDir;
|
|
5567
5787
|
// eslint-disable-next-line @typescript-eslint/prefer-readonly -- set via setDimensions()
|
|
5568
5788
|
_dimensions = null;
|
|
5789
|
+
embeddingFunction = null;
|
|
5569
5790
|
constructor(dataDir) {
|
|
5570
5791
|
this.dataDir = dataDir;
|
|
5571
5792
|
}
|
|
5793
|
+
/**
|
|
5794
|
+
* Set the embedding function for auto-embedding queries.
|
|
5795
|
+
* Must be called before initialize() for new tables.
|
|
5796
|
+
* The embedding function is initialized and its dimensions are used for schema creation.
|
|
5797
|
+
*/
|
|
5798
|
+
async setEmbeddingFunction(config) {
|
|
5799
|
+
this.embeddingFunction = new HuggingFaceEmbeddingFunction({
|
|
5800
|
+
model: config.model,
|
|
5801
|
+
batchSize: config.batchSize,
|
|
5802
|
+
dtype: config.dtype,
|
|
5803
|
+
pooling: config.pooling,
|
|
5804
|
+
normalize: config.normalize,
|
|
5805
|
+
queryPrefix: config.queryPrefix,
|
|
5806
|
+
docPrefix: config.docPrefix
|
|
5807
|
+
});
|
|
5808
|
+
await this.embeddingFunction.init();
|
|
5809
|
+
this._dimensions = this.embeddingFunction.ndims();
|
|
5810
|
+
}
|
|
5811
|
+
/**
|
|
5812
|
+
* Check if embedding function is available for auto-embedding queries.
|
|
5813
|
+
*/
|
|
5814
|
+
hasEmbeddingFunction() {
|
|
5815
|
+
return this.embeddingFunction !== null;
|
|
5816
|
+
}
|
|
5572
5817
|
/**
|
|
5573
5818
|
* Set the embedding dimensions. Must be called before initialize().
|
|
5574
5819
|
* This allows dimensions to be derived from the embedding model at runtime.
|
|
@@ -5579,22 +5824,35 @@ var LanceStore = class {
|
|
|
5579
5824
|
}
|
|
5580
5825
|
async initialize(storeId) {
|
|
5581
5826
|
if (this._dimensions === null) {
|
|
5582
|
-
throw new Error(
|
|
5827
|
+
throw new Error(
|
|
5828
|
+
"Dimensions not set. Call setDimensions() or setEmbeddingFunction() before initialize()."
|
|
5829
|
+
);
|
|
5583
5830
|
}
|
|
5584
5831
|
this.connection ??= await lancedb.connect(this.dataDir);
|
|
5585
5832
|
const tableName = this.getTableName(storeId);
|
|
5586
5833
|
const tableNames = await this.connection.tableNames();
|
|
5587
5834
|
if (!tableNames.includes(tableName)) {
|
|
5588
|
-
|
|
5589
|
-
{
|
|
5590
|
-
id:
|
|
5591
|
-
content:
|
|
5592
|
-
vector:
|
|
5593
|
-
metadata:
|
|
5594
|
-
}
|
|
5595
|
-
|
|
5596
|
-
|
|
5597
|
-
|
|
5835
|
+
if (this.embeddingFunction !== null) {
|
|
5836
|
+
const schema = LanceSchema({
|
|
5837
|
+
id: new Utf8(),
|
|
5838
|
+
content: this.embeddingFunction.sourceField(),
|
|
5839
|
+
vector: this.embeddingFunction.vectorField(),
|
|
5840
|
+
metadata: new Utf8()
|
|
5841
|
+
});
|
|
5842
|
+
const table = await this.connection.createEmptyTable(tableName, schema);
|
|
5843
|
+
this.tables.set(tableName, table);
|
|
5844
|
+
} else {
|
|
5845
|
+
const table = await this.connection.createTable(tableName, [
|
|
5846
|
+
{
|
|
5847
|
+
id: "__init__",
|
|
5848
|
+
content: "",
|
|
5849
|
+
vector: new Array(this._dimensions).fill(0),
|
|
5850
|
+
metadata: "{}"
|
|
5851
|
+
}
|
|
5852
|
+
]);
|
|
5853
|
+
await table.delete('id = "__init__"');
|
|
5854
|
+
this.tables.set(tableName, table);
|
|
5855
|
+
}
|
|
5598
5856
|
} else {
|
|
5599
5857
|
const table = await this.connection.openTable(tableName);
|
|
5600
5858
|
this.tables.set(tableName, table);
|
|
@@ -5638,6 +5896,29 @@ var LanceStore = class {
|
|
|
5638
5896
|
};
|
|
5639
5897
|
});
|
|
5640
5898
|
}
|
|
5899
|
+
/**
|
|
5900
|
+
* Search using a text query with automatic embedding.
|
|
5901
|
+
* Requires setEmbeddingFunction() to have been called.
|
|
5902
|
+
* Uses the embedding function to compute query embeddings consistently with document embeddings.
|
|
5903
|
+
*/
|
|
5904
|
+
async searchText(storeId, query, limit) {
|
|
5905
|
+
if (this.embeddingFunction === null) {
|
|
5906
|
+
throw new Error(
|
|
5907
|
+
"Embedding function not set. Call setEmbeddingFunction() before searchText()."
|
|
5908
|
+
);
|
|
5909
|
+
}
|
|
5910
|
+
const queryEmbedding = await this.embeddingFunction.computeQueryEmbeddings(query);
|
|
5911
|
+
const table = await this.getTable(storeId);
|
|
5912
|
+
const searchQuery = table.vectorSearch(queryEmbedding).limit(limit).distanceType("cosine");
|
|
5913
|
+
const rawResults = await searchQuery.toArray();
|
|
5914
|
+
const results = rawResults.filter(isSearchHit);
|
|
5915
|
+
return results.map((r) => ({
|
|
5916
|
+
id: createDocumentId(r.id),
|
|
5917
|
+
content: r.content,
|
|
5918
|
+
score: 1 - r._distance,
|
|
5919
|
+
metadata: parseDocumentMetadata(r.metadata)
|
|
5920
|
+
}));
|
|
5921
|
+
}
|
|
5641
5922
|
async createFtsIndex(storeId) {
|
|
5642
5923
|
const table = await this.getTable(storeId);
|
|
5643
5924
|
await table.createIndex("content", {
|
|
@@ -5734,10 +6015,7 @@ var LazyServiceContainer = class {
|
|
|
5734
6015
|
get embeddings() {
|
|
5735
6016
|
if (this._embeddings === null) {
|
|
5736
6017
|
logger4.debug("Lazy-initializing EmbeddingEngine");
|
|
5737
|
-
this._embeddings = new EmbeddingEngine(
|
|
5738
|
-
this.appConfig.embedding.model,
|
|
5739
|
-
this.appConfig.embedding.batchSize
|
|
5740
|
-
);
|
|
6018
|
+
this._embeddings = new EmbeddingEngine(this.appConfig.embedding);
|
|
5741
6019
|
}
|
|
5742
6020
|
return this._embeddings;
|
|
5743
6021
|
}
|
|
@@ -5757,12 +6035,7 @@ var LazyServiceContainer = class {
|
|
|
5757
6035
|
get search() {
|
|
5758
6036
|
if (this._search === null) {
|
|
5759
6037
|
logger4.debug("Lazy-initializing SearchService");
|
|
5760
|
-
this._search = new SearchService(
|
|
5761
|
-
this.lance,
|
|
5762
|
-
this.embeddings,
|
|
5763
|
-
this.codeGraph,
|
|
5764
|
-
this.appConfig.search
|
|
5765
|
-
);
|
|
6038
|
+
this._search = new SearchService(this.lance, this.codeGraph, this.appConfig.search);
|
|
5766
6039
|
}
|
|
5767
6040
|
return this._search;
|
|
5768
6041
|
}
|
|
@@ -5821,7 +6094,8 @@ async function createLazyServices(configPath, dataDir, projectRoot) {
|
|
|
5821
6094
|
const storeOptions = {
|
|
5822
6095
|
definitionService,
|
|
5823
6096
|
gitignoreService,
|
|
5824
|
-
projectRoot: resolvedProjectRoot
|
|
6097
|
+
projectRoot: resolvedProjectRoot,
|
|
6098
|
+
embeddingModelId: appConfig.embedding.model
|
|
5825
6099
|
};
|
|
5826
6100
|
const store = new StoreService(resolvedDataDir, storeOptions);
|
|
5827
6101
|
await store.initialize();
|
|
@@ -5840,21 +6114,23 @@ async function createServices(configPath, dataDir, projectRoot) {
|
|
|
5840
6114
|
const pythonBridge = new PythonBridge();
|
|
5841
6115
|
await pythonBridge.start();
|
|
5842
6116
|
const lance = new LanceStore(resolvedDataDir);
|
|
5843
|
-
const embeddings = new EmbeddingEngine(appConfig.embedding
|
|
6117
|
+
const embeddings = new EmbeddingEngine(appConfig.embedding);
|
|
5844
6118
|
await embeddings.initialize();
|
|
6119
|
+
await lance.setEmbeddingFunction(appConfig.embedding);
|
|
5845
6120
|
const resolvedProjectRoot = config.resolveProjectRoot();
|
|
5846
6121
|
const definitionService = new StoreDefinitionService(resolvedProjectRoot);
|
|
5847
6122
|
const gitignoreService = new GitignoreService(resolvedProjectRoot);
|
|
5848
6123
|
const storeOptions = {
|
|
5849
6124
|
definitionService,
|
|
5850
6125
|
gitignoreService,
|
|
5851
|
-
projectRoot: resolvedProjectRoot
|
|
6126
|
+
projectRoot: resolvedProjectRoot,
|
|
6127
|
+
embeddingModelId: appConfig.embedding.model
|
|
5852
6128
|
};
|
|
5853
6129
|
const store = new StoreService(resolvedDataDir, storeOptions);
|
|
5854
6130
|
await store.initialize();
|
|
5855
6131
|
const codeGraph = new CodeGraphService(resolvedDataDir, pythonBridge);
|
|
5856
6132
|
const manifest = new ManifestService(resolvedDataDir);
|
|
5857
|
-
const search = new SearchService(lance,
|
|
6133
|
+
const search = new SearchService(lance, codeGraph, appConfig.search);
|
|
5858
6134
|
const index = new IndexService(lance, embeddings, {
|
|
5859
6135
|
codeGraphService: codeGraph,
|
|
5860
6136
|
manifestService: manifest,
|
|
@@ -5946,4 +6222,4 @@ export {
|
|
|
5946
6222
|
createServices,
|
|
5947
6223
|
destroyServices
|
|
5948
6224
|
};
|
|
5949
|
-
//# sourceMappingURL=chunk-
|
|
6225
|
+
//# sourceMappingURL=chunk-ZR23KJPJ.js.map
|