@vectorstores/azure 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/index.cjs +1656 -0
- package/dist/index.d.cts +585 -0
- package/dist/index.d.ts +585 -0
- package/dist/index.edge-light.d.ts +585 -0
- package/dist/index.edge-light.js +1636 -0
- package/dist/index.js +1636 -0
- package/dist/storage.cjs +458 -0
- package/dist/storage.d.cts +220 -0
- package/dist/storage.d.ts +220 -0
- package/dist/storage.edge-light.d.ts +220 -0
- package/dist/storage.edge-light.js +451 -0
- package/dist/storage.js +451 -0
- package/package.json +77 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1636 @@
|
|
|
1
|
+
import { Document, metadataDictToNode, BaseVectorStore, MetadataMode, nodeToMetadata, FilterOperator, FilterCondition, VectorStoreQueryMode } from '@vectorstores/core';
|
|
2
|
+
export * from './storage.js';
|
|
3
|
+
import { KnownVectorSearchAlgorithmKind, KnownAnalyzerNames, KnownSearchFieldDataType, KnownVectorSearchAlgorithmMetric, KnownVectorSearchCompressionKind, SearchClient, SearchIndexClient, AzureKeyCredential, IndexDocumentsBatch } from '@azure/search-documents';
|
|
4
|
+
import { DefaultAzureCredential, ManagedIdentityCredential } from '@azure/identity';
|
|
5
|
+
import { consoleLogger, getEnv } from '@vectorstores/env';
|
|
6
|
+
import { MongoClient } from 'mongodb';
|
|
7
|
+
import { VectorEmbeddingDistanceFunction, VectorEmbeddingDataType, VectorIndexType, CosmosClient } from '@azure/cosmos';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Read data from CosmosDB.
|
|
11
|
+
*/ class SimpleCosmosDBReader {
|
|
12
|
+
constructor(client){
|
|
13
|
+
this.client = client;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Loads data from a Cosmos DB container
|
|
17
|
+
* @returns {Promise<Document[]>}
|
|
18
|
+
*/ async loadData(config) {
|
|
19
|
+
if (!config.databaseName || !config.containerName) {
|
|
20
|
+
throw new Error("databaseName and containerName are required");
|
|
21
|
+
}
|
|
22
|
+
const database = this.client.database(config.databaseName);
|
|
23
|
+
const container = database.container(config.containerName);
|
|
24
|
+
const query = config.query || "SELECT * FROM c";
|
|
25
|
+
const fields = config.fields || [
|
|
26
|
+
"text"
|
|
27
|
+
];
|
|
28
|
+
const fieldSeparator = config.fieldSeparator || "";
|
|
29
|
+
const metadataFields = config.metadataFields;
|
|
30
|
+
try {
|
|
31
|
+
const res = await container.items.query(query).fetchAll();
|
|
32
|
+
const documents = [];
|
|
33
|
+
for (const item of res.resources){
|
|
34
|
+
const texts = fields.map((name)=>item[name]);
|
|
35
|
+
const flattenedTexts = texts.flat();
|
|
36
|
+
const text = flattenedTexts.join(fieldSeparator);
|
|
37
|
+
let metadata = {};
|
|
38
|
+
if (metadataFields) {
|
|
39
|
+
metadata = Object.fromEntries(metadataFields.map((name)=>[
|
|
40
|
+
name,
|
|
41
|
+
item[name]
|
|
42
|
+
]));
|
|
43
|
+
}
|
|
44
|
+
documents.push(new Document({
|
|
45
|
+
id_: item.id,
|
|
46
|
+
text,
|
|
47
|
+
metadata
|
|
48
|
+
}));
|
|
49
|
+
}
|
|
50
|
+
return documents;
|
|
51
|
+
} catch (error) {
|
|
52
|
+
throw new Error(`Error loading data from Cosmos DB: ${error}`);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const AzureAISearchVectorStoreConfig = {
|
|
58
|
+
ALGORITHM_HNSW_NAME: "myHnsw",
|
|
59
|
+
ALGORITHM_EXHAUSTIVE_KNN_NAME: "myExhaustiveKnn",
|
|
60
|
+
PROFILE_HNSW_NAME: "myHnswProfile",
|
|
61
|
+
PROFILE_EXHAUSTIVE_KNN_NAME: "myExhaustiveKnnProfile",
|
|
62
|
+
COMPRESSION_TYPE_SCALAR: "myScalarCompression",
|
|
63
|
+
COMPRESSION_TYPE_BINARY: "myBinaryCompression",
|
|
64
|
+
SEMANTIC_CONFIG_NAME: "mySemanticConfig",
|
|
65
|
+
// 700 is default the maximum number of documents that can be sent in a single request
|
|
66
|
+
DEFAULT_MAX_BATCH_SIZE: 700,
|
|
67
|
+
// 14MB in bytes
|
|
68
|
+
DEFAULT_MAX_MB_SIZE: 14 * 1024 * 1024,
|
|
69
|
+
DEFAULT_USER_AGENT_PREFIX: "vectorstores-ts",
|
|
70
|
+
DEFAULT_AZURE_API_VERSION: "2024-09-01-preview"
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
class AzureQueryResultSearchBase {
|
|
74
|
+
constructor(query, fieldMapping, odataFilter, searchClient){
|
|
75
|
+
this._query = query;
|
|
76
|
+
this.fieldMapping = fieldMapping;
|
|
77
|
+
this.odataFilter = odataFilter;
|
|
78
|
+
this.searchClient = searchClient;
|
|
79
|
+
}
|
|
80
|
+
get selectFields() {
|
|
81
|
+
return [
|
|
82
|
+
this.fieldMapping["id"],
|
|
83
|
+
this.fieldMapping["chunk"],
|
|
84
|
+
this.fieldMapping["metadata"],
|
|
85
|
+
this.fieldMapping["doc_id"]
|
|
86
|
+
];
|
|
87
|
+
}
|
|
88
|
+
createSearchQuery() {
|
|
89
|
+
return "*";
|
|
90
|
+
}
|
|
91
|
+
createQueryVector() {
|
|
92
|
+
return null;
|
|
93
|
+
}
|
|
94
|
+
async _createQueryResult(searchQuery, vectorQueries) {
|
|
95
|
+
if (!vectorQueries) {
|
|
96
|
+
vectorQueries = [];
|
|
97
|
+
}
|
|
98
|
+
if (!this.searchClient) {
|
|
99
|
+
throw new Error("SearchClient is not set");
|
|
100
|
+
}
|
|
101
|
+
const searchResults = await this.searchClient.search(searchQuery, {
|
|
102
|
+
top: this._query.similarityTopK,
|
|
103
|
+
select: this.selectFields,
|
|
104
|
+
filter: this.odataFilter || "",
|
|
105
|
+
vectorSearchOptions: {
|
|
106
|
+
queries: vectorQueries
|
|
107
|
+
}
|
|
108
|
+
});
|
|
109
|
+
const idResult = [];
|
|
110
|
+
const nodeResult = [];
|
|
111
|
+
const scoreResult = [];
|
|
112
|
+
for await (const result of searchResults.results){
|
|
113
|
+
const { document } = result;
|
|
114
|
+
// build node metadata from the metadata field in the document
|
|
115
|
+
const nodeId = document[this.fieldMapping["id"]];
|
|
116
|
+
const metadataStr = document[this.fieldMapping["metadata"]];
|
|
117
|
+
const metadata = typeof metadataStr === "string" ? JSON.parse(metadataStr) : {};
|
|
118
|
+
const score = result["score"];
|
|
119
|
+
const chunk = document[this.fieldMapping["chunk"]];
|
|
120
|
+
let node;
|
|
121
|
+
try {
|
|
122
|
+
node = metadataDictToNode(metadata);
|
|
123
|
+
node.setContent(chunk);
|
|
124
|
+
consoleLogger.log(`Retrieved node id ${nodeId}`);
|
|
125
|
+
idResult.push(nodeId);
|
|
126
|
+
nodeResult.push(node);
|
|
127
|
+
scoreResult.push(score);
|
|
128
|
+
} catch (err) {
|
|
129
|
+
consoleLogger.error(`Error while parsing metadata for node id ${nodeId}. Error: ${err}`);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
consoleLogger.log(`Search query '${searchQuery}' returned ${idResult.length} results.`);
|
|
133
|
+
return {
|
|
134
|
+
nodes: nodeResult,
|
|
135
|
+
similarities: scoreResult,
|
|
136
|
+
ids: idResult
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
async search() {
|
|
140
|
+
const searchQuery = this.createSearchQuery();
|
|
141
|
+
const vectorQueries = this.createQueryVector();
|
|
142
|
+
return await this._createQueryResult(searchQuery, vectorQueries);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
class AzureQueryResultSearchDefault extends AzureQueryResultSearchBase {
|
|
146
|
+
createQueryVector() {
|
|
147
|
+
if (!this._query.queryEmbedding) {
|
|
148
|
+
throw new Error("query.queryEmbedding is missing");
|
|
149
|
+
}
|
|
150
|
+
return [
|
|
151
|
+
{
|
|
152
|
+
kind: "vector",
|
|
153
|
+
vector: this._query.queryEmbedding,
|
|
154
|
+
kNearestNeighborsCount: this._query.similarityTopK,
|
|
155
|
+
fields: [
|
|
156
|
+
this.fieldMapping["embedding"]
|
|
157
|
+
]
|
|
158
|
+
}
|
|
159
|
+
];
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
class AzureQueryResultSearchSparse extends AzureQueryResultSearchBase {
|
|
163
|
+
createSearchQuery() {
|
|
164
|
+
if (!this._query.queryStr) {
|
|
165
|
+
throw new Error("Query missing query string");
|
|
166
|
+
}
|
|
167
|
+
return this._query.queryStr;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
class AzureQueryResultSearchHybrid extends AzureQueryResultSearchBase {
|
|
171
|
+
createQueryVector() {
|
|
172
|
+
return new AzureQueryResultSearchDefault(this._query, this.fieldMapping, this.odataFilter, this.searchClient).createQueryVector();
|
|
173
|
+
}
|
|
174
|
+
createSearchQuery() {
|
|
175
|
+
return new AzureQueryResultSearchSparse(this._query, this.fieldMapping, this.odataFilter, this.searchClient).createSearchQuery();
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
class AzureQueryResultSearchSemanticHybrid extends AzureQueryResultSearchHybrid {
|
|
179
|
+
createQueryVector() {
|
|
180
|
+
if (!this._query.queryEmbedding) {
|
|
181
|
+
throw new Error("query.queryEmbedding is missing");
|
|
182
|
+
}
|
|
183
|
+
return [
|
|
184
|
+
{
|
|
185
|
+
kind: "vector",
|
|
186
|
+
vector: this._query.queryEmbedding,
|
|
187
|
+
// kNearestNeighborsCount is set to 50 to align with the number of accept document in azure semantic reranking model.
|
|
188
|
+
// https://learn.microsoft.com/azure/search/semantic-search-overview
|
|
189
|
+
kNearestNeighborsCount: 50,
|
|
190
|
+
fields: [
|
|
191
|
+
this.fieldMapping["embedding"]
|
|
192
|
+
]
|
|
193
|
+
}
|
|
194
|
+
];
|
|
195
|
+
}
|
|
196
|
+
async _createQueryResult(searchQuery, vectorQueries) {
|
|
197
|
+
if (!this.searchClient) {
|
|
198
|
+
throw new Error("SearchClient not set");
|
|
199
|
+
}
|
|
200
|
+
const searchResults = await this.searchClient.search(searchQuery, {
|
|
201
|
+
vectorSearchOptions: {
|
|
202
|
+
queries: vectorQueries
|
|
203
|
+
},
|
|
204
|
+
semanticSearchOptions: {
|
|
205
|
+
configurationName: AzureAISearchVectorStoreConfig.SEMANTIC_CONFIG_NAME
|
|
206
|
+
},
|
|
207
|
+
top: this._query.similarityTopK,
|
|
208
|
+
select: this.selectFields,
|
|
209
|
+
filter: this.odataFilter || "",
|
|
210
|
+
queryType: "semantic"
|
|
211
|
+
});
|
|
212
|
+
const idResult = [];
|
|
213
|
+
const nodeResult = [];
|
|
214
|
+
const scoreResult = [];
|
|
215
|
+
for await (const result of searchResults.results){
|
|
216
|
+
// build node metadata from the metadata field in the document
|
|
217
|
+
const { document } = result;
|
|
218
|
+
const nodeId = document[this.fieldMapping["id"]];
|
|
219
|
+
const metadataStr = document[this.fieldMapping["metadata"]];
|
|
220
|
+
const metadata = metadataStr ? JSON.parse(metadataStr) : {};
|
|
221
|
+
const chunk = document[this.fieldMapping["chunk"]];
|
|
222
|
+
const score = result["rerankerScore"];
|
|
223
|
+
let node;
|
|
224
|
+
try {
|
|
225
|
+
node = metadataDictToNode(metadata);
|
|
226
|
+
node.setContent(chunk);
|
|
227
|
+
idResult.push(nodeId);
|
|
228
|
+
nodeResult.push(node);
|
|
229
|
+
scoreResult.push(score);
|
|
230
|
+
} catch (err) {
|
|
231
|
+
consoleLogger.error(`Error while parsing metadata for node id ${nodeId}. Error: ${err}`);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
return {
|
|
235
|
+
nodes: nodeResult,
|
|
236
|
+
similarities: scoreResult,
|
|
237
|
+
ids: idResult
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Enumeration representing the supported index management operations
|
|
244
|
+
*/ var IndexManagement = /*#__PURE__*/ function(IndexManagement) {
|
|
245
|
+
IndexManagement["NO_VALIDATION"] = "NoValidation";
|
|
246
|
+
IndexManagement["VALIDATE_INDEX"] = "ValidateIndex";
|
|
247
|
+
IndexManagement["CREATE_IF_NOT_EXISTS"] = "CreateIfNotExists";
|
|
248
|
+
return IndexManagement;
|
|
249
|
+
}({});
|
|
250
|
+
/**
|
|
251
|
+
* Enumeration representing the supported types for metadata fields in an
|
|
252
|
+
* Azure AI Search Index, corresponds with types supported in a flat
|
|
253
|
+
* metadata dictionary.
|
|
254
|
+
*/ var MetadataIndexFieldType = /*#__PURE__*/ function(MetadataIndexFieldType) {
|
|
255
|
+
MetadataIndexFieldType["STRING"] = "Edm.String";
|
|
256
|
+
MetadataIndexFieldType["BOOLEAN"] = "Edm.Boolean";
|
|
257
|
+
MetadataIndexFieldType["INT32"] = "Edm.Int32";
|
|
258
|
+
MetadataIndexFieldType["INT64"] = "Edm.Int64";
|
|
259
|
+
MetadataIndexFieldType["DOUBLE"] = "Edm.Double";
|
|
260
|
+
MetadataIndexFieldType["COLLECTION"] = "Collection(Edm.String)";
|
|
261
|
+
return MetadataIndexFieldType;
|
|
262
|
+
}({});
|
|
263
|
+
const createSearchRequest = (fieldMapping, filterStr, batchSize, offset)=>{
|
|
264
|
+
return {
|
|
265
|
+
filter: filterStr,
|
|
266
|
+
top: batchSize,
|
|
267
|
+
skip: offset,
|
|
268
|
+
select: Object.keys(fieldMapping)
|
|
269
|
+
};
|
|
270
|
+
};
|
|
271
|
+
/**
|
|
272
|
+
* Azure AI Search vector store.
|
|
273
|
+
*
|
|
274
|
+
* @example
|
|
275
|
+
```typescript
|
|
276
|
+
import { DefaultAzureCredential, getBearerTokenProvider} from "@azure/identity";
|
|
277
|
+
import {KnownAnalyzerNames, KnownVectorSearchAlgorithmKind } from "@azure/search-documents";
|
|
278
|
+
|
|
279
|
+
// 1- Setup Azure OpenAI
|
|
280
|
+
const azureADTokenProvider = getBearerTokenProvider(
|
|
281
|
+
new DefaultAzureCredential(),
|
|
282
|
+
"https://cognitiveservices.azure.com/.default",
|
|
283
|
+
);
|
|
284
|
+
|
|
285
|
+
// IMPORTANT: You need to deploy your own embedding model as well as your own chat completion model
|
|
286
|
+
// NOTE: You can use whatever embedding model and language model that is supported by vectorstores
|
|
287
|
+
const azure = {
|
|
288
|
+
azureADTokenProvider,
|
|
289
|
+
deployment: process.env.AZURE_DEPLOYMENT_NAME,
|
|
290
|
+
};
|
|
291
|
+
Settings.llm = new OpenAI({ azure });
|
|
292
|
+
Settings.embedModel = new OpenAIEmbedding({
|
|
293
|
+
model: process.env.EMBEDDING_MODEL,
|
|
294
|
+
azure: {
|
|
295
|
+
...azure,
|
|
296
|
+
deployment: process.env.EMBEDDING_MODEL,
|
|
297
|
+
},
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
// ---------------------------------------------------------
|
|
301
|
+
// 2- Setup Azure AI Search
|
|
302
|
+
// Define env variables in .env file
|
|
303
|
+
// AZURE_AI_SEARCH_ENDPOINT=
|
|
304
|
+
// AZURE_AI_SEARCH_KEY=
|
|
305
|
+
// AZURE_OPENAI_ENDPOINT=
|
|
306
|
+
// EMBEDDING_MODEL=text-embedding-ada-002
|
|
307
|
+
// AZURE_DEPLOYMENT_NAME=gpt-4
|
|
308
|
+
// AZURE_API_VERSION=2024-09-01-preview
|
|
309
|
+
|
|
310
|
+
// Define index name
|
|
311
|
+
const indexName = "vectorstores-vector-store";
|
|
312
|
+
|
|
313
|
+
// ---------------------------------------------------------
|
|
314
|
+
// 3a- Create Index (if it does not exist)
|
|
315
|
+
// id: Edm.String
|
|
316
|
+
// chunk: Edm.String
|
|
317
|
+
// embedding: Collection(Edm.Single)
|
|
318
|
+
// metadata: Edm.String
|
|
319
|
+
// doc_id: Edm.String
|
|
320
|
+
// author: Edm.String
|
|
321
|
+
// theme: Edm.String
|
|
322
|
+
// director: Edm.String
|
|
323
|
+
|
|
324
|
+
// Define metadata fields with their respective configurations
|
|
325
|
+
const metadataFields = {
|
|
326
|
+
author: "author",
|
|
327
|
+
theme: ["theme", MetadataIndexFieldType.STRING],
|
|
328
|
+
director: "director",
|
|
329
|
+
};
|
|
330
|
+
|
|
331
|
+
// Define index parameters and vector store configuration
|
|
332
|
+
// Index validation:
|
|
333
|
+
// - IndexManagement.VALIDATE_INDEX: will validate before creating emnbedding index and will throw a runtime error if the index does not exist
|
|
334
|
+
// - IndexManagement.NO_VALIDATION: will try to access the index and will throw a runtime error if the index does not exist
|
|
335
|
+
// - IndexManagement.CREATE_IF_NOT_EXISTS: will create the index if it does not exist
|
|
336
|
+
|
|
337
|
+
const vectorStore = new AzureAISearchVectorStore({
|
|
338
|
+
filterableMetadataFieldKeys:
|
|
339
|
+
metadataFields as unknown as FilterableMetadataFieldKeysType,
|
|
340
|
+
indexName,
|
|
341
|
+
indexManagement: IndexManagement.CREATE_IF_NOT_EXISTS,
|
|
342
|
+
idFieldKey: "id",
|
|
343
|
+
chunkFieldKey: "chunk",
|
|
344
|
+
embeddingFieldKey: "embedding",
|
|
345
|
+
metadataStringFieldKey: "metadata",
|
|
346
|
+
docIdFieldKey: "doc_id",
|
|
347
|
+
embeddingDimensionality: 1536,
|
|
348
|
+
hiddenFieldKeys: ["embedding"],
|
|
349
|
+
languageAnalyzer: KnownAnalyzerNames.EnLucene,
|
|
350
|
+
// store vectors on disk
|
|
351
|
+
vectorAlgorithmType: KnownVectorSearchAlgorithmKind.ExhaustiveKnn,
|
|
352
|
+
|
|
353
|
+
// Optional: Set to "scalar" or "binary" if using HNSW
|
|
354
|
+
compressionType: KnownVectorSearchCompressionKind.BinaryQuantization,
|
|
355
|
+
});
|
|
356
|
+
|
|
357
|
+
// ---------------------------------------------------------
|
|
358
|
+
// 3a- Loading documents
|
|
359
|
+
// Load the documents stored in the data/paul_graham/ using the SimpleDirectoryReader
|
|
360
|
+
// NOTE: You can use whatever reader that is supported by vectorstores
|
|
361
|
+
|
|
362
|
+
// Load documents using a directory reader
|
|
363
|
+
const documents = await new SimpleDirectoryReader().loadData(
|
|
364
|
+
"data/paul_graham/",
|
|
365
|
+
);
|
|
366
|
+
const storageContext = await storageContextFromDefaults({ vectorStore });
|
|
367
|
+
|
|
368
|
+
// Create index from documents with the specified storage context
|
|
369
|
+
const index = await VectorStoreIndex.fromDocuments(documents, {
|
|
370
|
+
storageContext,
|
|
371
|
+
docStoreStrategy: DocStoreStrategy.UPSERTS,
|
|
372
|
+
});
|
|
373
|
+
|
|
374
|
+
const queryEngine = index.asQueryEngine();
|
|
375
|
+
const response = await queryEngine.query({
|
|
376
|
+
query: "What did the author do growing up?",
|
|
377
|
+
similarityTopK: 3,
|
|
378
|
+
} as any);
|
|
379
|
+
console.log({ response });
|
|
380
|
+
*/ class AzureAISearchVectorStore extends BaseVectorStore {
|
|
381
|
+
#languageAnalyzer;
|
|
382
|
+
#embeddingDimensionality;
|
|
383
|
+
#vectorProfileName;
|
|
384
|
+
#compressionType;
|
|
385
|
+
#indexManagement;
|
|
386
|
+
#indexName;
|
|
387
|
+
#fieldMapping;
|
|
388
|
+
#metadataToIndexFieldMap;
|
|
389
|
+
#idFieldKey;
|
|
390
|
+
#chunkFieldKey;
|
|
391
|
+
#embeddingFieldKey;
|
|
392
|
+
#docIdFieldKey;
|
|
393
|
+
#metadataStringFieldKey;
|
|
394
|
+
#serviceApiVersion;
|
|
395
|
+
#indexMapping;
|
|
396
|
+
#hiddenFiledKeys;
|
|
397
|
+
constructor(options){
|
|
398
|
+
super(options), this.storesText = true, this.#metadataToIndexFieldMap = new Map(), this.flatMetadata = true;
|
|
399
|
+
// set default values
|
|
400
|
+
options.vectorAlgorithmType ||= KnownVectorSearchAlgorithmKind.ExhaustiveKnn;
|
|
401
|
+
options.languageAnalyzer ||= KnownAnalyzerNames.EnLucene;
|
|
402
|
+
options.indexManagement ||= "NoValidation";
|
|
403
|
+
options.embeddingDimensionality ||= 1536;
|
|
404
|
+
options.serviceApiVersion ||= getEnv("AZURE_SEARCH_API_VERSION");
|
|
405
|
+
options.hiddenFieldKeys ||= [];
|
|
406
|
+
// set props
|
|
407
|
+
this.#serviceApiVersion = options.serviceApiVersion || AzureAISearchVectorStoreConfig.DEFAULT_AZURE_API_VERSION;
|
|
408
|
+
this.#languageAnalyzer = options.languageAnalyzer;
|
|
409
|
+
this.#compressionType = options.compressionType;
|
|
410
|
+
this.#embeddingDimensionality = options.embeddingDimensionality;
|
|
411
|
+
this.#indexManagement = options.indexManagement;
|
|
412
|
+
this.#indexName = options.indexName;
|
|
413
|
+
this.#idFieldKey = options.idFieldKey;
|
|
414
|
+
this.#docIdFieldKey = options.docIdFieldKey;
|
|
415
|
+
this.#chunkFieldKey = options.chunkFieldKey;
|
|
416
|
+
this.#embeddingFieldKey = options.embeddingFieldKey;
|
|
417
|
+
this.#metadataStringFieldKey = options.metadataStringFieldKey;
|
|
418
|
+
this.#hiddenFiledKeys = options.hiddenFieldKeys;
|
|
419
|
+
this.#indexMapping = options.indexMapping || this.#defaultIndexMapping;
|
|
420
|
+
// Default field mapping
|
|
421
|
+
this.#fieldMapping = {
|
|
422
|
+
["id"]: options.idFieldKey,
|
|
423
|
+
["doc_id"]: options.docIdFieldKey,
|
|
424
|
+
["chunk"]: options.chunkFieldKey,
|
|
425
|
+
["embedding"]: options.embeddingFieldKey,
|
|
426
|
+
["metadata"]: options.metadataStringFieldKey
|
|
427
|
+
};
|
|
428
|
+
this.#setVectorProfileName(options.vectorAlgorithmType);
|
|
429
|
+
this.#valideSearchOrIndexClient(options);
|
|
430
|
+
// Normalizing metadata to index fields
|
|
431
|
+
this.#metadataToIndexFieldMap = this.#normalizeMetadataToIndexFields(options.filterableMetadataFieldKeys);
|
|
432
|
+
}
|
|
433
|
+
// private
|
|
434
|
+
#normalizeMetadataToIndexFields(filterableMetadataFieldKeys) {
|
|
435
|
+
const indexFieldSpec = new Map();
|
|
436
|
+
if (Array.isArray(filterableMetadataFieldKeys)) {
|
|
437
|
+
// if filterableMetadataFieldKeys is an array, use the field name as the index field name
|
|
438
|
+
// eg. [
|
|
439
|
+
// "author",
|
|
440
|
+
// "theme",
|
|
441
|
+
// "director"
|
|
442
|
+
// ] => {
|
|
443
|
+
// "author": ["author", "Edm.String"],
|
|
444
|
+
// "theme": ["theme", "Edm.String"],
|
|
445
|
+
// "director": ["director", "Edm.String"]
|
|
446
|
+
// }
|
|
447
|
+
filterableMetadataFieldKeys.forEach((field)=>{
|
|
448
|
+
indexFieldSpec.set(field, [
|
|
449
|
+
field,
|
|
450
|
+
"Edm.String"
|
|
451
|
+
]);
|
|
452
|
+
});
|
|
453
|
+
} else if (typeof filterableMetadataFieldKeys === "object") {
|
|
454
|
+
// if filterableMetadataFieldKeys is an object, use the key as the index field name
|
|
455
|
+
// and the value as the metadata field name
|
|
456
|
+
// eg. {
|
|
457
|
+
// "author": "author",
|
|
458
|
+
// "theme": ["topic", MetadataIndexFieldType.STRING],
|
|
459
|
+
// "director": "director"
|
|
460
|
+
// } => {
|
|
461
|
+
// "author": ["author", "Edm.String"],
|
|
462
|
+
// "theme": ["topic", "Edm.String"],
|
|
463
|
+
// "director": ["director", "Edm.String"]
|
|
464
|
+
// }
|
|
465
|
+
// we also support specifying the metadata field type
|
|
466
|
+
// MetadataIndexFieldType.INT32 --> "Edm.Int32"
|
|
467
|
+
// MetadataIndexFieldType.INT64 --> "Edm.Int64"
|
|
468
|
+
// MetadataIndexFieldType.DOUBLE --> "Edm.Double"
|
|
469
|
+
// MetadataIndexFieldType.BOOLEAN --> "Edm.Boolean"
|
|
470
|
+
// MetadataIndexFieldType.COLLECTION --> "Collection(Edm.String)"
|
|
471
|
+
Object.entries(filterableMetadataFieldKeys).forEach(([k, v])=>{
|
|
472
|
+
if (Array.isArray(v)) {
|
|
473
|
+
indexFieldSpec.set(k, [
|
|
474
|
+
v[0],
|
|
475
|
+
v[1]
|
|
476
|
+
]);
|
|
477
|
+
} else {
|
|
478
|
+
switch(v){
|
|
479
|
+
case "Edm.String":
|
|
480
|
+
indexFieldSpec.set(k, [
|
|
481
|
+
v,
|
|
482
|
+
"Edm.String"
|
|
483
|
+
]);
|
|
484
|
+
break;
|
|
485
|
+
case "Edm.Int32":
|
|
486
|
+
indexFieldSpec.set(k, [
|
|
487
|
+
v,
|
|
488
|
+
"Edm.Int32"
|
|
489
|
+
]);
|
|
490
|
+
break;
|
|
491
|
+
case "Edm.Int64":
|
|
492
|
+
indexFieldSpec.set(k, [
|
|
493
|
+
v,
|
|
494
|
+
"Edm.Int64"
|
|
495
|
+
]);
|
|
496
|
+
break;
|
|
497
|
+
case "Edm.Double":
|
|
498
|
+
indexFieldSpec.set(k, [
|
|
499
|
+
v,
|
|
500
|
+
"Edm.Double"
|
|
501
|
+
]);
|
|
502
|
+
break;
|
|
503
|
+
case "Edm.Boolean":
|
|
504
|
+
indexFieldSpec.set(k, [
|
|
505
|
+
v,
|
|
506
|
+
"Edm.Boolean"
|
|
507
|
+
]);
|
|
508
|
+
break;
|
|
509
|
+
case "Collection(Edm.String)":
|
|
510
|
+
indexFieldSpec.set(k, [
|
|
511
|
+
v,
|
|
512
|
+
"Collection(Edm.String)"
|
|
513
|
+
]);
|
|
514
|
+
break;
|
|
515
|
+
default:
|
|
516
|
+
// Index field name and metadata field name may differ
|
|
517
|
+
// Use String as the default index field type
|
|
518
|
+
indexFieldSpec.set(k, [
|
|
519
|
+
v,
|
|
520
|
+
"Edm.String"
|
|
521
|
+
]);
|
|
522
|
+
break;
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
});
|
|
526
|
+
}
|
|
527
|
+
return indexFieldSpec;
|
|
528
|
+
}
|
|
529
|
+
#defaultIndexMapping(node, metadata) {
|
|
530
|
+
// include metadata fields in the index document
|
|
531
|
+
const filterableMetadata = {};
|
|
532
|
+
for (const [fieldName, _fieldType] of this.#metadataToIndexFieldMap.values()){
|
|
533
|
+
filterableMetadata[fieldName] = metadata[fieldName];
|
|
534
|
+
}
|
|
535
|
+
return {
|
|
536
|
+
[this.#embeddingFieldKey]: node.getEmbedding(),
|
|
537
|
+
[this.#idFieldKey]: node.id_,
|
|
538
|
+
[this.#docIdFieldKey]: node.id_,
|
|
539
|
+
[this.#chunkFieldKey]: node.getContent(MetadataMode.NONE),
|
|
540
|
+
[this.#metadataStringFieldKey]: JSON.stringify(metadata),
|
|
541
|
+
...filterableMetadata
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
#setVectorProfileName(vectorAlgorithmType) {
|
|
545
|
+
if (vectorAlgorithmType === KnownVectorSearchAlgorithmKind.ExhaustiveKnn) {
|
|
546
|
+
this.#vectorProfileName = "myExhaustiveKnnProfile";
|
|
547
|
+
} else if (vectorAlgorithmType === KnownVectorSearchAlgorithmKind.Hnsw) {
|
|
548
|
+
this.#vectorProfileName = "myHnswProfile";
|
|
549
|
+
} else {
|
|
550
|
+
throw new Error("Only 'exhaustiveKnn' and 'hnsw' are supported for vectorAlgorithmType");
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* Create a list of index fields for storing metadata values.
|
|
555
|
+
* @returns List of index fields for storing metadata values
|
|
556
|
+
*/ #createMetadataIndexFields() {
|
|
557
|
+
const indexFields = [];
|
|
558
|
+
for (const [fieldName, fieldType] of this.#metadataToIndexFieldMap.values()){
|
|
559
|
+
if (this.#fieldMapping[fieldName]) {
|
|
560
|
+
consoleLogger.log(`Skipping metadata field ${fieldName} as it is already mapped to an index field`);
|
|
561
|
+
continue;
|
|
562
|
+
}
|
|
563
|
+
let indexFieldType;
|
|
564
|
+
switch(fieldType){
|
|
565
|
+
case "Edm.String":
|
|
566
|
+
indexFieldType = KnownSearchFieldDataType.String;
|
|
567
|
+
break;
|
|
568
|
+
case "Edm.Int32":
|
|
569
|
+
indexFieldType = KnownSearchFieldDataType.Int32;
|
|
570
|
+
break;
|
|
571
|
+
case "Edm.Int64":
|
|
572
|
+
indexFieldType = KnownSearchFieldDataType.Int64;
|
|
573
|
+
break;
|
|
574
|
+
case "Edm.Double":
|
|
575
|
+
indexFieldType = KnownSearchFieldDataType.Double;
|
|
576
|
+
break;
|
|
577
|
+
case "Edm.Boolean":
|
|
578
|
+
indexFieldType = KnownSearchFieldDataType.Boolean;
|
|
579
|
+
break;
|
|
580
|
+
case "Collection(Edm.String)":
|
|
581
|
+
indexFieldType = `Collection(${KnownSearchFieldDataType.String})`;
|
|
582
|
+
break;
|
|
583
|
+
default:
|
|
584
|
+
throw new Error(`Unsupported field type: ${fieldType}`);
|
|
585
|
+
}
|
|
586
|
+
indexFields.push({
|
|
587
|
+
name: fieldName,
|
|
588
|
+
type: indexFieldType,
|
|
589
|
+
filterable: true
|
|
590
|
+
});
|
|
591
|
+
}
|
|
592
|
+
return indexFields;
|
|
593
|
+
}
|
|
594
|
+
// index management
|
|
595
|
+
async #indexExists(indexName) {
|
|
596
|
+
if (!indexName) {
|
|
597
|
+
throw new Error(`options.indexName is not valid`);
|
|
598
|
+
}
|
|
599
|
+
const availableIndexNames = await this._indexClient?.listIndexesNames();
|
|
600
|
+
if (!availableIndexNames) {
|
|
601
|
+
return false;
|
|
602
|
+
}
|
|
603
|
+
let listOfIndexNames = await availableIndexNames.next();
|
|
604
|
+
const indexNames = [];
|
|
605
|
+
while(!listOfIndexNames.done){
|
|
606
|
+
indexNames.push(listOfIndexNames.value);
|
|
607
|
+
listOfIndexNames = await availableIndexNames.next();
|
|
608
|
+
}
|
|
609
|
+
return indexNames.includes(indexName);
|
|
610
|
+
}
|
|
611
|
+
async #createIndexIfNotExists(indexName) {
|
|
612
|
+
const indexExists = await this.#indexExists(indexName);
|
|
613
|
+
if (!indexExists) {
|
|
614
|
+
consoleLogger.log(`Index ${indexName} does not exist in Azure AI Search, creating index`);
|
|
615
|
+
await this.#createIndex(indexName);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
/**
|
|
619
|
+
* Creates a default index based on the supplied index name, key field names and
|
|
620
|
+
* metadata filtering keys.
|
|
621
|
+
* @param indexName The name of the index to create
|
|
622
|
+
*/ async #createIndex(indexName) {
|
|
623
|
+
consoleLogger.log(`Configuring ${indexName} fields for Azure AI Search`);
|
|
624
|
+
const fields = [
|
|
625
|
+
{
|
|
626
|
+
name: this.#fieldMapping["id"],
|
|
627
|
+
type: KnownSearchFieldDataType.String,
|
|
628
|
+
hidden: this.#hiddenFiledKeys?.includes(this.#fieldMapping["id"]),
|
|
629
|
+
key: true,
|
|
630
|
+
filterable: true,
|
|
631
|
+
retrievable: true,
|
|
632
|
+
searchable: true
|
|
633
|
+
},
|
|
634
|
+
{
|
|
635
|
+
name: this.#fieldMapping["chunk"],
|
|
636
|
+
type: KnownSearchFieldDataType.String,
|
|
637
|
+
hidden: this.#hiddenFiledKeys?.includes(this.#fieldMapping["chunk"]),
|
|
638
|
+
analyzerName: this.#languageAnalyzer,
|
|
639
|
+
searchable: true
|
|
640
|
+
},
|
|
641
|
+
{
|
|
642
|
+
name: this.#fieldMapping["embedding"],
|
|
643
|
+
type: `Collection(${KnownSearchFieldDataType.Single})`,
|
|
644
|
+
vectorSearchDimensions: this.#embeddingDimensionality,
|
|
645
|
+
vectorSearchProfileName: this.#vectorProfileName,
|
|
646
|
+
hidden: this.#hiddenFiledKeys?.includes(this.#fieldMapping["embedding"]),
|
|
647
|
+
searchable: true
|
|
648
|
+
},
|
|
649
|
+
{
|
|
650
|
+
name: this.#fieldMapping["metadata"],
|
|
651
|
+
hidden: this.#hiddenFiledKeys?.includes(this.#fieldMapping["metadata"]),
|
|
652
|
+
type: KnownSearchFieldDataType.String
|
|
653
|
+
},
|
|
654
|
+
{
|
|
655
|
+
name: this.#fieldMapping["doc_id"],
|
|
656
|
+
type: KnownSearchFieldDataType.String,
|
|
657
|
+
hidden: this.#hiddenFiledKeys?.includes(this.#fieldMapping["doc_id"]),
|
|
658
|
+
filterable: true,
|
|
659
|
+
retrievable: true,
|
|
660
|
+
searchable: true
|
|
661
|
+
}
|
|
662
|
+
];
|
|
663
|
+
consoleLogger.log(`Configuring ${indexName} metadata fields`);
|
|
664
|
+
const metadataIndexFields = this.#createMetadataIndexFields();
|
|
665
|
+
fields.push(...metadataIndexFields);
|
|
666
|
+
consoleLogger.log(`Configuring ${indexName} vector search`);
|
|
667
|
+
const compressions = this.#getCompressions();
|
|
668
|
+
consoleLogger.log(`Configuring ${indexName} vector search with ${this.#compressionType} compression`);
|
|
669
|
+
const vectorSearch = {
|
|
670
|
+
algorithms: [
|
|
671
|
+
{
|
|
672
|
+
name: AzureAISearchVectorStoreConfig.ALGORITHM_HNSW_NAME,
|
|
673
|
+
kind: KnownVectorSearchAlgorithmKind.Hnsw,
|
|
674
|
+
parameters: {
|
|
675
|
+
m: 4,
|
|
676
|
+
efConstruction: 400,
|
|
677
|
+
efSearch: 500,
|
|
678
|
+
metric: KnownVectorSearchAlgorithmMetric.Cosine
|
|
679
|
+
}
|
|
680
|
+
},
|
|
681
|
+
{
|
|
682
|
+
name: AzureAISearchVectorStoreConfig.ALGORITHM_EXHAUSTIVE_KNN_NAME,
|
|
683
|
+
kind: KnownVectorSearchAlgorithmKind.ExhaustiveKnn,
|
|
684
|
+
parameters: {
|
|
685
|
+
metric: KnownVectorSearchAlgorithmMetric.Cosine
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
],
|
|
689
|
+
compressions,
|
|
690
|
+
profiles: [
|
|
691
|
+
{
|
|
692
|
+
name: AzureAISearchVectorStoreConfig.PROFILE_HNSW_NAME,
|
|
693
|
+
algorithmConfigurationName: AzureAISearchVectorStoreConfig.ALGORITHM_HNSW_NAME,
|
|
694
|
+
compressionName: compressions?.[0]?.compressionName
|
|
695
|
+
},
|
|
696
|
+
{
|
|
697
|
+
name: AzureAISearchVectorStoreConfig.PROFILE_EXHAUSTIVE_KNN_NAME,
|
|
698
|
+
algorithmConfigurationName: AzureAISearchVectorStoreConfig.ALGORITHM_EXHAUSTIVE_KNN_NAME
|
|
699
|
+
}
|
|
700
|
+
]
|
|
701
|
+
};
|
|
702
|
+
consoleLogger.log(`Configuring ${indexName} semantic search`);
|
|
703
|
+
const semanticConfig = {
|
|
704
|
+
name: AzureAISearchVectorStoreConfig.SEMANTIC_CONFIG_NAME,
|
|
705
|
+
prioritizedFields: {
|
|
706
|
+
contentFields: [
|
|
707
|
+
{
|
|
708
|
+
name: this.#fieldMapping["chunk"]
|
|
709
|
+
}
|
|
710
|
+
],
|
|
711
|
+
keywordsFields: [
|
|
712
|
+
{
|
|
713
|
+
name: this.#fieldMapping["metadata"]
|
|
714
|
+
}
|
|
715
|
+
],
|
|
716
|
+
titleField: {
|
|
717
|
+
name: this.#fieldMapping["id"]
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
};
|
|
721
|
+
const semanticSearch = {
|
|
722
|
+
configurations: [
|
|
723
|
+
semanticConfig
|
|
724
|
+
]
|
|
725
|
+
};
|
|
726
|
+
const index = {
|
|
727
|
+
name: indexName,
|
|
728
|
+
fields: fields,
|
|
729
|
+
vectorSearch: vectorSearch,
|
|
730
|
+
semanticSearch: semanticSearch
|
|
731
|
+
};
|
|
732
|
+
consoleLogger.log(`Creating ${indexName} search index with configuration:`);
|
|
733
|
+
consoleLogger.log({
|
|
734
|
+
index
|
|
735
|
+
});
|
|
736
|
+
await this._indexClient?.createIndex(index);
|
|
737
|
+
}
|
|
738
|
+
/**
|
|
739
|
+
* Get the compressions for the vector search
|
|
740
|
+
* @returns Array of compressions. See {@link VectorSearchCompression}
|
|
741
|
+
*/ #getCompressions() {
|
|
742
|
+
const compressions = [];
|
|
743
|
+
if (this.#compressionType === KnownVectorSearchCompressionKind.BinaryQuantization) {
|
|
744
|
+
compressions.push({
|
|
745
|
+
compressionName: AzureAISearchVectorStoreConfig.COMPRESSION_TYPE_BINARY,
|
|
746
|
+
kind: KnownVectorSearchCompressionKind.BinaryQuantization
|
|
747
|
+
});
|
|
748
|
+
} else if (this.#compressionType === KnownVectorSearchCompressionKind.ScalarQuantization) {
|
|
749
|
+
compressions.push({
|
|
750
|
+
compressionName: AzureAISearchVectorStoreConfig.COMPRESSION_TYPE_SCALAR,
|
|
751
|
+
kind: KnownVectorSearchCompressionKind.ScalarQuantization
|
|
752
|
+
});
|
|
753
|
+
}
|
|
754
|
+
return compressions;
|
|
755
|
+
}
|
|
756
|
+
#valideSearchOrIndexClient(options) {
|
|
757
|
+
if (options.searchClient) {
|
|
758
|
+
if (options.searchClient instanceof SearchClient) {
|
|
759
|
+
consoleLogger.log("Using provided Azure SearchClient");
|
|
760
|
+
this._searchClient = options.searchClient;
|
|
761
|
+
if (options.indexName) {
|
|
762
|
+
throw new Error("options.indexName cannot be supplied if using options.searchClient");
|
|
763
|
+
}
|
|
764
|
+
} else {
|
|
765
|
+
throw new Error("options.searchClient must be an instance of SearchClient");
|
|
766
|
+
}
|
|
767
|
+
} else {
|
|
768
|
+
this.createSearchClient(options);
|
|
769
|
+
}
|
|
770
|
+
if (options.indexClient) {
|
|
771
|
+
if (options.indexClient instanceof SearchIndexClient) {
|
|
772
|
+
if (!options.indexName) {
|
|
773
|
+
throw new Error("options.indexName must be supplied if using options.indexClient");
|
|
774
|
+
}
|
|
775
|
+
this._indexClient = options.indexClient;
|
|
776
|
+
} else {
|
|
777
|
+
throw new Error("options.indexClient must be an instance of SearchIndexClient");
|
|
778
|
+
}
|
|
779
|
+
} else {
|
|
780
|
+
this.createSearchIndexClient(options);
|
|
781
|
+
}
|
|
782
|
+
if (options.indexManagement === "CreateIfNotExists" && !this._indexClient) {
|
|
783
|
+
throw new Error("IndexManagement.CREATE_IF_NOT_EXISTS requires options.indexClient");
|
|
784
|
+
}
|
|
785
|
+
if (!this._searchClient && !this._indexClient) {
|
|
786
|
+
throw new Error("Either options.searchClient or options.indexClient must be supplied");
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
#buildCredentials(options) {
|
|
790
|
+
let { credential: credential, key, endpoint, indexName } = options;
|
|
791
|
+
// validate and use credential
|
|
792
|
+
if (credential) {
|
|
793
|
+
// if credential are provided, ensure they are an instance of valid credential instances
|
|
794
|
+
if (!(credential instanceof AzureKeyCredential || credential instanceof DefaultAzureCredential || credential instanceof ManagedIdentityCredential)) {
|
|
795
|
+
throw new Error("options.credential must be an instance of AzureKeyCredential or DefaultAzureCredential or ManagedIdentityCredential");
|
|
796
|
+
}
|
|
797
|
+
} else {
|
|
798
|
+
key ??= getEnv("AZURE_AI_SEARCH_KEY");
|
|
799
|
+
if (key) {
|
|
800
|
+
consoleLogger.log("Using provided Azure Search key");
|
|
801
|
+
credential = new AzureKeyCredential(key);
|
|
802
|
+
} else {
|
|
803
|
+
const clientId = getEnv("AZURE_CLIENT_ID");
|
|
804
|
+
if (clientId) {
|
|
805
|
+
consoleLogger.log("Using Azure Managed identity");
|
|
806
|
+
credential = new ManagedIdentityCredential(clientId);
|
|
807
|
+
} else {
|
|
808
|
+
// if key wasn't provided, try using DefaultAzureCredential
|
|
809
|
+
consoleLogger.log("Using Default Azure Credential");
|
|
810
|
+
credential = new DefaultAzureCredential();
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
// validate and use endpoint
|
|
815
|
+
endpoint ??= getEnv("AZURE_AI_SEARCH_ENDPOINT");
|
|
816
|
+
if (!endpoint) {
|
|
817
|
+
throw new Error("options.endpoint must be provided or set as an environment variable: AZURE_AI_SEARCH_ENDPOINT");
|
|
818
|
+
} else {
|
|
819
|
+
// check if enpoint is a valid URL
|
|
820
|
+
try {
|
|
821
|
+
new URL(endpoint);
|
|
822
|
+
} catch (error) {
|
|
823
|
+
throw new Error(`options.endpoint must be a valid URL.`);
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
// validate and use indexName
|
|
827
|
+
if (!indexName) {
|
|
828
|
+
if (this._searchClient) {
|
|
829
|
+
indexName = this._searchClient.indexName;
|
|
830
|
+
} else {
|
|
831
|
+
throw new Error("options.indexName must be provided");
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
return {
|
|
835
|
+
credential,
|
|
836
|
+
endpoint,
|
|
837
|
+
indexName
|
|
838
|
+
};
|
|
839
|
+
}
|
|
840
|
+
createSearchIndexClient(options) {
|
|
841
|
+
const { credential, endpoint } = this.#buildCredentials(options);
|
|
842
|
+
this._indexClient = new SearchIndexClient(endpoint, credential, {
|
|
843
|
+
serviceVersion: this.#serviceApiVersion,
|
|
844
|
+
userAgentOptions: {
|
|
845
|
+
userAgentPrefix: options.userAgent ?? AzureAISearchVectorStoreConfig.DEFAULT_USER_AGENT_PREFIX
|
|
846
|
+
}
|
|
847
|
+
});
|
|
848
|
+
}
|
|
849
|
+
createSearchClient(options) {
|
|
850
|
+
const { credential, endpoint, indexName } = this.#buildCredentials(options);
|
|
851
|
+
this._searchClient = new SearchClient(endpoint, indexName, credential, {
|
|
852
|
+
serviceVersion: this.#serviceApiVersion,
|
|
853
|
+
userAgentOptions: {
|
|
854
|
+
userAgentPrefix: options.userAgent ?? AzureAISearchVectorStoreConfig.DEFAULT_USER_AGENT_PREFIX
|
|
855
|
+
}
|
|
856
|
+
});
|
|
857
|
+
}
|
|
858
|
+
async #validateIndex(indexName) {
|
|
859
|
+
if (this._indexClient && indexName && !await this.#indexExists(indexName)) {
|
|
860
|
+
throw new Error(`Validation failed, index ${indexName} does not exist.`);
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
/**
|
|
864
|
+
* Create AI Search index document from embedding result.
|
|
865
|
+
* @param node The node to create the index document from
|
|
866
|
+
* @returns The mapped index document from the node
|
|
867
|
+
*/ #createIndexDocument(node) {
|
|
868
|
+
consoleLogger.log(`Mapping indexed document: ${node.id_}`);
|
|
869
|
+
const metadata = nodeToMetadata(node, true, this.#chunkFieldKey, this.flatMetadata);
|
|
870
|
+
return this.#indexMapping(node, metadata);
|
|
871
|
+
}
|
|
872
|
+
/**
|
|
873
|
+
* Generate an OData filter string using supplied metadata filters.
|
|
874
|
+
* @param metadataFilters
|
|
875
|
+
* @returns
|
|
876
|
+
*/ #createOdataFilter(metadataFilters) {
|
|
877
|
+
const odataFilter = [];
|
|
878
|
+
for (const subfilter of metadataFilters.filters){
|
|
879
|
+
// Join values with ' or ' to create an OR condition inside the any function
|
|
880
|
+
const metadataMapping = this.#metadataToIndexFieldMap.get(subfilter.key);
|
|
881
|
+
if (!metadataMapping) {
|
|
882
|
+
throw new Error(`Metadata field '${subfilter.key}' is missing a mapping to an index field. Please provide an entry in 'filterableMetadataFieldKeys' for this vector store.`);
|
|
883
|
+
}
|
|
884
|
+
const indexField = metadataMapping[0];
|
|
885
|
+
if (subfilter.operator === FilterOperator.IN) {
|
|
886
|
+
let valueStr;
|
|
887
|
+
if (Array.isArray(subfilter.value)) {
|
|
888
|
+
valueStr = subfilter.value.map((value)=>typeof value === "string" ? `t eq '${value}'` : `t eq ${value}`).join(" or ");
|
|
889
|
+
} else {
|
|
890
|
+
valueStr = typeof subfilter.value === "string" ? `t eq '${subfilter.value}'` : `t eq ${subfilter.value}`;
|
|
891
|
+
}
|
|
892
|
+
odataFilter.push(`${indexField}/any(t: ${valueStr})`);
|
|
893
|
+
} else if (subfilter.operator === FilterOperator.EQ) {
|
|
894
|
+
const escapedValue = typeof subfilter.value === "string" ? subfilter.value.replace(/'/g, "''") : subfilter.value;
|
|
895
|
+
odataFilter.push(`${indexField} eq '${escapedValue}'`);
|
|
896
|
+
} else {
|
|
897
|
+
throw new Error(`Unsupported filter operator ${subfilter.operator}. Supported operators are 'IN' and 'EQ'`);
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
let odataExpr = "";
|
|
901
|
+
if (metadataFilters.condition === FilterCondition.AND) {
|
|
902
|
+
odataExpr = odataFilter.join(" and ");
|
|
903
|
+
} else if (metadataFilters.condition === FilterCondition.OR) {
|
|
904
|
+
odataExpr = odataFilter.join(" or ");
|
|
905
|
+
} else {
|
|
906
|
+
throw new Error(`Unsupported filter condition ${metadataFilters.condition}. Supported conditions are 'AND' and 'OR'`);
|
|
907
|
+
}
|
|
908
|
+
consoleLogger.log(`OData filter: ${odataExpr}`);
|
|
909
|
+
return odataExpr;
|
|
910
|
+
}
|
|
911
|
+
#createNodeFromResult(result, fieldMapping) {
|
|
912
|
+
const { document } = result;
|
|
913
|
+
const metadataStr = document[fieldMapping["metadata"]];
|
|
914
|
+
const metadata = metadataStr ? JSON.parse(metadataStr) : {};
|
|
915
|
+
try {
|
|
916
|
+
const node = metadataDictToNode(metadata);
|
|
917
|
+
node.setContent(document[fieldMapping["chunk"]]);
|
|
918
|
+
node.embedding = document[fieldMapping["embedding"]];
|
|
919
|
+
return node;
|
|
920
|
+
} catch (error) {
|
|
921
|
+
throw new Error(`Failed to create node from search result`);
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
#buildFilterString(fieldMapping, nodeIds, filters) {
|
|
925
|
+
let filterStr = "";
|
|
926
|
+
if (nodeIds && nodeIds.length > 0) {
|
|
927
|
+
filterStr = nodeIds.map((nodeId)=>`${fieldMapping["id"]} eq '${nodeId}'`).join(" or ");
|
|
928
|
+
}
|
|
929
|
+
if (filters) {
|
|
930
|
+
const metadataFilter = this.#createOdataFilter(filters);
|
|
931
|
+
if (filterStr) {
|
|
932
|
+
filterStr = `(${filterStr}) or (${metadataFilter})`;
|
|
933
|
+
} else {
|
|
934
|
+
filterStr = metadataFilter;
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
return filterStr;
|
|
938
|
+
}
|
|
939
|
+
#processBatchResults(batchNodes, nodes, batchSize, limit) {
|
|
940
|
+
if (batchNodes.length === 0) {
|
|
941
|
+
return [
|
|
942
|
+
nodes,
|
|
943
|
+
false
|
|
944
|
+
];
|
|
945
|
+
}
|
|
946
|
+
nodes = [
|
|
947
|
+
...nodes,
|
|
948
|
+
...batchNodes
|
|
949
|
+
];
|
|
950
|
+
// If we've hit the requested limit, stop
|
|
951
|
+
if (limit && nodes.length >= limit) {
|
|
952
|
+
return [
|
|
953
|
+
nodes.slice(0, limit),
|
|
954
|
+
false
|
|
955
|
+
];
|
|
956
|
+
}
|
|
957
|
+
// If we got fewer results than batch size, we've hit the end
|
|
958
|
+
if (batchNodes.length < batchSize) {
|
|
959
|
+
return [
|
|
960
|
+
nodes,
|
|
961
|
+
false
|
|
962
|
+
];
|
|
963
|
+
}
|
|
964
|
+
return [
|
|
965
|
+
nodes,
|
|
966
|
+
true
|
|
967
|
+
];
|
|
968
|
+
}
|
|
969
|
+
// public
|
|
970
|
+
/**
|
|
971
|
+
* Get search client
|
|
972
|
+
* @returns Azure AI Search client. See {@link SearchClient}
|
|
973
|
+
*/ client() {
|
|
974
|
+
return this._searchClient;
|
|
975
|
+
}
|
|
976
|
+
/**
|
|
977
|
+
* Get index client
|
|
978
|
+
* @returns Azure AI Search index client. See {@link SearchIndexClient}
|
|
979
|
+
*/ indexClient() {
|
|
980
|
+
return this._indexClient;
|
|
981
|
+
}
|
|
982
|
+
/**
|
|
983
|
+
* Add nodes to index associated with the configured search client.
|
|
984
|
+
* @param nodes List of nodes with embeddings to add to the index
|
|
985
|
+
* @returns List of node IDs that were added to the index
|
|
986
|
+
*/ async add(nodes) {
|
|
987
|
+
if (!this._searchClient) {
|
|
988
|
+
throw new Error("Async Search client not initialized");
|
|
989
|
+
}
|
|
990
|
+
if (!nodes || nodes.length === 0) {
|
|
991
|
+
return [];
|
|
992
|
+
}
|
|
993
|
+
if (nodes.length > 0) {
|
|
994
|
+
if (this.#indexManagement === "CreateIfNotExists" && this.#indexName) {
|
|
995
|
+
await this.#createIndexIfNotExists(this.#indexName);
|
|
996
|
+
}
|
|
997
|
+
if (this.#indexManagement === "ValidateIndex") {
|
|
998
|
+
await this.#validateIndex(this.#indexName);
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
const accumulator = new IndexDocumentsBatch();
|
|
1002
|
+
let documents = [];
|
|
1003
|
+
const ids = [];
|
|
1004
|
+
let accumulatedSize = 0;
|
|
1005
|
+
const maxSize = AzureAISearchVectorStoreConfig.DEFAULT_MAX_MB_SIZE;
|
|
1006
|
+
const maxDocs = AzureAISearchVectorStoreConfig.DEFAULT_MAX_BATCH_SIZE;
|
|
1007
|
+
for (const node of nodes){
|
|
1008
|
+
consoleLogger.log(`Processing embedding: ${node.id_}`);
|
|
1009
|
+
const indexDocument = this.#createIndexDocument(node);
|
|
1010
|
+
const documentSize = JSON.stringify(indexDocument).length; // in bytes
|
|
1011
|
+
documents.push(indexDocument);
|
|
1012
|
+
accumulatedSize += documentSize;
|
|
1013
|
+
accumulator.upload(documents);
|
|
1014
|
+
if (documents.length >= maxDocs || accumulatedSize >= maxSize) {
|
|
1015
|
+
consoleLogger.log(`Uploading batch of size ${documents.length}, current progress ${ids.length} of ${nodes.length}, accumulated size ${(accumulatedSize / (1024 * 1024)).toFixed(2)} MB`);
|
|
1016
|
+
await this._searchClient.indexDocuments(accumulator);
|
|
1017
|
+
documents = [];
|
|
1018
|
+
accumulatedSize = 0;
|
|
1019
|
+
}
|
|
1020
|
+
ids.push(node.id_);
|
|
1021
|
+
}
|
|
1022
|
+
if (documents.length > 0) {
|
|
1023
|
+
consoleLogger.log(`Uploading remaining batch of size ${documents.length}, current progress ${ids.length} of ${nodes.length}, accumulated size ${(accumulatedSize / (1024 * 1024)).toFixed(2)} MB`);
|
|
1024
|
+
await this._searchClient.indexDocuments(accumulator);
|
|
1025
|
+
}
|
|
1026
|
+
return ids;
|
|
1027
|
+
}
|
|
1028
|
+
/**
|
|
1029
|
+
* Delete documents from the AI Search Index with docIdFieldKey (doc_id) field equal to refDocId.
|
|
1030
|
+
* @param refDocId The reference document ID to delete from the index
|
|
1031
|
+
*/ async delete(refDocId) {
|
|
1032
|
+
// Check if index exists
|
|
1033
|
+
if (!await this.#indexExists(this.#indexName)) {
|
|
1034
|
+
return;
|
|
1035
|
+
}
|
|
1036
|
+
if (!this._searchClient) {
|
|
1037
|
+
throw new Error("searchClient is not initialized");
|
|
1038
|
+
}
|
|
1039
|
+
// Define filter and batch size
|
|
1040
|
+
const filterExpr = `${this.#fieldMapping["doc_id"]} eq '${refDocId}'`;
|
|
1041
|
+
const batchSize = 1000;
|
|
1042
|
+
while(true){
|
|
1043
|
+
// Search for documents to delete
|
|
1044
|
+
consoleLogger.log(`Searching with filter ${filterExpr}`);
|
|
1045
|
+
const searchResults = await this._searchClient.search("*", {
|
|
1046
|
+
filter: filterExpr,
|
|
1047
|
+
top: batchSize
|
|
1048
|
+
});
|
|
1049
|
+
// Collect document IDs to delete
|
|
1050
|
+
const docsToDelete = [];
|
|
1051
|
+
for await (const result of searchResults.results){
|
|
1052
|
+
const { document } = result;
|
|
1053
|
+
docsToDelete.push(document);
|
|
1054
|
+
}
|
|
1055
|
+
// Delete documents if found
|
|
1056
|
+
if (docsToDelete.length > 0) {
|
|
1057
|
+
consoleLogger.log(`Deleting ${docsToDelete.length} documents`);
|
|
1058
|
+
await this._searchClient.deleteDocuments(docsToDelete);
|
|
1059
|
+
} else {
|
|
1060
|
+
consoleLogger.log("No documents found to delete");
|
|
1061
|
+
break;
|
|
1062
|
+
}
|
|
1063
|
+
}
|
|
1064
|
+
}
|
|
1065
|
+
/**
|
|
1066
|
+
* Get nodes asynchronously from the Azure AI Search index.
|
|
1067
|
+
* @param nodeIds List of node IDs to retrieve from the index
|
|
1068
|
+
* @param filters Metadata filters to apply to the search
|
|
1069
|
+
* @param limit Maximum number of nodes to retrieve
|
|
1070
|
+
* @returns List of nodes retrieved from the index
|
|
1071
|
+
*/ async getNodes(nodeIds, filters, limit) {
|
|
1072
|
+
if (!this._searchClient) {
|
|
1073
|
+
throw new Error("SearchClient not initialized");
|
|
1074
|
+
}
|
|
1075
|
+
const filterStr = this.#buildFilterString(this.#fieldMapping, nodeIds, filters);
|
|
1076
|
+
const nodes = [];
|
|
1077
|
+
const batchSize = 1000; // Azure Search batch size limit
|
|
1078
|
+
while(true){
|
|
1079
|
+
try {
|
|
1080
|
+
const searchRequest = createSearchRequest(this.#fieldMapping, filterStr, batchSize, nodes.length);
|
|
1081
|
+
const results = await this._searchClient.search("*", searchRequest);
|
|
1082
|
+
const batchNodes = [];
|
|
1083
|
+
for await (const result of results.results){
|
|
1084
|
+
batchNodes.push(this.#createNodeFromResult(result, this.#fieldMapping));
|
|
1085
|
+
}
|
|
1086
|
+
const [updatedNodes, continueFetching] = this.#processBatchResults(batchNodes, nodes, batchSize, limit);
|
|
1087
|
+
nodes.push(...updatedNodes);
|
|
1088
|
+
if (!continueFetching) {
|
|
1089
|
+
break;
|
|
1090
|
+
}
|
|
1091
|
+
} catch (error) {
|
|
1092
|
+
throw new Error(`Failed to get nodes from Azure AI Search: ${error}`);
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
return nodes;
|
|
1096
|
+
}
|
|
1097
|
+
async query(query) {
|
|
1098
|
+
let odataFilter;
|
|
1099
|
+
if (query.filters) {
|
|
1100
|
+
odataFilter = this.#createOdataFilter(query.filters);
|
|
1101
|
+
consoleLogger.log(`Querying with OData filter: ${odataFilter}`);
|
|
1102
|
+
}
|
|
1103
|
+
consoleLogger.log({
|
|
1104
|
+
query
|
|
1105
|
+
});
|
|
1106
|
+
// Define base AzureQueryResultSearch object based on query mode
|
|
1107
|
+
let azureQueryResultSearch = new AzureQueryResultSearchDefault(query, this.#fieldMapping, odataFilter, this._searchClient);
|
|
1108
|
+
switch(query.mode){
|
|
1109
|
+
case VectorStoreQueryMode.SPARSE:
|
|
1110
|
+
azureQueryResultSearch = new AzureQueryResultSearchSparse(query, this.#fieldMapping, odataFilter, this._searchClient);
|
|
1111
|
+
break;
|
|
1112
|
+
case VectorStoreQueryMode.HYBRID:
|
|
1113
|
+
azureQueryResultSearch = new AzureQueryResultSearchHybrid(query, this.#fieldMapping, odataFilter, this._searchClient);
|
|
1114
|
+
break;
|
|
1115
|
+
case VectorStoreQueryMode.SEMANTIC_HYBRID:
|
|
1116
|
+
azureQueryResultSearch = new AzureQueryResultSearchSemanticHybrid(query, this.#fieldMapping, odataFilter, this._searchClient);
|
|
1117
|
+
break;
|
|
1118
|
+
}
|
|
1119
|
+
// Execute the search and return the result
|
|
1120
|
+
return await azureQueryResultSearch.search();
|
|
1121
|
+
}
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
var version = "0.1.0";
|
|
1125
|
+
var pkg = {
|
|
1126
|
+
version: version};
|
|
1127
|
+
|
|
1128
|
+
/** Azure Cosmos DB for MongoDB vCore Similarity type. */ const AzureCosmosDBMongoDBSimilarityType = {
|
|
1129
|
+
/** Cosine similarity */ COS: "COS",
|
|
1130
|
+
/** Inner - product */ IP: "IP",
|
|
1131
|
+
/** Euclidian distance */ L2: "L2"
|
|
1132
|
+
};
|
|
1133
|
+
/**
|
|
1134
|
+
* Azure Cosmos DB for MongoDB vCore vector store.
|
|
1135
|
+
* To use this, you should have both:
|
|
1136
|
+
* - the `mongodb` NPM package installed
|
|
1137
|
+
* - a connection string associated with a MongoDB VCore Cluster
|
|
1138
|
+
*
|
|
1139
|
+
* You do not need to create a database or collection, it will be created
|
|
1140
|
+
* automatically.
|
|
1141
|
+
*
|
|
1142
|
+
* You also need an index on the collection, which is by default be created
|
|
1143
|
+
* automatically using the `createIndex` method.
|
|
1144
|
+
*/ class AzureCosmosDBMongoDBVectorStore extends BaseVectorStore {
|
|
1145
|
+
constructor(init){
|
|
1146
|
+
super(init), this.storesText = true, this.flatMetadata = true;
|
|
1147
|
+
if (init.mongodbClient) {
|
|
1148
|
+
this.mongodbClient = init.mongodbClient;
|
|
1149
|
+
} else {
|
|
1150
|
+
const mongoUri = getEnv("AZURE_COSMOSDB_MONGODB_CONNECTION_STRING");
|
|
1151
|
+
if (!mongoUri) {
|
|
1152
|
+
throw new Error("AzureCosmosDBMongoDBVectorStore client or connection string must be set.");
|
|
1153
|
+
}
|
|
1154
|
+
this.mongodbClient = new MongoClient(mongoUri, {
|
|
1155
|
+
appName: "VECTORSTORES_JS"
|
|
1156
|
+
});
|
|
1157
|
+
}
|
|
1158
|
+
this.mongodbClient.appendMetadata({
|
|
1159
|
+
name: "VECTORSTORES_AZURE_COSMOS_VCORE_VECTOR_STORE",
|
|
1160
|
+
version: pkg.version
|
|
1161
|
+
});
|
|
1162
|
+
this.dbName = init.dbName ?? "documentsDB";
|
|
1163
|
+
this.collectionName = init.collectionName ?? "documents";
|
|
1164
|
+
this.indexedMetadataFields = init.indexedMetadataFields ?? [];
|
|
1165
|
+
this.indexName = init.indexName ?? "vectorSearchIndex";
|
|
1166
|
+
this.embeddingKey = init.embeddingKey ?? "vectorContent";
|
|
1167
|
+
this.idKey = init.idKey ?? "id";
|
|
1168
|
+
this.textKey = init.textKey ?? "text";
|
|
1169
|
+
this.metadataKey = init.metadataKey ?? "metadata";
|
|
1170
|
+
this.indexOptions = init.indexOptions ?? {};
|
|
1171
|
+
this.database = this.mongodbClient.db(this.dbName);
|
|
1172
|
+
}
|
|
1173
|
+
client() {
|
|
1174
|
+
return this.mongodbClient;
|
|
1175
|
+
}
|
|
1176
|
+
async ensureCollection() {
|
|
1177
|
+
if (!this.collection) {
|
|
1178
|
+
const collection = await this.mongodbClient.db(this.dbName).createCollection(this.collectionName);
|
|
1179
|
+
this.collection = collection;
|
|
1180
|
+
}
|
|
1181
|
+
return this.collection;
|
|
1182
|
+
}
|
|
1183
|
+
async add(nodes) {
|
|
1184
|
+
if (!nodes || nodes.length === 0) {
|
|
1185
|
+
return [];
|
|
1186
|
+
}
|
|
1187
|
+
const dataToInsert = nodes.map((node)=>{
|
|
1188
|
+
const metadata = nodeToMetadata(node, true, this.textKey, this.flatMetadata);
|
|
1189
|
+
// Include the specified metadata fields in the top level of the document (to help filter)
|
|
1190
|
+
const populatedMetadata = {};
|
|
1191
|
+
for (const field of this.indexedMetadataFields){
|
|
1192
|
+
populatedMetadata[field] = metadata[field];
|
|
1193
|
+
}
|
|
1194
|
+
return {
|
|
1195
|
+
[this.idKey]: node.id_,
|
|
1196
|
+
[this.embeddingKey]: node.getEmbedding(),
|
|
1197
|
+
[this.textKey]: node.getContent(MetadataMode.NONE) || "",
|
|
1198
|
+
[this.metadataKey]: metadata,
|
|
1199
|
+
...populatedMetadata
|
|
1200
|
+
};
|
|
1201
|
+
});
|
|
1202
|
+
const collection = await this.ensureCollection();
|
|
1203
|
+
const insertResult = await collection.insertMany(dataToInsert);
|
|
1204
|
+
return Object.values(insertResult.insertedIds).map((id)=>String(id));
|
|
1205
|
+
}
|
|
1206
|
+
/**
|
|
1207
|
+
* Removes specified documents from the AzureCosmosDBMongoDBVectorStore.
|
|
1208
|
+
* @param params Parameters for the delete operation.
|
|
1209
|
+
* @returns A promise that resolves when the documents have been removed.
|
|
1210
|
+
*/ async delete(id, deleteOptions) {
|
|
1211
|
+
const collection = await this.ensureCollection();
|
|
1212
|
+
await collection.deleteMany({
|
|
1213
|
+
id: id
|
|
1214
|
+
}, deleteOptions);
|
|
1215
|
+
}
|
|
1216
|
+
async query(query, options) {
|
|
1217
|
+
const pipeline = [
|
|
1218
|
+
{
|
|
1219
|
+
$search: {
|
|
1220
|
+
cosmosSearch: {
|
|
1221
|
+
vector: query.queryEmbedding,
|
|
1222
|
+
path: this.embeddingKey,
|
|
1223
|
+
k: query.similarityTopK ?? 4,
|
|
1224
|
+
lSearch: options.lSearch ?? 40,
|
|
1225
|
+
efSearch: options.efSearch ?? 40,
|
|
1226
|
+
oversampling: options.oversampling ?? 1.0
|
|
1227
|
+
},
|
|
1228
|
+
returnStoredSource: true
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
];
|
|
1232
|
+
const collection = await this.ensureCollection();
|
|
1233
|
+
const cursor = await collection.aggregate(pipeline);
|
|
1234
|
+
const nodes = [];
|
|
1235
|
+
const ids = [];
|
|
1236
|
+
const similarities = [];
|
|
1237
|
+
for await (const res of (await cursor)){
|
|
1238
|
+
const text = res[this.textKey];
|
|
1239
|
+
const score = res.score;
|
|
1240
|
+
const id = res[this.idKey];
|
|
1241
|
+
const metadata = res[this.metadataKey];
|
|
1242
|
+
const node = metadataDictToNode(metadata);
|
|
1243
|
+
node.setContent(text);
|
|
1244
|
+
ids.push(id);
|
|
1245
|
+
nodes.push(node);
|
|
1246
|
+
similarities.push(score);
|
|
1247
|
+
}
|
|
1248
|
+
const result = {
|
|
1249
|
+
nodes,
|
|
1250
|
+
similarities,
|
|
1251
|
+
ids
|
|
1252
|
+
};
|
|
1253
|
+
return result;
|
|
1254
|
+
}
|
|
1255
|
+
/**
|
|
1256
|
+
* Creates an index on the collection with the specified index name during
|
|
1257
|
+
* instance construction.
|
|
1258
|
+
*
|
|
1259
|
+
* Setting the numLists parameter correctly is important for achieving good
|
|
1260
|
+
* accuracy and performance.
|
|
1261
|
+
* Since the vector store uses IVF as the indexing strategy, you should
|
|
1262
|
+
* create the index only after you have loaded a large enough sample
|
|
1263
|
+
* documents to ensure that the centroids for the respective buckets are
|
|
1264
|
+
* faily distributed.
|
|
1265
|
+
*
|
|
1266
|
+
* As for the compression, the following options are available:
|
|
1267
|
+
* - "half" - half precision compression for HNSW and IVF indexes
|
|
1268
|
+
* - "pq" - product quantization compression for DiskANN indexes
|
|
1269
|
+
* More information on the compression options can be found in the:
|
|
1270
|
+
* https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/product-quantization
|
|
1271
|
+
*
|
|
1272
|
+
* @param indexType Index Type for Mongo vCore index.
|
|
1273
|
+
* @param dimensions Number of dimensions for vector similarity.
|
|
1274
|
+
* The maximum number of supported dimensions is 2000.
|
|
1275
|
+
* If no number is provided, it will be determined automatically by
|
|
1276
|
+
* embedding a short text.
|
|
1277
|
+
* @param similarity Similarity metric to use with the IVF index.
|
|
1278
|
+
* Possible options are:
|
|
1279
|
+
* - CosmosDBSimilarityType.COS (cosine distance)
|
|
1280
|
+
* - CosmosDBSimilarityType.L2 (Euclidean distance)
|
|
1281
|
+
* - CosmosDBSimilarityType.IP (inner product)
|
|
1282
|
+
* @returns A promise that resolves when the index has been created.
|
|
1283
|
+
*/ async createIndex(dimensions = undefined, indexType = "ivf", similarity = AzureCosmosDBMongoDBSimilarityType.COS) {
|
|
1284
|
+
let vectorLength = dimensions;
|
|
1285
|
+
if (vectorLength === undefined) {
|
|
1286
|
+
vectorLength = 1536;
|
|
1287
|
+
}
|
|
1288
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
1289
|
+
const cosmosSearchOptions = {
|
|
1290
|
+
kind: "",
|
|
1291
|
+
similarity,
|
|
1292
|
+
dimensions: vectorLength
|
|
1293
|
+
};
|
|
1294
|
+
if (indexType === "hnsw") {
|
|
1295
|
+
cosmosSearchOptions.kind = "vector-hnsw";
|
|
1296
|
+
cosmosSearchOptions.m = this.indexOptions.m ?? 16;
|
|
1297
|
+
cosmosSearchOptions.efConstruction = this.indexOptions.efConstruction ?? 200;
|
|
1298
|
+
if (this.indexOptions.compression === "half") {
|
|
1299
|
+
cosmosSearchOptions.compression = "half";
|
|
1300
|
+
}
|
|
1301
|
+
} else if (indexType === "diskann") {
|
|
1302
|
+
cosmosSearchOptions.kind = "vector-diskann";
|
|
1303
|
+
cosmosSearchOptions.maxDegree = this.indexOptions.maxDegree ?? 40;
|
|
1304
|
+
cosmosSearchOptions.lBuild = this.indexOptions.lBuild ?? 50;
|
|
1305
|
+
if (this.indexOptions.compression === "pq") {
|
|
1306
|
+
cosmosSearchOptions.compression = "pq";
|
|
1307
|
+
cosmosSearchOptions.pqCompressedDims = this.indexOptions.pqCompressedDims ?? this.indexOptions.dimensions;
|
|
1308
|
+
cosmosSearchOptions.pqSampleSize = this.indexOptions.pqSampleSize ?? 1000;
|
|
1309
|
+
}
|
|
1310
|
+
/** Default to IVF index */ } else {
|
|
1311
|
+
cosmosSearchOptions.kind = "vector-ivf";
|
|
1312
|
+
cosmosSearchOptions.numLists = this.indexOptions.numLists ?? 100;
|
|
1313
|
+
if (this.indexOptions.compression === "half") {
|
|
1314
|
+
cosmosSearchOptions.compression = "half";
|
|
1315
|
+
}
|
|
1316
|
+
}
|
|
1317
|
+
const createIndexCommands = {
|
|
1318
|
+
createIndexes: this.collection?.collectionName,
|
|
1319
|
+
indexes: [
|
|
1320
|
+
{
|
|
1321
|
+
name: this.indexName,
|
|
1322
|
+
key: {
|
|
1323
|
+
[this.embeddingKey]: "cosmosSearch"
|
|
1324
|
+
},
|
|
1325
|
+
cosmosSearchOptions
|
|
1326
|
+
}
|
|
1327
|
+
]
|
|
1328
|
+
};
|
|
1329
|
+
await this.database.command(createIndexCommands);
|
|
1330
|
+
}
|
|
1331
|
+
/**
|
|
1332
|
+
* Checks if the specified index name during instance construction exists
|
|
1333
|
+
* on the collection.
|
|
1334
|
+
* @returns A promise that resolves to a boolean indicating if the index exists.
|
|
1335
|
+
*/ async checkIndexExists() {
|
|
1336
|
+
const collection = await this.ensureCollection();
|
|
1337
|
+
const indexes = await collection.listIndexes().toArray();
|
|
1338
|
+
return indexes.some((index)=>index.name === this.indexName);
|
|
1339
|
+
}
|
|
1340
|
+
/**
|
|
1341
|
+
* Deletes the index specified during instance construction if it exists.
|
|
1342
|
+
* @returns A promise that resolves when the index has been deleted.
|
|
1343
|
+
*/ async deleteIndex(indexName) {
|
|
1344
|
+
const collection = await this.ensureCollection();
|
|
1345
|
+
const indexes = await collection.listIndexes().toArray();
|
|
1346
|
+
const indexToDelete = indexes.find((index)=>index.name === indexName);
|
|
1347
|
+
if (indexToDelete) {
|
|
1348
|
+
await collection.dropIndex(indexName);
|
|
1349
|
+
}
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
const USER_AGENT_SUFFIX = "vectorstores-cdbnosql-vectorstore-javascript";
|
|
1354
|
+
const DEFAULT_VECTOR_EMBEDDING_POLICY = {
|
|
1355
|
+
vectorEmbeddings: [
|
|
1356
|
+
{
|
|
1357
|
+
path: "/embedding",
|
|
1358
|
+
dataType: VectorEmbeddingDataType.Float32,
|
|
1359
|
+
distanceFunction: VectorEmbeddingDistanceFunction.Cosine,
|
|
1360
|
+
dimensions: 1536
|
|
1361
|
+
}
|
|
1362
|
+
]
|
|
1363
|
+
};
|
|
1364
|
+
const DEFAULT_VECTOR_INDEXING_POLICY = [
|
|
1365
|
+
{
|
|
1366
|
+
path: "/embedding",
|
|
1367
|
+
type: VectorIndexType.QuantizedFlat
|
|
1368
|
+
}
|
|
1369
|
+
];
|
|
1370
|
+
function parseConnectionString(connectionString) {
|
|
1371
|
+
const parts = connectionString.split(";");
|
|
1372
|
+
let endpoint = "";
|
|
1373
|
+
let accountKey = "";
|
|
1374
|
+
parts.forEach((part)=>{
|
|
1375
|
+
const [key, value] = part.split("=");
|
|
1376
|
+
if (key && key.trim() === "AccountEndpoint") {
|
|
1377
|
+
endpoint = value?.trim() ?? "";
|
|
1378
|
+
} else if ((key ?? "").trim() === "AccountKey") {
|
|
1379
|
+
accountKey = value?.trim() ?? "";
|
|
1380
|
+
}
|
|
1381
|
+
});
|
|
1382
|
+
if (!endpoint || !accountKey) {
|
|
1383
|
+
throw new Error("Invalid connection string: missing AccountEndpoint or AccountKey.");
|
|
1384
|
+
}
|
|
1385
|
+
return {
|
|
1386
|
+
endpoint,
|
|
1387
|
+
key: accountKey
|
|
1388
|
+
};
|
|
1389
|
+
}
|
|
1390
|
+
/**
|
|
1391
|
+
* utility function to build the query string for the CosmosDB query
|
|
1392
|
+
*/ function queryBuilder(options) {
|
|
1393
|
+
let initialQuery = "SELECT TOP @k c[@id] as id, c[@text] as text, c[@metadata] as metadata";
|
|
1394
|
+
if (options.includeVectorDistance !== false) {
|
|
1395
|
+
initialQuery += `, VectorDistance(c[@embeddingKey],@embedding) AS SimilarityScore`;
|
|
1396
|
+
}
|
|
1397
|
+
initialQuery += ` FROM c`;
|
|
1398
|
+
if (options.whereClause) {
|
|
1399
|
+
initialQuery += ` WHERE ${options.whereClause}`;
|
|
1400
|
+
}
|
|
1401
|
+
initialQuery += ` ORDER BY VectorDistance(c[@embeddingKey],@embedding)`;
|
|
1402
|
+
return initialQuery;
|
|
1403
|
+
}
|
|
1404
|
+
class AzureCosmosDBNoSqlVectorStore extends BaseVectorStore {
|
|
1405
|
+
client() {
|
|
1406
|
+
return this.cosmosClient;
|
|
1407
|
+
}
|
|
1408
|
+
constructor(dbConfig){
|
|
1409
|
+
super(dbConfig), this.storesText = true, this.flatMetadata = true;
|
|
1410
|
+
if (!dbConfig.client) {
|
|
1411
|
+
throw new Error("CosmosClient is required for AzureCosmosDBNoSQLVectorStore initialization");
|
|
1412
|
+
}
|
|
1413
|
+
this.cosmosClient = dbConfig.client;
|
|
1414
|
+
const databaseName = dbConfig.databaseName ?? "vectorSearchDB";
|
|
1415
|
+
const containerName = dbConfig.containerName ?? "vectorSearchContainer";
|
|
1416
|
+
this.idKey = dbConfig.idKey ?? "id";
|
|
1417
|
+
this.textKey = dbConfig.textKey ?? "text";
|
|
1418
|
+
this.flatMetadata = dbConfig.flatMetadata ?? true;
|
|
1419
|
+
this.metadataKey = dbConfig.metadataKey ?? "metadata";
|
|
1420
|
+
const vectorEmbeddingPolicy = dbConfig.vectorEmbeddingPolicy ?? DEFAULT_VECTOR_EMBEDDING_POLICY;
|
|
1421
|
+
const indexingPolicy = dbConfig.indexingPolicy ?? {
|
|
1422
|
+
vectorIndexes: DEFAULT_VECTOR_INDEXING_POLICY
|
|
1423
|
+
};
|
|
1424
|
+
this.embeddingKey = vectorEmbeddingPolicy.vectorEmbeddings?.[0]?.path?.slice(1) ?? "";
|
|
1425
|
+
if (!this.embeddingKey) {
|
|
1426
|
+
throw new Error("AzureCosmosDBNoSQLVectorStore requires a valid vectorEmbeddings path");
|
|
1427
|
+
}
|
|
1428
|
+
// Deferring initialization to the first call to `initialize`
|
|
1429
|
+
this.initialize = ()=>{
|
|
1430
|
+
if (this.initPromise === undefined) {
|
|
1431
|
+
this.initPromise = this.init(this.cosmosClient, databaseName, containerName, {
|
|
1432
|
+
vectorEmbeddingPolicy,
|
|
1433
|
+
indexingPolicy,
|
|
1434
|
+
createContainerOptions: dbConfig.createContainerOptions,
|
|
1435
|
+
createDatabaseOptions: dbConfig.createDatabaseOptions
|
|
1436
|
+
}).catch((error)=>{
|
|
1437
|
+
console.error("Error during AzureCosmosDBNoSQLVectorStore initialization", error);
|
|
1438
|
+
});
|
|
1439
|
+
}
|
|
1440
|
+
return this.initPromise;
|
|
1441
|
+
};
|
|
1442
|
+
}
|
|
1443
|
+
/**
|
|
1444
|
+
* Static method for creating an instance using a connection string.
|
|
1445
|
+
* If no connection string is provided, it will attempt to use the env variable `AZURE_COSMOSDB_NOSQL_CONNECTION_STRING` as connection string.
|
|
1446
|
+
* @returns Instance of AzureCosmosDBNoSqlVectorStore
|
|
1447
|
+
*/ static fromConnectionString(config = {}) {
|
|
1448
|
+
const cosmosConnectionString = config.connectionString || getEnv("AZURE_COSMOSDB_NOSQL_CONNECTION_STRING");
|
|
1449
|
+
if (!cosmosConnectionString) {
|
|
1450
|
+
throw new Error("Azure CosmosDB connection string must be provided");
|
|
1451
|
+
}
|
|
1452
|
+
const { endpoint, key } = parseConnectionString(cosmosConnectionString);
|
|
1453
|
+
const client = new CosmosClient({
|
|
1454
|
+
endpoint,
|
|
1455
|
+
key,
|
|
1456
|
+
userAgentSuffix: USER_AGENT_SUFFIX
|
|
1457
|
+
});
|
|
1458
|
+
return new AzureCosmosDBNoSqlVectorStore({
|
|
1459
|
+
...config,
|
|
1460
|
+
client
|
|
1461
|
+
});
|
|
1462
|
+
}
|
|
1463
|
+
/**
|
|
1464
|
+
* Static method for creating an instance using a account endpoint and key.
|
|
1465
|
+
* If no endpoint and key is provided, it will attempt to use the env variable `AZURE_COSMOSDB_NOSQL_ACCOUNT_ENDPOINT` as enpoint and `AZURE_COSMOSDB_NOSQL_ACCOUNT_KEY` as key.
|
|
1466
|
+
* @returns Instance of AzureCosmosDBNoSqlVectorStore
|
|
1467
|
+
*/ static fromAccountAndKey(config = {}) {
|
|
1468
|
+
const cosmosEndpoint = config.endpoint || getEnv("AZURE_COSMOSDB_NOSQL_ENDPOINT");
|
|
1469
|
+
const cosmosKey = config.key || getEnv("AZURE_COSMOSDB_NOSQL_KEY");
|
|
1470
|
+
if (!cosmosEndpoint || !cosmosKey) {
|
|
1471
|
+
throw new Error("Azure CosmosDB account endpoint and key must be provided");
|
|
1472
|
+
}
|
|
1473
|
+
const client = new CosmosClient({
|
|
1474
|
+
endpoint: cosmosEndpoint,
|
|
1475
|
+
key: cosmosKey,
|
|
1476
|
+
userAgentSuffix: USER_AGENT_SUFFIX
|
|
1477
|
+
});
|
|
1478
|
+
return new AzureCosmosDBNoSqlVectorStore({
|
|
1479
|
+
...config,
|
|
1480
|
+
client
|
|
1481
|
+
});
|
|
1482
|
+
}
|
|
1483
|
+
/**
|
|
1484
|
+
* Static method for creating an instance using account endpoint and managed identity.
|
|
1485
|
+
* If no endpoint and credentials are provided, it will attempt to use the env variable `AZURE_COSMOSDB_NOSQL_ACCOUNT_ENDPOINT` as endpoint and use DefaultAzureCredential() as credentials.
|
|
1486
|
+
* @returns Instance of AzureCosmosDBNoSqlVectorStore
|
|
1487
|
+
*/ static fromUriAndManagedIdentity(config = {}) {
|
|
1488
|
+
const cosmosEndpoint = config.endpoint || getEnv("AZURE_COSMOSDB_NOSQL_ACCOUNT_ENDPOINT");
|
|
1489
|
+
if (!cosmosEndpoint) {
|
|
1490
|
+
throw new Error("Azure CosmosDB account endpoint must be provided");
|
|
1491
|
+
}
|
|
1492
|
+
const credentials = config.credential ?? new DefaultAzureCredential();
|
|
1493
|
+
const client = new CosmosClient({
|
|
1494
|
+
endpoint: cosmosEndpoint,
|
|
1495
|
+
aadCredentials: credentials,
|
|
1496
|
+
userAgentSuffix: USER_AGENT_SUFFIX
|
|
1497
|
+
});
|
|
1498
|
+
return new AzureCosmosDBNoSqlVectorStore({
|
|
1499
|
+
...config,
|
|
1500
|
+
client
|
|
1501
|
+
});
|
|
1502
|
+
}
|
|
1503
|
+
/**
|
|
1504
|
+
* Adds document to the CosmosDB container.
|
|
1505
|
+
*
|
|
1506
|
+
* @returns an array of document ids which were added
|
|
1507
|
+
*/ async add(nodes) {
|
|
1508
|
+
await this.initialize();
|
|
1509
|
+
if (!nodes || nodes.length === 0) {
|
|
1510
|
+
return [];
|
|
1511
|
+
}
|
|
1512
|
+
const docs = nodes.map((node)=>{
|
|
1513
|
+
const metadata = nodeToMetadata(node, true, this.textKey, this.flatMetadata);
|
|
1514
|
+
return {
|
|
1515
|
+
[this.idKey]: node.id_,
|
|
1516
|
+
[this.embeddingKey]: node.getEmbedding(),
|
|
1517
|
+
[this.textKey]: node.getContent(MetadataMode.NONE) || "",
|
|
1518
|
+
[this.metadataKey]: metadata
|
|
1519
|
+
};
|
|
1520
|
+
});
|
|
1521
|
+
const ids = [];
|
|
1522
|
+
const results = await Promise.allSettled(docs.map((doc)=>this.container.items.create(doc)));
|
|
1523
|
+
for (const result of results){
|
|
1524
|
+
if (result.status === "fulfilled") {
|
|
1525
|
+
ids.push(result.value.resource?.id ?? "");
|
|
1526
|
+
} else {
|
|
1527
|
+
ids.push("error: could not create item");
|
|
1528
|
+
}
|
|
1529
|
+
}
|
|
1530
|
+
return ids;
|
|
1531
|
+
}
|
|
1532
|
+
/**
|
|
1533
|
+
* Delete a document from the CosmosDB container.
|
|
1534
|
+
*
|
|
1535
|
+
* @param refDocId - The id of the document to delete
|
|
1536
|
+
* @param deleteOptions - Any options to pass to the container.item.delete function
|
|
1537
|
+
* @returns Promise that resolves if the delete query did not throw an error.
|
|
1538
|
+
*/ async delete(refDocId, deleteOptions) {
|
|
1539
|
+
await this.initialize();
|
|
1540
|
+
await this.container.item(refDocId).delete(deleteOptions);
|
|
1541
|
+
}
|
|
1542
|
+
/**
|
|
1543
|
+
* Performs a vector similarity search query in the CosmosDB container.
|
|
1544
|
+
*
|
|
1545
|
+
* @param query VectorStoreQuery
|
|
1546
|
+
* @returns List of nodes along with similarityScore
|
|
1547
|
+
*/ async query(query, options = {}) {
|
|
1548
|
+
await this.initialize();
|
|
1549
|
+
if (!query.queryEmbedding || query.queryEmbedding.length === 0) {
|
|
1550
|
+
throw new Error("queryEmbedding is required for AzureCosmosDBNoSqlVectorStore query");
|
|
1551
|
+
}
|
|
1552
|
+
const params = {
|
|
1553
|
+
vector: query.queryEmbedding,
|
|
1554
|
+
k: query.similarityTopK
|
|
1555
|
+
};
|
|
1556
|
+
const builtQuery = queryBuilder(options);
|
|
1557
|
+
const nodes = [];
|
|
1558
|
+
const ids = [];
|
|
1559
|
+
const similarities = [];
|
|
1560
|
+
const queryResults = await this.container.items.query({
|
|
1561
|
+
query: builtQuery,
|
|
1562
|
+
parameters: [
|
|
1563
|
+
{
|
|
1564
|
+
name: "@k",
|
|
1565
|
+
value: params.k
|
|
1566
|
+
},
|
|
1567
|
+
{
|
|
1568
|
+
name: "@id",
|
|
1569
|
+
value: this.idKey
|
|
1570
|
+
},
|
|
1571
|
+
{
|
|
1572
|
+
name: "@text",
|
|
1573
|
+
value: this.textKey
|
|
1574
|
+
},
|
|
1575
|
+
{
|
|
1576
|
+
name: "@metadata",
|
|
1577
|
+
value: this.metadataKey
|
|
1578
|
+
},
|
|
1579
|
+
{
|
|
1580
|
+
name: "@embedding",
|
|
1581
|
+
value: params.vector
|
|
1582
|
+
},
|
|
1583
|
+
{
|
|
1584
|
+
name: "@embeddingKey",
|
|
1585
|
+
value: this.embeddingKey
|
|
1586
|
+
}
|
|
1587
|
+
]
|
|
1588
|
+
}).fetchAll();
|
|
1589
|
+
for (const item of queryResults.resources){
|
|
1590
|
+
const node = metadataDictToNode(item["metadata"], {
|
|
1591
|
+
fallback: {
|
|
1592
|
+
id_: item["id"],
|
|
1593
|
+
text: item["text"],
|
|
1594
|
+
...item["metadata"]
|
|
1595
|
+
}
|
|
1596
|
+
});
|
|
1597
|
+
node.setContent(item["text"]);
|
|
1598
|
+
const nodeId = item["id"];
|
|
1599
|
+
const nodeScore = item["SimilarityScore"];
|
|
1600
|
+
nodes.push(node);
|
|
1601
|
+
ids.push(nodeId);
|
|
1602
|
+
similarities.push(nodeScore);
|
|
1603
|
+
}
|
|
1604
|
+
const result = {
|
|
1605
|
+
nodes,
|
|
1606
|
+
similarities,
|
|
1607
|
+
ids
|
|
1608
|
+
};
|
|
1609
|
+
return result;
|
|
1610
|
+
}
|
|
1611
|
+
/**
|
|
1612
|
+
* Initialize the CosmosDB container.
|
|
1613
|
+
*/ async init(client, databaseName, containerName, initOptions) {
|
|
1614
|
+
const { database } = await client.databases.createIfNotExists({
|
|
1615
|
+
...initOptions?.createDatabaseOptions ?? {},
|
|
1616
|
+
id: databaseName
|
|
1617
|
+
});
|
|
1618
|
+
const { container } = await database.containers.createIfNotExists({
|
|
1619
|
+
...initOptions?.createContainerOptions ?? {
|
|
1620
|
+
partitionKey: {
|
|
1621
|
+
paths: [
|
|
1622
|
+
"/id"
|
|
1623
|
+
]
|
|
1624
|
+
}
|
|
1625
|
+
},
|
|
1626
|
+
indexingPolicy: initOptions.indexingPolicy || {
|
|
1627
|
+
vectorIndexes: DEFAULT_VECTOR_INDEXING_POLICY
|
|
1628
|
+
},
|
|
1629
|
+
vectorEmbeddingPolicy: initOptions?.vectorEmbeddingPolicy || DEFAULT_VECTOR_EMBEDDING_POLICY,
|
|
1630
|
+
id: containerName
|
|
1631
|
+
});
|
|
1632
|
+
this.container = container;
|
|
1633
|
+
}
|
|
1634
|
+
}
|
|
1635
|
+
|
|
1636
|
+
export { AzureAISearchVectorStore, AzureAISearchVectorStoreConfig, AzureCosmosDBMongoDBSimilarityType, AzureCosmosDBMongoDBVectorStore, AzureCosmosDBNoSqlVectorStore, AzureQueryResultSearchBase, AzureQueryResultSearchDefault, AzureQueryResultSearchHybrid, AzureQueryResultSearchSemanticHybrid, AzureQueryResultSearchSparse, IndexManagement, MetadataIndexFieldType, SimpleCosmosDBReader };
|