@storecraft/database-mongodb 1.0.12 → 1.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+ /**
2
+ * @import {
3
+ * AIEmbedder, VectorStore
4
+ * } from '@storecraft/core/ai/core/types.private.js'
5
+ * @import {
6
+ * Config
7
+ * } from './types.js'
8
+ * @import {
9
+ mongo_vectorSearch_pipeline,
10
+ * MongoVectorDocument
11
+ * } from './types.private.js'
12
+ *
13
+ * @import {
14
+ * AnyBulkWriteOperation, Document, AggregationCursor
15
+ * } from 'mongodb'
16
+ * @import { ENV } from '@storecraft/core';
17
+ */
18
+
19
+ import { Collection } from 'mongodb';
20
+ import { MongoClient, ServerApiVersion } from 'mongodb';
21
+
22
+ export const EMBEDDING_KEY_PATH = 'embedding';
23
+ export const NAMESPACE_KEY = 'namespace';
24
+ export const DEFAULT_INDEX_NAME = 'vector_store';
25
+
26
+ /**
27
+ * @typedef {VectorStore} Impl
28
+ */
29
+
30
+ /**
31
+ * @description MongoDB Atlas Vector Store
32
+ * {@link https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#:~:text=You%20can%20use%20the%20vectorSearch,to%20pre%2Dfilter%20your%20data.}
33
+ *
34
+ * @implements {VectorStore}
35
+ */
36
+ export class MongoVectorStore {
37
+
38
+ /** @satisfies {ENV<Config>} */
39
+ static EnvConfig = /** @type{const} */ ({
40
+ db_name: 'MONGODB_VECTOR_STORE_DB_NAME',
41
+ url: 'MONGODB_VECTOR_STORE_URL'
42
+ });
43
+
44
+ /** @type {Config} */
45
+ config;
46
+
47
+ /** @type {MongoClient} */
48
+ #client
49
+
50
+ /**
51
+ *
52
+ * @param {Config} config
53
+ */
54
+ constructor(config) {
55
+ this.config = {
56
+ ...config,
57
+ index_name: config.index_name ?? 'vector_store',
58
+ similarity: config.similarity ?? 'cosine',
59
+ dimensions: config.dimensions ?? 1536,
60
+ options: config.options ?? {
61
+ ignoreUndefined: true,
62
+ serverApi: {
63
+ version: ServerApiVersion.v1,
64
+ strict: false,
65
+ deprecationErrors: true,
66
+ }
67
+ }
68
+ };
69
+ }
70
+
71
+ get client() {
72
+ if(!this.config.db_name || !this.config.url) {
73
+ throw new Error('MongoVectorStore::client() - missing url or db_name');
74
+ }
75
+
76
+ this.#client = this.#client ?? new MongoClient(
77
+ this.config.url, this.config.options
78
+ );
79
+ return this.#client;
80
+ }
81
+
82
+ /** @type {VectorStore["onInit"]} */
83
+ onInit = (app) => {
84
+ this.config.url ??= app.platform.env[MongoVectorStore.EnvConfig.url ?? 'MONGODB_URL'];
85
+ this.config.db_name ??= app.platform.env[MongoVectorStore.EnvConfig.db_name ?? 'MONGODB_DB_NAME'] ?? 'main';
86
+ }
87
+
88
+ /** @type {VectorStore["embedder"]} */
89
+ get embedder() {
90
+ return this.config.embedder
91
+ }
92
+
93
+ /** @type {Collection<MongoVectorDocument>} */
94
+ get vector_collection() {
95
+ return this.client.db(this.config.db_name).collection(this.config.index_name);
96
+ }
97
+
98
+ /** @type {VectorStore["upsertVectors"]} */
99
+ upsertVectors = async (vectors, documents, options) => {
100
+ /** @type {MongoVectorDocument[]} */
101
+ const mongo_docs = documents.map(
102
+ (doc, ix) => (
103
+ {
104
+ updated_at: new Date().toISOString(),
105
+ embedding: vectors[ix],
106
+ metadata: doc.metadata,
107
+ pageContent: doc.pageContent,
108
+ [NAMESPACE_KEY]: doc.namespace,
109
+ id: doc.id
110
+ }
111
+ )
112
+ );
113
+
114
+ // upsert all docs
115
+ /** @type {AnyBulkWriteOperation<MongoVectorDocument>[]} */
116
+ const mongo_replace_ops = mongo_docs.map(
117
+ (doc) => (
118
+ {
119
+ replaceOne: {
120
+ filter: {
121
+ id: doc.id
122
+ },
123
+ replacement: doc,
124
+ upsert: true
125
+ }
126
+ }
127
+ )
128
+ )
129
+
130
+ const results = await this.vector_collection.bulkWrite(
131
+ mongo_replace_ops
132
+ );
133
+
134
+ }
135
+
136
+ /** @type {VectorStore["upsertDocuments"]} */
137
+ upsertDocuments = async (documents, options) => {
138
+ // first, generate embeddings for the documents
139
+ const result = await this.embedder.generateEmbeddings(
140
+ {
141
+ content: documents.map(
142
+ doc => (
143
+ {
144
+ content: doc.pageContent,
145
+ type: 'text'
146
+ }
147
+ )
148
+ )
149
+ }
150
+ );
151
+
152
+ const vectors = result.content;
153
+
154
+ return this.upsertVectors(
155
+ vectors, documents, options
156
+ )
157
+ }
158
+
159
+ /** @type {VectorStore["delete"]} */
160
+ delete = async (ids) => {
161
+ const result = await this.vector_collection.deleteMany(
162
+ {
163
+ id: { $in: ids }
164
+ }
165
+ );
166
+
167
+ }
168
+
169
+ /** @type {VectorStore["similaritySearch"]} */
170
+ similaritySearch = async (query, k, namespaces) => {
171
+
172
+ const embedding_result = await this.embedder.generateEmbeddings(
173
+ {
174
+ content: [
175
+ {
176
+ content: query,
177
+ type: 'text'
178
+ }
179
+ ]
180
+ }
181
+ );
182
+ const vector = embedding_result.content[0]
183
+
184
+ const agg = [
185
+ {
186
+ '$vectorSearch': /** @type {mongo_vectorSearch_pipeline} */ ({
187
+ index: this.config.index_name,
188
+ path: EMBEDDING_KEY_PATH,
189
+ queryVector: vector,
190
+ numCandidates: k,
191
+ limit: k,
192
+ exact: false,
193
+ })
194
+ }, {
195
+ '$project': {
196
+ '_id': 0,
197
+ [EMBEDDING_KEY_PATH]: 0,
198
+ 'score': {
199
+ '$meta': 'vectorSearchScore'
200
+ }
201
+ }
202
+ }
203
+ ];
204
+
205
+ if(Array.isArray(namespaces) && namespaces.length) {
206
+ agg[0].$vectorSearch.filter = {
207
+ [NAMESPACE_KEY]: {$in: namespaces}
208
+ }
209
+ }
210
+
211
+ /** @type {AggregationCursor<MongoVectorDocument>} */
212
+ const agg_result = this.vector_collection.aggregate(agg);
213
+ const mongo_vector_docs = await agg_result.toArray();
214
+
215
+ return mongo_vector_docs.map(
216
+ (doc) => {
217
+ return {
218
+ score: doc.score,
219
+ document: {
220
+ id: doc.id,
221
+ metadata: /** @type {any}*/ (doc.metadata),
222
+ pageContent: doc.pageContent,
223
+ namespace: doc.namespace
224
+ }
225
+ }
226
+ }
227
+ );
228
+ }
229
+
230
+ /**
231
+ * @param {boolean} [disconnect_after_finish=true]
232
+ * @param {boolean} [delete_index_if_exists_before=false]
233
+ * @returns {Promise<boolean>}
234
+ */
235
+ createVectorIndex = async (disconnect_after_finish=true, delete_index_if_exists_before=false) => {
236
+ if(delete_index_if_exists_before) {
237
+ await this.deleteVectorIndex();
238
+ }
239
+
240
+ const db = this.client.db(this.config.db_name);
241
+ const collection_name = this.config.index_name;
242
+ // collection name will have the same name as the index
243
+ await db.createCollection(collection_name);
244
+ const index_result = await db.collection(collection_name).createSearchIndex(
245
+ {
246
+ name: this.config.index_name,
247
+ type: 'vectorSearch',
248
+ definition: {
249
+ fields: [
250
+ {
251
+ type: 'vector',
252
+ path: EMBEDDING_KEY_PATH,
253
+ numDimensions: this.config.dimensions,
254
+ similarity: this.config.similarity
255
+ },
256
+ {
257
+ type: 'filter',
258
+ path: NAMESPACE_KEY
259
+ },
260
+ ]
261
+ }
262
+ }
263
+ );
264
+
265
+ if(index_result!==this.config.index_name) {
266
+ throw new Error('MongoVectorStore::createVectorIndex failed');
267
+ }
268
+
269
+ if(disconnect_after_finish)
270
+ await this.client.close();
271
+
272
+ return true;
273
+ }
274
+
275
+ /**
276
+ * @returns {Promise<boolean>}
277
+ */
278
+ deleteVectorIndex = async () => {
279
+ const db = this.client.db(this.config.db_name);
280
+ const collection_name = this.config.index_name;
281
+ const index_result = await db.collection(collection_name).dropSearchIndex(
282
+ this.config.index_name
283
+ );
284
+
285
+ return true;
286
+ }
287
+
288
+ }
289
+
@@ -0,0 +1,44 @@
1
+
2
+ import type { AIEmbedder } from '@storecraft/core/ai';
3
+ import type { MongoClientOptions } from 'mongodb';
4
+ export * from './index.js';
5
+
6
+ export type Config = {
7
+ /**
8
+ * @description mongo connection url, if absent, will be infered at init
9
+ * with env `MONGODB_VECTOR_STORE_URL` or `MONGODB_URL`
10
+ */
11
+ url?: string;
12
+
13
+ /**
14
+ * @description the name of the database, if absent, will be infered at init
15
+ * with env `MONGODB_VECTOR_STORE_NAME` or `MONGODB_NAME`
16
+ * @default 'main'
17
+ */
18
+ db_name?: string;
19
+
20
+ /**
21
+ * @description mongo client options
22
+ */
23
+ options?: MongoClientOptions;
24
+
25
+ /**
26
+ * @description The name of the index
27
+ * @default 'vector_store'
28
+ */
29
+ index_name?: string,
30
+
31
+ /**
32
+ * @description The dimensions of the vectors to be inserted in the index.
33
+ * @default 1536
34
+ */
35
+ dimensions?: number,
36
+
37
+ /**
38
+ * @description The similiarity metric
39
+ * @default 'cosine'
40
+ */
41
+ similarity?: 'euclidean' | 'cosine' | 'dotProduct',
42
+
43
+ embedder: AIEmbedder
44
+ }
@@ -0,0 +1,44 @@
1
+ import { Document } from "mongodb"
2
+
3
+ export type MongoVectorDocument = {
4
+ id: string,
5
+ metadata: Record<string, any>,
6
+ embedding: number[],
7
+ pageContent: string,
8
+ updated_at: string,
9
+ score?: number
10
+ namespace?: string
11
+ }
12
+
13
+ export type mongo_vectorSearch_pipeline = {
14
+ /**
15
+ * This is required if numCandidates is omitted.
16
+ * Flag that specifies whether to run ENN or ANN search. Value can be one of the following:
17
+ * - false - to run ANN search
18
+ * - true - to run ENN search
19
+ *
20
+ * If omitted, defaults to false.
21
+ */
22
+ exact?: boolean
23
+
24
+ /**
25
+ * Any MQL match expression that compares an indexed field with a boolean, date, objectId, number (not decimals), string, or UUID to use as a pre-filter. To learn which query and aggregation pipeline operators Atlas Vector Search supports in your filter, see Atlas Vector Search Pre-Filter.
26
+ */
27
+ filter?: Document
28
+
29
+ /** Name of the Atlas Vector Search index to use. Atlas Vector Search doesn't return results if you misspell the index name or if the specified index doesn't already exist on the cluster. */
30
+ index: string
31
+
32
+ /** Number (of type int only) of documents to return in the results. This value can't exceed the value of numCandidates if you specify numCandidates. */
33
+ limit: number
34
+
35
+ /** This field is required if exact is false or omitted. Number of nearest neighbors to use during the search. Value must be less than or equal to (<=) 10000. You can't specify a number less than the number of documents to return (limit). We recommend that you specify a number higher than the number of documents to return (limit) to increase accuracy although this might impact latency. For example, we recommend a ratio of ten to twenty nearest neighbors for a limit of only one document. This overrequest pattern is the recommended way to trade off latency and recall in your ANN searches, and we recommend tuning this on your specific dataset. */
36
+ numCandidates?: number
37
+
38
+ /** Indexed vector type field to search. */
39
+ path: string
40
+
41
+ /** Array of numbers of the BSON double, BSON BinData vector subtype float32, or BSON BinData vector subtype int1 or int8 type that represent the query vector. The number type must match the indexed field value type. Otherwise, Atlas Vector Search doesn't return any results or errors */
42
+ queryVector: number[]
43
+
44
+ }