@langchain/core 0.1.61 → 0.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/document_loaders/base.cjs +24 -0
- package/dist/document_loaders/base.d.ts +28 -0
- package/dist/document_loaders/base.js +20 -0
- package/dist/documents/transformers.d.ts +1 -1
- package/dist/indexing/base.cjs +270 -0
- package/dist/indexing/base.d.ts +114 -0
- package/dist/indexing/base.js +261 -0
- package/dist/indexing/index.cjs +18 -0
- package/dist/indexing/index.d.ts +2 -0
- package/dist/indexing/index.js +2 -0
- package/dist/indexing/record_manager.cjs +18 -0
- package/dist/indexing/record_manager.d.ts +64 -0
- package/dist/indexing/record_manager.js +14 -0
- package/dist/runnables/base.cjs +10 -39
- package/dist/runnables/base.d.ts +11 -0
- package/dist/runnables/base.js +10 -39
- package/dist/runnables/remote.d.ts +4 -0
- package/dist/runnables/wrappers.cjs +18 -0
- package/dist/runnables/wrappers.d.ts +2 -0
- package/dist/runnables/wrappers.js +14 -0
- package/dist/tracers/base.cjs +4 -4
- package/dist/tracers/base.js +4 -4
- package/document_loaders/base.cjs +1 -0
- package/document_loaders/base.d.cts +1 -0
- package/document_loaders/base.d.ts +1 -0
- package/document_loaders/base.js +1 -0
- package/indexing.cjs +1 -0
- package/indexing.d.cts +1 -0
- package/indexing.d.ts +1 -0
- package/indexing.js +1 -0
- package/package.json +27 -1
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.BaseDocumentLoader = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* Abstract class that provides a default implementation for the
|
|
6
|
+
* loadAndSplit() method from the DocumentLoader interface. The load()
|
|
7
|
+
* method is left abstract and needs to be implemented by subclasses.
|
|
8
|
+
*/
|
|
9
|
+
class BaseDocumentLoader {
|
|
10
|
+
/**
|
|
11
|
+
* @deprecated Use `this.load()` and `splitter.splitDocuments()` individually.
|
|
12
|
+
* Loads the documents and splits them using a specified text splitter.
|
|
13
|
+
* @param textSplitter The TextSplitter instance to use for splitting the loaded documents. Defaults to a RecursiveCharacterTextSplitter instance.
|
|
14
|
+
* @returns A Promise that resolves with an array of Document instances, each split according to the provided TextSplitter.
|
|
15
|
+
*/
|
|
16
|
+
async loadAndSplit(splitter) {
|
|
17
|
+
if (splitter === undefined) {
|
|
18
|
+
throw new Error("You must pass a text splitter to use this method.");
|
|
19
|
+
}
|
|
20
|
+
const docs = await this.load();
|
|
21
|
+
return splitter.invoke(docs);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
exports.BaseDocumentLoader = BaseDocumentLoader;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { Document } from "../documents/document.js";
|
|
2
|
+
import { BaseDocumentTransformer } from "../documents/transformers.js";
|
|
3
|
+
/**
|
|
4
|
+
* Interface that defines the methods for loading and splitting documents.
|
|
5
|
+
*/
|
|
6
|
+
export interface DocumentLoader {
|
|
7
|
+
load(): Promise<Document[]>;
|
|
8
|
+
loadAndSplit(textSplitter?: BaseDocumentTransformer): Promise<Document[]>;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Abstract class that provides a default implementation for the
|
|
12
|
+
* loadAndSplit() method from the DocumentLoader interface. The load()
|
|
13
|
+
* method is left abstract and needs to be implemented by subclasses.
|
|
14
|
+
*/
|
|
15
|
+
export declare abstract class BaseDocumentLoader implements DocumentLoader {
|
|
16
|
+
/**
|
|
17
|
+
* Loads the documents.
|
|
18
|
+
* @returns A Promise that resolves with an array of Document instances.
|
|
19
|
+
*/
|
|
20
|
+
abstract load(): Promise<Document[]>;
|
|
21
|
+
/**
|
|
22
|
+
* @deprecated Use `this.load()` and `splitter.splitDocuments()` individually.
|
|
23
|
+
* Loads the documents and splits them using a specified text splitter.
|
|
24
|
+
* @param textSplitter The TextSplitter instance to use for splitting the loaded documents. Defaults to a RecursiveCharacterTextSplitter instance.
|
|
25
|
+
* @returns A Promise that resolves with an array of Document instances, each split according to the provided TextSplitter.
|
|
26
|
+
*/
|
|
27
|
+
loadAndSplit(splitter?: BaseDocumentTransformer): Promise<Document[]>;
|
|
28
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Abstract class that provides a default implementation for the
|
|
3
|
+
* loadAndSplit() method from the DocumentLoader interface. The load()
|
|
4
|
+
* method is left abstract and needs to be implemented by subclasses.
|
|
5
|
+
*/
|
|
6
|
+
export class BaseDocumentLoader {
|
|
7
|
+
/**
|
|
8
|
+
* @deprecated Use `this.load()` and `splitter.splitDocuments()` individually.
|
|
9
|
+
* Loads the documents and splits them using a specified text splitter.
|
|
10
|
+
* @param textSplitter The TextSplitter instance to use for splitting the loaded documents. Defaults to a RecursiveCharacterTextSplitter instance.
|
|
11
|
+
* @returns A Promise that resolves with an array of Document instances, each split according to the provided TextSplitter.
|
|
12
|
+
*/
|
|
13
|
+
async loadAndSplit(splitter) {
|
|
14
|
+
if (splitter === undefined) {
|
|
15
|
+
throw new Error("You must pass a text splitter to use this method.");
|
|
16
|
+
}
|
|
17
|
+
const docs = await this.load();
|
|
18
|
+
return splitter.invoke(docs);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
@@ -26,7 +26,7 @@ export declare abstract class BaseDocumentTransformer<RunInput extends DocumentI
|
|
|
26
26
|
* @param _options Optional configuration object to customize the behavior of callbacks.
|
|
27
27
|
* @returns A Promise that resolves to the transformed documents.
|
|
28
28
|
*/
|
|
29
|
-
invoke(input: RunInput, _options
|
|
29
|
+
invoke(input: RunInput, _options?: BaseCallbackConfig): Promise<RunOutput>;
|
|
30
30
|
}
|
|
31
31
|
/**
|
|
32
32
|
* Class for document transformers that return exactly one transformed document
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.index = exports._isBaseDocumentLoader = exports._getSourceIdAssigner = exports._deduplicateInOrder = exports._batch = exports._HashedDocument = void 0;
|
|
4
|
+
const uuid_1 = require("uuid");
|
|
5
|
+
const record_manager_js_1 = require("./record_manager.cjs");
|
|
6
|
+
const hash_js_1 = require("../utils/hash.cjs");
|
|
7
|
+
const document_js_1 = require("../documents/document.cjs");
|
|
8
|
+
/**
|
|
9
|
+
* HashedDocument is a Document with hashes calculated.
|
|
10
|
+
* Hashes are calculated based on page content and metadata.
|
|
11
|
+
* It is used for indexing.
|
|
12
|
+
*/
|
|
13
|
+
class _HashedDocument {
|
|
14
|
+
constructor(fields) {
|
|
15
|
+
Object.defineProperty(this, "uid", {
|
|
16
|
+
enumerable: true,
|
|
17
|
+
configurable: true,
|
|
18
|
+
writable: true,
|
|
19
|
+
value: void 0
|
|
20
|
+
});
|
|
21
|
+
Object.defineProperty(this, "hash_", {
|
|
22
|
+
enumerable: true,
|
|
23
|
+
configurable: true,
|
|
24
|
+
writable: true,
|
|
25
|
+
value: void 0
|
|
26
|
+
});
|
|
27
|
+
Object.defineProperty(this, "contentHash", {
|
|
28
|
+
enumerable: true,
|
|
29
|
+
configurable: true,
|
|
30
|
+
writable: true,
|
|
31
|
+
value: void 0
|
|
32
|
+
});
|
|
33
|
+
Object.defineProperty(this, "metadataHash", {
|
|
34
|
+
enumerable: true,
|
|
35
|
+
configurable: true,
|
|
36
|
+
writable: true,
|
|
37
|
+
value: void 0
|
|
38
|
+
});
|
|
39
|
+
Object.defineProperty(this, "pageContent", {
|
|
40
|
+
enumerable: true,
|
|
41
|
+
configurable: true,
|
|
42
|
+
writable: true,
|
|
43
|
+
value: void 0
|
|
44
|
+
});
|
|
45
|
+
Object.defineProperty(this, "metadata", {
|
|
46
|
+
enumerable: true,
|
|
47
|
+
configurable: true,
|
|
48
|
+
writable: true,
|
|
49
|
+
value: void 0
|
|
50
|
+
});
|
|
51
|
+
this.uid = fields.uid;
|
|
52
|
+
this.pageContent = fields.pageContent;
|
|
53
|
+
this.metadata = fields.metadata;
|
|
54
|
+
}
|
|
55
|
+
calculateHashes() {
|
|
56
|
+
const forbiddenKeys = ["hash_", "content_hash", "metadata_hash"];
|
|
57
|
+
for (const key of forbiddenKeys) {
|
|
58
|
+
if (key in this.metadata) {
|
|
59
|
+
throw new Error(`Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(", ")}]`);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
const contentHash = this._hashStringToUUID(this.pageContent);
|
|
63
|
+
try {
|
|
64
|
+
const metadataHash = this._hashNestedDictToUUID(this.metadata);
|
|
65
|
+
this.contentHash = contentHash;
|
|
66
|
+
this.metadataHash = metadataHash;
|
|
67
|
+
}
|
|
68
|
+
catch (e) {
|
|
69
|
+
throw new Error(`Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`);
|
|
70
|
+
}
|
|
71
|
+
this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash);
|
|
72
|
+
if (!this.uid) {
|
|
73
|
+
this.uid = this.hash_;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
toDocument() {
|
|
77
|
+
return new document_js_1.Document({
|
|
78
|
+
pageContent: this.pageContent,
|
|
79
|
+
metadata: this.metadata,
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
static fromDocument(document, uid) {
|
|
83
|
+
const doc = new this({
|
|
84
|
+
pageContent: document.pageContent,
|
|
85
|
+
metadata: document.metadata,
|
|
86
|
+
uid: uid || document.uid,
|
|
87
|
+
});
|
|
88
|
+
doc.calculateHashes();
|
|
89
|
+
return doc;
|
|
90
|
+
}
|
|
91
|
+
_hashStringToUUID(inputString) {
|
|
92
|
+
const hash_value = (0, hash_js_1.insecureHash)(inputString);
|
|
93
|
+
return (0, uuid_1.v5)(hash_value, record_manager_js_1.UUIDV5_NAMESPACE);
|
|
94
|
+
}
|
|
95
|
+
_hashNestedDictToUUID(data) {
|
|
96
|
+
const serialized_data = JSON.stringify(data, Object.keys(data).sort());
|
|
97
|
+
const hash_value = (0, hash_js_1.insecureHash)(serialized_data);
|
|
98
|
+
return (0, uuid_1.v5)(hash_value, record_manager_js_1.UUIDV5_NAMESPACE);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
exports._HashedDocument = _HashedDocument;
|
|
102
|
+
function _batch(size, iterable) {
|
|
103
|
+
const batches = [];
|
|
104
|
+
let currentBatch = [];
|
|
105
|
+
iterable.forEach((item) => {
|
|
106
|
+
currentBatch.push(item);
|
|
107
|
+
if (currentBatch.length >= size) {
|
|
108
|
+
batches.push(currentBatch);
|
|
109
|
+
currentBatch = [];
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
if (currentBatch.length > 0) {
|
|
113
|
+
batches.push(currentBatch);
|
|
114
|
+
}
|
|
115
|
+
return batches;
|
|
116
|
+
}
|
|
117
|
+
exports._batch = _batch;
|
|
118
|
+
function _deduplicateInOrder(hashedDocuments) {
|
|
119
|
+
const seen = new Set();
|
|
120
|
+
const deduplicated = [];
|
|
121
|
+
for (const hashedDoc of hashedDocuments) {
|
|
122
|
+
if (!hashedDoc.hash_) {
|
|
123
|
+
throw new Error("Hashed document does not have a hash");
|
|
124
|
+
}
|
|
125
|
+
if (!seen.has(hashedDoc.hash_)) {
|
|
126
|
+
seen.add(hashedDoc.hash_);
|
|
127
|
+
deduplicated.push(hashedDoc);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
return deduplicated;
|
|
131
|
+
}
|
|
132
|
+
exports._deduplicateInOrder = _deduplicateInOrder;
|
|
133
|
+
function _getSourceIdAssigner(sourceIdKey) {
|
|
134
|
+
if (sourceIdKey === null) {
|
|
135
|
+
return (_doc) => null;
|
|
136
|
+
}
|
|
137
|
+
else if (typeof sourceIdKey === "string") {
|
|
138
|
+
return (doc) => doc.metadata[sourceIdKey];
|
|
139
|
+
}
|
|
140
|
+
else if (typeof sourceIdKey === "function") {
|
|
141
|
+
return sourceIdKey;
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
exports._getSourceIdAssigner = _getSourceIdAssigner;
|
|
148
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
149
|
+
const _isBaseDocumentLoader = (arg) => {
|
|
150
|
+
if ("load" in arg &&
|
|
151
|
+
typeof arg.load === "function" &&
|
|
152
|
+
"loadAndSplit" in arg &&
|
|
153
|
+
typeof arg.loadAndSplit === "function") {
|
|
154
|
+
return true;
|
|
155
|
+
}
|
|
156
|
+
return false;
|
|
157
|
+
};
|
|
158
|
+
exports._isBaseDocumentLoader = _isBaseDocumentLoader;
|
|
159
|
+
/**
|
|
160
|
+
* Index data from the doc source into the vector store.
|
|
161
|
+
*
|
|
162
|
+
* Indexing functionality uses a manager to keep track of which documents
|
|
163
|
+
* are in the vector store.
|
|
164
|
+
*
|
|
165
|
+
* This allows us to keep track of which documents were updated, and which
|
|
166
|
+
* documents were deleted, which documents should be skipped.
|
|
167
|
+
*
|
|
168
|
+
* For the time being, documents are indexed using their hashes, and users
|
|
169
|
+
* are not able to specify the uid of the document.
|
|
170
|
+
*
|
|
171
|
+
* @param {IndexArgs} args
|
|
172
|
+
* @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.
|
|
173
|
+
* @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.
|
|
174
|
+
* @param {VectorStore} args.vectorStore The vector store to use for storing the documents.
|
|
175
|
+
* @param {IndexOptions | undefined} args.options Options for indexing.
|
|
176
|
+
* @returns {Promise<IndexingResult>}
|
|
177
|
+
*/
|
|
178
|
+
async function index(args) {
|
|
179
|
+
const { docsSource, recordManager, vectorStore, options } = args;
|
|
180
|
+
const { batchSize = 100, cleanup, sourceIdKey, cleanupBatchSize = 1000, forceUpdate = false, } = options ?? {};
|
|
181
|
+
if (cleanup === "incremental" && !sourceIdKey) {
|
|
182
|
+
throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.");
|
|
183
|
+
}
|
|
184
|
+
const docs = (0, exports._isBaseDocumentLoader)(docsSource)
|
|
185
|
+
? await docsSource.load()
|
|
186
|
+
: docsSource;
|
|
187
|
+
const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
|
|
188
|
+
const indexStartDt = await recordManager.getTime();
|
|
189
|
+
let numAdded = 0;
|
|
190
|
+
let numDeleted = 0;
|
|
191
|
+
let numUpdated = 0;
|
|
192
|
+
let numSkipped = 0;
|
|
193
|
+
const batches = _batch(batchSize ?? 100, docs);
|
|
194
|
+
for (const batch of batches) {
|
|
195
|
+
const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
|
|
196
|
+
const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
|
|
197
|
+
if (cleanup === "incremental") {
|
|
198
|
+
hashedDocs.forEach((_hashedDoc, index) => {
|
|
199
|
+
const source = sourceIds[index];
|
|
200
|
+
if (source === null) {
|
|
201
|
+
throw new Error("sourceIdKey must be provided when cleanup is incremental");
|
|
202
|
+
}
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
const batchExists = await recordManager.exists(hashedDocs.map((doc) => doc.uid));
|
|
206
|
+
const uids = [];
|
|
207
|
+
const docsToIndex = [];
|
|
208
|
+
const docsToUpdate = [];
|
|
209
|
+
const seenDocs = new Set();
|
|
210
|
+
hashedDocs.forEach((hashedDoc, i) => {
|
|
211
|
+
const docExists = batchExists[i];
|
|
212
|
+
if (docExists) {
|
|
213
|
+
if (forceUpdate) {
|
|
214
|
+
seenDocs.add(hashedDoc.uid);
|
|
215
|
+
}
|
|
216
|
+
else {
|
|
217
|
+
docsToUpdate.push(hashedDoc.uid);
|
|
218
|
+
return;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
uids.push(hashedDoc.uid);
|
|
222
|
+
docsToIndex.push(hashedDoc.toDocument());
|
|
223
|
+
});
|
|
224
|
+
if (docsToUpdate.length > 0) {
|
|
225
|
+
await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt });
|
|
226
|
+
numSkipped += docsToUpdate.length;
|
|
227
|
+
}
|
|
228
|
+
if (docsToIndex.length > 0) {
|
|
229
|
+
await vectorStore.addDocuments(docsToIndex, { ids: uids });
|
|
230
|
+
numAdded += docsToIndex.length - seenDocs.size;
|
|
231
|
+
numUpdated += seenDocs.size;
|
|
232
|
+
}
|
|
233
|
+
await recordManager.update(hashedDocs.map((doc) => doc.uid), { timeAtLeast: indexStartDt, groupIds: sourceIds });
|
|
234
|
+
if (cleanup === "incremental") {
|
|
235
|
+
sourceIds.forEach((sourceId) => {
|
|
236
|
+
if (!sourceId)
|
|
237
|
+
throw new Error("Source id cannot be null");
|
|
238
|
+
});
|
|
239
|
+
const uidsToDelete = await recordManager.listKeys({
|
|
240
|
+
before: indexStartDt,
|
|
241
|
+
groupIds: sourceIds,
|
|
242
|
+
});
|
|
243
|
+
await vectorStore.delete({ ids: uidsToDelete });
|
|
244
|
+
await recordManager.deleteKeys(uidsToDelete);
|
|
245
|
+
numDeleted += uidsToDelete.length;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
if (cleanup === "full") {
|
|
249
|
+
let uidsToDelete = await recordManager.listKeys({
|
|
250
|
+
before: indexStartDt,
|
|
251
|
+
limit: cleanupBatchSize,
|
|
252
|
+
});
|
|
253
|
+
while (uidsToDelete.length > 0) {
|
|
254
|
+
await vectorStore.delete({ ids: uidsToDelete });
|
|
255
|
+
await recordManager.deleteKeys(uidsToDelete);
|
|
256
|
+
numDeleted += uidsToDelete.length;
|
|
257
|
+
uidsToDelete = await recordManager.listKeys({
|
|
258
|
+
before: indexStartDt,
|
|
259
|
+
limit: cleanupBatchSize,
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
return {
|
|
264
|
+
numAdded,
|
|
265
|
+
numDeleted,
|
|
266
|
+
numUpdated,
|
|
267
|
+
numSkipped,
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
exports.index = index;
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import { VectorStore } from "../vectorstores.js";
|
|
2
|
+
import { RecordManagerInterface } from "./record_manager.js";
|
|
3
|
+
import { DocumentInterface } from "../documents/document.js";
|
|
4
|
+
import { BaseDocumentLoader } from "../document_loaders/base.js";
|
|
5
|
+
type Metadata = Record<string, unknown>;
|
|
6
|
+
type IndexingResult = {
|
|
7
|
+
numAdded: number;
|
|
8
|
+
numDeleted: number;
|
|
9
|
+
numUpdated: number;
|
|
10
|
+
numSkipped: number;
|
|
11
|
+
};
|
|
12
|
+
type StringOrDocFunc = string | ((doc: DocumentInterface) => string);
|
|
13
|
+
export interface HashedDocumentInterface extends DocumentInterface {
|
|
14
|
+
uid: string;
|
|
15
|
+
hash_?: string;
|
|
16
|
+
contentHash?: string;
|
|
17
|
+
metadataHash?: string;
|
|
18
|
+
pageContent: string;
|
|
19
|
+
metadata: Metadata;
|
|
20
|
+
calculateHashes(): void;
|
|
21
|
+
toDocument(): DocumentInterface;
|
|
22
|
+
}
|
|
23
|
+
interface HashedDocumentArgs {
|
|
24
|
+
pageContent: string;
|
|
25
|
+
metadata: Metadata;
|
|
26
|
+
uid: string;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* HashedDocument is a Document with hashes calculated.
|
|
30
|
+
* Hashes are calculated based on page content and metadata.
|
|
31
|
+
* It is used for indexing.
|
|
32
|
+
*/
|
|
33
|
+
export declare class _HashedDocument implements HashedDocumentInterface {
|
|
34
|
+
uid: string;
|
|
35
|
+
hash_?: string;
|
|
36
|
+
contentHash?: string;
|
|
37
|
+
metadataHash?: string;
|
|
38
|
+
pageContent: string;
|
|
39
|
+
metadata: Metadata;
|
|
40
|
+
constructor(fields: HashedDocumentArgs);
|
|
41
|
+
calculateHashes(): void;
|
|
42
|
+
toDocument(): DocumentInterface;
|
|
43
|
+
static fromDocument(document: DocumentInterface, uid?: string): _HashedDocument;
|
|
44
|
+
private _hashStringToUUID;
|
|
45
|
+
private _hashNestedDictToUUID;
|
|
46
|
+
}
|
|
47
|
+
export type CleanupMode = "full" | "incremental";
|
|
48
|
+
export type IndexOptions = {
|
|
49
|
+
/**
|
|
50
|
+
* The number of documents to index in one batch.
|
|
51
|
+
*/
|
|
52
|
+
batchSize?: number;
|
|
53
|
+
/**
|
|
54
|
+
* The cleanup mode to use. Can be "full", "incremental" or undefined.
|
|
55
|
+
* - **Incremental**: Cleans up all documents that haven't been updated AND
|
|
56
|
+
* that are associated with source ids that were seen
|
|
57
|
+
* during indexing.
|
|
58
|
+
* Clean up is done continuously during indexing helping
|
|
59
|
+
* to minimize the probability of users seeing duplicated
|
|
60
|
+
* content.
|
|
61
|
+
* - **Full**: Delete all documents that haven to been returned by the loader.
|
|
62
|
+
* Clean up runs after all documents have been indexed.
|
|
63
|
+
* This means that users may see duplicated content during indexing.
|
|
64
|
+
* - **undefined**: Do not delete any documents.
|
|
65
|
+
*/
|
|
66
|
+
cleanup?: CleanupMode;
|
|
67
|
+
/**
|
|
68
|
+
* Optional key that helps identify the original source of the document.
|
|
69
|
+
* Must either be a string representing the key of the source in the metadata
|
|
70
|
+
* or a function that takes a document and returns a string representing the source.
|
|
71
|
+
* **Required when cleanup is incremental**.
|
|
72
|
+
*/
|
|
73
|
+
sourceIdKey?: StringOrDocFunc;
|
|
74
|
+
/**
|
|
75
|
+
* Batch size to use when cleaning up documents.
|
|
76
|
+
*/
|
|
77
|
+
cleanupBatchSize?: number;
|
|
78
|
+
/**
|
|
79
|
+
* Force update documents even if they are present in the
|
|
80
|
+
* record manager. Useful if you are re-indexing with updated embeddings.
|
|
81
|
+
*/
|
|
82
|
+
forceUpdate?: boolean;
|
|
83
|
+
};
|
|
84
|
+
export declare function _batch<T>(size: number, iterable: T[]): T[][];
|
|
85
|
+
export declare function _deduplicateInOrder(hashedDocuments: HashedDocumentInterface[]): HashedDocumentInterface[];
|
|
86
|
+
export declare function _getSourceIdAssigner(sourceIdKey: StringOrDocFunc | null): (doc: DocumentInterface) => string | null;
|
|
87
|
+
export declare const _isBaseDocumentLoader: (arg: any) => arg is BaseDocumentLoader;
|
|
88
|
+
interface IndexArgs {
|
|
89
|
+
docsSource: BaseDocumentLoader | DocumentInterface[];
|
|
90
|
+
recordManager: RecordManagerInterface;
|
|
91
|
+
vectorStore: VectorStore;
|
|
92
|
+
options?: IndexOptions;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Index data from the doc source into the vector store.
|
|
96
|
+
*
|
|
97
|
+
* Indexing functionality uses a manager to keep track of which documents
|
|
98
|
+
* are in the vector store.
|
|
99
|
+
*
|
|
100
|
+
* This allows us to keep track of which documents were updated, and which
|
|
101
|
+
* documents were deleted, which documents should be skipped.
|
|
102
|
+
*
|
|
103
|
+
* For the time being, documents are indexed using their hashes, and users
|
|
104
|
+
* are not able to specify the uid of the document.
|
|
105
|
+
*
|
|
106
|
+
* @param {IndexArgs} args
|
|
107
|
+
* @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.
|
|
108
|
+
* @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.
|
|
109
|
+
* @param {VectorStore} args.vectorStore The vector store to use for storing the documents.
|
|
110
|
+
* @param {IndexOptions | undefined} args.options Options for indexing.
|
|
111
|
+
* @returns {Promise<IndexingResult>}
|
|
112
|
+
*/
|
|
113
|
+
export declare function index(args: IndexArgs): Promise<IndexingResult>;
|
|
114
|
+
export {};
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import { v5 as uuidv5 } from "uuid";
|
|
2
|
+
import { UUIDV5_NAMESPACE } from "./record_manager.js";
|
|
3
|
+
import { insecureHash } from "../utils/hash.js";
|
|
4
|
+
import { Document } from "../documents/document.js";
|
|
5
|
+
/**
|
|
6
|
+
* HashedDocument is a Document with hashes calculated.
|
|
7
|
+
* Hashes are calculated based on page content and metadata.
|
|
8
|
+
* It is used for indexing.
|
|
9
|
+
*/
|
|
10
|
+
export class _HashedDocument {
|
|
11
|
+
constructor(fields) {
|
|
12
|
+
Object.defineProperty(this, "uid", {
|
|
13
|
+
enumerable: true,
|
|
14
|
+
configurable: true,
|
|
15
|
+
writable: true,
|
|
16
|
+
value: void 0
|
|
17
|
+
});
|
|
18
|
+
Object.defineProperty(this, "hash_", {
|
|
19
|
+
enumerable: true,
|
|
20
|
+
configurable: true,
|
|
21
|
+
writable: true,
|
|
22
|
+
value: void 0
|
|
23
|
+
});
|
|
24
|
+
Object.defineProperty(this, "contentHash", {
|
|
25
|
+
enumerable: true,
|
|
26
|
+
configurable: true,
|
|
27
|
+
writable: true,
|
|
28
|
+
value: void 0
|
|
29
|
+
});
|
|
30
|
+
Object.defineProperty(this, "metadataHash", {
|
|
31
|
+
enumerable: true,
|
|
32
|
+
configurable: true,
|
|
33
|
+
writable: true,
|
|
34
|
+
value: void 0
|
|
35
|
+
});
|
|
36
|
+
Object.defineProperty(this, "pageContent", {
|
|
37
|
+
enumerable: true,
|
|
38
|
+
configurable: true,
|
|
39
|
+
writable: true,
|
|
40
|
+
value: void 0
|
|
41
|
+
});
|
|
42
|
+
Object.defineProperty(this, "metadata", {
|
|
43
|
+
enumerable: true,
|
|
44
|
+
configurable: true,
|
|
45
|
+
writable: true,
|
|
46
|
+
value: void 0
|
|
47
|
+
});
|
|
48
|
+
this.uid = fields.uid;
|
|
49
|
+
this.pageContent = fields.pageContent;
|
|
50
|
+
this.metadata = fields.metadata;
|
|
51
|
+
}
|
|
52
|
+
calculateHashes() {
|
|
53
|
+
const forbiddenKeys = ["hash_", "content_hash", "metadata_hash"];
|
|
54
|
+
for (const key of forbiddenKeys) {
|
|
55
|
+
if (key in this.metadata) {
|
|
56
|
+
throw new Error(`Metadata cannot contain key ${key} as it is reserved for internal use. Restricted keys: [${forbiddenKeys.join(", ")}]`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
const contentHash = this._hashStringToUUID(this.pageContent);
|
|
60
|
+
try {
|
|
61
|
+
const metadataHash = this._hashNestedDictToUUID(this.metadata);
|
|
62
|
+
this.contentHash = contentHash;
|
|
63
|
+
this.metadataHash = metadataHash;
|
|
64
|
+
}
|
|
65
|
+
catch (e) {
|
|
66
|
+
throw new Error(`Failed to hash metadata: ${e}. Please use a dict that can be serialized using json.`);
|
|
67
|
+
}
|
|
68
|
+
this.hash_ = this._hashStringToUUID(this.contentHash + this.metadataHash);
|
|
69
|
+
if (!this.uid) {
|
|
70
|
+
this.uid = this.hash_;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
toDocument() {
|
|
74
|
+
return new Document({
|
|
75
|
+
pageContent: this.pageContent,
|
|
76
|
+
metadata: this.metadata,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
static fromDocument(document, uid) {
|
|
80
|
+
const doc = new this({
|
|
81
|
+
pageContent: document.pageContent,
|
|
82
|
+
metadata: document.metadata,
|
|
83
|
+
uid: uid || document.uid,
|
|
84
|
+
});
|
|
85
|
+
doc.calculateHashes();
|
|
86
|
+
return doc;
|
|
87
|
+
}
|
|
88
|
+
_hashStringToUUID(inputString) {
|
|
89
|
+
const hash_value = insecureHash(inputString);
|
|
90
|
+
return uuidv5(hash_value, UUIDV5_NAMESPACE);
|
|
91
|
+
}
|
|
92
|
+
_hashNestedDictToUUID(data) {
|
|
93
|
+
const serialized_data = JSON.stringify(data, Object.keys(data).sort());
|
|
94
|
+
const hash_value = insecureHash(serialized_data);
|
|
95
|
+
return uuidv5(hash_value, UUIDV5_NAMESPACE);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
export function _batch(size, iterable) {
|
|
99
|
+
const batches = [];
|
|
100
|
+
let currentBatch = [];
|
|
101
|
+
iterable.forEach((item) => {
|
|
102
|
+
currentBatch.push(item);
|
|
103
|
+
if (currentBatch.length >= size) {
|
|
104
|
+
batches.push(currentBatch);
|
|
105
|
+
currentBatch = [];
|
|
106
|
+
}
|
|
107
|
+
});
|
|
108
|
+
if (currentBatch.length > 0) {
|
|
109
|
+
batches.push(currentBatch);
|
|
110
|
+
}
|
|
111
|
+
return batches;
|
|
112
|
+
}
|
|
113
|
+
export function _deduplicateInOrder(hashedDocuments) {
|
|
114
|
+
const seen = new Set();
|
|
115
|
+
const deduplicated = [];
|
|
116
|
+
for (const hashedDoc of hashedDocuments) {
|
|
117
|
+
if (!hashedDoc.hash_) {
|
|
118
|
+
throw new Error("Hashed document does not have a hash");
|
|
119
|
+
}
|
|
120
|
+
if (!seen.has(hashedDoc.hash_)) {
|
|
121
|
+
seen.add(hashedDoc.hash_);
|
|
122
|
+
deduplicated.push(hashedDoc);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return deduplicated;
|
|
126
|
+
}
|
|
127
|
+
export function _getSourceIdAssigner(sourceIdKey) {
|
|
128
|
+
if (sourceIdKey === null) {
|
|
129
|
+
return (_doc) => null;
|
|
130
|
+
}
|
|
131
|
+
else if (typeof sourceIdKey === "string") {
|
|
132
|
+
return (doc) => doc.metadata[sourceIdKey];
|
|
133
|
+
}
|
|
134
|
+
else if (typeof sourceIdKey === "function") {
|
|
135
|
+
return sourceIdKey;
|
|
136
|
+
}
|
|
137
|
+
else {
|
|
138
|
+
throw new Error(`sourceIdKey should be null, a string or a function, got ${typeof sourceIdKey}`);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
142
|
+
export const _isBaseDocumentLoader = (arg) => {
|
|
143
|
+
if ("load" in arg &&
|
|
144
|
+
typeof arg.load === "function" &&
|
|
145
|
+
"loadAndSplit" in arg &&
|
|
146
|
+
typeof arg.loadAndSplit === "function") {
|
|
147
|
+
return true;
|
|
148
|
+
}
|
|
149
|
+
return false;
|
|
150
|
+
};
|
|
151
|
+
/**
|
|
152
|
+
* Index data from the doc source into the vector store.
|
|
153
|
+
*
|
|
154
|
+
* Indexing functionality uses a manager to keep track of which documents
|
|
155
|
+
* are in the vector store.
|
|
156
|
+
*
|
|
157
|
+
* This allows us to keep track of which documents were updated, and which
|
|
158
|
+
* documents were deleted, which documents should be skipped.
|
|
159
|
+
*
|
|
160
|
+
* For the time being, documents are indexed using their hashes, and users
|
|
161
|
+
* are not able to specify the uid of the document.
|
|
162
|
+
*
|
|
163
|
+
* @param {IndexArgs} args
|
|
164
|
+
* @param {BaseDocumentLoader | DocumentInterface[]} args.docsSource The source of documents to index. Can be a DocumentLoader or a list of Documents.
|
|
165
|
+
* @param {RecordManagerInterface} args.recordManager The record manager to use for keeping track of indexed documents.
|
|
166
|
+
* @param {VectorStore} args.vectorStore The vector store to use for storing the documents.
|
|
167
|
+
* @param {IndexOptions | undefined} args.options Options for indexing.
|
|
168
|
+
* @returns {Promise<IndexingResult>}
|
|
169
|
+
*/
|
|
170
|
+
export async function index(args) {
|
|
171
|
+
const { docsSource, recordManager, vectorStore, options } = args;
|
|
172
|
+
const { batchSize = 100, cleanup, sourceIdKey, cleanupBatchSize = 1000, forceUpdate = false, } = options ?? {};
|
|
173
|
+
if (cleanup === "incremental" && !sourceIdKey) {
|
|
174
|
+
throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.");
|
|
175
|
+
}
|
|
176
|
+
const docs = _isBaseDocumentLoader(docsSource)
|
|
177
|
+
? await docsSource.load()
|
|
178
|
+
: docsSource;
|
|
179
|
+
const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null);
|
|
180
|
+
const indexStartDt = await recordManager.getTime();
|
|
181
|
+
let numAdded = 0;
|
|
182
|
+
let numDeleted = 0;
|
|
183
|
+
let numUpdated = 0;
|
|
184
|
+
let numSkipped = 0;
|
|
185
|
+
const batches = _batch(batchSize ?? 100, docs);
|
|
186
|
+
for (const batch of batches) {
|
|
187
|
+
const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)));
|
|
188
|
+
const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc));
|
|
189
|
+
if (cleanup === "incremental") {
|
|
190
|
+
hashedDocs.forEach((_hashedDoc, index) => {
|
|
191
|
+
const source = sourceIds[index];
|
|
192
|
+
if (source === null) {
|
|
193
|
+
throw new Error("sourceIdKey must be provided when cleanup is incremental");
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
const batchExists = await recordManager.exists(hashedDocs.map((doc) => doc.uid));
|
|
198
|
+
const uids = [];
|
|
199
|
+
const docsToIndex = [];
|
|
200
|
+
const docsToUpdate = [];
|
|
201
|
+
const seenDocs = new Set();
|
|
202
|
+
hashedDocs.forEach((hashedDoc, i) => {
|
|
203
|
+
const docExists = batchExists[i];
|
|
204
|
+
if (docExists) {
|
|
205
|
+
if (forceUpdate) {
|
|
206
|
+
seenDocs.add(hashedDoc.uid);
|
|
207
|
+
}
|
|
208
|
+
else {
|
|
209
|
+
docsToUpdate.push(hashedDoc.uid);
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
uids.push(hashedDoc.uid);
|
|
214
|
+
docsToIndex.push(hashedDoc.toDocument());
|
|
215
|
+
});
|
|
216
|
+
if (docsToUpdate.length > 0) {
|
|
217
|
+
await recordManager.update(docsToUpdate, { timeAtLeast: indexStartDt });
|
|
218
|
+
numSkipped += docsToUpdate.length;
|
|
219
|
+
}
|
|
220
|
+
if (docsToIndex.length > 0) {
|
|
221
|
+
await vectorStore.addDocuments(docsToIndex, { ids: uids });
|
|
222
|
+
numAdded += docsToIndex.length - seenDocs.size;
|
|
223
|
+
numUpdated += seenDocs.size;
|
|
224
|
+
}
|
|
225
|
+
await recordManager.update(hashedDocs.map((doc) => doc.uid), { timeAtLeast: indexStartDt, groupIds: sourceIds });
|
|
226
|
+
if (cleanup === "incremental") {
|
|
227
|
+
sourceIds.forEach((sourceId) => {
|
|
228
|
+
if (!sourceId)
|
|
229
|
+
throw new Error("Source id cannot be null");
|
|
230
|
+
});
|
|
231
|
+
const uidsToDelete = await recordManager.listKeys({
|
|
232
|
+
before: indexStartDt,
|
|
233
|
+
groupIds: sourceIds,
|
|
234
|
+
});
|
|
235
|
+
await vectorStore.delete({ ids: uidsToDelete });
|
|
236
|
+
await recordManager.deleteKeys(uidsToDelete);
|
|
237
|
+
numDeleted += uidsToDelete.length;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
if (cleanup === "full") {
|
|
241
|
+
let uidsToDelete = await recordManager.listKeys({
|
|
242
|
+
before: indexStartDt,
|
|
243
|
+
limit: cleanupBatchSize,
|
|
244
|
+
});
|
|
245
|
+
while (uidsToDelete.length > 0) {
|
|
246
|
+
await vectorStore.delete({ ids: uidsToDelete });
|
|
247
|
+
await recordManager.deleteKeys(uidsToDelete);
|
|
248
|
+
numDeleted += uidsToDelete.length;
|
|
249
|
+
uidsToDelete = await recordManager.listKeys({
|
|
250
|
+
before: indexStartDt,
|
|
251
|
+
limit: cleanupBatchSize,
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
return {
|
|
256
|
+
numAdded,
|
|
257
|
+
numDeleted,
|
|
258
|
+
numUpdated,
|
|
259
|
+
numSkipped,
|
|
260
|
+
};
|
|
261
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
__exportStar(require("./record_manager.cjs"), exports);
|
|
18
|
+
__exportStar(require("./base.cjs"), exports);
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.RecordManager = exports.UUIDV5_NAMESPACE = void 0;
|
|
4
|
+
const serializable_js_1 = require("../load/serializable.cjs");
|
|
5
|
+
// Arbitrary value, used for generating namespaced UUIDs.
|
|
6
|
+
exports.UUIDV5_NAMESPACE = "10f90ea3-90a4-4962-bf75-83a0f3c1c62a";
|
|
7
|
+
class RecordManager extends serializable_js_1.Serializable {
|
|
8
|
+
constructor() {
|
|
9
|
+
super(...arguments);
|
|
10
|
+
Object.defineProperty(this, "lc_namespace", {
|
|
11
|
+
enumerable: true,
|
|
12
|
+
configurable: true,
|
|
13
|
+
writable: true,
|
|
14
|
+
value: ["langchain", "recordmanagers"]
|
|
15
|
+
});
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
exports.RecordManager = RecordManager;
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { Serializable } from "../load/serializable.js";
|
|
2
|
+
export declare const UUIDV5_NAMESPACE = "10f90ea3-90a4-4962-bf75-83a0f3c1c62a";
|
|
3
|
+
export type UpdateOptions = {
|
|
4
|
+
groupIds?: (string | null)[];
|
|
5
|
+
timeAtLeast?: number;
|
|
6
|
+
};
|
|
7
|
+
export type ListKeyOptions = {
|
|
8
|
+
before?: number;
|
|
9
|
+
after?: number;
|
|
10
|
+
groupIds?: (string | null)[];
|
|
11
|
+
limit?: number;
|
|
12
|
+
};
|
|
13
|
+
export interface RecordManagerInterface {
|
|
14
|
+
/**
|
|
15
|
+
* Creates schema in the record manager.
|
|
16
|
+
* @returns Promise
|
|
17
|
+
*/
|
|
18
|
+
createSchema(): Promise<void>;
|
|
19
|
+
/**
|
|
20
|
+
* Returns current time from the record manager.
|
|
21
|
+
* @returns Current time
|
|
22
|
+
*/
|
|
23
|
+
getTime(): Promise<number>;
|
|
24
|
+
/**
|
|
25
|
+
* Updates keys in the record manager.
|
|
26
|
+
* @param keys List of keys to update
|
|
27
|
+
* @param groupIds List of groupIds to update
|
|
28
|
+
* @param timeAtLeast Update only if current time is at least this value
|
|
29
|
+
* @returns Promise
|
|
30
|
+
* @throws Error if timeAtLeast is provided and current time is less than timeAtLeast
|
|
31
|
+
* @throws Error if number of keys does not match number of groupIds
|
|
32
|
+
*/
|
|
33
|
+
update(keys: string[], updateOptions: UpdateOptions): Promise<void>;
|
|
34
|
+
/**
|
|
35
|
+
* Checks if keys exist in the record manager.
|
|
36
|
+
* @param keys List of keys to check
|
|
37
|
+
* @returns List of booleans indicating if key exists in same order as provided keys
|
|
38
|
+
*/
|
|
39
|
+
exists(keys: string[]): Promise<boolean[]>;
|
|
40
|
+
/**
|
|
41
|
+
* Lists keys from the record manager.
|
|
42
|
+
* @param before List keys before this timestamp
|
|
43
|
+
* @param after List keys after this timestamp
|
|
44
|
+
* @param groupIds List keys with these groupIds
|
|
45
|
+
* @param limit Limit the number of keys returned
|
|
46
|
+
* @returns List of keys
|
|
47
|
+
*
|
|
48
|
+
*/
|
|
49
|
+
listKeys(options: ListKeyOptions): Promise<string[]>;
|
|
50
|
+
/**
|
|
51
|
+
* Deletes keys from the record manager.
|
|
52
|
+
* @param keys List of keys to delete
|
|
53
|
+
*/
|
|
54
|
+
deleteKeys(keys: string[]): Promise<void>;
|
|
55
|
+
}
|
|
56
|
+
export declare abstract class RecordManager extends Serializable implements RecordManagerInterface {
|
|
57
|
+
lc_namespace: string[];
|
|
58
|
+
abstract createSchema(): Promise<void>;
|
|
59
|
+
abstract getTime(): Promise<number>;
|
|
60
|
+
abstract update(keys: string[], updateOptions?: UpdateOptions): Promise<void>;
|
|
61
|
+
abstract exists(keys: string[]): Promise<boolean[]>;
|
|
62
|
+
abstract listKeys(options?: ListKeyOptions): Promise<string[]>;
|
|
63
|
+
abstract deleteKeys(keys: string[]): Promise<void>;
|
|
64
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { Serializable } from "../load/serializable.js";
|
|
2
|
+
// Arbitrary value, used for generating namespaced UUIDs.
|
|
3
|
+
export const UUIDV5_NAMESPACE = "10f90ea3-90a4-4962-bf75-83a0f3c1c62a";
|
|
4
|
+
export class RecordManager extends Serializable {
|
|
5
|
+
constructor() {
|
|
6
|
+
super(...arguments);
|
|
7
|
+
Object.defineProperty(this, "lc_namespace", {
|
|
8
|
+
enumerable: true,
|
|
9
|
+
configurable: true,
|
|
10
|
+
writable: true,
|
|
11
|
+
value: ["langchain", "recordmanagers"]
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
}
|
package/dist/runnables/base.cjs
CHANGED
|
@@ -16,6 +16,7 @@ const root_listener_js_1 = require("../tracers/root_listener.cjs");
|
|
|
16
16
|
const utils_js_1 = require("./utils.cjs");
|
|
17
17
|
const index_js_1 = require("../singletons/index.cjs");
|
|
18
18
|
const graph_js_1 = require("./graph.cjs");
|
|
19
|
+
const wrappers_js_1 = require("./wrappers.cjs");
|
|
19
20
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
20
21
|
function _coerceToDict(value, defaultKey) {
|
|
21
22
|
return value &&
|
|
@@ -448,46 +449,16 @@ class Runnable extends serializable_js_1.Serializable {
|
|
|
448
449
|
await runnableStreamConsumePromise;
|
|
449
450
|
}
|
|
450
451
|
}
|
|
451
|
-
/**
|
|
452
|
-
* Generate a stream of events emitted by the internal steps of the runnable.
|
|
453
|
-
*
|
|
454
|
-
* Use to create an iterator over StreamEvents that provide real-time information
|
|
455
|
-
* about the progress of the runnable, including StreamEvents from intermediate
|
|
456
|
-
* results.
|
|
457
|
-
*
|
|
458
|
-
* A StreamEvent is a dictionary with the following schema:
|
|
459
|
-
*
|
|
460
|
-
* - `event`: string - Event names are of the format: on_[runnable_type]_(start|stream|end).
|
|
461
|
-
* - `name`: string - The name of the runnable that generated the event.
|
|
462
|
-
* - `run_id`: string - Randomly generated ID associated with the given execution of
|
|
463
|
-
* the runnable that emitted the event. A child runnable that gets invoked as part of the execution of a
|
|
464
|
-
* parent runnable is assigned its own unique ID.
|
|
465
|
-
* - `tags`: string[] - The tags of the runnable that generated the event.
|
|
466
|
-
* - `metadata`: Record<string, any> - The metadata of the runnable that generated the event.
|
|
467
|
-
* - `data`: Record<string, any>
|
|
468
|
-
*
|
|
469
|
-
* Below is a table that illustrates some events that might be emitted by various
|
|
470
|
-
* chains. Metadata fields have been omitted from the table for brevity.
|
|
471
|
-
* Chain definitions have been included after the table.
|
|
472
|
-
*
|
|
473
|
-
* | event | name | chunk | input | output |
|
|
474
|
-
* |----------------------|------------------|------------------------------------|-----------------------------------------------|-------------------------------------------------|
|
|
475
|
-
* | on_llm_start | [model name] | | {'input': 'hello'} | |
|
|
476
|
-
* | on_llm_stream | [model name] | 'Hello' OR AIMessageChunk("hello") | | |
|
|
477
|
-
* | on_llm_end | [model name] | | 'Hello human!' |
|
|
478
|
-
* | on_chain_start | format_docs | | | |
|
|
479
|
-
* | on_chain_stream | format_docs | "hello world!, goodbye world!" | | |
|
|
480
|
-
* | on_chain_end | format_docs | | [Document(...)] | "hello world!, goodbye world!" |
|
|
481
|
-
* | on_tool_start | some_tool | | {"x": 1, "y": "2"} | |
|
|
482
|
-
* | on_tool_stream | some_tool | {"x": 1, "y": "2"} | | |
|
|
483
|
-
* | on_tool_end | some_tool | | | {"x": 1, "y": "2"} |
|
|
484
|
-
* | on_retriever_start | [retriever name] | | {"query": "hello"} | |
|
|
485
|
-
* | on_retriever_chunk | [retriever name] | {documents: [...]} | | |
|
|
486
|
-
* | on_retriever_end | [retriever name] | | {"query": "hello"} | {documents: [...]} |
|
|
487
|
-
* | on_prompt_start | [template_name] | | {"question": "hello"} | |
|
|
488
|
-
* | on_prompt_end | [template_name] | | {"question": "hello"} | ChatPromptValue(messages: [SystemMessage, ...]) |
|
|
489
|
-
*/
|
|
490
452
|
async *streamEvents(input, options, streamOptions) {
|
|
453
|
+
if (options.encoding === "text/event-stream") {
|
|
454
|
+
const stream = await this._streamEvents(input, options, streamOptions);
|
|
455
|
+
yield* (0, wrappers_js_1.convertToHttpEventStream)(stream);
|
|
456
|
+
}
|
|
457
|
+
else {
|
|
458
|
+
yield* this._streamEvents(input, options, streamOptions);
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
async *_streamEvents(input, options, streamOptions) {
|
|
491
462
|
if (options.version !== "v1") {
|
|
492
463
|
throw new Error(`Only version "v1" of the events schema is currently supported.`);
|
|
493
464
|
}
|
package/dist/runnables/base.d.ts
CHANGED
|
@@ -199,6 +199,13 @@ export declare abstract class Runnable<RunInput = any, RunOutput = any, CallOpti
|
|
|
199
199
|
streamEvents(input: RunInput, options: Partial<CallOptions> & {
|
|
200
200
|
version: "v1";
|
|
201
201
|
}, streamOptions?: Omit<LogStreamCallbackHandlerInput, "autoClose">): AsyncGenerator<StreamEvent>;
|
|
202
|
+
streamEvents(input: RunInput, options: Partial<CallOptions> & {
|
|
203
|
+
version: "v1";
|
|
204
|
+
encoding: "text/event-stream";
|
|
205
|
+
}, streamOptions?: Omit<LogStreamCallbackHandlerInput, "autoClose">): AsyncGenerator<Uint8Array>;
|
|
206
|
+
_streamEvents(input: RunInput, options: Partial<CallOptions> & {
|
|
207
|
+
version: "v1";
|
|
208
|
+
}, streamOptions?: Omit<LogStreamCallbackHandlerInput, "autoClose">): AsyncGenerator<StreamEvent>;
|
|
202
209
|
static isRunnable(thing: any): thing is Runnable;
|
|
203
210
|
/**
|
|
204
211
|
* Bind lifecycle listeners to a Runnable, returning a new Runnable.
|
|
@@ -257,6 +264,10 @@ export declare class RunnableBinding<RunInput, RunOutput, CallOptions extends Ru
|
|
|
257
264
|
streamEvents(input: RunInput, options: Partial<CallOptions> & {
|
|
258
265
|
version: "v1";
|
|
259
266
|
}, streamOptions?: Omit<LogStreamCallbackHandlerInput, "autoClose">): AsyncGenerator<StreamEvent>;
|
|
267
|
+
streamEvents(input: RunInput, options: Partial<CallOptions> & {
|
|
268
|
+
version: "v1";
|
|
269
|
+
encoding: "text/event-stream";
|
|
270
|
+
}, streamOptions?: Omit<LogStreamCallbackHandlerInput, "autoClose">): AsyncGenerator<Uint8Array>;
|
|
260
271
|
static isRunnableBinding(thing: any): thing is RunnableBinding<any, any, any>;
|
|
261
272
|
/**
|
|
262
273
|
* Bind lifecycle listeners to a Runnable, returning a new Runnable.
|
package/dist/runnables/base.js
CHANGED
|
@@ -10,6 +10,7 @@ import { RootListenersTracer } from "../tracers/root_listener.js";
|
|
|
10
10
|
import { _RootEventFilter, isRunnableInterface } from "./utils.js";
|
|
11
11
|
import { AsyncLocalStorageProviderSingleton } from "../singletons/index.js";
|
|
12
12
|
import { Graph } from "./graph.js";
|
|
13
|
+
import { convertToHttpEventStream } from "./wrappers.js";
|
|
13
14
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
14
15
|
export function _coerceToDict(value, defaultKey) {
|
|
15
16
|
return value &&
|
|
@@ -441,46 +442,16 @@ export class Runnable extends Serializable {
|
|
|
441
442
|
await runnableStreamConsumePromise;
|
|
442
443
|
}
|
|
443
444
|
}
|
|
444
|
-
/**
|
|
445
|
-
* Generate a stream of events emitted by the internal steps of the runnable.
|
|
446
|
-
*
|
|
447
|
-
* Use to create an iterator over StreamEvents that provide real-time information
|
|
448
|
-
* about the progress of the runnable, including StreamEvents from intermediate
|
|
449
|
-
* results.
|
|
450
|
-
*
|
|
451
|
-
* A StreamEvent is a dictionary with the following schema:
|
|
452
|
-
*
|
|
453
|
-
* - `event`: string - Event names are of the format: on_[runnable_type]_(start|stream|end).
|
|
454
|
-
* - `name`: string - The name of the runnable that generated the event.
|
|
455
|
-
* - `run_id`: string - Randomly generated ID associated with the given execution of
|
|
456
|
-
* the runnable that emitted the event. A child runnable that gets invoked as part of the execution of a
|
|
457
|
-
* parent runnable is assigned its own unique ID.
|
|
458
|
-
* - `tags`: string[] - The tags of the runnable that generated the event.
|
|
459
|
-
* - `metadata`: Record<string, any> - The metadata of the runnable that generated the event.
|
|
460
|
-
* - `data`: Record<string, any>
|
|
461
|
-
*
|
|
462
|
-
* Below is a table that illustrates some events that might be emitted by various
|
|
463
|
-
* chains. Metadata fields have been omitted from the table for brevity.
|
|
464
|
-
* Chain definitions have been included after the table.
|
|
465
|
-
*
|
|
466
|
-
* | event | name | chunk | input | output |
|
|
467
|
-
* |----------------------|------------------|------------------------------------|-----------------------------------------------|-------------------------------------------------|
|
|
468
|
-
* | on_llm_start | [model name] | | {'input': 'hello'} | |
|
|
469
|
-
* | on_llm_stream | [model name] | 'Hello' OR AIMessageChunk("hello") | | |
|
|
470
|
-
* | on_llm_end | [model name] | | 'Hello human!' |
|
|
471
|
-
* | on_chain_start | format_docs | | | |
|
|
472
|
-
* | on_chain_stream | format_docs | "hello world!, goodbye world!" | | |
|
|
473
|
-
* | on_chain_end | format_docs | | [Document(...)] | "hello world!, goodbye world!" |
|
|
474
|
-
* | on_tool_start | some_tool | | {"x": 1, "y": "2"} | |
|
|
475
|
-
* | on_tool_stream | some_tool | {"x": 1, "y": "2"} | | |
|
|
476
|
-
* | on_tool_end | some_tool | | | {"x": 1, "y": "2"} |
|
|
477
|
-
* | on_retriever_start | [retriever name] | | {"query": "hello"} | |
|
|
478
|
-
* | on_retriever_chunk | [retriever name] | {documents: [...]} | | |
|
|
479
|
-
* | on_retriever_end | [retriever name] | | {"query": "hello"} | {documents: [...]} |
|
|
480
|
-
* | on_prompt_start | [template_name] | | {"question": "hello"} | |
|
|
481
|
-
* | on_prompt_end | [template_name] | | {"question": "hello"} | ChatPromptValue(messages: [SystemMessage, ...]) |
|
|
482
|
-
*/
|
|
483
445
|
async *streamEvents(input, options, streamOptions) {
|
|
446
|
+
if (options.encoding === "text/event-stream") {
|
|
447
|
+
const stream = await this._streamEvents(input, options, streamOptions);
|
|
448
|
+
yield* convertToHttpEventStream(stream);
|
|
449
|
+
}
|
|
450
|
+
else {
|
|
451
|
+
yield* this._streamEvents(input, options, streamOptions);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
async *_streamEvents(input, options, streamOptions) {
|
|
484
455
|
if (options.version !== "v1") {
|
|
485
456
|
throw new Error(`Only version "v1" of the events schema is currently supported.`);
|
|
486
457
|
}
|
|
@@ -30,5 +30,9 @@ export declare class RemoteRunnable<RunInput, RunOutput, CallOptions extends Run
|
|
|
30
30
|
streamEvents(input: RunInput, options: Partial<CallOptions> & {
|
|
31
31
|
version: "v1";
|
|
32
32
|
}, streamOptions?: Omit<LogStreamCallbackHandlerInput, "autoClose">): AsyncGenerator<StreamEvent>;
|
|
33
|
+
streamEvents(input: RunInput, options: Partial<CallOptions> & {
|
|
34
|
+
version: "v1";
|
|
35
|
+
encoding: "text/event-stream";
|
|
36
|
+
}, streamOptions?: Omit<LogStreamCallbackHandlerInput, "autoClose">): AsyncGenerator<Uint8Array>;
|
|
33
37
|
}
|
|
34
38
|
export {};
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.convertToHttpEventStream = void 0;
|
|
4
|
+
const stream_js_1 = require("../utils/stream.cjs");
|
|
5
|
+
function convertToHttpEventStream(stream) {
|
|
6
|
+
const encoder = new TextEncoder();
|
|
7
|
+
const finalStream = new ReadableStream({
|
|
8
|
+
async start(controller) {
|
|
9
|
+
for await (const chunk of stream) {
|
|
10
|
+
controller.enqueue(encoder.encode(`event: data\ndata: ${JSON.stringify(chunk)}\n\n`));
|
|
11
|
+
}
|
|
12
|
+
controller.enqueue(encoder.encode("event: end\n\n"));
|
|
13
|
+
controller.close();
|
|
14
|
+
},
|
|
15
|
+
});
|
|
16
|
+
return stream_js_1.IterableReadableStream.fromReadableStream(finalStream);
|
|
17
|
+
}
|
|
18
|
+
exports.convertToHttpEventStream = convertToHttpEventStream;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { IterableReadableStream } from "../utils/stream.js";
|
|
2
|
+
export function convertToHttpEventStream(stream) {
|
|
3
|
+
const encoder = new TextEncoder();
|
|
4
|
+
const finalStream = new ReadableStream({
|
|
5
|
+
async start(controller) {
|
|
6
|
+
for await (const chunk of stream) {
|
|
7
|
+
controller.enqueue(encoder.encode(`event: data\ndata: ${JSON.stringify(chunk)}\n\n`));
|
|
8
|
+
}
|
|
9
|
+
controller.enqueue(encoder.encode("event: end\n\n"));
|
|
10
|
+
controller.close();
|
|
11
|
+
},
|
|
12
|
+
});
|
|
13
|
+
return IterableReadableStream.fromReadableStream(finalStream);
|
|
14
|
+
}
|
package/dist/tracers/base.cjs
CHANGED
|
@@ -11,9 +11,9 @@ function _coerceToDict(value, defaultKey) {
|
|
|
11
11
|
function stripNonAlphanumeric(input) {
|
|
12
12
|
return input.replace(/[-:.]/g, "");
|
|
13
13
|
}
|
|
14
|
-
function convertToDottedOrderFormat(epoch, runId) {
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
function convertToDottedOrderFormat(epoch, runId, executionOrder) {
|
|
15
|
+
const paddedOrder = executionOrder.toFixed(0).slice(0, 3).padStart(3, "0");
|
|
16
|
+
return (stripNonAlphanumeric(`${new Date(epoch).toISOString().slice(0, -1)}${paddedOrder}Z`) + runId);
|
|
17
17
|
}
|
|
18
18
|
class BaseTracer extends base_js_1.BaseCallbackHandler {
|
|
19
19
|
constructor(_fields) {
|
|
@@ -42,7 +42,7 @@ class BaseTracer extends base_js_1.BaseCallbackHandler {
|
|
|
42
42
|
parentRun.child_runs.push(childRun);
|
|
43
43
|
}
|
|
44
44
|
async _startTrace(run) {
|
|
45
|
-
const currentDottedOrder = convertToDottedOrderFormat(run.start_time, run.id);
|
|
45
|
+
const currentDottedOrder = convertToDottedOrderFormat(run.start_time, run.id, run.execution_order);
|
|
46
46
|
const storedRun = { ...run };
|
|
47
47
|
if (storedRun.parent_run_id !== undefined) {
|
|
48
48
|
const parentRun = this.runMap.get(storedRun.parent_run_id);
|
package/dist/tracers/base.js
CHANGED
|
@@ -8,9 +8,9 @@ function _coerceToDict(value, defaultKey) {
|
|
|
8
8
|
function stripNonAlphanumeric(input) {
|
|
9
9
|
return input.replace(/[-:.]/g, "");
|
|
10
10
|
}
|
|
11
|
-
function convertToDottedOrderFormat(epoch, runId) {
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
function convertToDottedOrderFormat(epoch, runId, executionOrder) {
|
|
12
|
+
const paddedOrder = executionOrder.toFixed(0).slice(0, 3).padStart(3, "0");
|
|
13
|
+
return (stripNonAlphanumeric(`${new Date(epoch).toISOString().slice(0, -1)}${paddedOrder}Z`) + runId);
|
|
14
14
|
}
|
|
15
15
|
export class BaseTracer extends BaseCallbackHandler {
|
|
16
16
|
constructor(_fields) {
|
|
@@ -39,7 +39,7 @@ export class BaseTracer extends BaseCallbackHandler {
|
|
|
39
39
|
parentRun.child_runs.push(childRun);
|
|
40
40
|
}
|
|
41
41
|
async _startTrace(run) {
|
|
42
|
-
const currentDottedOrder = convertToDottedOrderFormat(run.start_time, run.id);
|
|
42
|
+
const currentDottedOrder = convertToDottedOrderFormat(run.start_time, run.id, run.execution_order);
|
|
43
43
|
const storedRun = { ...run };
|
|
44
44
|
if (storedRun.parent_run_id !== undefined) {
|
|
45
45
|
const parentRun = this.runMap.get(storedRun.parent_run_id);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
module.exports = require('../dist/document_loaders/base.cjs');
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from '../dist/document_loaders/base.js'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from '../dist/document_loaders/base.js'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from '../dist/document_loaders/base.js'
|
package/indexing.cjs
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
module.exports = require('./dist/indexing/index.cjs');
|
package/indexing.d.cts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './dist/indexing/index.js'
|
package/indexing.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './dist/indexing/index.js'
|
package/indexing.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './dist/indexing/index.js'
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@langchain/core",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0-rc.0",
|
|
4
4
|
"description": "Core LangChain.js abstractions and schemas",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -158,6 +158,15 @@
|
|
|
158
158
|
"import": "./documents.js",
|
|
159
159
|
"require": "./documents.cjs"
|
|
160
160
|
},
|
|
161
|
+
"./document_loaders/base": {
|
|
162
|
+
"types": {
|
|
163
|
+
"import": "./document_loaders/base.d.ts",
|
|
164
|
+
"require": "./document_loaders/base.d.cts",
|
|
165
|
+
"default": "./document_loaders/base.d.ts"
|
|
166
|
+
},
|
|
167
|
+
"import": "./document_loaders/base.js",
|
|
168
|
+
"require": "./document_loaders/base.cjs"
|
|
169
|
+
},
|
|
161
170
|
"./embeddings": {
|
|
162
171
|
"types": {
|
|
163
172
|
"import": "./embeddings.d.ts",
|
|
@@ -176,6 +185,15 @@
|
|
|
176
185
|
"import": "./example_selectors.js",
|
|
177
186
|
"require": "./example_selectors.cjs"
|
|
178
187
|
},
|
|
188
|
+
"./indexing": {
|
|
189
|
+
"types": {
|
|
190
|
+
"import": "./indexing.d.ts",
|
|
191
|
+
"require": "./indexing.d.cts",
|
|
192
|
+
"default": "./indexing.d.ts"
|
|
193
|
+
},
|
|
194
|
+
"import": "./indexing.js",
|
|
195
|
+
"require": "./indexing.cjs"
|
|
196
|
+
},
|
|
179
197
|
"./language_models/base": {
|
|
180
198
|
"types": {
|
|
181
199
|
"import": "./language_models/base.d.ts",
|
|
@@ -604,6 +622,10 @@
|
|
|
604
622
|
"documents.js",
|
|
605
623
|
"documents.d.ts",
|
|
606
624
|
"documents.d.cts",
|
|
625
|
+
"document_loaders/base.cjs",
|
|
626
|
+
"document_loaders/base.js",
|
|
627
|
+
"document_loaders/base.d.ts",
|
|
628
|
+
"document_loaders/base.d.cts",
|
|
607
629
|
"embeddings.cjs",
|
|
608
630
|
"embeddings.js",
|
|
609
631
|
"embeddings.d.ts",
|
|
@@ -612,6 +634,10 @@
|
|
|
612
634
|
"example_selectors.js",
|
|
613
635
|
"example_selectors.d.ts",
|
|
614
636
|
"example_selectors.d.cts",
|
|
637
|
+
"indexing.cjs",
|
|
638
|
+
"indexing.js",
|
|
639
|
+
"indexing.d.ts",
|
|
640
|
+
"indexing.d.cts",
|
|
615
641
|
"language_models/base.cjs",
|
|
616
642
|
"language_models/base.js",
|
|
617
643
|
"language_models/base.d.ts",
|