@mastra/rag 0.1.19 → 0.1.20-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +7 -7
- package/CHANGELOG.md +23 -0
- package/dist/_tsup-dts-rollup.d.cts +225 -86
- package/dist/_tsup-dts-rollup.d.ts +225 -86
- package/dist/index.cjs +417 -131
- package/dist/index.js +385 -99
- package/package.json +2 -3
- package/src/document/document.ts +6 -9
- package/src/document/extractors/base.ts +30 -0
- package/src/document/extractors/index.ts +1 -1
- package/src/document/extractors/keywords.test.ts +1 -1
- package/src/document/extractors/keywords.ts +7 -19
- package/src/document/extractors/questions.test.ts +1 -1
- package/src/document/extractors/questions.ts +7 -25
- package/src/document/extractors/summary.test.ts +1 -1
- package/src/document/extractors/summary.ts +7 -19
- package/src/document/extractors/title.test.ts +1 -1
- package/src/document/extractors/title.ts +7 -44
- package/src/document/extractors/types.ts +1 -1
- package/src/document/prompts/base.ts +77 -0
- package/src/document/prompts/format.ts +9 -0
- package/src/document/prompts/index.ts +15 -0
- package/src/document/prompts/prompt.ts +60 -0
- package/src/document/prompts/types.ts +29 -0
- package/src/document/schema/index.ts +3 -0
- package/src/document/schema/node.ts +187 -0
- package/src/document/schema/types.ts +40 -0
- package/src/document/transformers/html.ts +1 -1
- package/src/document/transformers/json.ts +1 -1
- package/src/document/transformers/markdown.ts +1 -1
- package/src/document/transformers/text.ts +1 -1
- package/src/document/transformers/transformer.ts +1 -1
package/dist/index.cjs
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var crypto = require('crypto');
|
|
4
4
|
var zod = require('zod');
|
|
5
5
|
var nodeHtmlBetterParser = require('node-html-better-parser');
|
|
6
6
|
var jsTiktoken = require('js-tiktoken');
|
|
@@ -14,6 +14,11 @@ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
|
14
14
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
15
15
|
var __getProtoOf = Object.getPrototypeOf;
|
|
16
16
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
17
|
+
var __knownSymbol = (name14, symbol15) => (symbol15 = Symbol[name14]) ? symbol15 : Symbol.for("Symbol." + name14);
|
|
18
|
+
var __typeError = (msg) => {
|
|
19
|
+
throw TypeError(msg);
|
|
20
|
+
};
|
|
21
|
+
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
|
|
17
22
|
var __commonJS = (cb, mod) => function __require() {
|
|
18
23
|
return mod || (0, cb[__getOwnPropNames(cb)[0]])((mod = { exports: {} }).exports, mod), mod.exports;
|
|
19
24
|
};
|
|
@@ -33,6 +38,43 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
33
38
|
__defProp(target, "default", { value: mod, enumerable: true }) ,
|
|
34
39
|
mod
|
|
35
40
|
));
|
|
41
|
+
var __decoratorStart = (base) => [, , , __create(null)];
|
|
42
|
+
var __decoratorStrings = ["class", "method", "getter", "setter", "accessor", "field", "value", "get", "set"];
|
|
43
|
+
var __expectFn = (fn) => fn !== void 0 && typeof fn !== "function" ? __typeError("Function expected") : fn;
|
|
44
|
+
var __decoratorContext = (kind, name14, done, metadata, fns) => ({ kind: __decoratorStrings[kind], name: name14, metadata, addInitializer: (fn) => done._ ? __typeError("Already initialized") : fns.push(__expectFn(fn || null)) });
|
|
45
|
+
var __decoratorMetadata = (array, target) => __defNormalProp(target, __knownSymbol("metadata"), array[3]);
|
|
46
|
+
var __runInitializers = (array, flags, self, value) => {
|
|
47
|
+
for (var i = 0, fns = array[flags >> 1], n = fns && fns.length; i < n; i++) flags & 1 ? fns[i].call(self) : value = fns[i].call(self, value);
|
|
48
|
+
return value;
|
|
49
|
+
};
|
|
50
|
+
var __decorateElement = (array, flags, name14, decorators, target, extra) => {
|
|
51
|
+
var fn, it, done, ctx, access, k = flags & 7, s = false, p = false;
|
|
52
|
+
var j = array.length + 1 , key = __decoratorStrings[k + 5];
|
|
53
|
+
var initializers = (array[j - 1] = []), extraInitializers = array[j] || (array[j] = []);
|
|
54
|
+
var desc = ((target = target.prototype), __getOwnPropDesc({ get [name14]() {
|
|
55
|
+
return __privateGet(this, extra);
|
|
56
|
+
}, set [name14](x) {
|
|
57
|
+
return __privateSet(this, extra, x);
|
|
58
|
+
} }, name14));
|
|
59
|
+
for (var i = decorators.length - 1; i >= 0; i--) {
|
|
60
|
+
ctx = __decoratorContext(k, name14, done = {}, array[3], extraInitializers);
|
|
61
|
+
{
|
|
62
|
+
ctx.static = s, ctx.private = p, access = ctx.access = { has: (x) => name14 in x };
|
|
63
|
+
access.get = (x) => x[name14];
|
|
64
|
+
access.set = (x, y) => x[name14] = y;
|
|
65
|
+
}
|
|
66
|
+
it = (0, decorators[i])({ get: desc.get, set: desc.set } , ctx), done._ = 1;
|
|
67
|
+
if (it === void 0) __expectFn(it) && (desc[key] = it );
|
|
68
|
+
else if (typeof it !== "object" || it === null) __typeError("Object expected");
|
|
69
|
+
else __expectFn(fn = it.get) && (desc.get = fn), __expectFn(fn = it.set) && (desc.set = fn), __expectFn(fn = it.init) && initializers.unshift(fn);
|
|
70
|
+
}
|
|
71
|
+
return desc && __defProp(target, name14, desc), target;
|
|
72
|
+
};
|
|
73
|
+
var __publicField = (obj, key, value) => __defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value);
|
|
74
|
+
var __accessCheck = (obj, member, msg) => member.has(obj) || __typeError("Cannot " + msg);
|
|
75
|
+
var __privateGet = (obj, member, getter) => (__accessCheck(obj, member, "read from private field"), member.get(obj));
|
|
76
|
+
var __privateAdd = (obj, member, value) => member.has(obj) ? __typeError("Cannot add the same private member more than once") : member instanceof WeakSet ? member.add(obj) : member.set(obj, value);
|
|
77
|
+
var __privateSet = (obj, member, value, setter) => (__accessCheck(obj, member, "write to private field"), member.set(obj, value), value);
|
|
36
78
|
|
|
37
79
|
// ../../node_modules/.pnpm/secure-json-parse@2.7.0/node_modules/secure-json-parse/index.js
|
|
38
80
|
var require_secure_json_parse = __commonJS({
|
|
@@ -137,6 +179,275 @@ var require_secure_json_parse = __commonJS({
|
|
|
137
179
|
}
|
|
138
180
|
});
|
|
139
181
|
|
|
182
|
+
// src/document/prompts/format.ts
|
|
183
|
+
function format(str, params) {
|
|
184
|
+
return str.replace(/{(\w+)}/g, (_, k) => params[k] ?? "");
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// src/document/prompts/base.ts
|
|
188
|
+
var BasePromptTemplate = class {
|
|
189
|
+
templateVars = /* @__PURE__ */ new Set();
|
|
190
|
+
options = {};
|
|
191
|
+
constructor(options) {
|
|
192
|
+
const { templateVars } = options;
|
|
193
|
+
if (templateVars) {
|
|
194
|
+
this.templateVars = new Set(templateVars);
|
|
195
|
+
}
|
|
196
|
+
if (options.options) {
|
|
197
|
+
this.options = options.options;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
var PromptTemplate = class _PromptTemplate extends BasePromptTemplate {
|
|
202
|
+
#template;
|
|
203
|
+
constructor(options) {
|
|
204
|
+
const { template, ...rest } = options;
|
|
205
|
+
super(rest);
|
|
206
|
+
this.#template = template;
|
|
207
|
+
}
|
|
208
|
+
partialFormat(options) {
|
|
209
|
+
const prompt = new _PromptTemplate({
|
|
210
|
+
template: this.template,
|
|
211
|
+
templateVars: [...this.templateVars],
|
|
212
|
+
options: this.options
|
|
213
|
+
});
|
|
214
|
+
prompt.options = {
|
|
215
|
+
...prompt.options,
|
|
216
|
+
...options
|
|
217
|
+
};
|
|
218
|
+
return prompt;
|
|
219
|
+
}
|
|
220
|
+
format(options) {
|
|
221
|
+
const allOptions = {
|
|
222
|
+
...this.options,
|
|
223
|
+
...options
|
|
224
|
+
};
|
|
225
|
+
return format(this.template, allOptions);
|
|
226
|
+
}
|
|
227
|
+
formatMessages(options) {
|
|
228
|
+
const prompt = this.format(options);
|
|
229
|
+
return [
|
|
230
|
+
{
|
|
231
|
+
role: "user",
|
|
232
|
+
content: prompt
|
|
233
|
+
}
|
|
234
|
+
];
|
|
235
|
+
}
|
|
236
|
+
get template() {
|
|
237
|
+
return this.#template;
|
|
238
|
+
}
|
|
239
|
+
};
|
|
240
|
+
|
|
241
|
+
// src/document/prompts/prompt.ts
|
|
242
|
+
var defaultSummaryPrompt = new PromptTemplate({
|
|
243
|
+
templateVars: ["context"],
|
|
244
|
+
template: `Write a summary of the following. Try to use only the information provided. Try to include as many key details as possible.
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
{context}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
SUMMARY:"""
|
|
251
|
+
`
|
|
252
|
+
});
|
|
253
|
+
var defaultKeywordExtractPrompt = new PromptTemplate({
|
|
254
|
+
templateVars: ["maxKeywords", "context"],
|
|
255
|
+
template: `
|
|
256
|
+
Some text is provided below. Given the text, extract up to {maxKeywords} keywords from the text. Avoid stopwords.
|
|
257
|
+
---------------------
|
|
258
|
+
{context}
|
|
259
|
+
---------------------
|
|
260
|
+
Provide keywords in the following comma-separated format: 'KEYWORDS: <keywords>'
|
|
261
|
+
`
|
|
262
|
+
}).partialFormat({
|
|
263
|
+
maxKeywords: "10"
|
|
264
|
+
});
|
|
265
|
+
var defaultQuestionExtractPrompt = new PromptTemplate({
|
|
266
|
+
templateVars: ["numQuestions", "context"],
|
|
267
|
+
template: `(
|
|
268
|
+
"Given the contextual informations below, generate {numQuestions} questions this context can provides specific answers to which are unlikely to be found else where. Higher-level summaries of surrounding context may be provided as well. "
|
|
269
|
+
"Try using these summaries to generate better questions that this context can answer."
|
|
270
|
+
"---------------------"
|
|
271
|
+
"{context}"
|
|
272
|
+
"---------------------"
|
|
273
|
+
"Provide questions in the following format: 'QUESTIONS: <questions>'"
|
|
274
|
+
)`
|
|
275
|
+
}).partialFormat({
|
|
276
|
+
numQuestions: "5"
|
|
277
|
+
});
|
|
278
|
+
var defaultTitleExtractorPromptTemplate = new PromptTemplate({
|
|
279
|
+
templateVars: ["context"],
|
|
280
|
+
template: `{context}
|
|
281
|
+
Give a title that summarizes all of the unique entities, titles or themes found in the context.
|
|
282
|
+
Title: `
|
|
283
|
+
});
|
|
284
|
+
var defaultTitleCombinePromptTemplate = new PromptTemplate({
|
|
285
|
+
templateVars: ["context"],
|
|
286
|
+
template: `{context}
|
|
287
|
+
Based on the above candidate titles and contents, what is the comprehensive title for this document?
|
|
288
|
+
Title: `
|
|
289
|
+
});
|
|
290
|
+
var _hash_dec, _init, _hash;
|
|
291
|
+
_hash_dec = [lazyInitHash];
|
|
292
|
+
var BaseNode = class {
|
|
293
|
+
constructor(init) {
|
|
294
|
+
__publicField(this, "id_");
|
|
295
|
+
__publicField(this, "metadata");
|
|
296
|
+
__publicField(this, "relationships");
|
|
297
|
+
__privateAdd(this, _hash, __runInitializers(_init, 8, this, "")), __runInitializers(_init, 11, this);
|
|
298
|
+
const { id_, metadata, relationships } = init || {};
|
|
299
|
+
this.id_ = id_ ?? crypto.randomUUID();
|
|
300
|
+
this.metadata = metadata ?? {};
|
|
301
|
+
this.relationships = relationships ?? {};
|
|
302
|
+
}
|
|
303
|
+
get sourceNode() {
|
|
304
|
+
const relationship = this.relationships["SOURCE" /* SOURCE */];
|
|
305
|
+
if (Array.isArray(relationship)) {
|
|
306
|
+
throw new Error("Source object must be a single RelatedNodeInfo object");
|
|
307
|
+
}
|
|
308
|
+
return relationship;
|
|
309
|
+
}
|
|
310
|
+
get prevNode() {
|
|
311
|
+
const relationship = this.relationships["PREVIOUS" /* PREVIOUS */];
|
|
312
|
+
if (Array.isArray(relationship)) {
|
|
313
|
+
throw new Error("Previous object must be a single RelatedNodeInfo object");
|
|
314
|
+
}
|
|
315
|
+
return relationship;
|
|
316
|
+
}
|
|
317
|
+
get nextNode() {
|
|
318
|
+
const relationship = this.relationships["NEXT" /* NEXT */];
|
|
319
|
+
if (Array.isArray(relationship)) {
|
|
320
|
+
throw new Error("Next object must be a single RelatedNodeInfo object");
|
|
321
|
+
}
|
|
322
|
+
return relationship;
|
|
323
|
+
}
|
|
324
|
+
get parentNode() {
|
|
325
|
+
const relationship = this.relationships["PARENT" /* PARENT */];
|
|
326
|
+
if (Array.isArray(relationship)) {
|
|
327
|
+
throw new Error("Parent object must be a single RelatedNodeInfo object");
|
|
328
|
+
}
|
|
329
|
+
return relationship;
|
|
330
|
+
}
|
|
331
|
+
get childNodes() {
|
|
332
|
+
const relationship = this.relationships["CHILD" /* CHILD */];
|
|
333
|
+
if (!Array.isArray(relationship)) {
|
|
334
|
+
throw new Error("Child object must be a an array of RelatedNodeInfo objects");
|
|
335
|
+
}
|
|
336
|
+
return relationship;
|
|
337
|
+
}
|
|
338
|
+
};
|
|
339
|
+
_init = __decoratorStart();
|
|
340
|
+
_hash = new WeakMap();
|
|
341
|
+
__decorateElement(_init, 4, "hash", _hash_dec, BaseNode, _hash);
|
|
342
|
+
__decoratorMetadata(_init, BaseNode);
|
|
343
|
+
var TextNode = class extends BaseNode {
|
|
344
|
+
text;
|
|
345
|
+
startCharIdx;
|
|
346
|
+
endCharIdx;
|
|
347
|
+
metadataSeparator;
|
|
348
|
+
constructor(init = {}) {
|
|
349
|
+
super(init);
|
|
350
|
+
const { text, startCharIdx, endCharIdx, metadataSeparator } = init;
|
|
351
|
+
this.text = text ?? "";
|
|
352
|
+
if (startCharIdx) {
|
|
353
|
+
this.startCharIdx = startCharIdx;
|
|
354
|
+
}
|
|
355
|
+
if (endCharIdx) {
|
|
356
|
+
this.endCharIdx = endCharIdx;
|
|
357
|
+
}
|
|
358
|
+
this.metadataSeparator = metadataSeparator ?? "\n";
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Generate a hash of the text node.
|
|
362
|
+
* The ID is not part of the hash as it can change independent of content.
|
|
363
|
+
* @returns
|
|
364
|
+
*/
|
|
365
|
+
generateHash() {
|
|
366
|
+
const hashFunction = createSHA256();
|
|
367
|
+
hashFunction.update(`type=${this.type}`);
|
|
368
|
+
hashFunction.update(`startCharIdx=${this.startCharIdx} endCharIdx=${this.endCharIdx}`);
|
|
369
|
+
hashFunction.update(this.getContent());
|
|
370
|
+
return hashFunction.digest();
|
|
371
|
+
}
|
|
372
|
+
get type() {
|
|
373
|
+
return "TEXT" /* TEXT */;
|
|
374
|
+
}
|
|
375
|
+
getContent() {
|
|
376
|
+
const metadataStr = this.getMetadataStr().trim();
|
|
377
|
+
return `${metadataStr}
|
|
378
|
+
|
|
379
|
+
${this.text}`.trim();
|
|
380
|
+
}
|
|
381
|
+
getMetadataStr() {
|
|
382
|
+
const usableMetadataKeys = new Set(Object.keys(this.metadata).sort());
|
|
383
|
+
return [...usableMetadataKeys].map((key) => `${key}: ${this.metadata[key]}`).join(this.metadataSeparator);
|
|
384
|
+
}
|
|
385
|
+
getNodeInfo() {
|
|
386
|
+
return { start: this.startCharIdx, end: this.endCharIdx };
|
|
387
|
+
}
|
|
388
|
+
getText() {
|
|
389
|
+
return this.text;
|
|
390
|
+
}
|
|
391
|
+
};
|
|
392
|
+
var Document = class extends TextNode {
|
|
393
|
+
constructor(init) {
|
|
394
|
+
super(init);
|
|
395
|
+
}
|
|
396
|
+
get type() {
|
|
397
|
+
return "DOCUMENT" /* DOCUMENT */;
|
|
398
|
+
}
|
|
399
|
+
};
|
|
400
|
+
function lazyInitHash(value, _context) {
|
|
401
|
+
return {
|
|
402
|
+
get() {
|
|
403
|
+
const oldValue = value.get.call(this);
|
|
404
|
+
if (oldValue === "") {
|
|
405
|
+
const hash = this.generateHash();
|
|
406
|
+
value.set.call(this, hash);
|
|
407
|
+
}
|
|
408
|
+
return value.get.call(this);
|
|
409
|
+
},
|
|
410
|
+
set(newValue) {
|
|
411
|
+
value.set.call(this, newValue);
|
|
412
|
+
},
|
|
413
|
+
init(value2) {
|
|
414
|
+
return value2;
|
|
415
|
+
}
|
|
416
|
+
};
|
|
417
|
+
}
|
|
418
|
+
function createSHA256() {
|
|
419
|
+
const hash = crypto.createHash("sha256");
|
|
420
|
+
return {
|
|
421
|
+
update(data) {
|
|
422
|
+
hash.update(data);
|
|
423
|
+
},
|
|
424
|
+
digest() {
|
|
425
|
+
return hash.digest("base64");
|
|
426
|
+
}
|
|
427
|
+
};
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
// src/document/extractors/base.ts
|
|
431
|
+
var BaseExtractor = class {
|
|
432
|
+
isTextNodeOnly = true;
|
|
433
|
+
/**
|
|
434
|
+
*
|
|
435
|
+
* @param nodes Nodes to extract metadata from.
|
|
436
|
+
* @returns Metadata extracted from the nodes.
|
|
437
|
+
*/
|
|
438
|
+
async processNodes(nodes) {
|
|
439
|
+
let newNodes = nodes;
|
|
440
|
+
const curMetadataList = await this.extract(newNodes);
|
|
441
|
+
for (const idx in newNodes) {
|
|
442
|
+
newNodes[idx].metadata = {
|
|
443
|
+
...newNodes[idx].metadata,
|
|
444
|
+
...curMetadataList[idx]
|
|
445
|
+
};
|
|
446
|
+
}
|
|
447
|
+
return newNodes;
|
|
448
|
+
}
|
|
449
|
+
};
|
|
450
|
+
|
|
140
451
|
// ../../node_modules/.pnpm/@ai-sdk+provider@1.1.3/node_modules/@ai-sdk/provider/dist/index.mjs
|
|
141
452
|
var marker = "vercel.ai.error";
|
|
142
453
|
var symbol = Symbol.for(marker);
|
|
@@ -2430,8 +2741,10 @@ var openaiTextEmbeddingResponseSchema = zod.z.object({
|
|
|
2430
2741
|
});
|
|
2431
2742
|
var modelMaxImagesPerCall = {
|
|
2432
2743
|
"dall-e-3": 1,
|
|
2433
|
-
"dall-e-2": 10
|
|
2744
|
+
"dall-e-2": 10,
|
|
2745
|
+
"gpt-image-1": 10
|
|
2434
2746
|
};
|
|
2747
|
+
var hasDefaultResponseFormat = /* @__PURE__ */ new Set(["gpt-image-1"]);
|
|
2435
2748
|
var OpenAIImageModel = class {
|
|
2436
2749
|
constructor(modelId, settings, config) {
|
|
2437
2750
|
this.modelId = modelId;
|
|
@@ -2481,7 +2794,7 @@ var OpenAIImageModel = class {
|
|
|
2481
2794
|
n,
|
|
2482
2795
|
size,
|
|
2483
2796
|
...(_d = providerOptions.openai) != null ? _d : {},
|
|
2484
|
-
response_format: "b64_json"
|
|
2797
|
+
...!hasDefaultResponseFormat.has(this.modelId) ? { response_format: "b64_json" } : {}
|
|
2485
2798
|
},
|
|
2486
2799
|
failedResponseHandler: openaiFailedResponseHandler,
|
|
2487
2800
|
successfulResponseHandler: createJsonResponseHandler(
|
|
@@ -2976,8 +3289,15 @@ var OpenAIResponsesLanguageModel = class {
|
|
|
2976
3289
|
user: openaiOptions == null ? void 0 : openaiOptions.user,
|
|
2977
3290
|
instructions: openaiOptions == null ? void 0 : openaiOptions.instructions,
|
|
2978
3291
|
// model-specific settings:
|
|
2979
|
-
...modelConfig.isReasoningModel && (openaiOptions == null ? void 0 : openaiOptions.reasoningEffort) != null && {
|
|
2980
|
-
reasoning: {
|
|
3292
|
+
...modelConfig.isReasoningModel && ((openaiOptions == null ? void 0 : openaiOptions.reasoningEffort) != null || (openaiOptions == null ? void 0 : openaiOptions.reasoningSummary) != null) && {
|
|
3293
|
+
reasoning: {
|
|
3294
|
+
...(openaiOptions == null ? void 0 : openaiOptions.reasoningEffort) != null && {
|
|
3295
|
+
effort: openaiOptions.reasoningEffort
|
|
3296
|
+
},
|
|
3297
|
+
...(openaiOptions == null ? void 0 : openaiOptions.reasoningSummary) != null && {
|
|
3298
|
+
summary: openaiOptions.reasoningSummary
|
|
3299
|
+
}
|
|
3300
|
+
}
|
|
2981
3301
|
},
|
|
2982
3302
|
...modelConfig.requiredAutoTruncation && {
|
|
2983
3303
|
truncation: "auto"
|
|
@@ -3059,7 +3379,7 @@ var OpenAIResponsesLanguageModel = class {
|
|
|
3059
3379
|
}
|
|
3060
3380
|
}
|
|
3061
3381
|
async doGenerate(options) {
|
|
3062
|
-
var _a15, _b, _c, _d, _e;
|
|
3382
|
+
var _a15, _b, _c, _d, _e, _f, _g;
|
|
3063
3383
|
const { args: body, warnings } = this.getArgs(options);
|
|
3064
3384
|
const {
|
|
3065
3385
|
responseHeaders,
|
|
@@ -3112,7 +3432,13 @@ var OpenAIResponsesLanguageModel = class {
|
|
|
3112
3432
|
type: zod.z.literal("computer_call")
|
|
3113
3433
|
}),
|
|
3114
3434
|
zod.z.object({
|
|
3115
|
-
type: zod.z.literal("reasoning")
|
|
3435
|
+
type: zod.z.literal("reasoning"),
|
|
3436
|
+
summary: zod.z.array(
|
|
3437
|
+
zod.z.object({
|
|
3438
|
+
type: zod.z.literal("summary_text"),
|
|
3439
|
+
text: zod.z.string()
|
|
3440
|
+
})
|
|
3441
|
+
)
|
|
3116
3442
|
})
|
|
3117
3443
|
])
|
|
3118
3444
|
),
|
|
@@ -3130,6 +3456,7 @@ var OpenAIResponsesLanguageModel = class {
|
|
|
3130
3456
|
toolName: output.name,
|
|
3131
3457
|
args: output.arguments
|
|
3132
3458
|
}));
|
|
3459
|
+
const reasoningSummary = (_b = (_a15 = response.output.find((item) => item.type === "reasoning")) == null ? void 0 : _a15.summary) != null ? _b : null;
|
|
3133
3460
|
return {
|
|
3134
3461
|
text: outputTextElements.map((content) => content.text).join("\n"),
|
|
3135
3462
|
sources: outputTextElements.flatMap(
|
|
@@ -3144,10 +3471,14 @@ var OpenAIResponsesLanguageModel = class {
|
|
|
3144
3471
|
})
|
|
3145
3472
|
),
|
|
3146
3473
|
finishReason: mapOpenAIResponseFinishReason({
|
|
3147
|
-
finishReason: (
|
|
3474
|
+
finishReason: (_c = response.incomplete_details) == null ? void 0 : _c.reason,
|
|
3148
3475
|
hasToolCalls: toolCalls.length > 0
|
|
3149
3476
|
}),
|
|
3150
3477
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
3478
|
+
reasoning: reasoningSummary ? reasoningSummary.map((summary) => ({
|
|
3479
|
+
type: "text",
|
|
3480
|
+
text: summary.text
|
|
3481
|
+
})) : void 0,
|
|
3151
3482
|
usage: {
|
|
3152
3483
|
promptTokens: response.usage.input_tokens,
|
|
3153
3484
|
completionTokens: response.usage.output_tokens
|
|
@@ -3171,8 +3502,8 @@ var OpenAIResponsesLanguageModel = class {
|
|
|
3171
3502
|
providerMetadata: {
|
|
3172
3503
|
openai: {
|
|
3173
3504
|
responseId: response.id,
|
|
3174
|
-
cachedPromptTokens: (
|
|
3175
|
-
reasoningTokens: (
|
|
3505
|
+
cachedPromptTokens: (_e = (_d = response.usage.input_tokens_details) == null ? void 0 : _d.cached_tokens) != null ? _e : null,
|
|
3506
|
+
reasoningTokens: (_g = (_f = response.usage.output_tokens_details) == null ? void 0 : _f.reasoning_tokens) != null ? _g : null
|
|
3176
3507
|
}
|
|
3177
3508
|
},
|
|
3178
3509
|
warnings
|
|
@@ -3255,6 +3586,11 @@ var OpenAIResponsesLanguageModel = class {
|
|
|
3255
3586
|
type: "text-delta",
|
|
3256
3587
|
textDelta: value.delta
|
|
3257
3588
|
});
|
|
3589
|
+
} else if (isResponseReasoningSummaryTextDeltaChunk(value)) {
|
|
3590
|
+
controller.enqueue({
|
|
3591
|
+
type: "reasoning",
|
|
3592
|
+
textDelta: value.delta
|
|
3593
|
+
});
|
|
3258
3594
|
} else if (isResponseOutputItemDoneChunk(value) && value.item.type === "function_call") {
|
|
3259
3595
|
ongoingToolCalls[value.output_index] = void 0;
|
|
3260
3596
|
hasToolCalls = true;
|
|
@@ -3386,6 +3722,13 @@ var responseAnnotationAddedSchema = zod.z.object({
|
|
|
3386
3722
|
title: zod.z.string()
|
|
3387
3723
|
})
|
|
3388
3724
|
});
|
|
3725
|
+
var responseReasoningSummaryTextDeltaSchema = zod.z.object({
|
|
3726
|
+
type: zod.z.literal("response.reasoning_summary_text.delta"),
|
|
3727
|
+
item_id: zod.z.string(),
|
|
3728
|
+
output_index: zod.z.number(),
|
|
3729
|
+
summary_index: zod.z.number(),
|
|
3730
|
+
delta: zod.z.string()
|
|
3731
|
+
});
|
|
3389
3732
|
var openaiResponsesChunkSchema = zod.z.union([
|
|
3390
3733
|
textDeltaChunkSchema,
|
|
3391
3734
|
responseFinishedChunkSchema,
|
|
@@ -3394,6 +3737,7 @@ var openaiResponsesChunkSchema = zod.z.union([
|
|
|
3394
3737
|
responseFunctionCallArgumentsDeltaSchema,
|
|
3395
3738
|
responseOutputItemAddedSchema,
|
|
3396
3739
|
responseAnnotationAddedSchema,
|
|
3740
|
+
responseReasoningSummaryTextDeltaSchema,
|
|
3397
3741
|
zod.z.object({ type: zod.z.string() }).passthrough()
|
|
3398
3742
|
// fallback for unknown chunks
|
|
3399
3743
|
]);
|
|
@@ -3418,6 +3762,9 @@ function isResponseOutputItemAddedChunk(chunk) {
|
|
|
3418
3762
|
function isResponseAnnotationAddedChunk(chunk) {
|
|
3419
3763
|
return chunk.type === "response.output_text.annotation.added";
|
|
3420
3764
|
}
|
|
3765
|
+
function isResponseReasoningSummaryTextDeltaChunk(chunk) {
|
|
3766
|
+
return chunk.type === "response.reasoning_summary_text.delta";
|
|
3767
|
+
}
|
|
3421
3768
|
function getResponsesModelConfig(modelId) {
|
|
3422
3769
|
if (modelId.startsWith("o")) {
|
|
3423
3770
|
if (modelId.startsWith("o1-mini") || modelId.startsWith("o1-preview")) {
|
|
@@ -3447,7 +3794,8 @@ var openaiResponsesProviderOptionsSchema = zod.z.object({
|
|
|
3447
3794
|
user: zod.z.string().nullish(),
|
|
3448
3795
|
reasoningEffort: zod.z.string().nullish(),
|
|
3449
3796
|
strictSchemas: zod.z.boolean().nullish(),
|
|
3450
|
-
instructions: zod.z.string().nullish()
|
|
3797
|
+
instructions: zod.z.string().nullish(),
|
|
3798
|
+
reasoningSummary: zod.z.string().nullish()
|
|
3451
3799
|
});
|
|
3452
3800
|
var WebSearchPreviewParameters = zod.z.object({});
|
|
3453
3801
|
function webSearchPreviewTool({
|
|
@@ -3667,53 +4015,24 @@ var openai2 = createOpenAI({ apiKey: process.env.OPENAI_API_KEY });
|
|
|
3667
4015
|
var baseLLM = openai2("gpt-4o");
|
|
3668
4016
|
|
|
3669
4017
|
// src/document/extractors/title.ts
|
|
3670
|
-
var TitleExtractor = class extends
|
|
3671
|
-
/**
|
|
3672
|
-
* MastraLanguageModel instance.
|
|
3673
|
-
* @type {MastraLanguageModel}
|
|
3674
|
-
*/
|
|
4018
|
+
var TitleExtractor = class extends BaseExtractor {
|
|
3675
4019
|
llm;
|
|
3676
|
-
/**
|
|
3677
|
-
* Can work for mixture of text and non-text nodes
|
|
3678
|
-
* @type {boolean}
|
|
3679
|
-
* @default false
|
|
3680
|
-
*/
|
|
3681
4020
|
isTextNodeOnly = false;
|
|
3682
|
-
/**
|
|
3683
|
-
* Number of nodes to extrct titles from.
|
|
3684
|
-
* @type {number}
|
|
3685
|
-
* @default 5
|
|
3686
|
-
*/
|
|
3687
4021
|
nodes = 5;
|
|
3688
|
-
/**
|
|
3689
|
-
* The prompt template to use for the title extractor.
|
|
3690
|
-
* @type {string}
|
|
3691
|
-
*/
|
|
3692
4022
|
nodeTemplate;
|
|
3693
|
-
/**
|
|
3694
|
-
* The prompt template to merge title with..
|
|
3695
|
-
* @type {string}
|
|
3696
|
-
*/
|
|
3697
4023
|
combineTemplate;
|
|
3698
|
-
/**
|
|
3699
|
-
* Constructor for the TitleExtractor class.
|
|
3700
|
-
* @param {MastraLanguageModel} llm MastraLanguageModel instance.
|
|
3701
|
-
* @param {number} nodes Number of nodes to extract titles from.
|
|
3702
|
-
* @param {TitleExtractorPrompt} nodeTemplate The prompt template to use for the title extractor.
|
|
3703
|
-
* @param {string} combineTemplate The prompt template to merge title with..
|
|
3704
|
-
*/
|
|
3705
4024
|
constructor(options) {
|
|
3706
4025
|
super();
|
|
3707
4026
|
this.llm = options?.llm ?? baseLLM;
|
|
3708
4027
|
this.nodes = options?.nodes ?? 5;
|
|
3709
|
-
this.nodeTemplate = options?.nodeTemplate ? new
|
|
4028
|
+
this.nodeTemplate = options?.nodeTemplate ? new PromptTemplate({
|
|
3710
4029
|
templateVars: ["context"],
|
|
3711
4030
|
template: options.nodeTemplate
|
|
3712
|
-
}) :
|
|
3713
|
-
this.combineTemplate = options?.combineTemplate ? new
|
|
4031
|
+
}) : defaultTitleExtractorPromptTemplate;
|
|
4032
|
+
this.combineTemplate = options?.combineTemplate ? new PromptTemplate({
|
|
3714
4033
|
templateVars: ["context"],
|
|
3715
4034
|
template: options.combineTemplate
|
|
3716
|
-
}) :
|
|
4035
|
+
}) : defaultTitleCombinePromptTemplate;
|
|
3717
4036
|
}
|
|
3718
4037
|
/**
|
|
3719
4038
|
* Extract titles from a list of nodes.
|
|
@@ -3725,7 +4044,7 @@ var TitleExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3725
4044
|
const nodesToExtractTitle = [];
|
|
3726
4045
|
const nodeIndexes = [];
|
|
3727
4046
|
nodes.forEach((node, idx) => {
|
|
3728
|
-
const text = node.getContent(
|
|
4047
|
+
const text = node.getContent();
|
|
3729
4048
|
if (!text || text.trim() === "") {
|
|
3730
4049
|
results[idx] = { documentTitle: "" };
|
|
3731
4050
|
} else {
|
|
@@ -3753,7 +4072,7 @@ var TitleExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3753
4072
|
}
|
|
3754
4073
|
filterNodes(nodes) {
|
|
3755
4074
|
return nodes.filter((node) => {
|
|
3756
|
-
if (this.isTextNodeOnly && !(node instanceof
|
|
4075
|
+
if (this.isTextNodeOnly && !(node instanceof TextNode)) {
|
|
3757
4076
|
return false;
|
|
3758
4077
|
}
|
|
3759
4078
|
return true;
|
|
@@ -3812,7 +4131,7 @@ var TitleExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3812
4131
|
{
|
|
3813
4132
|
type: "text",
|
|
3814
4133
|
text: this.nodeTemplate.format({
|
|
3815
|
-
context: node.getContent(
|
|
4134
|
+
context: node.getContent()
|
|
3816
4135
|
})
|
|
3817
4136
|
}
|
|
3818
4137
|
]
|
|
@@ -3829,21 +4148,11 @@ var TitleExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3829
4148
|
return await Promise.all(titleJobs);
|
|
3830
4149
|
}
|
|
3831
4150
|
};
|
|
3832
|
-
|
|
3833
|
-
|
|
3834
|
-
|
|
3835
|
-
* @type {MastraLanguageModel}
|
|
3836
|
-
*/
|
|
4151
|
+
|
|
4152
|
+
// src/document/extractors/summary.ts
|
|
4153
|
+
var SummaryExtractor = class extends BaseExtractor {
|
|
3837
4154
|
llm;
|
|
3838
|
-
/**
|
|
3839
|
-
* List of summaries to extract: 'self', 'prev', 'next'
|
|
3840
|
-
* @type {string[]}
|
|
3841
|
-
*/
|
|
3842
4155
|
summaries;
|
|
3843
|
-
/**
|
|
3844
|
-
* The prompt template to use for the summary extractor.
|
|
3845
|
-
* @type {string}
|
|
3846
|
-
*/
|
|
3847
4156
|
promptTemplate;
|
|
3848
4157
|
selfSummary;
|
|
3849
4158
|
prevSummary;
|
|
@@ -3855,10 +4164,10 @@ var SummaryExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3855
4164
|
super();
|
|
3856
4165
|
this.llm = options?.llm ?? baseLLM;
|
|
3857
4166
|
this.summaries = summaries;
|
|
3858
|
-
this.promptTemplate = options?.promptTemplate ? new
|
|
4167
|
+
this.promptTemplate = options?.promptTemplate ? new PromptTemplate({
|
|
3859
4168
|
templateVars: ["context"],
|
|
3860
4169
|
template: options.promptTemplate
|
|
3861
|
-
}) :
|
|
4170
|
+
}) : defaultSummaryPrompt;
|
|
3862
4171
|
this.selfSummary = summaries?.includes("self") ?? false;
|
|
3863
4172
|
this.prevSummary = summaries?.includes("prev") ?? false;
|
|
3864
4173
|
this.nextSummary = summaries?.includes("next") ?? false;
|
|
@@ -3869,14 +4178,14 @@ var SummaryExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3869
4178
|
* @returns {Promise<string>} Summary extracted from the node.
|
|
3870
4179
|
*/
|
|
3871
4180
|
async generateNodeSummary(node) {
|
|
3872
|
-
const text = node.getContent(
|
|
4181
|
+
const text = node.getContent();
|
|
3873
4182
|
if (!text || text.trim() === "") {
|
|
3874
4183
|
return "";
|
|
3875
4184
|
}
|
|
3876
|
-
if (this.isTextNodeOnly && !(node instanceof
|
|
4185
|
+
if (this.isTextNodeOnly && !(node instanceof TextNode)) {
|
|
3877
4186
|
return "";
|
|
3878
4187
|
}
|
|
3879
|
-
const context = node.getContent(
|
|
4188
|
+
const context = node.getContent();
|
|
3880
4189
|
const prompt = this.promptTemplate.format({
|
|
3881
4190
|
context
|
|
3882
4191
|
});
|
|
@@ -3904,7 +4213,7 @@ var SummaryExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3904
4213
|
* @returns {Promise<ExtractSummary[]>} Summaries extracted from the nodes.
|
|
3905
4214
|
*/
|
|
3906
4215
|
async extract(nodes) {
|
|
3907
|
-
if (!nodes.every((n) => n instanceof
|
|
4216
|
+
if (!nodes.every((n) => n instanceof TextNode)) throw new Error("Only `TextNode` is allowed for `Summary` extractor");
|
|
3908
4217
|
const nodeSummaries = await Promise.all(nodes.map((node) => this.generateNodeSummary(node)));
|
|
3909
4218
|
const metadataList = nodes.map(() => ({}));
|
|
3910
4219
|
for (let i = 0; i < nodes.length; i++) {
|
|
@@ -3921,28 +4230,12 @@ var SummaryExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3921
4230
|
return metadataList;
|
|
3922
4231
|
}
|
|
3923
4232
|
};
|
|
3924
|
-
|
|
3925
|
-
|
|
3926
|
-
|
|
3927
|
-
* @type {MastraLanguageModel}
|
|
3928
|
-
*/
|
|
4233
|
+
|
|
4234
|
+
// src/document/extractors/questions.ts
|
|
4235
|
+
var QuestionsAnsweredExtractor = class extends BaseExtractor {
|
|
3929
4236
|
llm;
|
|
3930
|
-
/**
|
|
3931
|
-
* Number of questions to generate.
|
|
3932
|
-
* @type {number}
|
|
3933
|
-
* @default 5
|
|
3934
|
-
*/
|
|
3935
4237
|
questions = 5;
|
|
3936
|
-
/**
|
|
3937
|
-
* The prompt template to use for the question extractor.
|
|
3938
|
-
* @type {string}
|
|
3939
|
-
*/
|
|
3940
4238
|
promptTemplate;
|
|
3941
|
-
/**
|
|
3942
|
-
* Wheter to use metadata for embeddings only
|
|
3943
|
-
* @type {boolean}
|
|
3944
|
-
* @default false
|
|
3945
|
-
*/
|
|
3946
4239
|
embeddingOnly = false;
|
|
3947
4240
|
/**
|
|
3948
4241
|
* Constructor for the QuestionsAnsweredExtractor class.
|
|
@@ -3956,12 +4249,12 @@ var QuestionsAnsweredExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3956
4249
|
super();
|
|
3957
4250
|
this.llm = options?.llm ?? baseLLM;
|
|
3958
4251
|
this.questions = options?.questions ?? 5;
|
|
3959
|
-
this.promptTemplate = options?.promptTemplate ? new
|
|
4252
|
+
this.promptTemplate = options?.promptTemplate ? new PromptTemplate({
|
|
3960
4253
|
templateVars: ["numQuestions", "context"],
|
|
3961
4254
|
template: options.promptTemplate
|
|
3962
4255
|
}).partialFormat({
|
|
3963
4256
|
numQuestions: "5"
|
|
3964
|
-
}) :
|
|
4257
|
+
}) : defaultQuestionExtractPrompt;
|
|
3965
4258
|
this.embeddingOnly = options?.embeddingOnly ?? false;
|
|
3966
4259
|
}
|
|
3967
4260
|
/**
|
|
@@ -3970,14 +4263,14 @@ var QuestionsAnsweredExtractor = class extends llamaindex.BaseExtractor {
|
|
|
3970
4263
|
* @returns {Promise<Array<ExtractQuestion> | Array<{}>>} Questions extracted from the node.
|
|
3971
4264
|
*/
|
|
3972
4265
|
async extractQuestionsFromNode(node) {
|
|
3973
|
-
const text = node.getContent(
|
|
4266
|
+
const text = node.getContent();
|
|
3974
4267
|
if (!text || text.trim() === "") {
|
|
3975
4268
|
return { questionsThisExcerptCanAnswer: "" };
|
|
3976
4269
|
}
|
|
3977
|
-
if (this.isTextNodeOnly && !(node instanceof
|
|
4270
|
+
if (this.isTextNodeOnly && !(node instanceof TextNode)) {
|
|
3978
4271
|
return { questionsThisExcerptCanAnswer: "" };
|
|
3979
4272
|
}
|
|
3980
|
-
const contextStr = node.getContent(
|
|
4273
|
+
const contextStr = node.getContent();
|
|
3981
4274
|
const prompt = this.promptTemplate.format({
|
|
3982
4275
|
context: contextStr,
|
|
3983
4276
|
numQuestions: this.questions.toString()
|
|
@@ -4016,22 +4309,11 @@ var QuestionsAnsweredExtractor = class extends llamaindex.BaseExtractor {
|
|
|
4016
4309
|
return results;
|
|
4017
4310
|
}
|
|
4018
4311
|
};
|
|
4019
|
-
|
|
4020
|
-
|
|
4021
|
-
|
|
4022
|
-
* @type {MastraLanguageModel}
|
|
4023
|
-
*/
|
|
4312
|
+
|
|
4313
|
+
// src/document/extractors/keywords.ts
|
|
4314
|
+
var KeywordExtractor = class extends BaseExtractor {
|
|
4024
4315
|
llm;
|
|
4025
|
-
/**
|
|
4026
|
-
* Number of keywords to extract.
|
|
4027
|
-
* @type {number}
|
|
4028
|
-
* @default 5
|
|
4029
|
-
*/
|
|
4030
4316
|
keywords = 5;
|
|
4031
|
-
/**
|
|
4032
|
-
* The prompt template to use for the question extractor.
|
|
4033
|
-
* @type {string}
|
|
4034
|
-
*/
|
|
4035
4317
|
promptTemplate;
|
|
4036
4318
|
/**
|
|
4037
4319
|
* Constructor for the KeywordExtractor class.
|
|
@@ -4045,10 +4327,10 @@ var KeywordExtractor = class extends llamaindex.BaseExtractor {
|
|
|
4045
4327
|
super();
|
|
4046
4328
|
this.llm = options?.llm ?? baseLLM;
|
|
4047
4329
|
this.keywords = options?.keywords ?? 5;
|
|
4048
|
-
this.promptTemplate = options?.promptTemplate ? new
|
|
4330
|
+
this.promptTemplate = options?.promptTemplate ? new PromptTemplate({
|
|
4049
4331
|
templateVars: ["context", "maxKeywords"],
|
|
4050
4332
|
template: options.promptTemplate
|
|
4051
|
-
}) :
|
|
4333
|
+
}) : defaultKeywordExtractPrompt;
|
|
4052
4334
|
}
|
|
4053
4335
|
/**
|
|
4054
4336
|
*
|
|
@@ -4060,11 +4342,11 @@ var KeywordExtractor = class extends llamaindex.BaseExtractor {
|
|
|
4060
4342
|
* Adds error handling for malformed/empty LLM output.
|
|
4061
4343
|
*/
|
|
4062
4344
|
async extractKeywordsFromNodes(node) {
|
|
4063
|
-
const text = node.getContent(
|
|
4345
|
+
const text = node.getContent();
|
|
4064
4346
|
if (!text || text.trim() === "") {
|
|
4065
4347
|
return { excerptKeywords: "" };
|
|
4066
4348
|
}
|
|
4067
|
-
if (this.isTextNodeOnly && !(node instanceof
|
|
4349
|
+
if (this.isTextNodeOnly && !(node instanceof TextNode)) {
|
|
4068
4350
|
return { excerptKeywords: "" };
|
|
4069
4351
|
}
|
|
4070
4352
|
let keywords = "";
|
|
@@ -4079,7 +4361,7 @@ var KeywordExtractor = class extends llamaindex.BaseExtractor {
|
|
|
4079
4361
|
{
|
|
4080
4362
|
type: "text",
|
|
4081
4363
|
text: this.promptTemplate.format({
|
|
4082
|
-
context: node.getContent(
|
|
4364
|
+
context: node.getContent(),
|
|
4083
4365
|
maxKeywords: this.keywords.toString()
|
|
4084
4366
|
})
|
|
4085
4367
|
}
|
|
@@ -4144,6 +4426,8 @@ var Language = /* @__PURE__ */ ((Language2) => {
|
|
|
4144
4426
|
Language2["POWERSHELL"] = "powershell";
|
|
4145
4427
|
return Language2;
|
|
4146
4428
|
})(Language || {});
|
|
4429
|
+
|
|
4430
|
+
// src/document/transformers/text.ts
|
|
4147
4431
|
var TextTransformer = class {
|
|
4148
4432
|
size;
|
|
4149
4433
|
overlap;
|
|
@@ -4187,7 +4471,7 @@ var TextTransformer = class {
|
|
|
4187
4471
|
previousChunkLen = chunk.length;
|
|
4188
4472
|
}
|
|
4189
4473
|
documents.push(
|
|
4190
|
-
new
|
|
4474
|
+
new Document({
|
|
4191
4475
|
text: chunk,
|
|
4192
4476
|
metadata
|
|
4193
4477
|
})
|
|
@@ -4517,7 +4801,7 @@ var HTMLHeaderTransformer = class {
|
|
|
4517
4801
|
});
|
|
4518
4802
|
});
|
|
4519
4803
|
return this.returnEachElement ? elements.map(
|
|
4520
|
-
(el) => new
|
|
4804
|
+
(el) => new Document({
|
|
4521
4805
|
text: el.content,
|
|
4522
4806
|
metadata: { ...el.metadata, xpath: el.xpath }
|
|
4523
4807
|
})
|
|
@@ -4569,7 +4853,7 @@ var HTMLHeaderTransformer = class {
|
|
|
4569
4853
|
}
|
|
4570
4854
|
}
|
|
4571
4855
|
return aggregatedChunks.map(
|
|
4572
|
-
(chunk) => new
|
|
4856
|
+
(chunk) => new Document({
|
|
4573
4857
|
text: chunk.content,
|
|
4574
4858
|
metadata: { ...chunk.metadata, xpath: chunk.xpath }
|
|
4575
4859
|
})
|
|
@@ -4591,7 +4875,7 @@ var HTMLHeaderTransformer = class {
|
|
|
4591
4875
|
}
|
|
4592
4876
|
}
|
|
4593
4877
|
documents.push(
|
|
4594
|
-
new
|
|
4878
|
+
new Document({
|
|
4595
4879
|
text: chunk.text,
|
|
4596
4880
|
metadata: { ...metadata, ...chunkMetadata }
|
|
4597
4881
|
})
|
|
@@ -4620,7 +4904,7 @@ var HTMLSectionTransformer = class {
|
|
|
4620
4904
|
splitText(text) {
|
|
4621
4905
|
const sections = this.splitHtmlByHeaders(text);
|
|
4622
4906
|
return sections.map(
|
|
4623
|
-
(section) => new
|
|
4907
|
+
(section) => new Document({
|
|
4624
4908
|
text: section.content,
|
|
4625
4909
|
metadata: {
|
|
4626
4910
|
[this.headersToSplitOn[section.tagName.toLowerCase()]]: section.header,
|
|
@@ -4703,7 +4987,7 @@ var HTMLSectionTransformer = class {
|
|
|
4703
4987
|
}
|
|
4704
4988
|
}
|
|
4705
4989
|
documents.push(
|
|
4706
|
-
new
|
|
4990
|
+
new Document({
|
|
4707
4991
|
text: chunk.text,
|
|
4708
4992
|
metadata: { ...metadata, ...chunkMetadata }
|
|
4709
4993
|
})
|
|
@@ -4722,6 +5006,8 @@ var HTMLSectionTransformer = class {
|
|
|
4722
5006
|
return this.createDocuments(texts, metadatas);
|
|
4723
5007
|
}
|
|
4724
5008
|
};
|
|
5009
|
+
|
|
5010
|
+
// src/document/transformers/json.ts
|
|
4725
5011
|
var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
|
|
4726
5012
|
maxSize;
|
|
4727
5013
|
minSize;
|
|
@@ -5093,7 +5379,7 @@ var RecursiveJsonTransformer = class _RecursiveJsonTransformer {
|
|
|
5093
5379
|
chunks.forEach((chunk) => {
|
|
5094
5380
|
const metadata = { ..._metadatas[i] || {} };
|
|
5095
5381
|
documents.push(
|
|
5096
|
-
new
|
|
5382
|
+
new Document({
|
|
5097
5383
|
text: chunk,
|
|
5098
5384
|
metadata
|
|
5099
5385
|
})
|
|
@@ -5129,6 +5415,8 @@ var LatexTransformer = class extends RecursiveCharacterTransformer {
|
|
|
5129
5415
|
super({ separators, isSeparatorRegex: true, options });
|
|
5130
5416
|
}
|
|
5131
5417
|
};
|
|
5418
|
+
|
|
5419
|
+
// src/document/transformers/markdown.ts
|
|
5132
5420
|
var MarkdownTransformer = class extends RecursiveCharacterTransformer {
|
|
5133
5421
|
constructor(options = {}) {
|
|
5134
5422
|
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage("markdown" /* MARKDOWN */);
|
|
@@ -5149,7 +5437,7 @@ var MarkdownHeaderTransformer = class {
|
|
|
5149
5437
|
return lines.flatMap((line) => {
|
|
5150
5438
|
const contentLines = line.content.split("\n");
|
|
5151
5439
|
return contentLines.filter((l) => l.trim() !== "" || this.headersToSplitOn.some(([sep]) => l.trim().startsWith(sep))).map(
|
|
5152
|
-
(l) => new
|
|
5440
|
+
(l) => new Document({
|
|
5153
5441
|
text: l.trim(),
|
|
5154
5442
|
metadata: line.metadata
|
|
5155
5443
|
})
|
|
@@ -5174,7 +5462,7 @@ var MarkdownHeaderTransformer = class {
|
|
|
5174
5462
|
}
|
|
5175
5463
|
}
|
|
5176
5464
|
return aggregatedChunks.map(
|
|
5177
|
-
(chunk) => new
|
|
5465
|
+
(chunk) => new Document({
|
|
5178
5466
|
text: chunk.content,
|
|
5179
5467
|
metadata: chunk.metadata
|
|
5180
5468
|
})
|
|
@@ -5276,7 +5564,7 @@ var MarkdownHeaderTransformer = class {
|
|
|
5276
5564
|
this.splitText({ text }).forEach((chunk) => {
|
|
5277
5565
|
const metadata = { ..._metadatas[i], ...chunk.metadata };
|
|
5278
5566
|
documents.push(
|
|
5279
|
-
new
|
|
5567
|
+
new Document({
|
|
5280
5568
|
text: chunk.text,
|
|
5281
5569
|
metadata
|
|
5282
5570
|
})
|
|
@@ -5392,7 +5680,7 @@ var MDocument = class _MDocument {
|
|
|
5392
5680
|
// e.g., 'text', 'html', 'markdown', 'json'
|
|
5393
5681
|
constructor({ docs, type }) {
|
|
5394
5682
|
this.chunks = docs.map((d) => {
|
|
5395
|
-
return new
|
|
5683
|
+
return new Document({ text: d.text, metadata: d.metadata });
|
|
5396
5684
|
});
|
|
5397
5685
|
this.type = type;
|
|
5398
5686
|
}
|
|
@@ -5410,26 +5698,24 @@ var MDocument = class _MDocument {
|
|
|
5410
5698
|
if (typeof title !== "undefined") {
|
|
5411
5699
|
transformations.push(new TitleExtractor(typeof title === "boolean" ? {} : title));
|
|
5412
5700
|
this.chunks = this.chunks.map(
|
|
5413
|
-
(doc) => doc?.metadata?.docId ? new
|
|
5701
|
+
(doc) => doc?.metadata?.docId ? new Document({
|
|
5414
5702
|
...doc,
|
|
5415
5703
|
relationships: {
|
|
5416
|
-
[
|
|
5704
|
+
["SOURCE" /* SOURCE */]: {
|
|
5417
5705
|
nodeId: doc.metadata.docId,
|
|
5418
|
-
nodeType:
|
|
5706
|
+
nodeType: "DOCUMENT" /* DOCUMENT */,
|
|
5419
5707
|
metadata: doc.metadata
|
|
5420
5708
|
}
|
|
5421
5709
|
}
|
|
5422
5710
|
}) : doc
|
|
5423
5711
|
);
|
|
5424
5712
|
}
|
|
5425
|
-
|
|
5426
|
-
|
|
5427
|
-
|
|
5428
|
-
|
|
5429
|
-
documents: this.chunks
|
|
5430
|
-
});
|
|
5713
|
+
let nodes = this.chunks;
|
|
5714
|
+
for (const extractor of transformations) {
|
|
5715
|
+
nodes = await extractor.processNodes(nodes);
|
|
5716
|
+
}
|
|
5431
5717
|
this.chunks = this.chunks.map((doc, i) => {
|
|
5432
|
-
return new
|
|
5718
|
+
return new Document({
|
|
5433
5719
|
text: doc.text,
|
|
5434
5720
|
metadata: {
|
|
5435
5721
|
...doc.metadata,
|