@mastra/rag 1.0.7 → 1.0.8-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +11 -0
- package/dist/document/document.d.ts +4 -1
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/transformers/semantic-markdown.d.ts +25 -0
- package/dist/document/transformers/semantic-markdown.d.ts.map +1 -0
- package/dist/document/types.d.ts +13 -1
- package/dist/document/types.d.ts.map +1 -1
- package/dist/document/validation.d.ts.map +1 -1
- package/dist/index.cjs +197 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -11
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
- package/src/document/document.test.ts +644 -1
- package/src/document/document.ts +32 -12
- package/src/document/transformers/semantic-markdown.ts +227 -0
- package/src/document/types.ts +21 -2
- package/src/document/validation.ts +11 -0
package/dist/index.js
CHANGED
|
@@ -5644,6 +5644,168 @@ var MarkdownHeaderTransformer = class {
|
|
|
5644
5644
|
return this.createDocuments(texts, metadatas);
|
|
5645
5645
|
}
|
|
5646
5646
|
};
|
|
5647
|
+
var SemanticMarkdownTransformer = class _SemanticMarkdownTransformer extends TextTransformer {
|
|
5648
|
+
tokenizer;
|
|
5649
|
+
joinThreshold;
|
|
5650
|
+
allowedSpecial;
|
|
5651
|
+
disallowedSpecial;
|
|
5652
|
+
constructor({
|
|
5653
|
+
joinThreshold = 500,
|
|
5654
|
+
encodingName = "cl100k_base",
|
|
5655
|
+
modelName,
|
|
5656
|
+
allowedSpecial = /* @__PURE__ */ new Set(),
|
|
5657
|
+
disallowedSpecial = "all",
|
|
5658
|
+
...baseOptions
|
|
5659
|
+
} = {}) {
|
|
5660
|
+
super(baseOptions);
|
|
5661
|
+
this.joinThreshold = joinThreshold;
|
|
5662
|
+
this.allowedSpecial = allowedSpecial;
|
|
5663
|
+
this.disallowedSpecial = disallowedSpecial;
|
|
5664
|
+
try {
|
|
5665
|
+
this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
|
|
5666
|
+
} catch {
|
|
5667
|
+
throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
|
|
5668
|
+
}
|
|
5669
|
+
}
|
|
5670
|
+
countTokens(text) {
|
|
5671
|
+
const allowed = this.allowedSpecial === "all" ? "all" : Array.from(this.allowedSpecial);
|
|
5672
|
+
const disallowed = this.disallowedSpecial === "all" ? "all" : Array.from(this.disallowedSpecial);
|
|
5673
|
+
const processedText = this.stripWhitespace ? text.trim() : text;
|
|
5674
|
+
return this.tokenizer.encode(processedText, allowed, disallowed).length;
|
|
5675
|
+
}
|
|
5676
|
+
splitMarkdownByHeaders(markdown) {
|
|
5677
|
+
const sections = [];
|
|
5678
|
+
const lines = markdown.split("\n");
|
|
5679
|
+
let currentContent = "";
|
|
5680
|
+
let currentTitle = "";
|
|
5681
|
+
let currentDepth = 0;
|
|
5682
|
+
let inCodeBlock = false;
|
|
5683
|
+
const headerRegex = /^(#+)\s+(.+)$/;
|
|
5684
|
+
for (let i = 0; i < lines.length; i++) {
|
|
5685
|
+
const line = lines[i];
|
|
5686
|
+
const headerMatch = line.match(headerRegex);
|
|
5687
|
+
if (line.startsWith("```") || line.startsWith("~~~")) {
|
|
5688
|
+
inCodeBlock = !inCodeBlock;
|
|
5689
|
+
}
|
|
5690
|
+
if (headerMatch && !inCodeBlock) {
|
|
5691
|
+
if (currentContent.trim() !== "" || currentTitle && currentDepth > 0) {
|
|
5692
|
+
sections.push({
|
|
5693
|
+
title: currentTitle,
|
|
5694
|
+
content: currentContent.trim(),
|
|
5695
|
+
depth: currentDepth,
|
|
5696
|
+
length: this.countTokens(currentContent.trim())
|
|
5697
|
+
});
|
|
5698
|
+
}
|
|
5699
|
+
currentContent = "";
|
|
5700
|
+
currentDepth = headerMatch[1].length;
|
|
5701
|
+
currentTitle = headerMatch[2];
|
|
5702
|
+
} else {
|
|
5703
|
+
currentContent += line + "\n";
|
|
5704
|
+
}
|
|
5705
|
+
}
|
|
5706
|
+
if (currentContent.trim() !== "") {
|
|
5707
|
+
sections.push({
|
|
5708
|
+
title: currentTitle,
|
|
5709
|
+
content: currentContent.trim(),
|
|
5710
|
+
depth: currentDepth,
|
|
5711
|
+
length: this.countTokens(currentContent.trim())
|
|
5712
|
+
});
|
|
5713
|
+
}
|
|
5714
|
+
if (sections.length > 1 && sections[0].title === "" && sections[0].content.trim() === "") {
|
|
5715
|
+
sections.shift();
|
|
5716
|
+
}
|
|
5717
|
+
return sections;
|
|
5718
|
+
}
|
|
5719
|
+
mergeSemanticSections(sections) {
|
|
5720
|
+
if (sections.length === 0) return sections;
|
|
5721
|
+
const workingSections = [...sections];
|
|
5722
|
+
const deepest = Math.max(...workingSections.map((s) => s.depth));
|
|
5723
|
+
for (let depth = deepest; depth > 0; depth--) {
|
|
5724
|
+
for (let j = 1; j < workingSections.length; j++) {
|
|
5725
|
+
const current = workingSections[j];
|
|
5726
|
+
if (current.depth === depth) {
|
|
5727
|
+
const prev = workingSections[j - 1];
|
|
5728
|
+
if (prev.length + current.length < this.joinThreshold && prev.depth <= current.depth) {
|
|
5729
|
+
const title = `${"#".repeat(current.depth)} ${current.title}`;
|
|
5730
|
+
const formattedTitle = `
|
|
5731
|
+
|
|
5732
|
+
${title}`;
|
|
5733
|
+
prev.content += `${formattedTitle}
|
|
5734
|
+
${current.content}`;
|
|
5735
|
+
prev.length = this.countTokens(prev.content);
|
|
5736
|
+
workingSections.splice(j, 1);
|
|
5737
|
+
j--;
|
|
5738
|
+
}
|
|
5739
|
+
}
|
|
5740
|
+
}
|
|
5741
|
+
}
|
|
5742
|
+
return workingSections;
|
|
5743
|
+
}
|
|
5744
|
+
splitText({ text }) {
|
|
5745
|
+
if (!text.trim()) return [];
|
|
5746
|
+
const initialSections = this.splitMarkdownByHeaders(text);
|
|
5747
|
+
const mergedSections = this.mergeSemanticSections(initialSections);
|
|
5748
|
+
return mergedSections.map((section) => {
|
|
5749
|
+
if (section.title) {
|
|
5750
|
+
const header = `${"#".repeat(section.depth)} ${section.title}`;
|
|
5751
|
+
return `${header}
|
|
5752
|
+
${section.content}`;
|
|
5753
|
+
}
|
|
5754
|
+
return section.content;
|
|
5755
|
+
});
|
|
5756
|
+
}
|
|
5757
|
+
createDocuments(texts, metadatas) {
|
|
5758
|
+
const _metadatas = metadatas || Array(texts.length).fill({});
|
|
5759
|
+
const documents = [];
|
|
5760
|
+
texts.forEach((text, i) => {
|
|
5761
|
+
this.splitText({ text }).forEach((chunk) => {
|
|
5762
|
+
const metadata = {
|
|
5763
|
+
..._metadatas[i],
|
|
5764
|
+
tokenCount: this.countTokens(chunk)
|
|
5765
|
+
};
|
|
5766
|
+
documents.push(
|
|
5767
|
+
new Document({
|
|
5768
|
+
text: chunk,
|
|
5769
|
+
metadata
|
|
5770
|
+
})
|
|
5771
|
+
);
|
|
5772
|
+
});
|
|
5773
|
+
});
|
|
5774
|
+
return documents;
|
|
5775
|
+
}
|
|
5776
|
+
transformDocuments(documents) {
|
|
5777
|
+
const texts = [];
|
|
5778
|
+
const metadatas = [];
|
|
5779
|
+
for (const doc of documents) {
|
|
5780
|
+
texts.push(doc.text);
|
|
5781
|
+
metadatas.push(doc.metadata);
|
|
5782
|
+
}
|
|
5783
|
+
return this.createDocuments(texts, metadatas);
|
|
5784
|
+
}
|
|
5785
|
+
static fromTikToken({
|
|
5786
|
+
encodingName = "cl100k_base",
|
|
5787
|
+
modelName,
|
|
5788
|
+
options = {}
|
|
5789
|
+
}) {
|
|
5790
|
+
let tokenizer;
|
|
5791
|
+
try {
|
|
5792
|
+
tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
|
|
5793
|
+
} catch {
|
|
5794
|
+
throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
|
|
5795
|
+
}
|
|
5796
|
+
const tikTokenCounter = (text) => {
|
|
5797
|
+
const allowed = options.allowedSpecial === "all" ? "all" : options.allowedSpecial ? Array.from(options.allowedSpecial) : [];
|
|
5798
|
+
const disallowed = options.disallowedSpecial === "all" ? "all" : options.disallowedSpecial ? Array.from(options.disallowedSpecial) : [];
|
|
5799
|
+
return tokenizer.encode(text, allowed, disallowed).length;
|
|
5800
|
+
};
|
|
5801
|
+
return new _SemanticMarkdownTransformer({
|
|
5802
|
+
...options,
|
|
5803
|
+
encodingName,
|
|
5804
|
+
modelName,
|
|
5805
|
+
lengthFunction: tikTokenCounter
|
|
5806
|
+
});
|
|
5807
|
+
}
|
|
5808
|
+
};
|
|
5647
5809
|
|
|
5648
5810
|
// src/document/transformers/sentence.ts
|
|
5649
5811
|
var SentenceTransformer = class extends TextTransformer {
|
|
@@ -6031,6 +6193,13 @@ var markdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
|
6031
6193
|
returnEachLine: z.boolean().optional(),
|
|
6032
6194
|
stripHeaders: z.boolean().optional()
|
|
6033
6195
|
}).strict();
|
|
6196
|
+
var semanticMarkdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6197
|
+
joinThreshold: z.number().positive().optional(),
|
|
6198
|
+
encodingName: z.string().optional(),
|
|
6199
|
+
modelName: z.string().optional(),
|
|
6200
|
+
allowedSpecial: setOrAllSchema,
|
|
6201
|
+
disallowedSpecial: setOrAllSchema
|
|
6202
|
+
}).strict();
|
|
6034
6203
|
var latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
|
|
6035
6204
|
var validationSchemas = {
|
|
6036
6205
|
character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
@@ -6040,6 +6209,7 @@ var validationSchemas = {
|
|
|
6040
6209
|
json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6041
6210
|
html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6042
6211
|
markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6212
|
+
"semantic-markdown": semanticMarkdownChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6043
6213
|
latex: latexChunkOptionsSchema.transform(handleDeprecatedSize)
|
|
6044
6214
|
};
|
|
6045
6215
|
function validateChunkParams(strategy, params) {
|
|
@@ -6169,18 +6339,25 @@ var MDocument = class _MDocument {
|
|
|
6169
6339
|
return "recursive";
|
|
6170
6340
|
}
|
|
6171
6341
|
}
|
|
6342
|
+
_strategyMap;
|
|
6343
|
+
get strategyMap() {
|
|
6344
|
+
if (!this._strategyMap) {
|
|
6345
|
+
this._strategyMap = {
|
|
6346
|
+
recursive: (options) => this.chunkRecursive(options),
|
|
6347
|
+
character: (options) => this.chunkCharacter(options),
|
|
6348
|
+
token: (options) => this.chunkToken(options),
|
|
6349
|
+
markdown: (options) => this.chunkMarkdown(options),
|
|
6350
|
+
html: (options) => this.chunkHTML(options),
|
|
6351
|
+
json: (options) => this.chunkJSON(options),
|
|
6352
|
+
latex: (options) => this.chunkLatex(options),
|
|
6353
|
+
sentence: (options) => this.chunkSentence(options),
|
|
6354
|
+
"semantic-markdown": (options) => this.chunkSemanticMarkdown(options)
|
|
6355
|
+
};
|
|
6356
|
+
}
|
|
6357
|
+
return this._strategyMap;
|
|
6358
|
+
}
|
|
6172
6359
|
async chunkBy(strategy, options) {
|
|
6173
|
-
const
|
|
6174
|
-
recursive: (options2) => this.chunkRecursive(options2),
|
|
6175
|
-
character: (options2) => this.chunkCharacter(options2),
|
|
6176
|
-
token: (options2) => this.chunkToken(options2),
|
|
6177
|
-
markdown: (options2) => this.chunkMarkdown(options2),
|
|
6178
|
-
html: (options2) => this.chunkHTML(options2),
|
|
6179
|
-
json: (options2) => this.chunkJSON(options2),
|
|
6180
|
-
latex: (options2) => this.chunkLatex(options2),
|
|
6181
|
-
sentence: (options2) => this.chunkSentence(options2)
|
|
6182
|
-
};
|
|
6183
|
-
const chunkingFunc = strategyMap[strategy];
|
|
6360
|
+
const chunkingFunc = this.strategyMap[strategy];
|
|
6184
6361
|
if (chunkingFunc) {
|
|
6185
6362
|
await chunkingFunc(options);
|
|
6186
6363
|
} else {
|
|
@@ -6282,6 +6459,15 @@ var MDocument = class _MDocument {
|
|
|
6282
6459
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
6283
6460
|
this.chunks = textSplit;
|
|
6284
6461
|
}
|
|
6462
|
+
async chunkSemanticMarkdown(options) {
|
|
6463
|
+
const rt = SemanticMarkdownTransformer.fromTikToken({
|
|
6464
|
+
options,
|
|
6465
|
+
encodingName: options?.encodingName,
|
|
6466
|
+
modelName: options?.modelName
|
|
6467
|
+
});
|
|
6468
|
+
const textSplit = rt.transformDocuments(this.chunks);
|
|
6469
|
+
this.chunks = textSplit;
|
|
6470
|
+
}
|
|
6285
6471
|
async chunk(params) {
|
|
6286
6472
|
const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
|
|
6287
6473
|
const strategy = passedStrategy || this.defaultStrategy();
|