@mastra/rag 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -6066,31 +6066,34 @@ var MarkdownHeaderTransformer = class {
6066
6066
  var SemanticMarkdownTransformer = class _SemanticMarkdownTransformer extends TextTransformer {
6067
6067
  tokenizer;
6068
6068
  joinThreshold;
6069
- allowedSpecial;
6070
- disallowedSpecial;
6069
+ allowedArray;
6070
+ disallowedArray;
6071
6071
  constructor({
6072
6072
  joinThreshold = 500,
6073
6073
  encodingName = "cl100k_base",
6074
6074
  modelName,
6075
+ tokenizer: existingTokenizer,
6075
6076
  allowedSpecial = /* @__PURE__ */ new Set(),
6076
6077
  disallowedSpecial = "all",
6077
6078
  ...baseOptions
6078
6079
  } = {}) {
6079
6080
  super(baseOptions);
6080
6081
  this.joinThreshold = joinThreshold;
6081
- this.allowedSpecial = allowedSpecial;
6082
- this.disallowedSpecial = disallowedSpecial;
6083
- try {
6084
- this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
6085
- } catch {
6086
- throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
6082
+ this.allowedArray = allowedSpecial === "all" ? "all" : Array.from(allowedSpecial);
6083
+ this.disallowedArray = disallowedSpecial === "all" ? "all" : Array.from(disallowedSpecial);
6084
+ if (existingTokenizer) {
6085
+ this.tokenizer = existingTokenizer;
6086
+ } else {
6087
+ try {
6088
+ this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
6089
+ } catch {
6090
+ throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
6091
+ }
6087
6092
  }
6088
6093
  }
6089
6094
  countTokens(text) {
6090
- const allowed = this.allowedSpecial === "all" ? "all" : Array.from(this.allowedSpecial);
6091
- const disallowed = this.disallowedSpecial === "all" ? "all" : Array.from(this.disallowedSpecial);
6092
6095
  const processedText = this.stripWhitespace ? text.trim() : text;
6093
- return this.tokenizer.encode(processedText, allowed, disallowed).length;
6096
+ return this.tokenizer.encode(processedText, this.allowedArray, this.disallowedArray).length;
6094
6097
  }
6095
6098
  splitMarkdownByHeaders(markdown) {
6096
6099
  const sections = [];
@@ -6144,14 +6147,21 @@ var SemanticMarkdownTransformer = class _SemanticMarkdownTransformer extends Tex
6144
6147
  const current = workingSections[j];
6145
6148
  if (current.depth === depth) {
6146
6149
  const prev = workingSections[j - 1];
6147
- if (prev.length + current.length < this.joinThreshold && prev.depth <= current.depth) {
6148
- const title = `${"#".repeat(current.depth)} ${current.title}`;
6149
- const formattedTitle = `
6150
+ const title = `${"#".repeat(current.depth)} ${current.title}`;
6151
+ const formattedTitle = `
6150
6152
 
6151
6153
  ${title}`;
6154
+ const headerLength = this.tokenizer.encode(
6155
+ `${formattedTitle}
6156
+ `,
6157
+ this.allowedArray,
6158
+ this.disallowedArray
6159
+ ).length;
6160
+ const mergedLength = prev.length + current.length + headerLength;
6161
+ if (mergedLength < this.joinThreshold && prev.depth <= current.depth) {
6152
6162
  prev.content += `${formattedTitle}
6153
6163
  ${current.content}`;
6154
- prev.length = this.countTokens(prev.content);
6164
+ prev.length = mergedLength;
6155
6165
  workingSections.splice(j, 1);
6156
6166
  j--;
6157
6167
  }
@@ -6221,6 +6231,7 @@ ${section.content}`;
6221
6231
  ...options,
6222
6232
  encodingName,
6223
6233
  modelName,
6234
+ tokenizer,
6224
6235
  lengthFunction: tikTokenCounter
6225
6236
  });
6226
6237
  }
@@ -6475,30 +6486,33 @@ function splitTextOnTokens({ text, tokenizer }) {
6475
6486
  }
6476
6487
  var TokenTransformer = class _TokenTransformer extends TextTransformer {
6477
6488
  tokenizer;
6478
- allowedSpecial;
6479
- disallowedSpecial;
6489
+ allowedArray;
6490
+ disallowedArray;
6480
6491
  constructor({
6481
6492
  encodingName = "cl100k_base",
6482
6493
  modelName,
6494
+ tokenizer: existingTokenizer,
6483
6495
  allowedSpecial = /* @__PURE__ */ new Set(),
6484
6496
  disallowedSpecial = "all",
6485
6497
  options = {}
6486
6498
  }) {
6487
6499
  super(options);
6488
- try {
6489
- this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
6490
- } catch {
6491
- throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
6500
+ if (existingTokenizer) {
6501
+ this.tokenizer = existingTokenizer;
6502
+ } else {
6503
+ try {
6504
+ this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
6505
+ } catch {
6506
+ throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
6507
+ }
6492
6508
  }
6493
- this.allowedSpecial = allowedSpecial;
6494
- this.disallowedSpecial = disallowedSpecial;
6509
+ this.allowedArray = allowedSpecial === "all" ? "all" : Array.from(allowedSpecial);
6510
+ this.disallowedArray = disallowedSpecial === "all" ? "all" : Array.from(disallowedSpecial);
6495
6511
  }
6496
6512
  splitText({ text }) {
6497
6513
  const encode = (text2) => {
6498
- const allowed = this.allowedSpecial === "all" ? "all" : Array.from(this.allowedSpecial);
6499
- const disallowed = this.disallowedSpecial === "all" ? "all" : Array.from(this.disallowedSpecial);
6500
6514
  const processedText = this.stripWhitespace ? text2.trim() : text2;
6501
- return Array.from(this.tokenizer.encode(processedText, allowed, disallowed));
6515
+ return Array.from(this.tokenizer.encode(processedText, this.allowedArray, this.disallowedArray));
6502
6516
  };
6503
6517
  const decode = (tokens) => {
6504
6518
  const text2 = this.tokenizer.decode(tokens);
@@ -6535,6 +6549,7 @@ var TokenTransformer = class _TokenTransformer extends TextTransformer {
6535
6549
  return new _TokenTransformer({
6536
6550
  encodingName,
6537
6551
  modelName,
6552
+ tokenizer,
6538
6553
  allowedSpecial: options.allowedSpecial,
6539
6554
  disallowedSpecial: options.disallowedSpecial,
6540
6555
  options: {