@mastra/rag 2.0.0-beta.2 → 2.0.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -266,7 +266,7 @@ Provide keywords in the following comma-separated format: 'KEYWORDS: <keywords>'
266
266
  var defaultQuestionExtractPrompt = new PromptTemplate({
267
267
  templateVars: ["numQuestions", "context"],
268
268
  template: `(
269
- "Given the contextual informations below, generate {numQuestions} questions this context can provides specific answers to which are unlikely to be found else where. Higher-level summaries of surrounding context may be provided as well. "
269
+ "Given the contextual information below, generate {numQuestions} questions this context can provide specific answers to which are unlikely to be found elsewhere. Higher-level summaries of surrounding context may be provided as well. "
270
270
  "Try using these summaries to generate better questions that this context can answer."
271
271
  "---------------------"
272
272
  "{context}"
@@ -4993,23 +4993,47 @@ var HTMLSectionTransformer = class {
4993
4993
  }
4994
4994
  return "/" + parts.join("/");
4995
4995
  }
4996
+ getTextContent(element) {
4997
+ if (!element) return "";
4998
+ if (!element.tagName) {
4999
+ return element.text || "";
5000
+ }
5001
+ let content = element.text || "";
5002
+ if (element.childNodes) {
5003
+ for (const child of element.childNodes) {
5004
+ const childText = this.getTextContent(child);
5005
+ if (childText) {
5006
+ content += " " + childText;
5007
+ }
5008
+ }
5009
+ }
5010
+ return content.trim();
5011
+ }
4996
5012
  splitHtmlByHeaders(htmlDoc) {
4997
5013
  const sections = [];
4998
5014
  const root = parse(htmlDoc);
4999
5015
  const headers = Object.keys(this.headersToSplitOn);
5000
5016
  const headerElements = root.querySelectorAll(headers.join(","));
5001
- headerElements.forEach((headerElement, index) => {
5017
+ headerElements.forEach((headerElement) => {
5002
5018
  const header = headerElement.text?.trim() || "";
5003
5019
  const tagName = headerElement.tagName;
5004
5020
  const xpath = this.getXPath(headerElement);
5005
5021
  let content = "";
5006
- let currentElement = headerElement.nextElementSibling;
5007
- const nextHeader = headerElements[index + 1];
5008
- while (currentElement && (!nextHeader || currentElement !== nextHeader)) {
5009
- if (currentElement.text) {
5010
- content += currentElement.text.trim() + " ";
5022
+ const parentNode = headerElement.parentNode;
5023
+ if (parentNode && parentNode.childNodes) {
5024
+ let foundHeader = false;
5025
+ for (const node of parentNode.childNodes) {
5026
+ if (node === headerElement) {
5027
+ foundHeader = true;
5028
+ continue;
5029
+ }
5030
+ if (foundHeader && node.tagName && headers.includes(node.tagName.toLowerCase())) {
5031
+ break;
5032
+ }
5033
+ if (foundHeader) {
5034
+ content += this.getTextContent(node) + " ";
5035
+ }
5011
5036
  }
5012
- currentElement = currentElement.nextElementSibling;
5013
5037
  }
5014
5038
  content = content.trim();
5015
5039
  sections.push({
@@ -6386,13 +6410,33 @@ var MDocument = class _MDocument {
6386
6410
  async chunkHTML(options) {
6387
6411
  if (options?.headers?.length) {
6388
6412
  const rt = new HTMLHeaderTransformer(options);
6389
- const textSplit = rt.transformDocuments(this.chunks);
6413
+ let textSplit = rt.transformDocuments(this.chunks);
6414
+ if (options?.maxSize) {
6415
+ const textSplitter = new RecursiveCharacterTransformer({
6416
+ maxSize: options.maxSize,
6417
+ overlap: options.overlap,
6418
+ keepSeparator: options.keepSeparator,
6419
+ addStartIndex: options.addStartIndex,
6420
+ stripWhitespace: options.stripWhitespace
6421
+ });
6422
+ textSplit = textSplitter.splitDocuments(textSplit);
6423
+ }
6390
6424
  this.chunks = textSplit;
6391
6425
  return;
6392
6426
  }
6393
6427
  if (options?.sections?.length) {
6394
6428
  const rt = new HTMLSectionTransformer(options);
6395
- const textSplit = rt.transformDocuments(this.chunks);
6429
+ let textSplit = rt.transformDocuments(this.chunks);
6430
+ if (options?.maxSize) {
6431
+ const textSplitter = new RecursiveCharacterTransformer({
6432
+ maxSize: options.maxSize,
6433
+ overlap: options.overlap,
6434
+ keepSeparator: options.keepSeparator,
6435
+ addStartIndex: options.addStartIndex,
6436
+ stripWhitespace: options.stripWhitespace
6437
+ });
6438
+ textSplit = textSplitter.splitDocuments(textSplit);
6439
+ }
6396
6440
  this.chunks = textSplit;
6397
6441
  return;
6398
6442
  }