@mastra/rag 2.0.0-beta.2 → 2.0.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,47 @@
1
1
  # @mastra/rag
2
2
 
3
+ ## 2.0.0-beta.3
4
+
5
+ ### Patch Changes
6
+
7
+ - Add maxSize support for HTML chunking strategies ([#10654](https://github.com/mastra-ai/mastra/pull/10654))
8
+
9
+ Added support for the `maxSize` option in HTML chunking strategies (`headers` and `sections`), allowing users to control the maximum chunk size when chunking HTML documents. Previously, HTML chunks could be excessively large when sections contained substantial content.
10
+
11
+ **Changes:**
12
+ - Added `maxSize` support to `headers` strategy - applies `RecursiveCharacterTransformer` after header-based splitting
13
+ - Added `maxSize` support to `sections` strategy - applies `RecursiveCharacterTransformer` after section-based splitting
14
+ - Fixed `splitHtmlByHeaders` content extraction bug - changed from broken `nextElementSibling` to working `parentNode.childNodes` approach
15
+ - Added comprehensive test coverage including integration test with real arXiv paper
16
+
17
+ **Usage:**
18
+
19
+ ```typescript
20
+ import { MDocument } from '@mastra/rag';
21
+
22
+ const doc = MDocument.fromHTML(htmlContent);
23
+
24
+ const chunks = await doc.chunk({
25
+ strategy: 'html',
26
+ headers: [
27
+ ['h1', 'Header 1'],
28
+ ['h2', 'Header 2'],
29
+ ['h3', 'Header 3'],
30
+ ],
31
+ maxSize: 512, // Control chunk size
32
+ overlap: 50, // Optional overlap for context
33
+ });
34
+ ```
35
+
36
+ **Results from real arXiv paper test:**
37
+ - Without maxSize: 22 chunks, max 45,531 chars (too big!)
38
+ - With maxSize=512: 499 chunks, max 512 chars (properly sized)
39
+
40
+ Fixes #7942
41
+
42
+ - Updated dependencies [[`ac0d2f4`](https://github.com/mastra-ai/mastra/commit/ac0d2f4ff8831f72c1c66c2be809706d17f65789), [`1a0d3fc`](https://github.com/mastra-ai/mastra/commit/1a0d3fc811482c9c376cdf79ee615c23bae9b2d6), [`85a628b`](https://github.com/mastra-ai/mastra/commit/85a628b1224a8f64cd82ea7f033774bf22df7a7e), [`c237233`](https://github.com/mastra-ai/mastra/commit/c23723399ccedf7f5744b3f40997b79246bfbe64), [`15f9e21`](https://github.com/mastra-ai/mastra/commit/15f9e216177201ea6e3f6d0bfb063fcc0953444f), [`ff94dea`](https://github.com/mastra-ai/mastra/commit/ff94dea935f4e34545c63bcb6c29804732698809), [`5b2ff46`](https://github.com/mastra-ai/mastra/commit/5b2ff4651df70c146523a7fca773f8eb0a2272f8), [`db41688`](https://github.com/mastra-ai/mastra/commit/db4168806d007417e2e60b4f68656dca4e5f40c9), [`5ca599d`](https://github.com/mastra-ai/mastra/commit/5ca599d0bb59a1595f19f58473fcd67cc71cef58), [`bff1145`](https://github.com/mastra-ai/mastra/commit/bff114556b3cbadad9b2768488708f8ad0e91475), [`5c8ca24`](https://github.com/mastra-ai/mastra/commit/5c8ca247094e0cc2cdbd7137822fb47241f86e77), [`e191844`](https://github.com/mastra-ai/mastra/commit/e1918444ca3f80e82feef1dad506cd4ec6e2875f), [`22553f1`](https://github.com/mastra-ai/mastra/commit/22553f11c63ee5e966a9c034a349822249584691), [`7237163`](https://github.com/mastra-ai/mastra/commit/72371635dbf96a87df4b073cc48fc655afbdce3d), [`2500740`](https://github.com/mastra-ai/mastra/commit/2500740ea23da067d6e50ec71c625ab3ce275e64), [`873ecbb`](https://github.com/mastra-ai/mastra/commit/873ecbb517586aa17d2f1e99283755b3ebb2863f), [`4f9bbe5`](https://github.com/mastra-ai/mastra/commit/4f9bbe5968f42c86f4930b8193de3c3c17e5bd36), [`02e51fe`](https://github.com/mastra-ai/mastra/commit/02e51feddb3d4155cfbcc42624fd0d0970d032c0), [`8f3fa3a`](https://github.com/mastra-ai/mastra/commit/8f3fa3a652bb77da092f913ec51ae46e3a7e27dc), [`cd29ad2`](https://github.com/mastra-ai/mastra/commit/cd29ad23a255534e8191f249593849ed29160886), [`bdf4d8c`](https://github.com/mastra-ai/mastra/commit/bdf4d8cdc656d8a2c21d81834bfa3bfa70f56c16), [`854e3da`](https://github.com/mastra-ai/mastra/commit/854e3dad5daac17a91a20986399d3a51f54bf68b), [`ce18d38`](https://github.com/mastra-ai/mastra/commit/ce18d38678c65870350d123955014a8432075fd9), [`cccf9c8`](https://github.com/mastra-ai/mastra/commit/cccf9c8b2d2dfc1a5e63919395b83d78c89682a0), [`61a5705`](https://github.com/mastra-ai/mastra/commit/61a570551278b6743e64243b3ce7d73de915ca8a), [`db70a48`](https://github.com/mastra-ai/mastra/commit/db70a48aeeeeb8e5f92007e8ede52c364ce15287), [`f0fdc14`](https://github.com/mastra-ai/mastra/commit/f0fdc14ee233d619266b3d2bbdeea7d25cfc6d13), [`db18bc9`](https://github.com/mastra-ai/mastra/commit/db18bc9c3825e2c1a0ad9a183cc9935f6691bfa1), [`9b37b56`](https://github.com/mastra-ai/mastra/commit/9b37b565e1f2a76c24f728945cc740c2b09be9da), [`41a23c3`](https://github.com/mastra-ai/mastra/commit/41a23c32f9877d71810f37e24930515df2ff7a0f), [`5d171ad`](https://github.com/mastra-ai/mastra/commit/5d171ad9ef340387276b77c2bb3e83e83332d729), [`f03ae60`](https://github.com/mastra-ai/mastra/commit/f03ae60500fe350c9d828621006cdafe1975fdd8), [`d1e74a0`](https://github.com/mastra-ai/mastra/commit/d1e74a0a293866dece31022047f5dbab65a304d0), [`39e7869`](https://github.com/mastra-ai/mastra/commit/39e7869bc7d0ee391077ce291474d8a84eedccff), [`5761926`](https://github.com/mastra-ai/mastra/commit/57619260c4a2cdd598763abbacd90de594c6bc76), [`c900fdd`](https://github.com/mastra-ai/mastra/commit/c900fdd504c41348efdffb205cfe80d48c38fa33), [`604a79f`](https://github.com/mastra-ai/mastra/commit/604a79fecf276e26a54a3fe01bb94e65315d2e0e), [`887f0b4`](https://github.com/mastra-ai/mastra/commit/887f0b4746cdbd7cb7d6b17ac9f82aeb58037ea5), [`2562143`](https://github.com/mastra-ai/mastra/commit/256214336b4faa78646c9c1776612393790d8784), [`ef11a61`](https://github.com/mastra-ai/mastra/commit/ef11a61920fa0ed08a5b7ceedd192875af119749)]:
43
+ - @mastra/core@1.0.0-beta.6
44
+
3
45
  ## 2.0.0-beta.2
4
46
 
5
47
  ### Patch Changes
@@ -1 +1 @@
1
- {"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../../src/document/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,IAAI,KAAK,EAAgC,MAAM,UAAU,CAAC;AAU3E,OAAO,KAAK,EACV,WAAW,EAEX,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EACrB,qBAAqB,EACrB,iBAAiB,EACjB,oBAAoB,EACpB,4BAA4B,EAC5B,gBAAgB,EAChB,iBAAiB,EACjB,oBAAoB,EAErB,MAAM,SAAS,CAAC;AAGjB,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAU;IACxB,OAAO,CAAC,IAAI,CAAS;gBAET,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;SAAE,EAAE,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE;IAOhG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC;IAmDjG,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYhF,MAAM,CAAC,QAAQ,CAAC,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAY9E,OAAO,CAAC,eAAe;IAevB,OAAO,CAAC,YAAY,CAAC,CAA4E;IAEjG,OAAO,KAAK,WAAW,GAetB;YAEa,OAAO;IASf,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa9D,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9D,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAoBpD,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAmBpD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAMtD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAUtD,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa5D,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAuB5D,qBAAqB,CAAC,OAAO,CAAC,EAAE,4BAA4B,GAAG,OAAO,CAAC,IAAI,CAAC;IAU5E,KAAK,CAAC,MAAM,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;IAiBnD,OAAO,IAAI,KAAK,EAAE;IAIlB,OAAO,IAAI,MAAM,EAAE;IAInB,WAAW,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE;CAGrC"}
1
+ {"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../../src/document/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,IAAI,KAAK,EAAgC,MAAM,UAAU,CAAC;AAU3E,OAAO,KAAK,EACV,WAAW,EAEX,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EACrB,qBAAqB,EACrB,iBAAiB,EACjB,oBAAoB,EACpB,4BAA4B,EAC5B,gBAAgB,EAChB,iBAAiB,EACjB,oBAAoB,EAErB,MAAM,SAAS,CAAC;AAGjB,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAU;IACxB,OAAO,CAAC,IAAI,CAAS;gBAET,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;SAAE,EAAE,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE;IAOhG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC;IAmDjG,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYhF,MAAM,CAAC,QAAQ,CAAC,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAY9E,OAAO,CAAC,eAAe;IAevB,OAAO,CAAC,YAAY,CAAC,CAA4E;IAEjG,OAAO,KAAK,WAAW,GAetB;YAEa,OAAO;IASf,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa9D,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9D,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IA8CpD,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAmBpD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAMtD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAUtD,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa5D,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAuB5D,qBAAqB,CAAC,OAAO,CAAC,EAAE,4BAA4B,GAAG,OAAO,CAAC,IAAI,CAAC;IAU5E,KAAK,CAAC,MAAM,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;IAiBnD,OAAO,IAAI,KAAK,EAAE;IAIlB,OAAO,IAAI,MAAM,EAAE;IAInB,WAAW,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE;CAGrC"}
@@ -23,6 +23,7 @@ export declare class HTMLSectionTransformer {
23
23
  });
24
24
  splitText(text: string): Document[];
25
25
  private getXPath;
26
+ private getTextContent;
26
27
  private splitHtmlByHeaders;
27
28
  splitDocuments(documents: Document[]): Promise<Document[]>;
28
29
  createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[];
@@ -1 +1 @@
1
- {"version":3,"file":"html.d.ts","sourceRoot":"","sources":["../../../src/document/transformers/html.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AACrC,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAWjD,qBAAa,qBAAqB;IAChC,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,iBAAiB,CAAU;gBAEvB,OAAO,EAAE,gBAAgB,GAAG;QAAE,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAA;KAAE;IAKvE,SAAS,CAAC,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,GAAG,QAAQ,EAAE;IAwDjD,OAAO,CAAC,QAAQ;IA2BhB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,yBAAyB;IAyBjC,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,GAAG,QAAQ,EAAE;IA8B/E,kBAAkB,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE;CAWtD;AAED,qBAAa,sBAAsB;IACjC,OAAO,CAAC,gBAAgB,CAAyB;IACjD,OAAO,CAAC,YAAY,CAAgC;gBAExC,OAAO,EAAE,gBAAgB,GAAG;QAAE,QAAQ,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAA;KAAE;IAKxE,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,QAAQ,EAAE;IAenC,OAAO,CAAC,QAAQ;IAwBhB,OAAO,CAAC,kBAAkB;IA8CpB,cAAc,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;IAahE,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,GAAG,QAAQ,EAAE;IA+B/E,kBAAkB,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE;CAWtD"}
1
+ {"version":3,"file":"html.d.ts","sourceRoot":"","sources":["../../../src/document/transformers/html.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AACrC,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAWjD,qBAAa,qBAAqB;IAChC,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,iBAAiB,CAAU;gBAEvB,OAAO,EAAE,gBAAgB,GAAG;QAAE,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAA;KAAE;IAKvE,SAAS,CAAC,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,GAAG,QAAQ,EAAE;IAwDjD,OAAO,CAAC,QAAQ;IA2BhB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,yBAAyB;IAyBjC,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,GAAG,QAAQ,EAAE;IA8B/E,kBAAkB,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE;CAWtD;AAED,qBAAa,sBAAsB;IACjC,OAAO,CAAC,gBAAgB,CAAyB;IACjD,OAAO,CAAC,YAAY,CAAgC;gBAExC,OAAO,EAAE,gBAAgB,GAAG;QAAE,QAAQ,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAA;KAAE;IAKxE,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,QAAQ,EAAE;IAenC,OAAO,CAAC,QAAQ;IAwBhB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,kBAAkB;IA6DpB,cAAc,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;IAahE,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,GAAG,QAAQ,EAAE;IA+B/E,kBAAkB,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE;CAWtD"}
package/dist/index.cjs CHANGED
@@ -272,7 +272,7 @@ Provide keywords in the following comma-separated format: 'KEYWORDS: <keywords>'
272
272
  var defaultQuestionExtractPrompt = new PromptTemplate({
273
273
  templateVars: ["numQuestions", "context"],
274
274
  template: `(
275
- "Given the contextual informations below, generate {numQuestions} questions this context can provides specific answers to which are unlikely to be found else where. Higher-level summaries of surrounding context may be provided as well. "
275
+ "Given the contextual information below, generate {numQuestions} questions this context can provide specific answers to which are unlikely to be found elsewhere. Higher-level summaries of surrounding context may be provided as well. "
276
276
  "Try using these summaries to generate better questions that this context can answer."
277
277
  "---------------------"
278
278
  "{context}"
@@ -4999,23 +4999,47 @@ var HTMLSectionTransformer = class {
4999
4999
  }
5000
5000
  return "/" + parts.join("/");
5001
5001
  }
5002
+ getTextContent(element) {
5003
+ if (!element) return "";
5004
+ if (!element.tagName) {
5005
+ return element.text || "";
5006
+ }
5007
+ let content = element.text || "";
5008
+ if (element.childNodes) {
5009
+ for (const child of element.childNodes) {
5010
+ const childText = this.getTextContent(child);
5011
+ if (childText) {
5012
+ content += " " + childText;
5013
+ }
5014
+ }
5015
+ }
5016
+ return content.trim();
5017
+ }
5002
5018
  splitHtmlByHeaders(htmlDoc) {
5003
5019
  const sections = [];
5004
5020
  const root = nodeHtmlBetterParser.parse(htmlDoc);
5005
5021
  const headers = Object.keys(this.headersToSplitOn);
5006
5022
  const headerElements = root.querySelectorAll(headers.join(","));
5007
- headerElements.forEach((headerElement, index) => {
5023
+ headerElements.forEach((headerElement) => {
5008
5024
  const header = headerElement.text?.trim() || "";
5009
5025
  const tagName = headerElement.tagName;
5010
5026
  const xpath = this.getXPath(headerElement);
5011
5027
  let content = "";
5012
- let currentElement = headerElement.nextElementSibling;
5013
- const nextHeader = headerElements[index + 1];
5014
- while (currentElement && (!nextHeader || currentElement !== nextHeader)) {
5015
- if (currentElement.text) {
5016
- content += currentElement.text.trim() + " ";
5028
+ const parentNode = headerElement.parentNode;
5029
+ if (parentNode && parentNode.childNodes) {
5030
+ let foundHeader = false;
5031
+ for (const node of parentNode.childNodes) {
5032
+ if (node === headerElement) {
5033
+ foundHeader = true;
5034
+ continue;
5035
+ }
5036
+ if (foundHeader && node.tagName && headers.includes(node.tagName.toLowerCase())) {
5037
+ break;
5038
+ }
5039
+ if (foundHeader) {
5040
+ content += this.getTextContent(node) + " ";
5041
+ }
5017
5042
  }
5018
- currentElement = currentElement.nextElementSibling;
5019
5043
  }
5020
5044
  content = content.trim();
5021
5045
  sections.push({
@@ -6392,13 +6416,33 @@ var MDocument = class _MDocument {
6392
6416
  async chunkHTML(options) {
6393
6417
  if (options?.headers?.length) {
6394
6418
  const rt = new HTMLHeaderTransformer(options);
6395
- const textSplit = rt.transformDocuments(this.chunks);
6419
+ let textSplit = rt.transformDocuments(this.chunks);
6420
+ if (options?.maxSize) {
6421
+ const textSplitter = new RecursiveCharacterTransformer({
6422
+ maxSize: options.maxSize,
6423
+ overlap: options.overlap,
6424
+ keepSeparator: options.keepSeparator,
6425
+ addStartIndex: options.addStartIndex,
6426
+ stripWhitespace: options.stripWhitespace
6427
+ });
6428
+ textSplit = textSplitter.splitDocuments(textSplit);
6429
+ }
6396
6430
  this.chunks = textSplit;
6397
6431
  return;
6398
6432
  }
6399
6433
  if (options?.sections?.length) {
6400
6434
  const rt = new HTMLSectionTransformer(options);
6401
- const textSplit = rt.transformDocuments(this.chunks);
6435
+ let textSplit = rt.transformDocuments(this.chunks);
6436
+ if (options?.maxSize) {
6437
+ const textSplitter = new RecursiveCharacterTransformer({
6438
+ maxSize: options.maxSize,
6439
+ overlap: options.overlap,
6440
+ keepSeparator: options.keepSeparator,
6441
+ addStartIndex: options.addStartIndex,
6442
+ stripWhitespace: options.stripWhitespace
6443
+ });
6444
+ textSplit = textSplitter.splitDocuments(textSplit);
6445
+ }
6402
6446
  this.chunks = textSplit;
6403
6447
  return;
6404
6448
  }