@mastra/rag 2.0.0-beta.2 → 2.0.0-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +42 -0
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/transformers/html.d.ts +1 -0
- package/dist/document/transformers/html.d.ts.map +1 -1
- package/dist/index.cjs +54 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +54 -10
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,47 @@
|
|
|
1
1
|
# @mastra/rag
|
|
2
2
|
|
|
3
|
+
## 2.0.0-beta.3
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- Add maxSize support for HTML chunking strategies ([#10654](https://github.com/mastra-ai/mastra/pull/10654))
|
|
8
|
+
|
|
9
|
+
Added support for the `maxSize` option in HTML chunking strategies (`headers` and `sections`), allowing users to control the maximum chunk size when chunking HTML documents. Previously, HTML chunks could be excessively large when sections contained substantial content.
|
|
10
|
+
|
|
11
|
+
**Changes:**
|
|
12
|
+
- Added `maxSize` support to `headers` strategy - applies `RecursiveCharacterTransformer` after header-based splitting
|
|
13
|
+
- Added `maxSize` support to `sections` strategy - applies `RecursiveCharacterTransformer` after section-based splitting
|
|
14
|
+
- Fixed `splitHtmlByHeaders` content extraction bug - changed from broken `nextElementSibling` to working `parentNode.childNodes` approach
|
|
15
|
+
- Added comprehensive test coverage including integration test with real arXiv paper
|
|
16
|
+
|
|
17
|
+
**Usage:**
|
|
18
|
+
|
|
19
|
+
```typescript
|
|
20
|
+
import { MDocument } from '@mastra/rag';
|
|
21
|
+
|
|
22
|
+
const doc = MDocument.fromHTML(htmlContent);
|
|
23
|
+
|
|
24
|
+
const chunks = await doc.chunk({
|
|
25
|
+
strategy: 'html',
|
|
26
|
+
headers: [
|
|
27
|
+
['h1', 'Header 1'],
|
|
28
|
+
['h2', 'Header 2'],
|
|
29
|
+
['h3', 'Header 3'],
|
|
30
|
+
],
|
|
31
|
+
maxSize: 512, // Control chunk size
|
|
32
|
+
overlap: 50, // Optional overlap for context
|
|
33
|
+
});
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Results from real arXiv paper test:**
|
|
37
|
+
- Without maxSize: 22 chunks, max 45,531 chars (too big!)
|
|
38
|
+
- With maxSize=512: 499 chunks, max 512 chars (properly sized)
|
|
39
|
+
|
|
40
|
+
Fixes #7942
|
|
41
|
+
|
|
42
|
+
- Updated dependencies [[`ac0d2f4`](https://github.com/mastra-ai/mastra/commit/ac0d2f4ff8831f72c1c66c2be809706d17f65789), [`1a0d3fc`](https://github.com/mastra-ai/mastra/commit/1a0d3fc811482c9c376cdf79ee615c23bae9b2d6), [`85a628b`](https://github.com/mastra-ai/mastra/commit/85a628b1224a8f64cd82ea7f033774bf22df7a7e), [`c237233`](https://github.com/mastra-ai/mastra/commit/c23723399ccedf7f5744b3f40997b79246bfbe64), [`15f9e21`](https://github.com/mastra-ai/mastra/commit/15f9e216177201ea6e3f6d0bfb063fcc0953444f), [`ff94dea`](https://github.com/mastra-ai/mastra/commit/ff94dea935f4e34545c63bcb6c29804732698809), [`5b2ff46`](https://github.com/mastra-ai/mastra/commit/5b2ff4651df70c146523a7fca773f8eb0a2272f8), [`db41688`](https://github.com/mastra-ai/mastra/commit/db4168806d007417e2e60b4f68656dca4e5f40c9), [`5ca599d`](https://github.com/mastra-ai/mastra/commit/5ca599d0bb59a1595f19f58473fcd67cc71cef58), [`bff1145`](https://github.com/mastra-ai/mastra/commit/bff114556b3cbadad9b2768488708f8ad0e91475), [`5c8ca24`](https://github.com/mastra-ai/mastra/commit/5c8ca247094e0cc2cdbd7137822fb47241f86e77), [`e191844`](https://github.com/mastra-ai/mastra/commit/e1918444ca3f80e82feef1dad506cd4ec6e2875f), [`22553f1`](https://github.com/mastra-ai/mastra/commit/22553f11c63ee5e966a9c034a349822249584691), [`7237163`](https://github.com/mastra-ai/mastra/commit/72371635dbf96a87df4b073cc48fc655afbdce3d), [`2500740`](https://github.com/mastra-ai/mastra/commit/2500740ea23da067d6e50ec71c625ab3ce275e64), [`873ecbb`](https://github.com/mastra-ai/mastra/commit/873ecbb517586aa17d2f1e99283755b3ebb2863f), [`4f9bbe5`](https://github.com/mastra-ai/mastra/commit/4f9bbe5968f42c86f4930b8193de3c3c17e5bd36), [`02e51fe`](https://github.com/mastra-ai/mastra/commit/02e51feddb3d4155cfbcc42624fd0d0970d032c0), [`8f3fa3a`](https://github.com/mastra-ai/mastra/commit/8f3fa3a652bb77da092f913ec51ae46e3a7e27dc), [`cd29ad2`](https://github.com/mastra-ai/mastra/commit/cd29ad23a255534e8191f249593849ed29160886), [`bdf4d8c`](https://github.com/mastra-ai/mastra/commit/bdf4d8cdc656d8a2c21d81834bfa3bfa70f56c16), [`854e3da`](https://github.com/mastra-ai/mastra/commit/854e3dad5daac17a91a20986399d3a51f54bf68b), [`ce18d38`](https://github.com/mastra-ai/mastra/commit/ce18d38678c65870350d123955014a8432075fd9), [`cccf9c8`](https://github.com/mastra-ai/mastra/commit/cccf9c8b2d2dfc1a5e63919395b83d78c89682a0), [`61a5705`](https://github.com/mastra-ai/mastra/commit/61a570551278b6743e64243b3ce7d73de915ca8a), [`db70a48`](https://github.com/mastra-ai/mastra/commit/db70a48aeeeeb8e5f92007e8ede52c364ce15287), [`f0fdc14`](https://github.com/mastra-ai/mastra/commit/f0fdc14ee233d619266b3d2bbdeea7d25cfc6d13), [`db18bc9`](https://github.com/mastra-ai/mastra/commit/db18bc9c3825e2c1a0ad9a183cc9935f6691bfa1), [`9b37b56`](https://github.com/mastra-ai/mastra/commit/9b37b565e1f2a76c24f728945cc740c2b09be9da), [`41a23c3`](https://github.com/mastra-ai/mastra/commit/41a23c32f9877d71810f37e24930515df2ff7a0f), [`5d171ad`](https://github.com/mastra-ai/mastra/commit/5d171ad9ef340387276b77c2bb3e83e83332d729), [`f03ae60`](https://github.com/mastra-ai/mastra/commit/f03ae60500fe350c9d828621006cdafe1975fdd8), [`d1e74a0`](https://github.com/mastra-ai/mastra/commit/d1e74a0a293866dece31022047f5dbab65a304d0), [`39e7869`](https://github.com/mastra-ai/mastra/commit/39e7869bc7d0ee391077ce291474d8a84eedccff), [`5761926`](https://github.com/mastra-ai/mastra/commit/57619260c4a2cdd598763abbacd90de594c6bc76), [`c900fdd`](https://github.com/mastra-ai/mastra/commit/c900fdd504c41348efdffb205cfe80d48c38fa33), [`604a79f`](https://github.com/mastra-ai/mastra/commit/604a79fecf276e26a54a3fe01bb94e65315d2e0e), [`887f0b4`](https://github.com/mastra-ai/mastra/commit/887f0b4746cdbd7cb7d6b17ac9f82aeb58037ea5), [`2562143`](https://github.com/mastra-ai/mastra/commit/256214336b4faa78646c9c1776612393790d8784), [`ef11a61`](https://github.com/mastra-ai/mastra/commit/ef11a61920fa0ed08a5b7ceedd192875af119749)]:
|
|
43
|
+
- @mastra/core@1.0.0-beta.6
|
|
44
|
+
|
|
3
45
|
## 2.0.0-beta.2
|
|
4
46
|
|
|
5
47
|
### Patch Changes
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../../src/document/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,IAAI,KAAK,EAAgC,MAAM,UAAU,CAAC;AAU3E,OAAO,KAAK,EACV,WAAW,EAEX,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EACrB,qBAAqB,EACrB,iBAAiB,EACjB,oBAAoB,EACpB,4BAA4B,EAC5B,gBAAgB,EAChB,iBAAiB,EACjB,oBAAoB,EAErB,MAAM,SAAS,CAAC;AAGjB,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAU;IACxB,OAAO,CAAC,IAAI,CAAS;gBAET,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;SAAE,EAAE,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE;IAOhG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC;IAmDjG,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYhF,MAAM,CAAC,QAAQ,CAAC,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAY9E,OAAO,CAAC,eAAe;IAevB,OAAO,CAAC,YAAY,CAAC,CAA4E;IAEjG,OAAO,KAAK,WAAW,GAetB;YAEa,OAAO;IASf,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa9D,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9D,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;
|
|
1
|
+
{"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../../src/document/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,IAAI,KAAK,EAAgC,MAAM,UAAU,CAAC;AAU3E,OAAO,KAAK,EACV,WAAW,EAEX,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EACrB,qBAAqB,EACrB,iBAAiB,EACjB,oBAAoB,EACpB,4BAA4B,EAC5B,gBAAgB,EAChB,iBAAiB,EACjB,oBAAoB,EAErB,MAAM,SAAS,CAAC;AAGjB,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAU;IACxB,OAAO,CAAC,IAAI,CAAS;gBAET,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;SAAE,EAAE,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE;IAOhG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC;IAmDjG,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYhF,MAAM,CAAC,QAAQ,CAAC,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAY9E,OAAO,CAAC,eAAe;IAevB,OAAO,CAAC,YAAY,CAAC,CAA4E;IAEjG,OAAO,KAAK,WAAW,GAetB;YAEa,OAAO;IASf,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa9D,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9D,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IA8CpD,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAmBpD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAMtD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAUtD,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa5D,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAuB5D,qBAAqB,CAAC,OAAO,CAAC,EAAE,4BAA4B,GAAG,OAAO,CAAC,IAAI,CAAC;IAU5E,KAAK,CAAC,MAAM,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;IAiBnD,OAAO,IAAI,KAAK,EAAE;IAIlB,OAAO,IAAI,MAAM,EAAE;IAInB,WAAW,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE;CAGrC"}
|
|
@@ -23,6 +23,7 @@ export declare class HTMLSectionTransformer {
|
|
|
23
23
|
});
|
|
24
24
|
splitText(text: string): Document[];
|
|
25
25
|
private getXPath;
|
|
26
|
+
private getTextContent;
|
|
26
27
|
private splitHtmlByHeaders;
|
|
27
28
|
splitDocuments(documents: Document[]): Promise<Document[]>;
|
|
28
29
|
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[];
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html.d.ts","sourceRoot":"","sources":["../../../src/document/transformers/html.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AACrC,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAWjD,qBAAa,qBAAqB;IAChC,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,iBAAiB,CAAU;gBAEvB,OAAO,EAAE,gBAAgB,GAAG;QAAE,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAA;KAAE;IAKvE,SAAS,CAAC,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,GAAG,QAAQ,EAAE;IAwDjD,OAAO,CAAC,QAAQ;IA2BhB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,yBAAyB;IAyBjC,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,GAAG,QAAQ,EAAE;IA8B/E,kBAAkB,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE;CAWtD;AAED,qBAAa,sBAAsB;IACjC,OAAO,CAAC,gBAAgB,CAAyB;IACjD,OAAO,CAAC,YAAY,CAAgC;gBAExC,OAAO,EAAE,gBAAgB,GAAG;QAAE,QAAQ,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAA;KAAE;IAKxE,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,QAAQ,EAAE;IAenC,OAAO,CAAC,QAAQ;IAwBhB,OAAO,CAAC,kBAAkB;
|
|
1
|
+
{"version":3,"file":"html.d.ts","sourceRoot":"","sources":["../../../src/document/transformers/html.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AACrC,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAWjD,qBAAa,qBAAqB;IAChC,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,iBAAiB,CAAU;gBAEvB,OAAO,EAAE,gBAAgB,GAAG;QAAE,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAA;KAAE;IAKvE,SAAS,CAAC,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,GAAG,QAAQ,EAAE;IAwDjD,OAAO,CAAC,QAAQ;IA2BhB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,yBAAyB;IAyBjC,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,GAAG,QAAQ,EAAE;IA8B/E,kBAAkB,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE;CAWtD;AAED,qBAAa,sBAAsB;IACjC,OAAO,CAAC,gBAAgB,CAAyB;IACjD,OAAO,CAAC,YAAY,CAAgC;gBAExC,OAAO,EAAE,gBAAgB,GAAG;QAAE,QAAQ,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAA;KAAE;IAKxE,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,QAAQ,EAAE;IAenC,OAAO,CAAC,QAAQ;IAwBhB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,kBAAkB;IA6DpB,cAAc,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;IAahE,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,GAAG,QAAQ,EAAE;IA+B/E,kBAAkB,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE;CAWtD"}
|
package/dist/index.cjs
CHANGED
|
@@ -272,7 +272,7 @@ Provide keywords in the following comma-separated format: 'KEYWORDS: <keywords>'
|
|
|
272
272
|
var defaultQuestionExtractPrompt = new PromptTemplate({
|
|
273
273
|
templateVars: ["numQuestions", "context"],
|
|
274
274
|
template: `(
|
|
275
|
-
"Given the contextual
|
|
275
|
+
"Given the contextual information below, generate {numQuestions} questions this context can provide specific answers to which are unlikely to be found elsewhere. Higher-level summaries of surrounding context may be provided as well. "
|
|
276
276
|
"Try using these summaries to generate better questions that this context can answer."
|
|
277
277
|
"---------------------"
|
|
278
278
|
"{context}"
|
|
@@ -4999,23 +4999,47 @@ var HTMLSectionTransformer = class {
|
|
|
4999
4999
|
}
|
|
5000
5000
|
return "/" + parts.join("/");
|
|
5001
5001
|
}
|
|
5002
|
+
getTextContent(element) {
|
|
5003
|
+
if (!element) return "";
|
|
5004
|
+
if (!element.tagName) {
|
|
5005
|
+
return element.text || "";
|
|
5006
|
+
}
|
|
5007
|
+
let content = element.text || "";
|
|
5008
|
+
if (element.childNodes) {
|
|
5009
|
+
for (const child of element.childNodes) {
|
|
5010
|
+
const childText = this.getTextContent(child);
|
|
5011
|
+
if (childText) {
|
|
5012
|
+
content += " " + childText;
|
|
5013
|
+
}
|
|
5014
|
+
}
|
|
5015
|
+
}
|
|
5016
|
+
return content.trim();
|
|
5017
|
+
}
|
|
5002
5018
|
splitHtmlByHeaders(htmlDoc) {
|
|
5003
5019
|
const sections = [];
|
|
5004
5020
|
const root = nodeHtmlBetterParser.parse(htmlDoc);
|
|
5005
5021
|
const headers = Object.keys(this.headersToSplitOn);
|
|
5006
5022
|
const headerElements = root.querySelectorAll(headers.join(","));
|
|
5007
|
-
headerElements.forEach((headerElement
|
|
5023
|
+
headerElements.forEach((headerElement) => {
|
|
5008
5024
|
const header = headerElement.text?.trim() || "";
|
|
5009
5025
|
const tagName = headerElement.tagName;
|
|
5010
5026
|
const xpath = this.getXPath(headerElement);
|
|
5011
5027
|
let content = "";
|
|
5012
|
-
|
|
5013
|
-
|
|
5014
|
-
|
|
5015
|
-
|
|
5016
|
-
|
|
5028
|
+
const parentNode = headerElement.parentNode;
|
|
5029
|
+
if (parentNode && parentNode.childNodes) {
|
|
5030
|
+
let foundHeader = false;
|
|
5031
|
+
for (const node of parentNode.childNodes) {
|
|
5032
|
+
if (node === headerElement) {
|
|
5033
|
+
foundHeader = true;
|
|
5034
|
+
continue;
|
|
5035
|
+
}
|
|
5036
|
+
if (foundHeader && node.tagName && headers.includes(node.tagName.toLowerCase())) {
|
|
5037
|
+
break;
|
|
5038
|
+
}
|
|
5039
|
+
if (foundHeader) {
|
|
5040
|
+
content += this.getTextContent(node) + " ";
|
|
5041
|
+
}
|
|
5017
5042
|
}
|
|
5018
|
-
currentElement = currentElement.nextElementSibling;
|
|
5019
5043
|
}
|
|
5020
5044
|
content = content.trim();
|
|
5021
5045
|
sections.push({
|
|
@@ -6392,13 +6416,33 @@ var MDocument = class _MDocument {
|
|
|
6392
6416
|
async chunkHTML(options) {
|
|
6393
6417
|
if (options?.headers?.length) {
|
|
6394
6418
|
const rt = new HTMLHeaderTransformer(options);
|
|
6395
|
-
|
|
6419
|
+
let textSplit = rt.transformDocuments(this.chunks);
|
|
6420
|
+
if (options?.maxSize) {
|
|
6421
|
+
const textSplitter = new RecursiveCharacterTransformer({
|
|
6422
|
+
maxSize: options.maxSize,
|
|
6423
|
+
overlap: options.overlap,
|
|
6424
|
+
keepSeparator: options.keepSeparator,
|
|
6425
|
+
addStartIndex: options.addStartIndex,
|
|
6426
|
+
stripWhitespace: options.stripWhitespace
|
|
6427
|
+
});
|
|
6428
|
+
textSplit = textSplitter.splitDocuments(textSplit);
|
|
6429
|
+
}
|
|
6396
6430
|
this.chunks = textSplit;
|
|
6397
6431
|
return;
|
|
6398
6432
|
}
|
|
6399
6433
|
if (options?.sections?.length) {
|
|
6400
6434
|
const rt = new HTMLSectionTransformer(options);
|
|
6401
|
-
|
|
6435
|
+
let textSplit = rt.transformDocuments(this.chunks);
|
|
6436
|
+
if (options?.maxSize) {
|
|
6437
|
+
const textSplitter = new RecursiveCharacterTransformer({
|
|
6438
|
+
maxSize: options.maxSize,
|
|
6439
|
+
overlap: options.overlap,
|
|
6440
|
+
keepSeparator: options.keepSeparator,
|
|
6441
|
+
addStartIndex: options.addStartIndex,
|
|
6442
|
+
stripWhitespace: options.stripWhitespace
|
|
6443
|
+
});
|
|
6444
|
+
textSplit = textSplitter.splitDocuments(textSplit);
|
|
6445
|
+
}
|
|
6402
6446
|
this.chunks = textSplit;
|
|
6403
6447
|
return;
|
|
6404
6448
|
}
|