@mastra/rag 0.1.0-alpha.83 → 0.1.0-alpha.85
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/dist/index.js +97 -55
- package/package.json +3 -4
- package/src/document/document.test.ts +171 -0
- package/src/document/transformers/html.ts +137 -76
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
# @mastra/rag
|
|
2
2
|
|
|
3
|
+
## 0.1.0-alpha.85
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- b27bdb8: Swap jsdon for node-html-parser in rag
|
|
8
|
+
|
|
9
|
+
## 0.1.0-alpha.84
|
|
10
|
+
|
|
11
|
+
### Patch Changes
|
|
12
|
+
|
|
13
|
+
- Updated dependencies [4d4f6b6]
|
|
14
|
+
- @mastra/core@0.2.0-alpha.92
|
|
15
|
+
|
|
3
16
|
## 0.1.0-alpha.83
|
|
4
17
|
|
|
5
18
|
### Patch Changes
|
package/dist/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Document, SummaryExtractor, QuestionsAnsweredExtractor, KeywordExtractor, TitleExtractor, IngestionPipeline } from 'llamaindex';
|
|
2
|
-
import {
|
|
2
|
+
import { parse } from 'node-html-better-parser';
|
|
3
3
|
import { encodingForModel, getEncoding } from 'js-tiktoken';
|
|
4
4
|
import { embed as embed$1, embedMany as embedMany$1 } from '@mastra/core/embeddings';
|
|
5
5
|
import { MastraAgentRelevanceScorer, CohereRelevanceScorer } from '@mastra/core/relevance';
|
|
@@ -353,50 +353,81 @@ var HTMLHeaderTransformer = class {
|
|
|
353
353
|
this.headersToSplitOn = [...headersToSplitOn].sort();
|
|
354
354
|
}
|
|
355
355
|
splitText({ text }) {
|
|
356
|
-
const
|
|
357
|
-
const { document } = dom.window;
|
|
356
|
+
const root = parse(text);
|
|
358
357
|
const headerFilter = this.headersToSplitOn.map(([header]) => header);
|
|
359
358
|
const headerMapping = Object.fromEntries(this.headersToSplitOn);
|
|
360
359
|
const elements = [];
|
|
361
|
-
const headers =
|
|
360
|
+
const headers = root.querySelectorAll(headerFilter.join(","));
|
|
362
361
|
headers.forEach((header) => {
|
|
363
362
|
let content = "";
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
363
|
+
const parentNode = header.parentNode;
|
|
364
|
+
if (parentNode && parentNode.childNodes) {
|
|
365
|
+
let foundHeader = false;
|
|
366
|
+
for (const node of parentNode.childNodes) {
|
|
367
|
+
if (node === header) {
|
|
368
|
+
foundHeader = true;
|
|
369
|
+
continue;
|
|
370
|
+
}
|
|
371
|
+
if (foundHeader && node.tagName && headerFilter.includes(node.tagName.toLowerCase())) {
|
|
372
|
+
break;
|
|
373
|
+
}
|
|
374
|
+
if (foundHeader) {
|
|
375
|
+
content += this.getTextContent(node) + " ";
|
|
376
|
+
}
|
|
377
|
+
}
|
|
368
378
|
}
|
|
369
379
|
elements.push({
|
|
370
380
|
url: text,
|
|
371
381
|
xpath: this.getXPath(header),
|
|
372
382
|
content: content.trim(),
|
|
373
383
|
metadata: {
|
|
374
|
-
[headerMapping?.[header.tagName.toLowerCase()]]: header.
|
|
384
|
+
[headerMapping?.[header.tagName.toLowerCase()]]: header.text || ""
|
|
375
385
|
}
|
|
376
386
|
});
|
|
377
387
|
});
|
|
378
388
|
return this.returnEachElement ? elements.map(
|
|
379
389
|
(el) => new Document({
|
|
380
390
|
text: el.content,
|
|
381
|
-
metadata: el.metadata
|
|
391
|
+
metadata: { ...el.metadata, xpath: el.xpath }
|
|
382
392
|
})
|
|
383
393
|
) : this.aggregateElementsToChunks(elements);
|
|
384
394
|
}
|
|
385
395
|
getXPath(element) {
|
|
396
|
+
if (!element) return "";
|
|
386
397
|
const parts = [];
|
|
387
398
|
let current = element;
|
|
388
|
-
while (current && current.
|
|
399
|
+
while (current && current.tagName) {
|
|
389
400
|
let index = 1;
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
401
|
+
const parent = current.parentNode;
|
|
402
|
+
if (parent && parent.childNodes) {
|
|
403
|
+
for (const sibling of parent.childNodes) {
|
|
404
|
+
if (sibling === current) break;
|
|
405
|
+
if (sibling.tagName === current.tagName) {
|
|
406
|
+
index++;
|
|
407
|
+
}
|
|
393
408
|
}
|
|
394
409
|
}
|
|
395
410
|
parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
|
|
396
|
-
current = current.
|
|
411
|
+
current = current.parentNode;
|
|
397
412
|
}
|
|
398
413
|
return "/" + parts.join("/");
|
|
399
414
|
}
|
|
415
|
+
getTextContent(element) {
|
|
416
|
+
if (!element) return "";
|
|
417
|
+
if (!element.tagName) {
|
|
418
|
+
return element.text || "";
|
|
419
|
+
}
|
|
420
|
+
let content = element.text || "";
|
|
421
|
+
if (element.childNodes) {
|
|
422
|
+
for (const child of element.childNodes) {
|
|
423
|
+
const childText = this.getTextContent(child);
|
|
424
|
+
if (childText) {
|
|
425
|
+
content += " " + childText;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
return content.trim();
|
|
430
|
+
}
|
|
400
431
|
aggregateElementsToChunks(elements) {
|
|
401
432
|
const aggregatedChunks = [];
|
|
402
433
|
for (const element of elements) {
|
|
@@ -409,7 +440,7 @@ var HTMLHeaderTransformer = class {
|
|
|
409
440
|
return aggregatedChunks.map(
|
|
410
441
|
(chunk) => new Document({
|
|
411
442
|
text: chunk.content,
|
|
412
|
-
metadata: chunk.metadata
|
|
443
|
+
metadata: { ...chunk.metadata, xpath: chunk.xpath }
|
|
413
444
|
})
|
|
414
445
|
);
|
|
415
446
|
}
|
|
@@ -450,7 +481,7 @@ var HTMLHeaderTransformer = class {
|
|
|
450
481
|
};
|
|
451
482
|
var HTMLSectionTransformer = class {
|
|
452
483
|
constructor(headersToSplitOn, options = {}) {
|
|
453
|
-
this.headersToSplitOn = Object.fromEntries(headersToSplitOn);
|
|
484
|
+
this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name]) => [tag.toLowerCase(), name]));
|
|
454
485
|
this.options = options;
|
|
455
486
|
}
|
|
456
487
|
splitText(text) {
|
|
@@ -459,11 +490,59 @@ var HTMLSectionTransformer = class {
|
|
|
459
490
|
(section) => new Document({
|
|
460
491
|
text: section.content,
|
|
461
492
|
metadata: {
|
|
462
|
-
[this.headersToSplitOn[section.tagName]]: section.header
|
|
493
|
+
[this.headersToSplitOn[section.tagName.toLowerCase()]]: section.header,
|
|
494
|
+
xpath: section.xpath
|
|
463
495
|
}
|
|
464
496
|
})
|
|
465
497
|
);
|
|
466
498
|
}
|
|
499
|
+
getXPath(element) {
|
|
500
|
+
const parts = [];
|
|
501
|
+
let current = element;
|
|
502
|
+
while (current && current.nodeType === 1) {
|
|
503
|
+
let index = 1;
|
|
504
|
+
let sibling = current.previousSibling;
|
|
505
|
+
while (sibling) {
|
|
506
|
+
if (sibling.nodeType === 1 && sibling.tagName === current.tagName) {
|
|
507
|
+
index++;
|
|
508
|
+
}
|
|
509
|
+
sibling = sibling.previousSibling;
|
|
510
|
+
}
|
|
511
|
+
if (current.tagName) {
|
|
512
|
+
parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
|
|
513
|
+
}
|
|
514
|
+
current = current.parentNode;
|
|
515
|
+
}
|
|
516
|
+
return "/" + parts.join("/");
|
|
517
|
+
}
|
|
518
|
+
splitHtmlByHeaders(htmlDoc) {
|
|
519
|
+
const sections = [];
|
|
520
|
+
const root = parse(htmlDoc);
|
|
521
|
+
const headers = Object.keys(this.headersToSplitOn);
|
|
522
|
+
const headerElements = root.querySelectorAll(headers.join(","));
|
|
523
|
+
headerElements.forEach((headerElement, index) => {
|
|
524
|
+
const header = headerElement.text?.trim() || "";
|
|
525
|
+
const tagName = headerElement.tagName;
|
|
526
|
+
const xpath = this.getXPath(headerElement);
|
|
527
|
+
let content = "";
|
|
528
|
+
let currentElement = headerElement.nextElementSibling;
|
|
529
|
+
const nextHeader = headerElements[index + 1];
|
|
530
|
+
while (currentElement && (!nextHeader || currentElement !== nextHeader)) {
|
|
531
|
+
if (currentElement.text) {
|
|
532
|
+
content += currentElement.text.trim() + " ";
|
|
533
|
+
}
|
|
534
|
+
currentElement = currentElement.nextElementSibling;
|
|
535
|
+
}
|
|
536
|
+
content = content.trim();
|
|
537
|
+
sections.push({
|
|
538
|
+
header,
|
|
539
|
+
content,
|
|
540
|
+
tagName,
|
|
541
|
+
xpath
|
|
542
|
+
});
|
|
543
|
+
});
|
|
544
|
+
return sections;
|
|
545
|
+
}
|
|
467
546
|
async splitDocuments(documents) {
|
|
468
547
|
const texts = [];
|
|
469
548
|
const metadatas = [];
|
|
@@ -500,43 +579,6 @@ var HTMLSectionTransformer = class {
|
|
|
500
579
|
}
|
|
501
580
|
return documents;
|
|
502
581
|
}
|
|
503
|
-
splitHtmlByHeaders(htmlDoc) {
|
|
504
|
-
const sections = [];
|
|
505
|
-
const dom = new JSDOM(htmlDoc);
|
|
506
|
-
const { document } = dom.window;
|
|
507
|
-
const headers = ["body", ...Object.keys(this.headersToSplitOn)];
|
|
508
|
-
const headerElements = Array.from(document.querySelectorAll(headers.join(",")));
|
|
509
|
-
for (let i = 0; i < headerElements.length; i++) {
|
|
510
|
-
const headerElement = headerElements[i];
|
|
511
|
-
let currentHeader;
|
|
512
|
-
let currentHeaderTag;
|
|
513
|
-
let sectionContent = [];
|
|
514
|
-
if (i === 0) {
|
|
515
|
-
currentHeader = "#TITLE#";
|
|
516
|
-
currentHeaderTag = "h1";
|
|
517
|
-
} else {
|
|
518
|
-
currentHeader = headerElement.textContent?.trim() || "";
|
|
519
|
-
currentHeaderTag = headerElement.tagName.toLowerCase();
|
|
520
|
-
}
|
|
521
|
-
let currentNode = headerElement.nextSibling;
|
|
522
|
-
const nextHeader = headerElements[i + 1];
|
|
523
|
-
while (currentNode && currentNode !== nextHeader) {
|
|
524
|
-
if (currentNode.textContent) {
|
|
525
|
-
sectionContent.push(currentNode.textContent);
|
|
526
|
-
}
|
|
527
|
-
currentNode = currentNode.nextSibling;
|
|
528
|
-
}
|
|
529
|
-
const content = sectionContent.join(" ").trim();
|
|
530
|
-
if (content) {
|
|
531
|
-
sections.push({
|
|
532
|
-
header: currentHeader,
|
|
533
|
-
content,
|
|
534
|
-
tagName: currentHeaderTag
|
|
535
|
-
});
|
|
536
|
-
}
|
|
537
|
-
}
|
|
538
|
-
return sections;
|
|
539
|
-
}
|
|
540
582
|
transformDocuments(documents) {
|
|
541
583
|
const texts = [];
|
|
542
584
|
const metadatas = [];
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/rag",
|
|
3
|
-
"version": "0.1.0-alpha.
|
|
3
|
+
"version": "0.1.0-alpha.85",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -21,17 +21,16 @@
|
|
|
21
21
|
"@llamaindex/env": "^0.1.20",
|
|
22
22
|
"@paralleldrive/cuid2": "^2.2.2",
|
|
23
23
|
"js-tiktoken": "^1.0.15",
|
|
24
|
-
"jsdom": "^25.0.1",
|
|
25
24
|
"llamaindex": "^0.8.15",
|
|
25
|
+
"node-html-better-parser": "^1.4.7",
|
|
26
26
|
"pathe": "^2.0.2",
|
|
27
27
|
"zod": "^3.24.1",
|
|
28
|
-
"@mastra/core": "^0.2.0-alpha.
|
|
28
|
+
"@mastra/core": "^0.2.0-alpha.92"
|
|
29
29
|
},
|
|
30
30
|
"devDependencies": {
|
|
31
31
|
"@babel/preset-env": "^7.26.0",
|
|
32
32
|
"@babel/preset-typescript": "^7.26.0",
|
|
33
33
|
"@tsconfig/recommended": "^1.0.7",
|
|
34
|
-
"@types/jsdom": "^21.1.7",
|
|
35
34
|
"@types/node": "^22.9.0",
|
|
36
35
|
"tsup": "^8.0.1",
|
|
37
36
|
"vitest": "^3.0.4"
|
|
@@ -461,6 +461,177 @@ describe('MDocument', () => {
|
|
|
461
461
|
expect(doc?.metadata).toHaveProperty('Header 1');
|
|
462
462
|
});
|
|
463
463
|
});
|
|
464
|
+
|
|
465
|
+
it('should handle empty or invalid HTML', async () => {
|
|
466
|
+
const emptyHtml = '';
|
|
467
|
+
const invalidHtml = '<unclosed>test';
|
|
468
|
+
const noHeadersHtml = '<div>test</div>';
|
|
469
|
+
|
|
470
|
+
const doc1 = MDocument.fromHTML(emptyHtml, { meta: 'data' });
|
|
471
|
+
const doc2 = MDocument.fromHTML(invalidHtml, { meta: 'data' });
|
|
472
|
+
const doc3 = MDocument.fromHTML(noHeadersHtml, { meta: 'data' });
|
|
473
|
+
|
|
474
|
+
await doc1.chunk({
|
|
475
|
+
strategy: 'html',
|
|
476
|
+
headers: [
|
|
477
|
+
['h1', 'Header 1'],
|
|
478
|
+
['h2', 'Header 2'],
|
|
479
|
+
],
|
|
480
|
+
});
|
|
481
|
+
|
|
482
|
+
await doc2.chunk({
|
|
483
|
+
strategy: 'html',
|
|
484
|
+
headers: [
|
|
485
|
+
['h1', 'Header 1'],
|
|
486
|
+
['h2', 'Header 2'],
|
|
487
|
+
],
|
|
488
|
+
});
|
|
489
|
+
|
|
490
|
+
await doc3.chunk({
|
|
491
|
+
strategy: 'html',
|
|
492
|
+
headers: [
|
|
493
|
+
['h1', 'Header 1'],
|
|
494
|
+
['h2', 'Header 2'],
|
|
495
|
+
],
|
|
496
|
+
});
|
|
497
|
+
|
|
498
|
+
expect(doc1.getDocs()).toHaveLength(0);
|
|
499
|
+
expect(doc2.getDocs()).toHaveLength(0);
|
|
500
|
+
expect(doc3.getDocs()).toHaveLength(0);
|
|
501
|
+
});
|
|
502
|
+
|
|
503
|
+
it('should handle complex nested header hierarchies', async () => {
|
|
504
|
+
const html = `
|
|
505
|
+
<html>
|
|
506
|
+
<body>
|
|
507
|
+
<h1>Main Title</h1>
|
|
508
|
+
<p>Main content</p>
|
|
509
|
+
<h2>Section 1</h2>
|
|
510
|
+
<p>Section 1 content</p>
|
|
511
|
+
<h3>Subsection 1.1</h3>
|
|
512
|
+
<p>Subsection 1.1 content</p>
|
|
513
|
+
<h2>Section 2</h2>
|
|
514
|
+
<h3>Subsection 2.1</h3>
|
|
515
|
+
<p>Subsection 2.1 content</p>
|
|
516
|
+
</body>
|
|
517
|
+
</html>
|
|
518
|
+
`;
|
|
519
|
+
|
|
520
|
+
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
521
|
+
await doc.chunk({
|
|
522
|
+
strategy: 'html',
|
|
523
|
+
headers: [
|
|
524
|
+
['h1', 'Header 1'],
|
|
525
|
+
['h2', 'Header 2'],
|
|
526
|
+
['h3', 'Header 3'],
|
|
527
|
+
],
|
|
528
|
+
});
|
|
529
|
+
|
|
530
|
+
const docs = doc.getDocs();
|
|
531
|
+
expect(docs.length).toBeGreaterThan(3);
|
|
532
|
+
expect(docs.some(d => d.metadata?.['Header 1'] === 'Main Title')).toBe(true);
|
|
533
|
+
expect(docs.some(d => d.metadata?.['Header 2'] === 'Section 1')).toBe(true);
|
|
534
|
+
expect(docs.some(d => d.metadata?.['Header 3'] === 'Subsection 1.1')).toBe(true);
|
|
535
|
+
});
|
|
536
|
+
|
|
537
|
+
it('should handle headers with mixed content and special characters', async () => {
|
|
538
|
+
const html = `
|
|
539
|
+
<html>
|
|
540
|
+
<body>
|
|
541
|
+
<h1>Title with <strong>bold</strong> & <em>emphasis</em></h1>
|
|
542
|
+
<p>Content 1</p>
|
|
543
|
+
<h2>Section with <tags> & symbols</h2>
|
|
544
|
+
<p>Content 2</p>
|
|
545
|
+
</body>
|
|
546
|
+
</html>
|
|
547
|
+
`;
|
|
548
|
+
|
|
549
|
+
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
550
|
+
await doc.chunk({
|
|
551
|
+
strategy: 'html',
|
|
552
|
+
headers: [
|
|
553
|
+
['h1', 'Header 1'],
|
|
554
|
+
['h2', 'Header 2'],
|
|
555
|
+
],
|
|
556
|
+
});
|
|
557
|
+
|
|
558
|
+
const docs = doc.getDocs();
|
|
559
|
+
expect(docs.length).toBeGreaterThan(1);
|
|
560
|
+
expect(docs[0]?.metadata?.['Header 1']).toContain('bold');
|
|
561
|
+
expect(docs[0]?.metadata?.['Header 1']).toContain('&');
|
|
562
|
+
expect(docs[0]?.metadata?.['Header 1']).toContain('emphasis');
|
|
563
|
+
expect(docs[1]?.metadata?.['Header 2']).toContain('<tags>');
|
|
564
|
+
});
|
|
565
|
+
|
|
566
|
+
it('should handle headers with no content or whitespace content', async () => {
|
|
567
|
+
const html = `
|
|
568
|
+
<html>
|
|
569
|
+
<body>
|
|
570
|
+
<h1>Empty Section</h1>
|
|
571
|
+
<h2>Whitespace Section</h2>
|
|
572
|
+
|
|
573
|
+
<h2>Valid Section</h2>
|
|
574
|
+
<p>Content</p>
|
|
575
|
+
</body>
|
|
576
|
+
</html>
|
|
577
|
+
`;
|
|
578
|
+
|
|
579
|
+
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
580
|
+
await doc.chunk({
|
|
581
|
+
strategy: 'html',
|
|
582
|
+
headers: [
|
|
583
|
+
['h1', 'Header 1'],
|
|
584
|
+
['h2', 'Header 2'],
|
|
585
|
+
],
|
|
586
|
+
});
|
|
587
|
+
|
|
588
|
+
const docs = doc.getDocs();
|
|
589
|
+
expect(docs.some(d => d.metadata?.['Header 1'] === 'Empty Section')).toBe(true);
|
|
590
|
+
expect(docs.some(d => d.metadata?.['Header 2'] === 'Valid Section')).toBe(true);
|
|
591
|
+
expect(docs.find(d => d.metadata?.['Header 2'] === 'Valid Section')?.text).toContain('Content');
|
|
592
|
+
});
|
|
593
|
+
|
|
594
|
+
it('should generate correct XPaths for deeply nested elements', async () => {
|
|
595
|
+
const html = `
|
|
596
|
+
<html>
|
|
597
|
+
<body>
|
|
598
|
+
<div class="container">
|
|
599
|
+
<section id="main">
|
|
600
|
+
<div>
|
|
601
|
+
<h1>Deeply Nested Title</h1>
|
|
602
|
+
<p>Content</p>
|
|
603
|
+
</div>
|
|
604
|
+
<div>
|
|
605
|
+
<h1>Second Title</h1>
|
|
606
|
+
<p>More Content</p>
|
|
607
|
+
</div>
|
|
608
|
+
</section>
|
|
609
|
+
</div>
|
|
610
|
+
</body>
|
|
611
|
+
</html>
|
|
612
|
+
`;
|
|
613
|
+
|
|
614
|
+
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
615
|
+
await doc.chunk({
|
|
616
|
+
strategy: 'html',
|
|
617
|
+
headers: [['h1', 'Header 1']],
|
|
618
|
+
});
|
|
619
|
+
|
|
620
|
+
const docs = doc.getDocs();
|
|
621
|
+
expect(docs).toHaveLength(2);
|
|
622
|
+
|
|
623
|
+
// First h1
|
|
624
|
+
expect(docs[0]?.metadata?.['Header 1']).toBe('Deeply Nested Title');
|
|
625
|
+
const xpath1 = docs[0]?.metadata?.xpath as string;
|
|
626
|
+
expect(xpath1).toBeDefined();
|
|
627
|
+
expect(xpath1).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[1\]\/h1\[1\]$/);
|
|
628
|
+
|
|
629
|
+
// Second h1
|
|
630
|
+
expect(docs[1]?.metadata?.['Header 1']).toBe('Second Title');
|
|
631
|
+
const xpath2 = docs[1]?.metadata?.xpath as string;
|
|
632
|
+
expect(xpath2).toBeDefined();
|
|
633
|
+
expect(xpath2).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[2\]\/h1\[1\]$/);
|
|
634
|
+
});
|
|
464
635
|
});
|
|
465
636
|
|
|
466
637
|
describe('chunkJson', () => {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { JSDOM } from 'jsdom';
|
|
2
1
|
import { Document } from 'llamaindex';
|
|
2
|
+
import { parse } from 'node-html-better-parser';
|
|
3
3
|
|
|
4
4
|
import { RecursiveCharacterTransformer } from './character';
|
|
5
5
|
|
|
@@ -20,22 +20,37 @@ export class HTMLHeaderTransformer {
|
|
|
20
20
|
}
|
|
21
21
|
|
|
22
22
|
splitText({ text }: { text: string }): Document[] {
|
|
23
|
-
const
|
|
24
|
-
const { document } = dom.window;
|
|
23
|
+
const root = parse(text);
|
|
25
24
|
|
|
26
25
|
const headerFilter = this.headersToSplitOn.map(([header]) => header);
|
|
27
26
|
const headerMapping = Object.fromEntries(this.headersToSplitOn);
|
|
28
27
|
|
|
29
28
|
const elements: ElementType[] = [];
|
|
30
|
-
const headers =
|
|
29
|
+
const headers = root.querySelectorAll(headerFilter.join(','));
|
|
31
30
|
|
|
32
31
|
headers.forEach(header => {
|
|
33
32
|
let content = '';
|
|
34
|
-
|
|
33
|
+
const parentNode = header.parentNode;
|
|
34
|
+
|
|
35
|
+
if (parentNode && parentNode.childNodes) {
|
|
36
|
+
let foundHeader = false;
|
|
37
|
+
for (const node of parentNode.childNodes) {
|
|
38
|
+
// Start collecting content after we find our header
|
|
39
|
+
if (node === header) {
|
|
40
|
+
foundHeader = true;
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// If we found our header and hit another header, stop
|
|
45
|
+
if (foundHeader && node.tagName && headerFilter.includes(node.tagName.toLowerCase())) {
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
35
48
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
49
|
+
// Collect content between headers
|
|
50
|
+
if (foundHeader) {
|
|
51
|
+
content += this.getTextContent(node) + ' ';
|
|
52
|
+
}
|
|
53
|
+
}
|
|
39
54
|
}
|
|
40
55
|
|
|
41
56
|
elements.push({
|
|
@@ -43,7 +58,7 @@ export class HTMLHeaderTransformer {
|
|
|
43
58
|
xpath: this.getXPath(header),
|
|
44
59
|
content: content.trim(),
|
|
45
60
|
metadata: {
|
|
46
|
-
[headerMapping?.[header.tagName.toLowerCase()]!]: header.
|
|
61
|
+
[headerMapping?.[header.tagName.toLowerCase()]!]: header.text || '',
|
|
47
62
|
},
|
|
48
63
|
});
|
|
49
64
|
});
|
|
@@ -53,30 +68,62 @@ export class HTMLHeaderTransformer {
|
|
|
53
68
|
el =>
|
|
54
69
|
new Document({
|
|
55
70
|
text: el.content,
|
|
56
|
-
metadata: el.metadata,
|
|
71
|
+
metadata: { ...el.metadata, xpath: el.xpath },
|
|
57
72
|
}),
|
|
58
73
|
)
|
|
59
74
|
: this.aggregateElementsToChunks(elements);
|
|
60
75
|
}
|
|
61
76
|
|
|
62
|
-
private getXPath(element:
|
|
77
|
+
private getXPath(element: any): string {
|
|
78
|
+
if (!element) return '';
|
|
79
|
+
|
|
63
80
|
const parts: string[] = [];
|
|
64
|
-
let current
|
|
81
|
+
let current = element;
|
|
65
82
|
|
|
66
|
-
while (current && current.
|
|
83
|
+
while (current && current.tagName) {
|
|
67
84
|
let index = 1;
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
85
|
+
const parent = current.parentNode;
|
|
86
|
+
|
|
87
|
+
if (parent && parent.childNodes) {
|
|
88
|
+
// Count preceding siblings with same tag
|
|
89
|
+
for (const sibling of parent.childNodes) {
|
|
90
|
+
if (sibling === current) break;
|
|
91
|
+
if (sibling.tagName === current.tagName) {
|
|
92
|
+
index++;
|
|
93
|
+
}
|
|
71
94
|
}
|
|
72
95
|
}
|
|
96
|
+
|
|
73
97
|
parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
|
|
74
|
-
current = current.
|
|
98
|
+
current = current.parentNode;
|
|
75
99
|
}
|
|
76
100
|
|
|
77
101
|
return '/' + parts.join('/');
|
|
78
102
|
}
|
|
79
103
|
|
|
104
|
+
private getTextContent(element: any): string {
|
|
105
|
+
if (!element) return '';
|
|
106
|
+
|
|
107
|
+
// For text nodes, return their content
|
|
108
|
+
if (!element.tagName) {
|
|
109
|
+
return element.text || '';
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// For element nodes, combine their text with children's text
|
|
113
|
+
let content = element.text || '';
|
|
114
|
+
|
|
115
|
+
if (element.childNodes) {
|
|
116
|
+
for (const child of element.childNodes) {
|
|
117
|
+
const childText = this.getTextContent(child);
|
|
118
|
+
if (childText) {
|
|
119
|
+
content += ' ' + childText;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return content.trim();
|
|
125
|
+
}
|
|
126
|
+
|
|
80
127
|
private aggregateElementsToChunks(elements: ElementType[]): Document[] {
|
|
81
128
|
const aggregatedChunks: ElementType[] = [];
|
|
82
129
|
|
|
@@ -97,7 +144,7 @@ export class HTMLHeaderTransformer {
|
|
|
97
144
|
chunk =>
|
|
98
145
|
new Document({
|
|
99
146
|
text: chunk.content,
|
|
100
|
-
metadata: chunk.metadata,
|
|
147
|
+
metadata: { ...chunk.metadata, xpath: chunk.xpath },
|
|
101
148
|
}),
|
|
102
149
|
);
|
|
103
150
|
}
|
|
@@ -110,7 +157,6 @@ export class HTMLHeaderTransformer {
|
|
|
110
157
|
const chunks = this.splitText({ text: texts[i]! });
|
|
111
158
|
for (const chunk of chunks) {
|
|
112
159
|
const metadata = { ...(_metadatas[i] || {}) };
|
|
113
|
-
|
|
114
160
|
const chunkMetadata = chunk.metadata;
|
|
115
161
|
|
|
116
162
|
if (chunkMetadata) {
|
|
@@ -151,7 +197,7 @@ export class HTMLSectionTransformer {
|
|
|
151
197
|
private options: Record<string, any>;
|
|
152
198
|
|
|
153
199
|
constructor(headersToSplitOn: [string, string][], options: Record<string, any> = {}) {
|
|
154
|
-
this.headersToSplitOn = Object.fromEntries(headersToSplitOn);
|
|
200
|
+
this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name]) => [tag.toLowerCase(), name]));
|
|
155
201
|
this.options = options;
|
|
156
202
|
}
|
|
157
203
|
|
|
@@ -163,12 +209,82 @@ export class HTMLSectionTransformer {
|
|
|
163
209
|
new Document({
|
|
164
210
|
text: section.content,
|
|
165
211
|
metadata: {
|
|
166
|
-
[this.headersToSplitOn[section.tagName]!]: section.header,
|
|
212
|
+
[this.headersToSplitOn[section.tagName.toLowerCase()]!]: section.header,
|
|
213
|
+
xpath: section.xpath,
|
|
167
214
|
},
|
|
168
215
|
}),
|
|
169
216
|
);
|
|
170
217
|
}
|
|
171
218
|
|
|
219
|
+
private getXPath(element: any): string {
|
|
220
|
+
const parts: string[] = [];
|
|
221
|
+
let current = element;
|
|
222
|
+
|
|
223
|
+
while (current && current.nodeType === 1) {
|
|
224
|
+
let index = 1;
|
|
225
|
+
let sibling = current.previousSibling;
|
|
226
|
+
|
|
227
|
+
while (sibling) {
|
|
228
|
+
if (sibling.nodeType === 1 && sibling.tagName === current.tagName) {
|
|
229
|
+
index++;
|
|
230
|
+
}
|
|
231
|
+
sibling = sibling.previousSibling;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (current.tagName) {
|
|
235
|
+
parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
|
|
236
|
+
}
|
|
237
|
+
current = current.parentNode;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return '/' + parts.join('/');
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
private splitHtmlByHeaders(htmlDoc: string): Array<{
|
|
244
|
+
header: string;
|
|
245
|
+
content: string;
|
|
246
|
+
tagName: string;
|
|
247
|
+
xpath: string;
|
|
248
|
+
}> {
|
|
249
|
+
const sections: Array<{
|
|
250
|
+
header: string;
|
|
251
|
+
content: string;
|
|
252
|
+
tagName: string;
|
|
253
|
+
xpath: string;
|
|
254
|
+
}> = [];
|
|
255
|
+
|
|
256
|
+
const root = parse(htmlDoc);
|
|
257
|
+
const headers = Object.keys(this.headersToSplitOn);
|
|
258
|
+
const headerElements = root.querySelectorAll(headers.join(','));
|
|
259
|
+
|
|
260
|
+
headerElements.forEach((headerElement, index) => {
|
|
261
|
+
const header = headerElement.text?.trim() || '';
|
|
262
|
+
const tagName = headerElement.tagName;
|
|
263
|
+
const xpath = this.getXPath(headerElement);
|
|
264
|
+
let content = '';
|
|
265
|
+
|
|
266
|
+
let currentElement = headerElement.nextElementSibling;
|
|
267
|
+
const nextHeader = headerElements[index + 1];
|
|
268
|
+
|
|
269
|
+
while (currentElement && (!nextHeader || currentElement !== nextHeader)) {
|
|
270
|
+
if (currentElement.text) {
|
|
271
|
+
content += currentElement.text.trim() + ' ';
|
|
272
|
+
}
|
|
273
|
+
currentElement = currentElement.nextElementSibling;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
content = content.trim();
|
|
277
|
+
sections.push({
|
|
278
|
+
header,
|
|
279
|
+
content,
|
|
280
|
+
tagName,
|
|
281
|
+
xpath,
|
|
282
|
+
});
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
return sections;
|
|
286
|
+
}
|
|
287
|
+
|
|
172
288
|
async splitDocuments(documents: Document[]): Promise<Document[]> {
|
|
173
289
|
const texts: string[] = [];
|
|
174
290
|
const metadatas: Record<string, any>[] = [];
|
|
@@ -214,61 +330,6 @@ export class HTMLSectionTransformer {
|
|
|
214
330
|
return documents;
|
|
215
331
|
}
|
|
216
332
|
|
|
217
|
-
private splitHtmlByHeaders(htmlDoc: string): Array<{
|
|
218
|
-
header: string;
|
|
219
|
-
content: string;
|
|
220
|
-
tagName: string;
|
|
221
|
-
}> {
|
|
222
|
-
const sections: Array<{
|
|
223
|
-
header: string;
|
|
224
|
-
content: string;
|
|
225
|
-
tagName: string;
|
|
226
|
-
}> = [];
|
|
227
|
-
|
|
228
|
-
const dom = new JSDOM(htmlDoc);
|
|
229
|
-
const { document } = dom.window;
|
|
230
|
-
const headers = ['body', ...Object.keys(this.headersToSplitOn)];
|
|
231
|
-
|
|
232
|
-
const headerElements = Array.from(document.querySelectorAll(headers.join(',')));
|
|
233
|
-
|
|
234
|
-
for (let i = 0; i < headerElements.length; i++) {
|
|
235
|
-
const headerElement = headerElements[i]!;
|
|
236
|
-
let currentHeader: string;
|
|
237
|
-
let currentHeaderTag: string;
|
|
238
|
-
let sectionContent: string[] = [];
|
|
239
|
-
|
|
240
|
-
if (i === 0) {
|
|
241
|
-
currentHeader = '#TITLE#';
|
|
242
|
-
currentHeaderTag = 'h1';
|
|
243
|
-
} else {
|
|
244
|
-
currentHeader = headerElement.textContent?.trim() || '';
|
|
245
|
-
currentHeaderTag = headerElement.tagName.toLowerCase();
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
// Get content until next header
|
|
249
|
-
let currentNode = headerElement.nextSibling;
|
|
250
|
-
const nextHeader = headerElements[i + 1];
|
|
251
|
-
|
|
252
|
-
while (currentNode && currentNode !== nextHeader) {
|
|
253
|
-
if (currentNode.textContent) {
|
|
254
|
-
sectionContent.push(currentNode.textContent);
|
|
255
|
-
}
|
|
256
|
-
currentNode = currentNode.nextSibling;
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
const content = sectionContent.join(' ').trim();
|
|
260
|
-
if (content) {
|
|
261
|
-
sections.push({
|
|
262
|
-
header: currentHeader,
|
|
263
|
-
content,
|
|
264
|
-
tagName: currentHeaderTag,
|
|
265
|
-
});
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
return sections;
|
|
270
|
-
}
|
|
271
|
-
|
|
272
333
|
transformDocuments(documents: Document[]): Document[] {
|
|
273
334
|
const texts: string[] = [];
|
|
274
335
|
const metadatas: Record<string, any>[] = [];
|