@mastra/rag 0.1.0-alpha.83 → 0.1.0-alpha.85

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,18 @@
1
1
  # @mastra/rag
2
2
 
3
+ ## 0.1.0-alpha.85
4
+
5
+ ### Patch Changes
6
+
7
+ - b27bdb8: Swap jsdon for node-html-parser in rag
8
+
9
+ ## 0.1.0-alpha.84
10
+
11
+ ### Patch Changes
12
+
13
+ - Updated dependencies [4d4f6b6]
14
+ - @mastra/core@0.2.0-alpha.92
15
+
3
16
  ## 0.1.0-alpha.83
4
17
 
5
18
  ### Patch Changes
package/dist/index.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import { Document, SummaryExtractor, QuestionsAnsweredExtractor, KeywordExtractor, TitleExtractor, IngestionPipeline } from 'llamaindex';
2
- import { JSDOM } from 'jsdom';
2
+ import { parse } from 'node-html-better-parser';
3
3
  import { encodingForModel, getEncoding } from 'js-tiktoken';
4
4
  import { embed as embed$1, embedMany as embedMany$1 } from '@mastra/core/embeddings';
5
5
  import { MastraAgentRelevanceScorer, CohereRelevanceScorer } from '@mastra/core/relevance';
@@ -353,50 +353,81 @@ var HTMLHeaderTransformer = class {
353
353
  this.headersToSplitOn = [...headersToSplitOn].sort();
354
354
  }
355
355
  splitText({ text }) {
356
- const dom = new JSDOM(text);
357
- const { document } = dom.window;
356
+ const root = parse(text);
358
357
  const headerFilter = this.headersToSplitOn.map(([header]) => header);
359
358
  const headerMapping = Object.fromEntries(this.headersToSplitOn);
360
359
  const elements = [];
361
- const headers = document.querySelectorAll(headerFilter.join(","));
360
+ const headers = root.querySelectorAll(headerFilter.join(","));
362
361
  headers.forEach((header) => {
363
362
  let content = "";
364
- let nextElement = header.nextElementSibling;
365
- while (nextElement && !headerFilter.includes(nextElement.tagName.toLowerCase())) {
366
- content += nextElement.textContent + " ";
367
- nextElement = nextElement.nextElementSibling;
363
+ const parentNode = header.parentNode;
364
+ if (parentNode && parentNode.childNodes) {
365
+ let foundHeader = false;
366
+ for (const node of parentNode.childNodes) {
367
+ if (node === header) {
368
+ foundHeader = true;
369
+ continue;
370
+ }
371
+ if (foundHeader && node.tagName && headerFilter.includes(node.tagName.toLowerCase())) {
372
+ break;
373
+ }
374
+ if (foundHeader) {
375
+ content += this.getTextContent(node) + " ";
376
+ }
377
+ }
368
378
  }
369
379
  elements.push({
370
380
  url: text,
371
381
  xpath: this.getXPath(header),
372
382
  content: content.trim(),
373
383
  metadata: {
374
- [headerMapping?.[header.tagName.toLowerCase()]]: header.textContent?.trim() || ""
384
+ [headerMapping?.[header.tagName.toLowerCase()]]: header.text || ""
375
385
  }
376
386
  });
377
387
  });
378
388
  return this.returnEachElement ? elements.map(
379
389
  (el) => new Document({
380
390
  text: el.content,
381
- metadata: el.metadata
391
+ metadata: { ...el.metadata, xpath: el.xpath }
382
392
  })
383
393
  ) : this.aggregateElementsToChunks(elements);
384
394
  }
385
395
  getXPath(element) {
396
+ if (!element) return "";
386
397
  const parts = [];
387
398
  let current = element;
388
- while (current && current.nodeType === 1) {
399
+ while (current && current.tagName) {
389
400
  let index = 1;
390
- for (let sibling = current.previousElementSibling; sibling; sibling = sibling.previousElementSibling) {
391
- if (sibling.nodeName === current.nodeName) {
392
- index++;
401
+ const parent = current.parentNode;
402
+ if (parent && parent.childNodes) {
403
+ for (const sibling of parent.childNodes) {
404
+ if (sibling === current) break;
405
+ if (sibling.tagName === current.tagName) {
406
+ index++;
407
+ }
393
408
  }
394
409
  }
395
410
  parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
396
- current = current.parentElement;
411
+ current = current.parentNode;
397
412
  }
398
413
  return "/" + parts.join("/");
399
414
  }
415
+ getTextContent(element) {
416
+ if (!element) return "";
417
+ if (!element.tagName) {
418
+ return element.text || "";
419
+ }
420
+ let content = element.text || "";
421
+ if (element.childNodes) {
422
+ for (const child of element.childNodes) {
423
+ const childText = this.getTextContent(child);
424
+ if (childText) {
425
+ content += " " + childText;
426
+ }
427
+ }
428
+ }
429
+ return content.trim();
430
+ }
400
431
  aggregateElementsToChunks(elements) {
401
432
  const aggregatedChunks = [];
402
433
  for (const element of elements) {
@@ -409,7 +440,7 @@ var HTMLHeaderTransformer = class {
409
440
  return aggregatedChunks.map(
410
441
  (chunk) => new Document({
411
442
  text: chunk.content,
412
- metadata: chunk.metadata
443
+ metadata: { ...chunk.metadata, xpath: chunk.xpath }
413
444
  })
414
445
  );
415
446
  }
@@ -450,7 +481,7 @@ var HTMLHeaderTransformer = class {
450
481
  };
451
482
  var HTMLSectionTransformer = class {
452
483
  constructor(headersToSplitOn, options = {}) {
453
- this.headersToSplitOn = Object.fromEntries(headersToSplitOn);
484
+ this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name]) => [tag.toLowerCase(), name]));
454
485
  this.options = options;
455
486
  }
456
487
  splitText(text) {
@@ -459,11 +490,59 @@ var HTMLSectionTransformer = class {
459
490
  (section) => new Document({
460
491
  text: section.content,
461
492
  metadata: {
462
- [this.headersToSplitOn[section.tagName]]: section.header
493
+ [this.headersToSplitOn[section.tagName.toLowerCase()]]: section.header,
494
+ xpath: section.xpath
463
495
  }
464
496
  })
465
497
  );
466
498
  }
499
+ getXPath(element) {
500
+ const parts = [];
501
+ let current = element;
502
+ while (current && current.nodeType === 1) {
503
+ let index = 1;
504
+ let sibling = current.previousSibling;
505
+ while (sibling) {
506
+ if (sibling.nodeType === 1 && sibling.tagName === current.tagName) {
507
+ index++;
508
+ }
509
+ sibling = sibling.previousSibling;
510
+ }
511
+ if (current.tagName) {
512
+ parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
513
+ }
514
+ current = current.parentNode;
515
+ }
516
+ return "/" + parts.join("/");
517
+ }
518
+ splitHtmlByHeaders(htmlDoc) {
519
+ const sections = [];
520
+ const root = parse(htmlDoc);
521
+ const headers = Object.keys(this.headersToSplitOn);
522
+ const headerElements = root.querySelectorAll(headers.join(","));
523
+ headerElements.forEach((headerElement, index) => {
524
+ const header = headerElement.text?.trim() || "";
525
+ const tagName = headerElement.tagName;
526
+ const xpath = this.getXPath(headerElement);
527
+ let content = "";
528
+ let currentElement = headerElement.nextElementSibling;
529
+ const nextHeader = headerElements[index + 1];
530
+ while (currentElement && (!nextHeader || currentElement !== nextHeader)) {
531
+ if (currentElement.text) {
532
+ content += currentElement.text.trim() + " ";
533
+ }
534
+ currentElement = currentElement.nextElementSibling;
535
+ }
536
+ content = content.trim();
537
+ sections.push({
538
+ header,
539
+ content,
540
+ tagName,
541
+ xpath
542
+ });
543
+ });
544
+ return sections;
545
+ }
467
546
  async splitDocuments(documents) {
468
547
  const texts = [];
469
548
  const metadatas = [];
@@ -500,43 +579,6 @@ var HTMLSectionTransformer = class {
500
579
  }
501
580
  return documents;
502
581
  }
503
- splitHtmlByHeaders(htmlDoc) {
504
- const sections = [];
505
- const dom = new JSDOM(htmlDoc);
506
- const { document } = dom.window;
507
- const headers = ["body", ...Object.keys(this.headersToSplitOn)];
508
- const headerElements = Array.from(document.querySelectorAll(headers.join(",")));
509
- for (let i = 0; i < headerElements.length; i++) {
510
- const headerElement = headerElements[i];
511
- let currentHeader;
512
- let currentHeaderTag;
513
- let sectionContent = [];
514
- if (i === 0) {
515
- currentHeader = "#TITLE#";
516
- currentHeaderTag = "h1";
517
- } else {
518
- currentHeader = headerElement.textContent?.trim() || "";
519
- currentHeaderTag = headerElement.tagName.toLowerCase();
520
- }
521
- let currentNode = headerElement.nextSibling;
522
- const nextHeader = headerElements[i + 1];
523
- while (currentNode && currentNode !== nextHeader) {
524
- if (currentNode.textContent) {
525
- sectionContent.push(currentNode.textContent);
526
- }
527
- currentNode = currentNode.nextSibling;
528
- }
529
- const content = sectionContent.join(" ").trim();
530
- if (content) {
531
- sections.push({
532
- header: currentHeader,
533
- content,
534
- tagName: currentHeaderTag
535
- });
536
- }
537
- }
538
- return sections;
539
- }
540
582
  transformDocuments(documents) {
541
583
  const texts = [];
542
584
  const metadatas = [];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/rag",
3
- "version": "0.1.0-alpha.83",
3
+ "version": "0.1.0-alpha.85",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -21,17 +21,16 @@
21
21
  "@llamaindex/env": "^0.1.20",
22
22
  "@paralleldrive/cuid2": "^2.2.2",
23
23
  "js-tiktoken": "^1.0.15",
24
- "jsdom": "^25.0.1",
25
24
  "llamaindex": "^0.8.15",
25
+ "node-html-better-parser": "^1.4.7",
26
26
  "pathe": "^2.0.2",
27
27
  "zod": "^3.24.1",
28
- "@mastra/core": "^0.2.0-alpha.91"
28
+ "@mastra/core": "^0.2.0-alpha.92"
29
29
  },
30
30
  "devDependencies": {
31
31
  "@babel/preset-env": "^7.26.0",
32
32
  "@babel/preset-typescript": "^7.26.0",
33
33
  "@tsconfig/recommended": "^1.0.7",
34
- "@types/jsdom": "^21.1.7",
35
34
  "@types/node": "^22.9.0",
36
35
  "tsup": "^8.0.1",
37
36
  "vitest": "^3.0.4"
@@ -461,6 +461,177 @@ describe('MDocument', () => {
461
461
  expect(doc?.metadata).toHaveProperty('Header 1');
462
462
  });
463
463
  });
464
+
465
+ it('should handle empty or invalid HTML', async () => {
466
+ const emptyHtml = '';
467
+ const invalidHtml = '<unclosed>test';
468
+ const noHeadersHtml = '<div>test</div>';
469
+
470
+ const doc1 = MDocument.fromHTML(emptyHtml, { meta: 'data' });
471
+ const doc2 = MDocument.fromHTML(invalidHtml, { meta: 'data' });
472
+ const doc3 = MDocument.fromHTML(noHeadersHtml, { meta: 'data' });
473
+
474
+ await doc1.chunk({
475
+ strategy: 'html',
476
+ headers: [
477
+ ['h1', 'Header 1'],
478
+ ['h2', 'Header 2'],
479
+ ],
480
+ });
481
+
482
+ await doc2.chunk({
483
+ strategy: 'html',
484
+ headers: [
485
+ ['h1', 'Header 1'],
486
+ ['h2', 'Header 2'],
487
+ ],
488
+ });
489
+
490
+ await doc3.chunk({
491
+ strategy: 'html',
492
+ headers: [
493
+ ['h1', 'Header 1'],
494
+ ['h2', 'Header 2'],
495
+ ],
496
+ });
497
+
498
+ expect(doc1.getDocs()).toHaveLength(0);
499
+ expect(doc2.getDocs()).toHaveLength(0);
500
+ expect(doc3.getDocs()).toHaveLength(0);
501
+ });
502
+
503
+ it('should handle complex nested header hierarchies', async () => {
504
+ const html = `
505
+ <html>
506
+ <body>
507
+ <h1>Main Title</h1>
508
+ <p>Main content</p>
509
+ <h2>Section 1</h2>
510
+ <p>Section 1 content</p>
511
+ <h3>Subsection 1.1</h3>
512
+ <p>Subsection 1.1 content</p>
513
+ <h2>Section 2</h2>
514
+ <h3>Subsection 2.1</h3>
515
+ <p>Subsection 2.1 content</p>
516
+ </body>
517
+ </html>
518
+ `;
519
+
520
+ const doc = MDocument.fromHTML(html, { meta: 'data' });
521
+ await doc.chunk({
522
+ strategy: 'html',
523
+ headers: [
524
+ ['h1', 'Header 1'],
525
+ ['h2', 'Header 2'],
526
+ ['h3', 'Header 3'],
527
+ ],
528
+ });
529
+
530
+ const docs = doc.getDocs();
531
+ expect(docs.length).toBeGreaterThan(3);
532
+ expect(docs.some(d => d.metadata?.['Header 1'] === 'Main Title')).toBe(true);
533
+ expect(docs.some(d => d.metadata?.['Header 2'] === 'Section 1')).toBe(true);
534
+ expect(docs.some(d => d.metadata?.['Header 3'] === 'Subsection 1.1')).toBe(true);
535
+ });
536
+
537
+ it('should handle headers with mixed content and special characters', async () => {
538
+ const html = `
539
+ <html>
540
+ <body>
541
+ <h1>Title with <strong>bold</strong> &amp; <em>emphasis</em></h1>
542
+ <p>Content 1</p>
543
+ <h2>Section with &lt;tags&gt; &amp; symbols</h2>
544
+ <p>Content 2</p>
545
+ </body>
546
+ </html>
547
+ `;
548
+
549
+ const doc = MDocument.fromHTML(html, { meta: 'data' });
550
+ await doc.chunk({
551
+ strategy: 'html',
552
+ headers: [
553
+ ['h1', 'Header 1'],
554
+ ['h2', 'Header 2'],
555
+ ],
556
+ });
557
+
558
+ const docs = doc.getDocs();
559
+ expect(docs.length).toBeGreaterThan(1);
560
+ expect(docs[0]?.metadata?.['Header 1']).toContain('bold');
561
+ expect(docs[0]?.metadata?.['Header 1']).toContain('&');
562
+ expect(docs[0]?.metadata?.['Header 1']).toContain('emphasis');
563
+ expect(docs[1]?.metadata?.['Header 2']).toContain('<tags>');
564
+ });
565
+
566
+ it('should handle headers with no content or whitespace content', async () => {
567
+ const html = `
568
+ <html>
569
+ <body>
570
+ <h1>Empty Section</h1>
571
+ <h2>Whitespace Section</h2>
572
+
573
+ <h2>Valid Section</h2>
574
+ <p>Content</p>
575
+ </body>
576
+ </html>
577
+ `;
578
+
579
+ const doc = MDocument.fromHTML(html, { meta: 'data' });
580
+ await doc.chunk({
581
+ strategy: 'html',
582
+ headers: [
583
+ ['h1', 'Header 1'],
584
+ ['h2', 'Header 2'],
585
+ ],
586
+ });
587
+
588
+ const docs = doc.getDocs();
589
+ expect(docs.some(d => d.metadata?.['Header 1'] === 'Empty Section')).toBe(true);
590
+ expect(docs.some(d => d.metadata?.['Header 2'] === 'Valid Section')).toBe(true);
591
+ expect(docs.find(d => d.metadata?.['Header 2'] === 'Valid Section')?.text).toContain('Content');
592
+ });
593
+
594
+ it('should generate correct XPaths for deeply nested elements', async () => {
595
+ const html = `
596
+ <html>
597
+ <body>
598
+ <div class="container">
599
+ <section id="main">
600
+ <div>
601
+ <h1>Deeply Nested Title</h1>
602
+ <p>Content</p>
603
+ </div>
604
+ <div>
605
+ <h1>Second Title</h1>
606
+ <p>More Content</p>
607
+ </div>
608
+ </section>
609
+ </div>
610
+ </body>
611
+ </html>
612
+ `;
613
+
614
+ const doc = MDocument.fromHTML(html, { meta: 'data' });
615
+ await doc.chunk({
616
+ strategy: 'html',
617
+ headers: [['h1', 'Header 1']],
618
+ });
619
+
620
+ const docs = doc.getDocs();
621
+ expect(docs).toHaveLength(2);
622
+
623
+ // First h1
624
+ expect(docs[0]?.metadata?.['Header 1']).toBe('Deeply Nested Title');
625
+ const xpath1 = docs[0]?.metadata?.xpath as string;
626
+ expect(xpath1).toBeDefined();
627
+ expect(xpath1).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[1\]\/h1\[1\]$/);
628
+
629
+ // Second h1
630
+ expect(docs[1]?.metadata?.['Header 1']).toBe('Second Title');
631
+ const xpath2 = docs[1]?.metadata?.xpath as string;
632
+ expect(xpath2).toBeDefined();
633
+ expect(xpath2).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[2\]\/h1\[1\]$/);
634
+ });
464
635
  });
465
636
 
466
637
  describe('chunkJson', () => {
@@ -1,5 +1,5 @@
1
- import { JSDOM } from 'jsdom';
2
1
  import { Document } from 'llamaindex';
2
+ import { parse } from 'node-html-better-parser';
3
3
 
4
4
  import { RecursiveCharacterTransformer } from './character';
5
5
 
@@ -20,22 +20,37 @@ export class HTMLHeaderTransformer {
20
20
  }
21
21
 
22
22
  splitText({ text }: { text: string }): Document[] {
23
- const dom = new JSDOM(text);
24
- const { document } = dom.window;
23
+ const root = parse(text);
25
24
 
26
25
  const headerFilter = this.headersToSplitOn.map(([header]) => header);
27
26
  const headerMapping = Object.fromEntries(this.headersToSplitOn);
28
27
 
29
28
  const elements: ElementType[] = [];
30
- const headers = document.querySelectorAll(headerFilter.join(','));
29
+ const headers = root.querySelectorAll(headerFilter.join(','));
31
30
 
32
31
  headers.forEach(header => {
33
32
  let content = '';
34
- let nextElement = header.nextElementSibling;
33
+ const parentNode = header.parentNode;
34
+
35
+ if (parentNode && parentNode.childNodes) {
36
+ let foundHeader = false;
37
+ for (const node of parentNode.childNodes) {
38
+ // Start collecting content after we find our header
39
+ if (node === header) {
40
+ foundHeader = true;
41
+ continue;
42
+ }
43
+
44
+ // If we found our header and hit another header, stop
45
+ if (foundHeader && node.tagName && headerFilter.includes(node.tagName.toLowerCase())) {
46
+ break;
47
+ }
35
48
 
36
- while (nextElement && !headerFilter.includes(nextElement.tagName.toLowerCase())) {
37
- content += nextElement.textContent + ' ';
38
- nextElement = nextElement.nextElementSibling;
49
+ // Collect content between headers
50
+ if (foundHeader) {
51
+ content += this.getTextContent(node) + ' ';
52
+ }
53
+ }
39
54
  }
40
55
 
41
56
  elements.push({
@@ -43,7 +58,7 @@ export class HTMLHeaderTransformer {
43
58
  xpath: this.getXPath(header),
44
59
  content: content.trim(),
45
60
  metadata: {
46
- [headerMapping?.[header.tagName.toLowerCase()]!]: header.textContent?.trim() || '',
61
+ [headerMapping?.[header.tagName.toLowerCase()]!]: header.text || '',
47
62
  },
48
63
  });
49
64
  });
@@ -53,30 +68,62 @@ export class HTMLHeaderTransformer {
53
68
  el =>
54
69
  new Document({
55
70
  text: el.content,
56
- metadata: el.metadata,
71
+ metadata: { ...el.metadata, xpath: el.xpath },
57
72
  }),
58
73
  )
59
74
  : this.aggregateElementsToChunks(elements);
60
75
  }
61
76
 
62
- private getXPath(element: Element): string {
77
+ private getXPath(element: any): string {
78
+ if (!element) return '';
79
+
63
80
  const parts: string[] = [];
64
- let current: Element | null = element;
81
+ let current = element;
65
82
 
66
- while (current && current.nodeType === 1) {
83
+ while (current && current.tagName) {
67
84
  let index = 1;
68
- for (let sibling = current.previousElementSibling; sibling; sibling = sibling.previousElementSibling) {
69
- if (sibling.nodeName === current.nodeName) {
70
- index++;
85
+ const parent = current.parentNode;
86
+
87
+ if (parent && parent.childNodes) {
88
+ // Count preceding siblings with same tag
89
+ for (const sibling of parent.childNodes) {
90
+ if (sibling === current) break;
91
+ if (sibling.tagName === current.tagName) {
92
+ index++;
93
+ }
71
94
  }
72
95
  }
96
+
73
97
  parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
74
- current = current.parentElement;
98
+ current = current.parentNode;
75
99
  }
76
100
 
77
101
  return '/' + parts.join('/');
78
102
  }
79
103
 
104
+ private getTextContent(element: any): string {
105
+ if (!element) return '';
106
+
107
+ // For text nodes, return their content
108
+ if (!element.tagName) {
109
+ return element.text || '';
110
+ }
111
+
112
+ // For element nodes, combine their text with children's text
113
+ let content = element.text || '';
114
+
115
+ if (element.childNodes) {
116
+ for (const child of element.childNodes) {
117
+ const childText = this.getTextContent(child);
118
+ if (childText) {
119
+ content += ' ' + childText;
120
+ }
121
+ }
122
+ }
123
+
124
+ return content.trim();
125
+ }
126
+
80
127
  private aggregateElementsToChunks(elements: ElementType[]): Document[] {
81
128
  const aggregatedChunks: ElementType[] = [];
82
129
 
@@ -97,7 +144,7 @@ export class HTMLHeaderTransformer {
97
144
  chunk =>
98
145
  new Document({
99
146
  text: chunk.content,
100
- metadata: chunk.metadata,
147
+ metadata: { ...chunk.metadata, xpath: chunk.xpath },
101
148
  }),
102
149
  );
103
150
  }
@@ -110,7 +157,6 @@ export class HTMLHeaderTransformer {
110
157
  const chunks = this.splitText({ text: texts[i]! });
111
158
  for (const chunk of chunks) {
112
159
  const metadata = { ...(_metadatas[i] || {}) };
113
-
114
160
  const chunkMetadata = chunk.metadata;
115
161
 
116
162
  if (chunkMetadata) {
@@ -151,7 +197,7 @@ export class HTMLSectionTransformer {
151
197
  private options: Record<string, any>;
152
198
 
153
199
  constructor(headersToSplitOn: [string, string][], options: Record<string, any> = {}) {
154
- this.headersToSplitOn = Object.fromEntries(headersToSplitOn);
200
+ this.headersToSplitOn = Object.fromEntries(headersToSplitOn.map(([tag, name]) => [tag.toLowerCase(), name]));
155
201
  this.options = options;
156
202
  }
157
203
 
@@ -163,12 +209,82 @@ export class HTMLSectionTransformer {
163
209
  new Document({
164
210
  text: section.content,
165
211
  metadata: {
166
- [this.headersToSplitOn[section.tagName]!]: section.header,
212
+ [this.headersToSplitOn[section.tagName.toLowerCase()]!]: section.header,
213
+ xpath: section.xpath,
167
214
  },
168
215
  }),
169
216
  );
170
217
  }
171
218
 
219
+ private getXPath(element: any): string {
220
+ const parts: string[] = [];
221
+ let current = element;
222
+
223
+ while (current && current.nodeType === 1) {
224
+ let index = 1;
225
+ let sibling = current.previousSibling;
226
+
227
+ while (sibling) {
228
+ if (sibling.nodeType === 1 && sibling.tagName === current.tagName) {
229
+ index++;
230
+ }
231
+ sibling = sibling.previousSibling;
232
+ }
233
+
234
+ if (current.tagName) {
235
+ parts.unshift(`${current.tagName.toLowerCase()}[${index}]`);
236
+ }
237
+ current = current.parentNode;
238
+ }
239
+
240
+ return '/' + parts.join('/');
241
+ }
242
+
243
+ private splitHtmlByHeaders(htmlDoc: string): Array<{
244
+ header: string;
245
+ content: string;
246
+ tagName: string;
247
+ xpath: string;
248
+ }> {
249
+ const sections: Array<{
250
+ header: string;
251
+ content: string;
252
+ tagName: string;
253
+ xpath: string;
254
+ }> = [];
255
+
256
+ const root = parse(htmlDoc);
257
+ const headers = Object.keys(this.headersToSplitOn);
258
+ const headerElements = root.querySelectorAll(headers.join(','));
259
+
260
+ headerElements.forEach((headerElement, index) => {
261
+ const header = headerElement.text?.trim() || '';
262
+ const tagName = headerElement.tagName;
263
+ const xpath = this.getXPath(headerElement);
264
+ let content = '';
265
+
266
+ let currentElement = headerElement.nextElementSibling;
267
+ const nextHeader = headerElements[index + 1];
268
+
269
+ while (currentElement && (!nextHeader || currentElement !== nextHeader)) {
270
+ if (currentElement.text) {
271
+ content += currentElement.text.trim() + ' ';
272
+ }
273
+ currentElement = currentElement.nextElementSibling;
274
+ }
275
+
276
+ content = content.trim();
277
+ sections.push({
278
+ header,
279
+ content,
280
+ tagName,
281
+ xpath,
282
+ });
283
+ });
284
+
285
+ return sections;
286
+ }
287
+
172
288
  async splitDocuments(documents: Document[]): Promise<Document[]> {
173
289
  const texts: string[] = [];
174
290
  const metadatas: Record<string, any>[] = [];
@@ -214,61 +330,6 @@ export class HTMLSectionTransformer {
214
330
  return documents;
215
331
  }
216
332
 
217
- private splitHtmlByHeaders(htmlDoc: string): Array<{
218
- header: string;
219
- content: string;
220
- tagName: string;
221
- }> {
222
- const sections: Array<{
223
- header: string;
224
- content: string;
225
- tagName: string;
226
- }> = [];
227
-
228
- const dom = new JSDOM(htmlDoc);
229
- const { document } = dom.window;
230
- const headers = ['body', ...Object.keys(this.headersToSplitOn)];
231
-
232
- const headerElements = Array.from(document.querySelectorAll(headers.join(',')));
233
-
234
- for (let i = 0; i < headerElements.length; i++) {
235
- const headerElement = headerElements[i]!;
236
- let currentHeader: string;
237
- let currentHeaderTag: string;
238
- let sectionContent: string[] = [];
239
-
240
- if (i === 0) {
241
- currentHeader = '#TITLE#';
242
- currentHeaderTag = 'h1';
243
- } else {
244
- currentHeader = headerElement.textContent?.trim() || '';
245
- currentHeaderTag = headerElement.tagName.toLowerCase();
246
- }
247
-
248
- // Get content until next header
249
- let currentNode = headerElement.nextSibling;
250
- const nextHeader = headerElements[i + 1];
251
-
252
- while (currentNode && currentNode !== nextHeader) {
253
- if (currentNode.textContent) {
254
- sectionContent.push(currentNode.textContent);
255
- }
256
- currentNode = currentNode.nextSibling;
257
- }
258
-
259
- const content = sectionContent.join(' ').trim();
260
- if (content) {
261
- sections.push({
262
- header: currentHeader,
263
- content,
264
- tagName: currentHeaderTag,
265
- });
266
- }
267
- }
268
-
269
- return sections;
270
- }
271
-
272
333
  transformDocuments(documents: Document[]): Document[] {
273
334
  const texts: string[] = [];
274
335
  const metadatas: Record<string, any>[] = [];