@exulu/backend 1.49.2 → 1.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -310,6 +310,28 @@ export class MarkdownChunker {
310
310
  return result;
311
311
  }
312
312
 
313
+ /**
314
+ * Checks if a position in the text falls within a <diagram> tag.
315
+ * Returns the adjusted position (before the diagram) if inside a diagram, otherwise returns the original position.
316
+ */
317
+ private adjustForDiagramTags(text: string, position: number): number {
318
+ // Find all diagram tags in the text
319
+ const diagramRegex = /<diagram>[\s\S]*?<\/diagram>/gi;
320
+ let match: RegExpExecArray | null;
321
+
322
+ while ((match = diagramRegex.exec(text)) !== null) {
323
+ const diagramStart = match.index;
324
+ const diagramEnd = match.index + match[0].length;
325
+
326
+ // If the position falls within a diagram tag, return the position before the diagram
327
+ if (position > diagramStart && position < diagramEnd) {
328
+ return diagramStart;
329
+ }
330
+ }
331
+
332
+ return position;
333
+ }
334
+
313
335
  /**
314
336
  * Find the nearest logical breakpoint working backwards from the end of the text.
315
337
  * Logical breakpoints are prioritized as follows:
@@ -321,6 +343,7 @@ export class MarkdownChunker {
321
343
  *
322
344
  * Only considers breakpoints in the last 50% of the text to avoid creating very small chunks.
323
345
  * Returns the position of the breakpoint, or null if none found
346
+ * IMPORTANT: Never splits content within <diagram> tags
324
347
  */
325
348
  private findLogicalBreakpoint(text: string): number | null {
326
349
  if (text.length === 0) return null;
@@ -349,7 +372,8 @@ export class MarkdownChunker {
349
372
  }
350
373
 
351
374
  if (lastHeaderPosition > 0) {
352
- return lastHeaderPosition; // Break BEFORE the header (at the \n)
375
+ // Ensure we don't break inside a diagram tag
376
+ return this.adjustForDiagramTags(text, lastHeaderPosition);
353
377
  }
354
378
 
355
379
  // Priority 2: Look for paragraph breaks (double newlines) in the latter half
@@ -365,13 +389,16 @@ export class MarkdownChunker {
365
389
  }
366
390
 
367
391
  if (lastParagraphBreak > 0) {
368
- return lastParagraphBreak + 2; // Include both newlines
392
+ // Ensure we don't break inside a diagram tag
393
+ const adjusted = this.adjustForDiagramTags(text, lastParagraphBreak + 2);
394
+ return adjusted;
369
395
  }
370
396
 
371
397
  // Priority 3: Look for single newlines in the latter half
372
398
  const newlineIndex = text.lastIndexOf('\n');
373
399
  if (newlineIndex >= minPosition) {
374
- return newlineIndex + 1; // Include the newline
400
+ // Ensure we don't break inside a diagram tag
401
+ return this.adjustForDiagramTags(text, newlineIndex + 1);
375
402
  }
376
403
 
377
404
  // Priority 4: Look for end of sentence (. ! ? followed by space or newline)
@@ -385,7 +412,8 @@ export class MarkdownChunker {
385
412
  }
386
413
 
387
414
  if (lastSentenceEnd > 0) {
388
- return lastSentenceEnd;
415
+ // Ensure we don't break inside a diagram tag
416
+ return this.adjustForDiagramTags(text, lastSentenceEnd);
389
417
  }
390
418
 
391
419
  // Priority 5: Look for any whitespace in the latter half
@@ -393,7 +421,8 @@ export class MarkdownChunker {
393
421
  while (lastSpace > minPosition) {
394
422
  const pos = text.lastIndexOf(' ', lastSpace - 1);
395
423
  if (pos >= minPosition) {
396
- return pos + 1; // Include the space
424
+ // Ensure we don't break inside a diagram tag
425
+ return this.adjustForDiagramTags(text, pos + 1);
397
426
  }
398
427
  lastSpace = pos;
399
428
  }
@@ -588,6 +617,55 @@ export class MarkdownChunker {
588
617
  }
589
618
  }
590
619
 
620
+ // Check if the current slice ends in the middle of a diagram tag
621
+ // If so, we need to adjust to include the entire diagram or exclude it entirely
622
+ const diagramCheck = /<diagram>/gi;
623
+ const diagramCloseCheck = /<\/diagram>/gi;
624
+ let openDiagramsInSlice = 0;
625
+
626
+ while (diagramCheck.exec(currentSlice) !== null) {
627
+ openDiagramsInSlice++;
628
+ }
629
+
630
+ let closeDiagramsInSlice = 0;
631
+ while (diagramCloseCheck.exec(currentSlice) !== null) {
632
+ closeDiagramsInSlice++;
633
+ }
634
+
635
+ // If we have more opening tags than closing tags, we're cutting a diagram in half
636
+ if (openDiagramsInSlice > closeDiagramsInSlice) {
637
+ // Find the last opening diagram tag in the slice
638
+ const lastDiagramOpenIndex = currentSlice.lastIndexOf('<diagram>');
639
+ if (lastDiagramOpenIndex !== -1) {
640
+ // Try to extend the slice to include the closing tag
641
+ const remainingText = text.slice(currentPosition + lastDiagramOpenIndex);
642
+ const closingTagMatch = /<\/diagram>/i.exec(remainingText);
643
+
644
+ if (closingTagMatch) {
645
+ const closingTagPosition = lastDiagramOpenIndex + closingTagMatch.index + closingTagMatch[0].length;
646
+
647
+ // Check if including the full diagram would still be reasonable
648
+ // If the diagram is massive, we'll exclude it from this chunk instead
649
+ const extendedSlice = text.slice(currentPosition, currentPosition + closingTagPosition);
650
+ const extendedTokens = tokenizer.encode(extendedSlice);
651
+
652
+ if (extendedTokens.length <= adjustedChunkSize * 1.5) {
653
+ // Include the full diagram in this chunk
654
+ currentSlice = extendedSlice;
655
+ targetPosition = currentPosition + closingTagPosition;
656
+ } else {
657
+ // Diagram is too large, exclude it from this chunk
658
+ currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
659
+ targetPosition = currentPosition + lastDiagramOpenIndex;
660
+ }
661
+ } else {
662
+ // Closing tag not found, exclude the opening tag from this chunk
663
+ currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
664
+ targetPosition = currentPosition + lastDiagramOpenIndex;
665
+ }
666
+ }
667
+ }
668
+
591
669
  // Working backwards from the target position find the nearest logical
592
670
  // breakpoint near the end of the slice (i.e. a heading or a paragraph break).
593
671
  const breakpointPosition = this.findLogicalBreakpoint(currentSlice);