@exulu/backend 1.49.2 → 1.51.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/setup-python.cjs +140 -0
- package/dist/index.cjs +561 -119
- package/dist/index.d.cts +16 -3
- package/dist/index.d.ts +16 -3
- package/dist/index.js +564 -122
- package/ee/chunking/markdown.ts +83 -5
- package/ee/python/documents/processing/doc_processor.ts +380 -84
- package/ee/workers.ts +214 -18
- package/package.json +8 -1
package/ee/chunking/markdown.ts
CHANGED
|
@@ -310,6 +310,28 @@ export class MarkdownChunker {
|
|
|
310
310
|
return result;
|
|
311
311
|
}
|
|
312
312
|
|
|
313
|
+
/**
|
|
314
|
+
* Checks if a position in the text falls within a <diagram> tag.
|
|
315
|
+
* Returns the adjusted position (before the diagram) if inside a diagram, otherwise returns the original position.
|
|
316
|
+
*/
|
|
317
|
+
private adjustForDiagramTags(text: string, position: number): number {
|
|
318
|
+
// Find all diagram tags in the text
|
|
319
|
+
const diagramRegex = /<diagram>[\s\S]*?<\/diagram>/gi;
|
|
320
|
+
let match: RegExpExecArray | null;
|
|
321
|
+
|
|
322
|
+
while ((match = diagramRegex.exec(text)) !== null) {
|
|
323
|
+
const diagramStart = match.index;
|
|
324
|
+
const diagramEnd = match.index + match[0].length;
|
|
325
|
+
|
|
326
|
+
// If the position falls within a diagram tag, return the position before the diagram
|
|
327
|
+
if (position > diagramStart && position < diagramEnd) {
|
|
328
|
+
return diagramStart;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
return position;
|
|
333
|
+
}
|
|
334
|
+
|
|
313
335
|
/**
|
|
314
336
|
* Find the nearest logical breakpoint working backwards from the end of the text.
|
|
315
337
|
* Logical breakpoints are prioritized as follows:
|
|
@@ -321,6 +343,7 @@ export class MarkdownChunker {
|
|
|
321
343
|
*
|
|
322
344
|
* Only considers breakpoints in the last 50% of the text to avoid creating very small chunks.
|
|
323
345
|
* Returns the position of the breakpoint, or null if none found
|
|
346
|
+
* IMPORTANT: Never splits content within <diagram> tags
|
|
324
347
|
*/
|
|
325
348
|
private findLogicalBreakpoint(text: string): number | null {
|
|
326
349
|
if (text.length === 0) return null;
|
|
@@ -349,7 +372,8 @@ export class MarkdownChunker {
|
|
|
349
372
|
}
|
|
350
373
|
|
|
351
374
|
if (lastHeaderPosition > 0) {
|
|
352
|
-
|
|
375
|
+
// Ensure we don't break inside a diagram tag
|
|
376
|
+
return this.adjustForDiagramTags(text, lastHeaderPosition);
|
|
353
377
|
}
|
|
354
378
|
|
|
355
379
|
// Priority 2: Look for paragraph breaks (double newlines) in the latter half
|
|
@@ -365,13 +389,16 @@ export class MarkdownChunker {
|
|
|
365
389
|
}
|
|
366
390
|
|
|
367
391
|
if (lastParagraphBreak > 0) {
|
|
368
|
-
|
|
392
|
+
// Ensure we don't break inside a diagram tag
|
|
393
|
+
const adjusted = this.adjustForDiagramTags(text, lastParagraphBreak + 2);
|
|
394
|
+
return adjusted;
|
|
369
395
|
}
|
|
370
396
|
|
|
371
397
|
// Priority 3: Look for single newlines in the latter half
|
|
372
398
|
const newlineIndex = text.lastIndexOf('\n');
|
|
373
399
|
if (newlineIndex >= minPosition) {
|
|
374
|
-
|
|
400
|
+
// Ensure we don't break inside a diagram tag
|
|
401
|
+
return this.adjustForDiagramTags(text, newlineIndex + 1);
|
|
375
402
|
}
|
|
376
403
|
|
|
377
404
|
// Priority 4: Look for end of sentence (. ! ? followed by space or newline)
|
|
@@ -385,7 +412,8 @@ export class MarkdownChunker {
|
|
|
385
412
|
}
|
|
386
413
|
|
|
387
414
|
if (lastSentenceEnd > 0) {
|
|
388
|
-
|
|
415
|
+
// Ensure we don't break inside a diagram tag
|
|
416
|
+
return this.adjustForDiagramTags(text, lastSentenceEnd);
|
|
389
417
|
}
|
|
390
418
|
|
|
391
419
|
// Priority 5: Look for any whitespace in the latter half
|
|
@@ -393,7 +421,8 @@ export class MarkdownChunker {
|
|
|
393
421
|
while (lastSpace > minPosition) {
|
|
394
422
|
const pos = text.lastIndexOf(' ', lastSpace - 1);
|
|
395
423
|
if (pos >= minPosition) {
|
|
396
|
-
|
|
424
|
+
// Ensure we don't break inside a diagram tag
|
|
425
|
+
return this.adjustForDiagramTags(text, pos + 1);
|
|
397
426
|
}
|
|
398
427
|
lastSpace = pos;
|
|
399
428
|
}
|
|
@@ -588,6 +617,55 @@ export class MarkdownChunker {
|
|
|
588
617
|
}
|
|
589
618
|
}
|
|
590
619
|
|
|
620
|
+
// Check if the current slice ends in the middle of a diagram tag
|
|
621
|
+
// If so, we need to adjust to include the entire diagram or exclude it entirely
|
|
622
|
+
const diagramCheck = /<diagram>/gi;
|
|
623
|
+
const diagramCloseCheck = /<\/diagram>/gi;
|
|
624
|
+
let openDiagramsInSlice = 0;
|
|
625
|
+
|
|
626
|
+
while (diagramCheck.exec(currentSlice) !== null) {
|
|
627
|
+
openDiagramsInSlice++;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
let closeDiagramsInSlice = 0;
|
|
631
|
+
while (diagramCloseCheck.exec(currentSlice) !== null) {
|
|
632
|
+
closeDiagramsInSlice++;
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
// If we have more opening tags than closing tags, we're cutting a diagram in half
|
|
636
|
+
if (openDiagramsInSlice > closeDiagramsInSlice) {
|
|
637
|
+
// Find the last opening diagram tag in the slice
|
|
638
|
+
const lastDiagramOpenIndex = currentSlice.lastIndexOf('<diagram>');
|
|
639
|
+
if (lastDiagramOpenIndex !== -1) {
|
|
640
|
+
// Try to extend the slice to include the closing tag
|
|
641
|
+
const remainingText = text.slice(currentPosition + lastDiagramOpenIndex);
|
|
642
|
+
const closingTagMatch = /<\/diagram>/i.exec(remainingText);
|
|
643
|
+
|
|
644
|
+
if (closingTagMatch) {
|
|
645
|
+
const closingTagPosition = lastDiagramOpenIndex + closingTagMatch.index + closingTagMatch[0].length;
|
|
646
|
+
|
|
647
|
+
// Check if including the full diagram would still be reasonable
|
|
648
|
+
// If the diagram is massive, we'll exclude it from this chunk instead
|
|
649
|
+
const extendedSlice = text.slice(currentPosition, currentPosition + closingTagPosition);
|
|
650
|
+
const extendedTokens = tokenizer.encode(extendedSlice);
|
|
651
|
+
|
|
652
|
+
if (extendedTokens.length <= adjustedChunkSize * 1.5) {
|
|
653
|
+
// Include the full diagram in this chunk
|
|
654
|
+
currentSlice = extendedSlice;
|
|
655
|
+
targetPosition = currentPosition + closingTagPosition;
|
|
656
|
+
} else {
|
|
657
|
+
// Diagram is too large, exclude it from this chunk
|
|
658
|
+
currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
|
|
659
|
+
targetPosition = currentPosition + lastDiagramOpenIndex;
|
|
660
|
+
}
|
|
661
|
+
} else {
|
|
662
|
+
// Closing tag not found, exclude the opening tag from this chunk
|
|
663
|
+
currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
|
|
664
|
+
targetPosition = currentPosition + lastDiagramOpenIndex;
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
|
|
591
669
|
// Working backwards from the target position find the nearest logical
|
|
592
670
|
// breakpoint near the end of the slice (i.e. a heading or a paragraph break).
|
|
593
671
|
const breakpointPosition = this.findLogicalBreakpoint(currentSlice);
|