@exulu/backend 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1237 -26
- package/dist/index.d.cts +863 -7
- package/dist/index.d.ts +863 -7
- package/dist/index.js +1232 -21
- package/package.json +4 -3
package/dist/index.d.cts
CHANGED
|
@@ -5,7 +5,8 @@ import { ZodSchema, z } from 'zod';
|
|
|
5
5
|
import { Tool, LanguageModelV1 } from 'ai';
|
|
6
6
|
import { Express } from 'express';
|
|
7
7
|
import { Knex } from 'knex';
|
|
8
|
-
import {
|
|
8
|
+
import { Tiktoken } from 'tiktoken/lite';
|
|
9
|
+
import models from 'tiktoken/model_to_encoding.json';
|
|
9
10
|
|
|
10
11
|
declare function redisClient(): Promise<{
|
|
11
12
|
client: RedisClientType | null;
|
|
@@ -526,6 +527,863 @@ declare class ExuluQueues {
|
|
|
526
527
|
}
|
|
527
528
|
declare const queues: ExuluQueues;
|
|
528
529
|
|
|
530
|
+
/**
|
|
531
|
+
* Represents the data structure for a chunk object.
|
|
532
|
+
*
|
|
533
|
+
* @property {string} text - The text of the chunk.
|
|
534
|
+
* @property {number} startIndex - The starting index of the chunk in the original text.
|
|
535
|
+
* @property {number} endIndex - The ending index of the chunk in the original text.
|
|
536
|
+
* @property {number} tokenCount - The number of tokens in the chunk.
|
|
537
|
+
*/
|
|
538
|
+
interface ChunkData {
|
|
539
|
+
text: string;
|
|
540
|
+
startIndex: number;
|
|
541
|
+
endIndex: number;
|
|
542
|
+
tokenCount: number;
|
|
543
|
+
embedding?: number[];
|
|
544
|
+
}
|
|
545
|
+
/**
|
|
546
|
+
* Represents a chunk of text with associated metadata.
|
|
547
|
+
*
|
|
548
|
+
* @property {string} text - The text of the chunk.
|
|
549
|
+
* @property {number} startIndex - The starting index of the chunk in the original text.
|
|
550
|
+
* @property {number} endIndex - The ending index of the chunk in the original text.
|
|
551
|
+
* @property {number} tokenCount - The number of tokens in the chunk.
|
|
552
|
+
* @property {number[]} [embedding] - The embedding for the chunk.
|
|
553
|
+
*/
|
|
554
|
+
declare class Chunk {
|
|
555
|
+
/** The text of the chunk. */
|
|
556
|
+
text: string;
|
|
557
|
+
/** The starting index of the chunk in the original text. */
|
|
558
|
+
startIndex: number;
|
|
559
|
+
/** The ending index of the chunk in the original text. */
|
|
560
|
+
endIndex: number;
|
|
561
|
+
/** The number of tokens in the chunk. */
|
|
562
|
+
tokenCount: number;
|
|
563
|
+
/** Optional embedding for the chunk. */
|
|
564
|
+
embedding?: number[];
|
|
565
|
+
/**
|
|
566
|
+
* Constructs a new Chunk object.
|
|
567
|
+
*
|
|
568
|
+
* @param {ChunkData} data - The data to construct the Chunk from.
|
|
569
|
+
*/
|
|
570
|
+
constructor(data: {
|
|
571
|
+
text: string;
|
|
572
|
+
startIndex: number;
|
|
573
|
+
endIndex: number;
|
|
574
|
+
tokenCount: number;
|
|
575
|
+
embedding?: number[];
|
|
576
|
+
});
|
|
577
|
+
/** Return a string representation of the Chunk.
|
|
578
|
+
*
|
|
579
|
+
* @returns {string} The text of the chunk.
|
|
580
|
+
*/
|
|
581
|
+
toString(): string;
|
|
582
|
+
/** Return a detailed string representation of the Chunk.
|
|
583
|
+
*
|
|
584
|
+
* @returns {string} The detailed string representation of the Chunk.
|
|
585
|
+
*/
|
|
586
|
+
toRepresentation(): string;
|
|
587
|
+
/** Return a slice of the chunk's text.
|
|
588
|
+
*
|
|
589
|
+
* @param {number} [start] - The starting index of the slice.
|
|
590
|
+
* @param {number} [end] - The ending index of the slice.
|
|
591
|
+
* @returns {string} The slice of the chunk's text.
|
|
592
|
+
*/
|
|
593
|
+
slice(start?: number, end?: number): string;
|
|
594
|
+
/** Return the Chunk as a dictionary-like object.
|
|
595
|
+
*
|
|
596
|
+
* @returns {ChunkData} The dictionary-like object.
|
|
597
|
+
*/
|
|
598
|
+
toDict(): ChunkData;
|
|
599
|
+
/** Create a Chunk object from a dictionary-like object.
|
|
600
|
+
*
|
|
601
|
+
* @param {ChunkData} data - The dictionary-like object.
|
|
602
|
+
* @returns {Chunk} The Chunk object.
|
|
603
|
+
*/
|
|
604
|
+
static fromDict(data: ChunkData): Chunk;
|
|
605
|
+
/** Return a deep copy of the chunk.
|
|
606
|
+
*
|
|
607
|
+
* @returns {Chunk} The deep copy of the chunk.
|
|
608
|
+
*/
|
|
609
|
+
copy(): Chunk;
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
/** Type for include delimiter options
|
|
613
|
+
*
|
|
614
|
+
* @enum {string}
|
|
615
|
+
*/
|
|
616
|
+
type IncludeDelim = 'prev' | 'next';
|
|
617
|
+
/** Interface for RecursiveLevel data
|
|
618
|
+
*
|
|
619
|
+
* @interface RecursiveLevelData
|
|
620
|
+
* @property {string | string[]} [delimiters] - The delimiters to use for chunking.
|
|
621
|
+
* @property {boolean} [whitespace] - Whether to use whitespace as a delimiter.
|
|
622
|
+
* @property {IncludeDelim} [includeDelim] - Whether to include the delimiter in the previous or next chunk.
|
|
623
|
+
*/
|
|
624
|
+
interface RecursiveLevelData {
|
|
625
|
+
delimiters?: string | string[];
|
|
626
|
+
whitespace?: boolean;
|
|
627
|
+
includeDelim?: IncludeDelim;
|
|
628
|
+
}
|
|
629
|
+
/** Class to represent recursive chunking rules at a specific level
|
|
630
|
+
*
|
|
631
|
+
* @class RecursiveLevel
|
|
632
|
+
* @property {string | string[]} [delimiters] - The delimiters to use for chunking.
|
|
633
|
+
* @property {boolean} [whitespace] - Whether to use whitespace as a delimiter.
|
|
634
|
+
* @property {IncludeDelim} [includeDelim] - Whether to include the delimiter in the previous or next chunk.
|
|
635
|
+
*/
|
|
636
|
+
declare class RecursiveLevel {
|
|
637
|
+
/** Custom delimiters for chunking */
|
|
638
|
+
delimiters?: string | string[];
|
|
639
|
+
/** Whether to use whitespace as a delimiter */
|
|
640
|
+
whitespace: boolean;
|
|
641
|
+
/** Whether to include the delimiter in the previous or next chunk */
|
|
642
|
+
includeDelim: IncludeDelim;
|
|
643
|
+
/**
|
|
644
|
+
* Constructs a new RecursiveLevel object.
|
|
645
|
+
*
|
|
646
|
+
* @param {RecursiveLevelData} data - The data to construct the RecursiveLevel from.
|
|
647
|
+
*/
|
|
648
|
+
constructor(data?: RecursiveLevelData);
|
|
649
|
+
/**
|
|
650
|
+
* Validates the RecursiveLevel object.
|
|
651
|
+
*
|
|
652
|
+
* @private
|
|
653
|
+
*/
|
|
654
|
+
private validate;
|
|
655
|
+
/** Return a string representation of the RecursiveLevel
|
|
656
|
+
*
|
|
657
|
+
* @returns {string} The string representation of the RecursiveLevel.
|
|
658
|
+
*/
|
|
659
|
+
toString(): string;
|
|
660
|
+
/** Return the RecursiveLevel as a dictionary-like object
|
|
661
|
+
*
|
|
662
|
+
* @returns {RecursiveLevelData} The dictionary-like object.
|
|
663
|
+
*/
|
|
664
|
+
toDict(): RecursiveLevelData;
|
|
665
|
+
/** Create RecursiveLevel object from a dictionary
|
|
666
|
+
*
|
|
667
|
+
* @param {RecursiveLevelData} data - The dictionary-like object.
|
|
668
|
+
* @returns {RecursiveLevel} The RecursiveLevel object.
|
|
669
|
+
*/
|
|
670
|
+
static fromDict(data: RecursiveLevelData): RecursiveLevel;
|
|
671
|
+
/** Create RecursiveLevel object from a recipe
|
|
672
|
+
*
|
|
673
|
+
* @param {string} name - The name of the recipe.
|
|
674
|
+
* @param {string} lang - The language of the recipe.
|
|
675
|
+
* @returns {Promise<RecursiveLevel>} The RecursiveLevel object.
|
|
676
|
+
*/
|
|
677
|
+
static fromRecipe(name: string, lang?: string): Promise<RecursiveLevel>;
|
|
678
|
+
}
|
|
679
|
+
/** Interface for RecursiveRules data
|
|
680
|
+
*
|
|
681
|
+
* @interface RecursiveRulesData
|
|
682
|
+
* @property {RecursiveLevelData[]} [levels] - The recursive levels.
|
|
683
|
+
*/
|
|
684
|
+
interface RecursiveRulesData {
|
|
685
|
+
levels?: RecursiveLevelData[];
|
|
686
|
+
}
|
|
687
|
+
/** Class to represent recursive chunking rules
|
|
688
|
+
*
|
|
689
|
+
* @class RecursiveRules
|
|
690
|
+
* @property {RecursiveLevel[]} [levels] - The recursive levels.
|
|
691
|
+
*/
|
|
692
|
+
declare class RecursiveRules {
|
|
693
|
+
/** List of recursive levels */
|
|
694
|
+
levels: RecursiveLevel[];
|
|
695
|
+
constructor(data?: RecursiveRulesData);
|
|
696
|
+
/** Return a string representation of the RecursiveRules
|
|
697
|
+
*
|
|
698
|
+
* @returns {string} The string representation of the RecursiveRules.
|
|
699
|
+
*/
|
|
700
|
+
toString(): string;
|
|
701
|
+
/** Return the number of levels
|
|
702
|
+
*
|
|
703
|
+
* @returns {number} The number of levels.
|
|
704
|
+
*/
|
|
705
|
+
get length(): number;
|
|
706
|
+
/** Get a level by index
|
|
707
|
+
*
|
|
708
|
+
* @param {number} index - The index of the level.
|
|
709
|
+
* @returns {RecursiveLevel | undefined} The level.
|
|
710
|
+
*/
|
|
711
|
+
getLevel(index: number): RecursiveLevel | undefined;
|
|
712
|
+
/** Return an iterator over the levels
|
|
713
|
+
*
|
|
714
|
+
* @returns {Iterator<RecursiveLevel>} The iterator over the levels.
|
|
715
|
+
*/
|
|
716
|
+
[Symbol.iterator](): Iterator<RecursiveLevel>;
|
|
717
|
+
/** Create a RecursiveRules object from a dictionary
|
|
718
|
+
*
|
|
719
|
+
* @param {RecursiveRulesData} data - The dictionary-like object.
|
|
720
|
+
* @returns {RecursiveRules} The RecursiveRules object.
|
|
721
|
+
*/
|
|
722
|
+
static fromDict(data: RecursiveRulesData): RecursiveRules;
|
|
723
|
+
/** Return the RecursiveRules as a dictionary-like object
|
|
724
|
+
*
|
|
725
|
+
* @returns {RecursiveRulesData} The dictionary-like object.
|
|
726
|
+
*/
|
|
727
|
+
toDict(): RecursiveRulesData;
|
|
728
|
+
/** Create a RecursiveRules object from a recipe
|
|
729
|
+
*
|
|
730
|
+
* @param {string} name - The name of the recipe.
|
|
731
|
+
* @param {string} lang - The language of the recipe.
|
|
732
|
+
* @param {string} path - The path to the recipe.
|
|
733
|
+
* @returns {Promise<RecursiveRules>} The RecursiveRules object.
|
|
734
|
+
*/
|
|
735
|
+
static fromRecipe(name?: string, lang?: string, path?: string): Promise<RecursiveRules>;
|
|
736
|
+
}
|
|
737
|
+
/** Interface for RecursiveChunk data
|
|
738
|
+
*
|
|
739
|
+
* @interface RecursiveChunkData
|
|
740
|
+
* @property {string} text - The text of the chunk.
|
|
741
|
+
* @property {number} startIndex - The starting index of the chunk.
|
|
742
|
+
* @property {number} endIndex - The ending index of the chunk.
|
|
743
|
+
* @property {number} tokenCount - The number of tokens in the chunk.
|
|
744
|
+
* @property {number} [level] - The level of recursion for the chunk.
|
|
745
|
+
*/
|
|
746
|
+
interface RecursiveChunkData {
|
|
747
|
+
text: string;
|
|
748
|
+
startIndex: number;
|
|
749
|
+
endIndex: number;
|
|
750
|
+
tokenCount: number;
|
|
751
|
+
level?: number;
|
|
752
|
+
embedding?: number[];
|
|
753
|
+
}
|
|
754
|
+
/** Class to represent recursive chunks
|
|
755
|
+
*
|
|
756
|
+
* @class RecursiveChunk
|
|
757
|
+
* @property {number} [level] - The level of recursion for the chunk.
|
|
758
|
+
*/
|
|
759
|
+
declare class RecursiveChunk extends Chunk {
|
|
760
|
+
/** The level of recursion for the chunk */
|
|
761
|
+
level?: number;
|
|
762
|
+
constructor(data: {
|
|
763
|
+
text: string;
|
|
764
|
+
startIndex: number;
|
|
765
|
+
endIndex: number;
|
|
766
|
+
tokenCount: number;
|
|
767
|
+
level?: number;
|
|
768
|
+
embedding?: number[];
|
|
769
|
+
});
|
|
770
|
+
/** Return a string representation of the RecursiveChunk
|
|
771
|
+
*
|
|
772
|
+
* @returns {string} The string representation of the RecursiveChunk.
|
|
773
|
+
*/
|
|
774
|
+
toString(): string;
|
|
775
|
+
/** Return the RecursiveChunk as a dictionary-like object
|
|
776
|
+
*
|
|
777
|
+
* @returns {RecursiveChunkData} The dictionary-like object.
|
|
778
|
+
*/
|
|
779
|
+
toDict(): RecursiveChunkData;
|
|
780
|
+
/** Create a RecursiveChunk object from a dictionary
|
|
781
|
+
*
|
|
782
|
+
* @param {RecursiveChunkData} data - The dictionary-like object.
|
|
783
|
+
* @returns {RecursiveChunk} The RecursiveChunk object.
|
|
784
|
+
*/
|
|
785
|
+
static fromDict(data: RecursiveChunkData): RecursiveChunk;
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
type TokenizerModelName = keyof typeof models;
|
|
789
|
+
declare class ExuluTokenizer {
|
|
790
|
+
constructor();
|
|
791
|
+
encoder: Tiktoken | null;
|
|
792
|
+
create(modelName: TokenizerModelName): Promise<Tiktoken>;
|
|
793
|
+
decode(tokens: Uint32Array): Promise<string>;
|
|
794
|
+
decodeBatch(tokenSequences: Uint32Array[]): Promise<string[]>;
|
|
795
|
+
encode(text: string): Uint32Array;
|
|
796
|
+
countTokensBatch(texts: string[]): Promise<number[]>;
|
|
797
|
+
countTokens(text: string): number;
|
|
798
|
+
free(): Promise<void>;
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
/** Base Chunking Class. **/
|
|
802
|
+
|
|
803
|
+
/**
|
|
804
|
+
* Base class for all chunking classes.
|
|
805
|
+
*
|
|
806
|
+
* This abstract class provides a common interface and shared logic for all chunking implementations.
|
|
807
|
+
* It supports chunking a single text or a batch of texts, with optional concurrency and progress reporting.
|
|
808
|
+
*
|
|
809
|
+
* Subclasses must implement the `chunk` method to define how a single text is chunked.
|
|
810
|
+
*
|
|
811
|
+
* @template T - The type of chunk produced (usually `Chunk[]` or `string[]`).
|
|
812
|
+
*
|
|
813
|
+
* @property {Tokenizer} tokenizer - The tokenizer instance used for chunking operations.
|
|
814
|
+
* @property {boolean} _useConcurrency - Whether to use concurrent processing for batch chunking (default: true).
|
|
815
|
+
*
|
|
816
|
+
* @example
|
|
817
|
+
* class MyChunker extends BaseChunker {
|
|
818
|
+
* async chunk(text: string): Promise<Chunk[]> {
|
|
819
|
+
* // ... implementation ...
|
|
820
|
+
* }
|
|
821
|
+
* }
|
|
822
|
+
*
|
|
823
|
+
* const chunker = new MyChunker(tokenizer);
|
|
824
|
+
* const chunks = await chunker.call("Some text");
|
|
825
|
+
* const batchChunks = await chunker.call(["Text 1", "Text 2"], true);
|
|
826
|
+
*/
|
|
827
|
+
declare abstract class BaseChunker {
|
|
828
|
+
protected tokenizer: ExuluTokenizer;
|
|
829
|
+
protected _useConcurrency: boolean;
|
|
830
|
+
constructor(tokenizer: ExuluTokenizer);
|
|
831
|
+
/**
|
|
832
|
+
* Returns a string representation of the chunker instance.
|
|
833
|
+
*
|
|
834
|
+
* @returns {string} The class name and constructor signature.
|
|
835
|
+
*/
|
|
836
|
+
toString(): string;
|
|
837
|
+
/**
|
|
838
|
+
* Call the chunker with a single string or an array of strings.
|
|
839
|
+
*
|
|
840
|
+
* If a single string is provided, returns the result of `chunk(text)`.
|
|
841
|
+
* If an array of strings is provided, returns the result of `chunkBatch(texts, showProgress)`.
|
|
842
|
+
*
|
|
843
|
+
* @param {string | string[]} textOrTexts - The text or array of texts to chunk.
|
|
844
|
+
* @param {boolean} [showProgress=false] - Whether to display progress for batch operations (only applies to arrays).
|
|
845
|
+
* @returns {Promise<Chunk[] | Chunk[][]>} The chunked result(s).
|
|
846
|
+
* @throws {Error} If input is not a string or array of strings.
|
|
847
|
+
*/
|
|
848
|
+
call(text: string, showProgress?: boolean): Promise<Chunk[]>;
|
|
849
|
+
call(texts: string[], showProgress?: boolean): Promise<Chunk[][]>;
|
|
850
|
+
/**
|
|
851
|
+
* Process a batch of texts sequentially (one after another).
|
|
852
|
+
*
|
|
853
|
+
* @protected
|
|
854
|
+
* @param {string[]} texts - The texts to chunk.
|
|
855
|
+
* @param {boolean} [showProgress=false] - Whether to display progress in the console.
|
|
856
|
+
* @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
|
|
857
|
+
*/
|
|
858
|
+
protected _sequential_batch_processing(texts: string[], showProgress?: boolean): Promise<Chunk[][]>;
|
|
859
|
+
/**
|
|
860
|
+
* Process a batch of texts concurrently using Promise.all.
|
|
861
|
+
*
|
|
862
|
+
* @protected
|
|
863
|
+
* @param {string[]} texts - The texts to chunk.
|
|
864
|
+
* @param {boolean} [showProgress=false] - Whether to display progress in the console.
|
|
865
|
+
* @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
|
|
866
|
+
*/
|
|
867
|
+
protected _concurrent_batch_processing(texts: string[], showProgress?: boolean): Promise<Chunk[][]>;
|
|
868
|
+
/**
|
|
869
|
+
* Abstract method to chunk a single text. Must be implemented by subclasses.
|
|
870
|
+
*
|
|
871
|
+
* @param {string} text - The text to chunk.
|
|
872
|
+
* @returns {Promise<Chunk[]>} The chunked representation of the input text.
|
|
873
|
+
* @abstract
|
|
874
|
+
*/
|
|
875
|
+
abstract chunk(text: string): Promise<Chunk[]>;
|
|
876
|
+
/**
|
|
877
|
+
* Chunk a batch of texts, using either concurrent or sequential processing.
|
|
878
|
+
*
|
|
879
|
+
* If only one text is provided, processes it directly without batch overhead.
|
|
880
|
+
*
|
|
881
|
+
* @param {string[]} texts - The texts to chunk.
|
|
882
|
+
* @param {boolean} [showProgress=true] - Whether to display progress in the console.
|
|
883
|
+
* @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
|
|
884
|
+
*/
|
|
885
|
+
chunkBatch(texts: string[], showProgress?: boolean): Promise<Chunk[][]>;
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
/** Module containing RecursiveChunker class. */
|
|
889
|
+
|
|
890
|
+
/**
|
|
891
|
+
* Configuration options for creating a RecursiveChunker instance.
|
|
892
|
+
* All options are optional and have sensible defaults.
|
|
893
|
+
*
|
|
894
|
+
* @interface RecursiveChunkerOptions
|
|
895
|
+
* @property {string | Tokenizer} [tokenizer] - The tokenizer to use for text processing. Can be a string identifier (default: "Xenova/gpt2") or a Tokenizer instance.
|
|
896
|
+
* @property {number} [chunkSize] - The maximum number of tokens per chunk. Must be greater than 0. Default: 512.
|
|
897
|
+
* @property {RecursiveRules} [rules] - The rules that define how text should be recursively chunked. Default: new RecursiveRules().
|
|
898
|
+
* @property {number} [minCharactersPerChunk] - The minimum number of characters that should be in each chunk. Must be greater than 0. Default: 24.
|
|
899
|
+
*/
|
|
900
|
+
interface RecursiveChunkerOptions {
|
|
901
|
+
tokenizer?: TokenizerModelName;
|
|
902
|
+
chunkSize?: number;
|
|
903
|
+
rules?: RecursiveRules;
|
|
904
|
+
minCharactersPerChunk?: number;
|
|
905
|
+
}
|
|
906
|
+
/**
|
|
907
|
+
* Represents a RecursiveChunker instance that is also directly callable as a function.
|
|
908
|
+
*
|
|
909
|
+
* This type combines all properties and methods of {@link RecursiveChunker} with callable signatures for chunking text(s).
|
|
910
|
+
*
|
|
911
|
+
* Calling the instance executes its `call` method (from {@link BaseChunker}), which in turn calls `chunk` or `chunkBatch`.
|
|
912
|
+
*
|
|
913
|
+
* @typedef {Object} CallableRecursiveChunker
|
|
914
|
+
* @property {number} chunkSize - The maximum number of tokens per chunk.
|
|
915
|
+
* @property {number} minCharactersPerChunk - The minimum number of characters per chunk.
|
|
916
|
+
* @property {RecursiveRules} rules - The rules that define how text should be recursively chunked.
|
|
917
|
+
* @property {string} sep - The separator string used for internal splitting (usually "✄").
|
|
918
|
+
* @property {Tokenizer} tokenizer - The tokenizer instance used for chunking operations (inherited from BaseChunker).
|
|
919
|
+
*
|
|
920
|
+
* @method chunk - Recursively chunk a single text into chunks or strings.
|
|
921
|
+
* @method chunkBatch - Recursively chunk a batch of texts.
|
|
922
|
+
* @method toString - Returns a string representation of the RecursiveChunker instance.
|
|
923
|
+
* @method call - Call the chunker with a single string or an array of strings. (see callable signatures)
|
|
924
|
+
*
|
|
925
|
+
* @static
|
|
926
|
+
* @method create
|
|
927
|
+
* @memberof CallableRecursiveChunker
|
|
928
|
+
* @param {RecursiveChunkerOptions} [options] - Configuration options for the RecursiveChunker.
|
|
929
|
+
* @returns {Promise<CallableRecursiveChunker>} A Promise that resolves to a callable RecursiveChunker instance.
|
|
930
|
+
*
|
|
931
|
+
* @example
|
|
932
|
+
* const chunker = await RecursiveChunker.create({ chunkSize: 256 });
|
|
933
|
+
* const chunks = await chunker("Some text to chunk");
|
|
934
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
935
|
+
*/
|
|
936
|
+
type CallableRecursiveChunker = RecursiveChunker & {
|
|
937
|
+
(text: string, showProgress?: boolean): Promise<RecursiveChunk[]>;
|
|
938
|
+
(texts: string[], showProgress?: boolean): Promise<(RecursiveChunk[])[]>;
|
|
939
|
+
};
|
|
940
|
+
/**
|
|
941
|
+
* Recursively chunk text using a set of rules.
|
|
942
|
+
*
|
|
943
|
+
* This class extends the BaseChunker class and implements the chunk method.
|
|
944
|
+
* It provides a flexible way to chunk text based on custom rules, including
|
|
945
|
+
* delimiters, whitespace, and token-based chunking.
|
|
946
|
+
*
|
|
947
|
+
* @extends BaseChunker
|
|
948
|
+
* @property {number} chunkSize - The maximum number of tokens per chunk.
|
|
949
|
+
* @property {number} minCharactersPerChunk - The minimum number of characters per chunk.
|
|
950
|
+
* @property {RecursiveRules} rules - The rules that define how text should be recursively chunked.
|
|
951
|
+
* @property {string} sep - The separator string used for internal splitting (usually "✄").
|
|
952
|
+
*
|
|
953
|
+
* @method chunk - Recursively chunk a single text into chunks or strings.
|
|
954
|
+
* @method chunkBatch - Recursively chunk a batch of texts.
|
|
955
|
+
* @method toString - Returns a string representation of the RecursiveChunker instance.
|
|
956
|
+
* @method call - Call the chunker with a single string or an array of strings. (see callable signatures)
|
|
957
|
+
*
|
|
958
|
+
* @static
|
|
959
|
+
* @method create
|
|
960
|
+
* @memberof RecursiveChunker
|
|
961
|
+
* @param {RecursiveChunkerOptions} [options] - Configuration options for the RecursiveChunker.
|
|
962
|
+
* @returns {Promise<RecursiveChunker>} A Promise that resolves to a RecursiveChunker instance.
|
|
963
|
+
*
|
|
964
|
+
* @example
|
|
965
|
+
* const chunker = await RecursiveChunker.create({ chunkSize: 256 });
|
|
966
|
+
* const chunks = await chunker("Some text to chunk");
|
|
967
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
968
|
+
*/
|
|
969
|
+
declare class RecursiveChunker extends BaseChunker {
|
|
970
|
+
readonly chunkSize: number;
|
|
971
|
+
readonly minCharactersPerChunk: number;
|
|
972
|
+
readonly rules: RecursiveRules;
|
|
973
|
+
readonly sep: string;
|
|
974
|
+
private readonly _CHARS_PER_TOKEN;
|
|
975
|
+
/**
|
|
976
|
+
* Private constructor. Use `RecursiveChunker.create()` to instantiate.
|
|
977
|
+
*/
|
|
978
|
+
private constructor();
|
|
979
|
+
/**
|
|
980
|
+
* Creates and initializes a directly callable RecursiveChunker instance.
|
|
981
|
+
*
|
|
982
|
+
* This static factory method constructs a RecursiveChunker with the provided options and returns a callable function object.
|
|
983
|
+
* The returned instance can be used as both a function (to chunk text(s)) and as an object (with all RecursiveChunker methods and properties).
|
|
984
|
+
*
|
|
985
|
+
* @param {RecursiveChunkerOptions} [options] - Configuration options for the chunker. All options are optional:
|
|
986
|
+
* @param {string|Tokenizer} [options.tokenizer="Xenova/gpt2"] - Tokenizer to use for text processing. Can be a string identifier (e.g., "Xenova/gpt2") or a Tokenizer instance. If a string is provided, Tokenizer.create() is called internally.
|
|
987
|
+
* @param {number} [options.chunkSize=512] - Maximum number of tokens per chunk. Must be > 0.
|
|
988
|
+
* @param {RecursiveRules} [options.rules=new RecursiveRules()] - Rules for recursive chunking. See {@link RecursiveRules} for customization.
|
|
989
|
+
* @param {number} [options.minCharactersPerChunk=24] - Minimum number of characters per chunk. Must be > 0.
|
|
990
|
+
*
|
|
991
|
+
* @returns {Promise<CallableRecursiveChunker>} Promise resolving to a callable RecursiveChunker instance.
|
|
992
|
+
*
|
|
993
|
+
* @throws {Error} If any option is invalid (e.g., chunkSize <= 0).
|
|
994
|
+
*
|
|
995
|
+
* @see CallableRecursiveChunker for the callable interface and available properties/methods.
|
|
996
|
+
*
|
|
997
|
+
* @example <caption>Basic usage with default options</caption>
|
|
998
|
+
* const chunker = await RecursiveChunker.create();
|
|
999
|
+
* const chunks = await chunker("Some text to chunk");
|
|
1000
|
+
*
|
|
1001
|
+
* @example <caption>Custom options and batch chunking</caption>
|
|
1002
|
+
* const chunker = await RecursiveChunker.create({ chunkSize: 256 });
|
|
1003
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
1004
|
+
*
|
|
1005
|
+
* @example <caption>Accessing properties and methods</caption>
|
|
1006
|
+
* const chunker = await RecursiveChunker.create();
|
|
1007
|
+
* console.log(chunker.chunkSize); // 512
|
|
1008
|
+
* console.log(chunker.rules); // RecursiveRules instance
|
|
1009
|
+
* const chunks = await chunker.chunk("Some text"); // Use as object method
|
|
1010
|
+
*
|
|
1011
|
+
* @note
|
|
1012
|
+
* The returned instance is both callable (like a function) and has all properties/methods of RecursiveChunker.
|
|
1013
|
+
* You can use it as a drop-in replacement for a function or a class instance.
|
|
1014
|
+
*
|
|
1015
|
+
* @note
|
|
1016
|
+
* For advanced customization, pass a custom RecursiveRules object to the rules option.
|
|
1017
|
+
* See {@link RecursiveRules} and {@link RecursiveLevel} for rule structure.
|
|
1018
|
+
*/
|
|
1019
|
+
static create(options?: RecursiveChunkerOptions): Promise<CallableRecursiveChunker>;
|
|
1020
|
+
/**
|
|
1021
|
+
* Estimates the number of tokens in a given text.
|
|
1022
|
+
*
|
|
1023
|
+
* This method uses a character-to-token ratio (default: 6.5 characters per token) for quick estimation.
|
|
1024
|
+
* If the estimated token count exceeds the chunk size, it performs an actual token count.
|
|
1025
|
+
*
|
|
1026
|
+
* @param {string} text - The text to estimate token count for
|
|
1027
|
+
* @returns {Promise<number>} A promise that resolves to the estimated number of tokens
|
|
1028
|
+
* @private
|
|
1029
|
+
*/
|
|
1030
|
+
private _estimateTokenCount;
|
|
1031
|
+
/**
|
|
1032
|
+
* Split the text into chunks based on the provided recursive level rules.
|
|
1033
|
+
*
|
|
1034
|
+
* This method handles three different splitting strategies:
|
|
1035
|
+
* 1. Whitespace-based splitting: Splits text on spaces
|
|
1036
|
+
* 2. Delimiter-based splitting: Splits text on specified delimiters with options to include delimiters
|
|
1037
|
+
* 3. Token-based splitting: Splits text into chunks of maximum token size
|
|
1038
|
+
*
|
|
1039
|
+
* @param {string} text - The text to be split into chunks
|
|
1040
|
+
* @param {RecursiveLevel} recursiveLevel - The rules defining how to split the text
|
|
1041
|
+
* @returns {Promise<string[]>} A promise that resolves to an array of text chunks
|
|
1042
|
+
* @private
|
|
1043
|
+
*/
|
|
1044
|
+
private _splitText;
|
|
1045
|
+
/**
|
|
1046
|
+
* Create a RecursiveChunk object with indices based on the current offset.
|
|
1047
|
+
*
|
|
1048
|
+
* This method constructs a RecursiveChunk object that contains metadata about the chunk,
|
|
1049
|
+
* including the text content, its start and end indices, token count, and the level of recursion.
|
|
1050
|
+
*
|
|
1051
|
+
* @param {string} text - The text content of the chunk
|
|
1052
|
+
* @param {number} tokenCount - The number of tokens in the chunk
|
|
1053
|
+
*/
|
|
1054
|
+
private _makeChunks;
|
|
1055
|
+
/**
|
|
1056
|
+
* Merge short splits.
|
|
1057
|
+
*/
|
|
1058
|
+
private _mergeSplits;
|
|
1059
|
+
/**
|
|
1060
|
+
* Binary search to find the leftmost position where value should be inserted to maintain order.
|
|
1061
|
+
*
|
|
1062
|
+
* @param {number[]} arr - The array to search
|
|
1063
|
+
* @param {number} value - The value to insert
|
|
1064
|
+
* @param {number} [lo=0] - The starting index for the search
|
|
1065
|
+
* @returns {number} The index where the value should be inserted
|
|
1066
|
+
* @private
|
|
1067
|
+
*/
|
|
1068
|
+
private _bisectLeft;
|
|
1069
|
+
/**
|
|
1070
|
+
* Recursive helper for core chunking.
|
|
1071
|
+
*/
|
|
1072
|
+
private _recursiveChunk;
|
|
1073
|
+
/**
|
|
1074
|
+
* Recursively chunk text.
|
|
1075
|
+
*
|
|
1076
|
+
* This method is the main entry point for chunking text using the RecursiveChunker.
|
|
1077
|
+
* It takes a single text string and returns an array of RecursiveChunk objects.
|
|
1078
|
+
*
|
|
1079
|
+
* @param {string} text - The text to be chunked
|
|
1080
|
+
* @returns {Promise<RecursiveChunk[]>} A promise that resolves to an array of RecursiveChunk objects
|
|
1081
|
+
*/
|
|
1082
|
+
chunk(text: string): Promise<RecursiveChunk[]>;
|
|
1083
|
+
/**
|
|
1084
|
+
* Return a string representation of the RecursiveChunker.
|
|
1085
|
+
*
|
|
1086
|
+
* This method provides a string representation of the RecursiveChunker instance,
|
|
1087
|
+
* including its tokenizer, rules, chunk size, minimum characters per chunk, and return type.
|
|
1088
|
+
*
|
|
1089
|
+
* @returns {string} A string representation of the RecursiveChunker
|
|
1090
|
+
*/
|
|
1091
|
+
toString(): string;
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
/**
|
|
1095
|
+
* Represents the essential data for a sentence within a text.
|
|
1096
|
+
*
|
|
1097
|
+
* @property text - The actual sentence string as it appears in the source text.
|
|
1098
|
+
* @property startIndex - The zero-based index indicating where the sentence starts in the original text.
|
|
1099
|
+
* @property endIndex - The zero-based index indicating where the sentence ends in the original text (inclusive).
|
|
1100
|
+
* @property tokenCount - The number of tokens (words or subwords) in the sentence, useful for NLP tasks.
|
|
1101
|
+
*/
|
|
1102
|
+
interface SentenceData {
|
|
1103
|
+
text: string;
|
|
1104
|
+
startIndex: number;
|
|
1105
|
+
endIndex: number;
|
|
1106
|
+
tokenCount: number;
|
|
1107
|
+
}
|
|
1108
|
+
/**
|
|
1109
|
+
* Class to represent a sentence.
|
|
1110
|
+
*
|
|
1111
|
+
* Represents a single sentence within a text, including its text, position, and token count.
|
|
1112
|
+
*
|
|
1113
|
+
* @class
|
|
1114
|
+
* @param {SentenceData} data - The data required to construct a Sentence instance.
|
|
1115
|
+
* @property {string} text - The text of the sentence.
|
|
1116
|
+
* @property {number} startIndex - The starting index of the sentence in the original text.
|
|
1117
|
+
* @property {number} endIndex - The ending index of the sentence in the original text.
|
|
1118
|
+
* @property {number} tokenCount - The number of tokens in the sentence.
|
|
1119
|
+
* @property {number[]} [embedding] - The embedding vector for the sentence (array of numbers, or null if not present).
|
|
1120
|
+
*
|
|
1121
|
+
* @method toString Returns a string representation of the Sentence.
|
|
1122
|
+
* @returns {string}
|
|
1123
|
+
*
|
|
1124
|
+
* @method toDict Returns the Sentence as a dictionary-like object.
|
|
1125
|
+
* @returns {SentenceData}
|
|
1126
|
+
*
|
|
1127
|
+
* @method static fromDict Creates a Sentence object from a dictionary-like object.
|
|
1128
|
+
* @param {SentenceData} data - The data to create the Sentence from.
|
|
1129
|
+
* @returns {Sentence}
|
|
1130
|
+
*/
|
|
1131
|
+
declare class Sentence {
|
|
1132
|
+
/** The text of the sentence */
|
|
1133
|
+
text: string;
|
|
1134
|
+
/** The starting index of the sentence in the original text */
|
|
1135
|
+
startIndex: number;
|
|
1136
|
+
/** The ending index of the sentence in the original text */
|
|
1137
|
+
endIndex: number;
|
|
1138
|
+
/** The number of tokens in the sentence */
|
|
1139
|
+
tokenCount: number;
|
|
1140
|
+
constructor(data: SentenceData);
|
|
1141
|
+
/** Return a string representation of the Sentence */
|
|
1142
|
+
toString(): string;
|
|
1143
|
+
/** Return the Sentence as a dictionary-like object */
|
|
1144
|
+
toDict(): SentenceData;
|
|
1145
|
+
/** Create a Sentence object from a dictionary-like object */
|
|
1146
|
+
static fromDict(data: SentenceData): Sentence;
|
|
1147
|
+
}
|
|
1148
|
+
/**
|
|
1149
|
+
* Represents the essential data for a chunk of sentences within a text.
|
|
1150
|
+
*
|
|
1151
|
+
* @property text - The combined text of all sentences in the chunk as it appears in the source text.
|
|
1152
|
+
* @property startIndex - The zero-based index indicating where the chunk starts in the original text.
|
|
1153
|
+
* @property endIndex - The zero-based index indicating where the chunk ends in the original text (inclusive).
|
|
1154
|
+
* @property tokenCount - The total number of tokens (words or subwords) in the chunk, useful for NLP tasks.
|
|
1155
|
+
* @property sentences - An array of SentenceData objects, each representing an individual sentence within the chunk.
|
|
1156
|
+
*/
|
|
1157
|
+
interface SentenceChunkData {
|
|
1158
|
+
text: string;
|
|
1159
|
+
startIndex: number;
|
|
1160
|
+
endIndex: number;
|
|
1161
|
+
tokenCount: number;
|
|
1162
|
+
sentences: SentenceData[];
|
|
1163
|
+
embedding?: number[];
|
|
1164
|
+
}
|
|
1165
|
+
/**
|
|
1166
|
+
* Represents a chunk of one or more sentences within a text.
|
|
1167
|
+
*
|
|
1168
|
+
* A SentenceChunk groups together multiple {@link Sentence} objects, providing their combined text, position, and token count within the original text.
|
|
1169
|
+
*
|
|
1170
|
+
* @class
|
|
1171
|
+
* @extends Chunk
|
|
1172
|
+
*
|
|
1173
|
+
* @param {Object} data - Data to construct a SentenceChunk instance.
|
|
1174
|
+
* @param {string} data.text - Combined text of all sentences in the chunk.
|
|
1175
|
+
* @param {number} data.startIndex - Zero-based index where the chunk starts in the original text.
|
|
1176
|
+
* @param {number} data.endIndex - Zero-based index where the chunk ends in the original text (inclusive).
|
|
1177
|
+
* @param {number} data.tokenCount - Total number of tokens in the chunk.
|
|
1178
|
+
* @param {Sentence[]} data.sentences - Array of {@link Sentence} objects in the chunk.
|
|
1179
|
+
*
|
|
1180
|
+
* @property {string} text - Combined text of all sentences in the chunk.
|
|
1181
|
+
* @property {number} startIndex - Starting index of the chunk in the original text.
|
|
1182
|
+
* @property {number} endIndex - Ending index of the chunk in the original text.
|
|
1183
|
+
* @property {number} tokenCount - Total number of tokens in the chunk.
|
|
1184
|
+
* @property {Sentence[]} sentences - List of {@link Sentence} objects in the chunk.
|
|
1185
|
+
*
|
|
1186
|
+
* @method toString Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata.
|
|
1187
|
+
* @method toDict Returns the SentenceChunk as a plain object (see {@link SentenceChunkData}).
|
|
1188
|
+
* @method static fromDict Creates a SentenceChunk from a {@link SentenceChunkData} object.
|
|
1189
|
+
*/
|
|
1190
|
+
declare class SentenceChunk extends Chunk {
|
|
1191
|
+
/** List of sentences in the chunk */
|
|
1192
|
+
sentences: Sentence[];
|
|
1193
|
+
constructor(data: {
|
|
1194
|
+
text: string;
|
|
1195
|
+
startIndex: number;
|
|
1196
|
+
endIndex: number;
|
|
1197
|
+
tokenCount: number;
|
|
1198
|
+
sentences: Sentence[];
|
|
1199
|
+
embedding?: number[];
|
|
1200
|
+
});
|
|
1201
|
+
/**
|
|
1202
|
+
* Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata.
|
|
1203
|
+
*
|
|
1204
|
+
* This method overrides the base {@link Chunk} toString method to provide a more informative output, which is especially useful for debugging and logging. Each sentence in the chunk is represented using its own toString method, and all sentences are included in the output.
|
|
1205
|
+
*
|
|
1206
|
+
* @returns {string} A string describing the SentenceChunk and all its sentences, e.g.,
|
|
1207
|
+
* SentenceChunk(text=..., startIndex=..., endIndex=..., tokenCount=..., sentences=[Sentence(...), ...])
|
|
1208
|
+
*/
|
|
1209
|
+
toString(): string;
|
|
1210
|
+
/**
|
|
1211
|
+
* Returns the SentenceChunk as a dictionary-like object.
|
|
1212
|
+
*
|
|
1213
|
+
* This method extends the base {@link Chunk} toDict method to include the sentences in the chunk.
|
|
1214
|
+
*
|
|
1215
|
+
* @returns {SentenceChunkData} A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
|
|
1216
|
+
/** Return the SentenceChunk as a dictionary-like object */
|
|
1217
|
+
toDict(): SentenceChunkData;
|
|
1218
|
+
/**
|
|
1219
|
+
* Creates a SentenceChunk object from a dictionary-like object.
|
|
1220
|
+
*
|
|
1221
|
+
* This method extends the base {@link Chunk} fromDict method to include the sentences in the chunk.
|
|
1222
|
+
*
|
|
1223
|
+
* @param {SentenceChunkData} data - A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
|
|
1224
|
+
* @returns {SentenceChunk} A new SentenceChunk object created from the provided dictionary-like object.
|
|
1225
|
+
*/
|
|
1226
|
+
static fromDict(data: SentenceChunkData): SentenceChunk;
|
|
1227
|
+
}
|
|
1228
|
+
|
|
1229
|
+
/** Module containing SentenceChunker class. */
|
|
1230
|
+
|
|
1231
|
+
/**
|
|
1232
|
+
* Options for creating a SentenceChunker instance.
|
|
1233
|
+
*
|
|
1234
|
+
* @property {string | Tokenizer} [tokenizer] - The tokenizer to use for token counting. Can be a string (model name) or a Tokenizer instance. Default: 'Xenova/gpt2'.
|
|
1235
|
+
* @property {number} [chunkSize] - Maximum number of tokens per chunk. Must be > 0. Default: 512.
|
|
1236
|
+
* @property {number} [chunkOverlap] - Number of tokens to overlap between consecutive chunks. Must be >= 0 and < chunkSize. Default: 0.
|
|
1237
|
+
* @property {number} [minSentencesPerChunk] - Minimum number of sentences per chunk. Must be > 0. Default: 1.
|
|
1238
|
+
* @property {number} [minCharactersPerSentence] - Minimum number of characters for a valid sentence. Sentences shorter than this are merged. Must be > 0. Default: 12.
|
|
1239
|
+
* @property {boolean} [approximate] - (Deprecated) Whether to use approximate token counting. Default: false. Will be removed in future versions.
|
|
1240
|
+
* @property {string[]} [delim] - List of sentence delimiters to use for splitting. Default: ['. ', '! ', '? ', '\n'].
|
|
1241
|
+
* @property {('prev' | 'next' | null)} [includeDelim] - Whether to include the delimiter with the previous sentence ('prev'), next sentence ('next'), or exclude it (null). Default: 'prev'.
|
|
1242
|
+
*/
|
|
1243
|
+
interface SentenceChunkerOptions {
|
|
1244
|
+
tokenizer?: TokenizerModelName;
|
|
1245
|
+
chunkSize?: number;
|
|
1246
|
+
chunkOverlap?: number;
|
|
1247
|
+
minSentencesPerChunk?: number;
|
|
1248
|
+
minCharactersPerSentence?: number;
|
|
1249
|
+
approximate?: boolean;
|
|
1250
|
+
delim?: string[];
|
|
1251
|
+
includeDelim?: "prev" | "next" | null;
|
|
1252
|
+
}
|
|
1253
|
+
/**
|
|
1254
|
+
* Represents a SentenceChunker instance that is also directly callable.
|
|
1255
|
+
* This type combines the SentenceChunker class with a function interface,
|
|
1256
|
+
* allowing the instance to be called directly like a function.
|
|
1257
|
+
*
|
|
1258
|
+
* When called, it executes the `call` method inherited from BaseChunker,
|
|
1259
|
+
* which in turn calls either `chunk` (for single text) or `chunkBatch` (for multiple texts).
|
|
1260
|
+
*
|
|
1261
|
+
* @example
|
|
1262
|
+
* const chunker = await SentenceChunker.create();
|
|
1263
|
+
* // Single text processing
|
|
1264
|
+
* const chunks = await chunker("This is a sample text.");
|
|
1265
|
+
* // Batch processing
|
|
1266
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
1267
|
+
*
|
|
1268
|
+
* @type {SentenceChunker & {
|
|
1269
|
+
* (text: string, showProgress?: boolean): Promise<SentenceChunk[]>;
|
|
1270
|
+
* (texts: string[], showProgress?: boolean): Promise<SentenceChunk[][]>;
|
|
1271
|
+
* }}
|
|
1272
|
+
*/
|
|
1273
|
+
type CallableSentenceChunker = SentenceChunker & {
|
|
1274
|
+
(text: string, showProgress?: boolean): Promise<SentenceChunk[]>;
|
|
1275
|
+
(texts: string[], showProgress?: boolean): Promise<SentenceChunk[][]>;
|
|
1276
|
+
};
|
|
1277
|
+
/**
|
|
1278
|
+
* SentenceChunker is a class that implements the BaseChunker interface.
|
|
1279
|
+
* It uses a tokenizer to split text into sentences and then creates chunks of text.
|
|
1280
|
+
*
|
|
1281
|
+
* @extends BaseChunker
|
|
1282
|
+
*
|
|
1283
|
+
* @property {number} chunkSize - Maximum number of tokens per chunk.
|
|
1284
|
+
* @property {number} chunkOverlap - Number of tokens to overlap between consecutive chunks.
|
|
1285
|
+
* @property {number} minSentencesPerChunk - Minimum number of sentences per chunk.
|
|
1286
|
+
* @property {number} minCharactersPerSentence - Minimum number of characters for a valid sentence.
|
|
1287
|
+
* @property {boolean} approximate - Whether to use approximate token counting.
|
|
1288
|
+
* @property {string[]} delim - List of sentence delimiters to use for splitting.
|
|
1289
|
+
* @property {('prev' | 'next' | null)} includeDelim - Whether to include the delimiter with the previous sentence ('prev'), next sentence ('next'), or exclude it (null).
|
|
1290
|
+
*
|
|
1291
|
+
* @method chunk - Chunk a single text string.
|
|
1292
|
+
* @method chunkBatch - Chunk an array of text strings.
|
|
1293
|
+
* @method call - (Inherited from BaseChunker) Chunk a single text string or an array of text strings.
|
|
1294
|
+
* @method toString - Return a string representation of the SentenceChunker.
|
|
1295
|
+
*
|
|
1296
|
+
* @example
|
|
1297
|
+
* const chunker = await SentenceChunker.create();
|
|
1298
|
+
* const chunks = await chunker("This is a sample text.");
|
|
1299
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
1300
|
+
*
|
|
1301
|
+
* @see BaseChunker
|
|
1302
|
+
*/
|
|
1303
|
+
declare class SentenceChunker extends BaseChunker {
|
|
1304
|
+
readonly chunkSize: number;
|
|
1305
|
+
readonly chunkOverlap: number;
|
|
1306
|
+
readonly minSentencesPerChunk: number;
|
|
1307
|
+
readonly minCharactersPerSentence: number;
|
|
1308
|
+
readonly approximate: boolean;
|
|
1309
|
+
readonly delim: string[];
|
|
1310
|
+
readonly includeDelim: "prev" | "next" | null;
|
|
1311
|
+
readonly sep: string;
|
|
1312
|
+
/**
|
|
1313
|
+
* Private constructor. Use `SentenceChunker.create()` to instantiate.
|
|
1314
|
+
*
|
|
1315
|
+
* @param {Tokenizer} tokenizer - The tokenizer to use for token counting.
|
|
1316
|
+
* @param {number} chunkSize - Maximum number of tokens per chunk.
|
|
1317
|
+
* @param {number} chunkOverlap - Number of tokens to overlap between consecutive chunks.
|
|
1318
|
+
* @param {number} minSentencesPerChunk - Minimum number of sentences per chunk.
|
|
1319
|
+
* @param {number} minCharactersPerSentence - Minimum number of characters for a valid sentence.
|
|
1320
|
+
* @param {boolean} approximate - Whether to use approximate token counting.
|
|
1321
|
+
* @param {string[]} delim - List of sentence delimiters to use for splitting.
|
|
1322
|
+
* @param {('prev' | 'next' | null)} includeDelim - Whether to include the delimiter with the previous sentence ('prev'), next sentence ('next'), or exclude it (null).
|
|
1323
|
+
*/
|
|
1324
|
+
private constructor();
|
|
1325
|
+
/**
|
|
1326
|
+
* Creates and initializes a SentenceChunker instance that is directly callable.
|
|
1327
|
+
*
|
|
1328
|
+
* This method is a static factory function that returns a Promise resolving to a CallableSentenceChunker instance.
|
|
1329
|
+
* The returned instance is a callable function that can be used to chunk text strings or arrays of text strings.
|
|
1330
|
+
*
|
|
1331
|
+
* @param {SentenceChunkerOptions} [options] - Options for configuring the SentenceChunker.
|
|
1332
|
+
* @returns {Promise<CallableSentenceChunker>} A promise that resolves to a callable SentenceChunker instance.
|
|
1333
|
+
*
|
|
1334
|
+
* @example
|
|
1335
|
+
* const chunker = await SentenceChunker.create();
|
|
1336
|
+
* const chunks = await chunker("This is a sample text.");
|
|
1337
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
1338
|
+
*
|
|
1339
|
+
* @see SentenceChunkerOptions
|
|
1340
|
+
*/
|
|
1341
|
+
static create(options?: SentenceChunkerOptions): Promise<CallableSentenceChunker>;
|
|
1342
|
+
/**
|
|
1343
|
+
* Fast sentence splitting while maintaining accuracy.
|
|
1344
|
+
*
|
|
1345
|
+
* @param {string} text - The text to split into sentences.
|
|
1346
|
+
* @returns {string[]} An array of sentences.
|
|
1347
|
+
*/
|
|
1348
|
+
private _splitText;
|
|
1349
|
+
/**
|
|
1350
|
+
* Split text into sentences and calculate token counts for each sentence.
|
|
1351
|
+
*
|
|
1352
|
+
* @param {string} text - The text to split into sentences.
|
|
1353
|
+
* @returns {Promise<Sentence[]>} An array of Sentence objects.
|
|
1354
|
+
*/
|
|
1355
|
+
private _prepareSentences;
|
|
1356
|
+
/**
|
|
1357
|
+
* Create a chunk from a list of sentences.
|
|
1358
|
+
*
|
|
1359
|
+
* @param {Sentence[]} sentences - The sentences to create a chunk from.
|
|
1360
|
+
* @returns {Promise<SentenceChunk>} A promise that resolves to a SentenceChunk object.
|
|
1361
|
+
*/
|
|
1362
|
+
private _createChunk;
|
|
1363
|
+
/**
|
|
1364
|
+
* Split text into overlapping chunks based on sentences while respecting token limits.
|
|
1365
|
+
*
|
|
1366
|
+
* @param {string} text - The text to split into chunks.
|
|
1367
|
+
* @returns {Promise<SentenceChunk[]>} A promise that resolves to an array of SentenceChunk objects.
|
|
1368
|
+
*/
|
|
1369
|
+
chunk(text: string): Promise<SentenceChunk[]>;
|
|
1370
|
+
/**
|
|
1371
|
+
* Binary search to find the leftmost position where value should be inserted to maintain order.
|
|
1372
|
+
*
|
|
1373
|
+
* @param {number[]} arr - The array to search.
|
|
1374
|
+
* @param {number} value - The value to search for.
|
|
1375
|
+
* @param {number} [lo] - The starting index of the search.
|
|
1376
|
+
* @returns {number} The index of the leftmost position where value should be inserted.
|
|
1377
|
+
*/
|
|
1378
|
+
private _bisectLeft;
|
|
1379
|
+
/**
|
|
1380
|
+
* Return a string representation of the SentenceChunker.
|
|
1381
|
+
*
|
|
1382
|
+
* @returns {string} A string representation of the SentenceChunker.
|
|
1383
|
+
*/
|
|
1384
|
+
toString(): string;
|
|
1385
|
+
}
|
|
1386
|
+
|
|
529
1387
|
type JOB_STATUS = "completed" | "failed" | "delayed" | "active" | "waiting" | "paused" | "stuck";
|
|
530
1388
|
declare const JOB_STATUS_ENUM: {
|
|
531
1389
|
completed: string;
|
|
@@ -549,12 +1407,10 @@ declare const ExuluJobs: {
|
|
|
549
1407
|
};
|
|
550
1408
|
|
|
551
1409
|
declare const ExuluChunkers: {
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
rules: typeof RecursiveRules;
|
|
557
|
-
};
|
|
1410
|
+
sentence: typeof SentenceChunker;
|
|
1411
|
+
recursive: {
|
|
1412
|
+
function: typeof RecursiveChunker;
|
|
1413
|
+
rules: typeof RecursiveRules;
|
|
558
1414
|
};
|
|
559
1415
|
};
|
|
560
1416
|
declare const ExuluDatabase: {
|