@exulu/backend 0.3.4 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -5,7 +5,8 @@ import { ZodSchema, z } from 'zod';
5
5
  import { Tool, LanguageModelV1 } from 'ai';
6
6
  import { Express } from 'express';
7
7
  import { Knex } from 'knex';
8
- import { SentenceChunker, RecursiveChunker, RecursiveRules } from 'chonkie';
8
+ import { Tiktoken } from 'tiktoken/lite';
9
+ import models from 'tiktoken/model_to_encoding.json';
9
10
 
10
11
  declare function redisClient(): Promise<{
11
12
  client: RedisClientType | null;
@@ -526,6 +527,863 @@ declare class ExuluQueues {
526
527
  }
527
528
  declare const queues: ExuluQueues;
528
529
 
530
+ /**
531
+ * Represents the data structure for a chunk object.
532
+ *
533
+ * @property {string} text - The text of the chunk.
534
+ * @property {number} startIndex - The starting index of the chunk in the original text.
535
+ * @property {number} endIndex - The ending index of the chunk in the original text.
536
+ * @property {number} tokenCount - The number of tokens in the chunk.
537
+ */
538
+ interface ChunkData {
539
+ text: string;
540
+ startIndex: number;
541
+ endIndex: number;
542
+ tokenCount: number;
543
+ embedding?: number[];
544
+ }
545
+ /**
546
+ * Represents a chunk of text with associated metadata.
547
+ *
548
+ * @property {string} text - The text of the chunk.
549
+ * @property {number} startIndex - The starting index of the chunk in the original text.
550
+ * @property {number} endIndex - The ending index of the chunk in the original text.
551
+ * @property {number} tokenCount - The number of tokens in the chunk.
552
+ * @property {number[]} [embedding] - The embedding for the chunk.
553
+ */
554
+ declare class Chunk {
555
+ /** The text of the chunk. */
556
+ text: string;
557
+ /** The starting index of the chunk in the original text. */
558
+ startIndex: number;
559
+ /** The ending index of the chunk in the original text. */
560
+ endIndex: number;
561
+ /** The number of tokens in the chunk. */
562
+ tokenCount: number;
563
+ /** Optional embedding for the chunk. */
564
+ embedding?: number[];
565
+ /**
566
+ * Constructs a new Chunk object.
567
+ *
568
+ * @param {ChunkData} data - The data to construct the Chunk from.
569
+ */
570
+ constructor(data: {
571
+ text: string;
572
+ startIndex: number;
573
+ endIndex: number;
574
+ tokenCount: number;
575
+ embedding?: number[];
576
+ });
577
+ /** Return a string representation of the Chunk.
578
+ *
579
+ * @returns {string} The text of the chunk.
580
+ */
581
+ toString(): string;
582
+ /** Return a detailed string representation of the Chunk.
583
+ *
584
+ * @returns {string} The detailed string representation of the Chunk.
585
+ */
586
+ toRepresentation(): string;
587
+ /** Return a slice of the chunk's text.
588
+ *
589
+ * @param {number} [start] - The starting index of the slice.
590
+ * @param {number} [end] - The ending index of the slice.
591
+ * @returns {string} The slice of the chunk's text.
592
+ */
593
+ slice(start?: number, end?: number): string;
594
+ /** Return the Chunk as a dictionary-like object.
595
+ *
596
+ * @returns {ChunkData} The dictionary-like object.
597
+ */
598
+ toDict(): ChunkData;
599
+ /** Create a Chunk object from a dictionary-like object.
600
+ *
601
+ * @param {ChunkData} data - The dictionary-like object.
602
+ * @returns {Chunk} The Chunk object.
603
+ */
604
+ static fromDict(data: ChunkData): Chunk;
605
+ /** Return a deep copy of the chunk.
606
+ *
607
+ * @returns {Chunk} The deep copy of the chunk.
608
+ */
609
+ copy(): Chunk;
610
+ }
611
+
612
+ /** Type for include delimiter options
613
+ *
614
+ * @enum {string}
615
+ */
616
+ type IncludeDelim = 'prev' | 'next';
617
+ /** Interface for RecursiveLevel data
618
+ *
619
+ * @interface RecursiveLevelData
620
+ * @property {string | string[]} [delimiters] - The delimiters to use for chunking.
621
+ * @property {boolean} [whitespace] - Whether to use whitespace as a delimiter.
622
+ * @property {IncludeDelim} [includeDelim] - Whether to include the delimiter in the previous or next chunk.
623
+ */
624
+ interface RecursiveLevelData {
625
+ delimiters?: string | string[];
626
+ whitespace?: boolean;
627
+ includeDelim?: IncludeDelim;
628
+ }
629
+ /** Class to represent recursive chunking rules at a specific level
630
+ *
631
+ * @class RecursiveLevel
632
+ * @property {string | string[]} [delimiters] - The delimiters to use for chunking.
633
+ * @property {boolean} [whitespace] - Whether to use whitespace as a delimiter.
634
+ * @property {IncludeDelim} [includeDelim] - Whether to include the delimiter in the previous or next chunk.
635
+ */
636
+ declare class RecursiveLevel {
637
+ /** Custom delimiters for chunking */
638
+ delimiters?: string | string[];
639
+ /** Whether to use whitespace as a delimiter */
640
+ whitespace: boolean;
641
+ /** Whether to include the delimiter in the previous or next chunk */
642
+ includeDelim: IncludeDelim;
643
+ /**
644
+ * Constructs a new RecursiveLevel object.
645
+ *
646
+ * @param {RecursiveLevelData} data - The data to construct the RecursiveLevel from.
647
+ */
648
+ constructor(data?: RecursiveLevelData);
649
+ /**
650
+ * Validates the RecursiveLevel object.
651
+ *
652
+ * @private
653
+ */
654
+ private validate;
655
+ /** Return a string representation of the RecursiveLevel
656
+ *
657
+ * @returns {string} The string representation of the RecursiveLevel.
658
+ */
659
+ toString(): string;
660
+ /** Return the RecursiveLevel as a dictionary-like object
661
+ *
662
+ * @returns {RecursiveLevelData} The dictionary-like object.
663
+ */
664
+ toDict(): RecursiveLevelData;
665
+ /** Create RecursiveLevel object from a dictionary
666
+ *
667
+ * @param {RecursiveLevelData} data - The dictionary-like object.
668
+ * @returns {RecursiveLevel} The RecursiveLevel object.
669
+ */
670
+ static fromDict(data: RecursiveLevelData): RecursiveLevel;
671
+ /** Create RecursiveLevel object from a recipe
672
+ *
673
+ * @param {string} name - The name of the recipe.
674
+ * @param {string} lang - The language of the recipe.
675
+ * @returns {Promise<RecursiveLevel>} The RecursiveLevel object.
676
+ */
677
+ static fromRecipe(name: string, lang?: string): Promise<RecursiveLevel>;
678
+ }
679
+ /** Interface for RecursiveRules data
680
+ *
681
+ * @interface RecursiveRulesData
682
+ * @property {RecursiveLevelData[]} [levels] - The recursive levels.
683
+ */
684
+ interface RecursiveRulesData {
685
+ levels?: RecursiveLevelData[];
686
+ }
687
+ /** Class to represent recursive chunking rules
688
+ *
689
+ * @class RecursiveRules
690
+ * @property {RecursiveLevel[]} [levels] - The recursive levels.
691
+ */
692
+ declare class RecursiveRules {
693
+ /** List of recursive levels */
694
+ levels: RecursiveLevel[];
695
+ constructor(data?: RecursiveRulesData);
696
+ /** Return a string representation of the RecursiveRules
697
+ *
698
+ * @returns {string} The string representation of the RecursiveRules.
699
+ */
700
+ toString(): string;
701
+ /** Return the number of levels
702
+ *
703
+ * @returns {number} The number of levels.
704
+ */
705
+ get length(): number;
706
+ /** Get a level by index
707
+ *
708
+ * @param {number} index - The index of the level.
709
+ * @returns {RecursiveLevel | undefined} The level.
710
+ */
711
+ getLevel(index: number): RecursiveLevel | undefined;
712
+ /** Return an iterator over the levels
713
+ *
714
+ * @returns {Iterator<RecursiveLevel>} The iterator over the levels.
715
+ */
716
+ [Symbol.iterator](): Iterator<RecursiveLevel>;
717
+ /** Create a RecursiveRules object from a dictionary
718
+ *
719
+ * @param {RecursiveRulesData} data - The dictionary-like object.
720
+ * @returns {RecursiveRules} The RecursiveRules object.
721
+ */
722
+ static fromDict(data: RecursiveRulesData): RecursiveRules;
723
+ /** Return the RecursiveRules as a dictionary-like object
724
+ *
725
+ * @returns {RecursiveRulesData} The dictionary-like object.
726
+ */
727
+ toDict(): RecursiveRulesData;
728
+ /** Create a RecursiveRules object from a recipe
729
+ *
730
+ * @param {string} name - The name of the recipe.
731
+ * @param {string} lang - The language of the recipe.
732
+ * @param {string} path - The path to the recipe.
733
+ * @returns {Promise<RecursiveRules>} The RecursiveRules object.
734
+ */
735
+ static fromRecipe(name?: string, lang?: string, path?: string): Promise<RecursiveRules>;
736
+ }
737
+ /** Interface for RecursiveChunk data
738
+ *
739
+ * @interface RecursiveChunkData
740
+ * @property {string} text - The text of the chunk.
741
+ * @property {number} startIndex - The starting index of the chunk.
742
+ * @property {number} endIndex - The ending index of the chunk.
743
+ * @property {number} tokenCount - The number of tokens in the chunk.
744
+ * @property {number} [level] - The level of recursion for the chunk.
745
+ */
746
+ interface RecursiveChunkData {
747
+ text: string;
748
+ startIndex: number;
749
+ endIndex: number;
750
+ tokenCount: number;
751
+ level?: number;
752
+ embedding?: number[];
753
+ }
754
+ /** Class to represent recursive chunks
755
+ *
756
+ * @class RecursiveChunk
757
+ * @property {number} [level] - The level of recursion for the chunk.
758
+ */
759
+ declare class RecursiveChunk extends Chunk {
760
+ /** The level of recursion for the chunk */
761
+ level?: number;
762
+ constructor(data: {
763
+ text: string;
764
+ startIndex: number;
765
+ endIndex: number;
766
+ tokenCount: number;
767
+ level?: number;
768
+ embedding?: number[];
769
+ });
770
+ /** Return a string representation of the RecursiveChunk
771
+ *
772
+ * @returns {string} The string representation of the RecursiveChunk.
773
+ */
774
+ toString(): string;
775
+ /** Return the RecursiveChunk as a dictionary-like object
776
+ *
777
+ * @returns {RecursiveChunkData} The dictionary-like object.
778
+ */
779
+ toDict(): RecursiveChunkData;
780
+ /** Create a RecursiveChunk object from a dictionary
781
+ *
782
+ * @param {RecursiveChunkData} data - The dictionary-like object.
783
+ * @returns {RecursiveChunk} The RecursiveChunk object.
784
+ */
785
+ static fromDict(data: RecursiveChunkData): RecursiveChunk;
786
+ }
787
+
788
+ type TokenizerModelName = keyof typeof models;
789
+ declare class ExuluTokenizer {
790
+ constructor();
791
+ encoder: Tiktoken | null;
792
+ create(modelName: TokenizerModelName): Promise<Tiktoken>;
793
+ decode(tokens: Uint32Array): Promise<string>;
794
+ decodeBatch(tokenSequences: Uint32Array[]): Promise<string[]>;
795
+ encode(text: string): Uint32Array;
796
+ countTokensBatch(texts: string[]): Promise<number[]>;
797
+ countTokens(text: string): number;
798
+ free(): Promise<void>;
799
+ }
800
+
801
+ /** Base Chunking Class. **/
802
+
803
+ /**
804
+ * Base class for all chunking classes.
805
+ *
806
+ * This abstract class provides a common interface and shared logic for all chunking implementations.
807
+ * It supports chunking a single text or a batch of texts, with optional concurrency and progress reporting.
808
+ *
809
+ * Subclasses must implement the `chunk` method to define how a single text is chunked.
810
+ *
811
+ * @template T - The type of chunk produced (usually `Chunk[]` or `string[]`).
812
+ *
813
+ * @property {Tokenizer} tokenizer - The tokenizer instance used for chunking operations.
814
+ * @property {boolean} _useConcurrency - Whether to use concurrent processing for batch chunking (default: true).
815
+ *
816
+ * @example
817
+ * class MyChunker extends BaseChunker {
818
+ * async chunk(text: string): Promise<Chunk[]> {
819
+ * // ... implementation ...
820
+ * }
821
+ * }
822
+ *
823
+ * const chunker = new MyChunker(tokenizer);
824
+ * const chunks = await chunker.call("Some text");
825
+ * const batchChunks = await chunker.call(["Text 1", "Text 2"], true);
826
+ */
827
+ declare abstract class BaseChunker {
828
+ protected tokenizer: ExuluTokenizer;
829
+ protected _useConcurrency: boolean;
830
+ constructor(tokenizer: ExuluTokenizer);
831
+ /**
832
+ * Returns a string representation of the chunker instance.
833
+ *
834
+ * @returns {string} The class name and constructor signature.
835
+ */
836
+ toString(): string;
837
+ /**
838
+ * Call the chunker with a single string or an array of strings.
839
+ *
840
+ * If a single string is provided, returns the result of `chunk(text)`.
841
+ * If an array of strings is provided, returns the result of `chunkBatch(texts, showProgress)`.
842
+ *
843
+ * @param {string | string[]} textOrTexts - The text or array of texts to chunk.
844
+ * @param {boolean} [showProgress=false] - Whether to display progress for batch operations (only applies to arrays).
845
+ * @returns {Promise<Chunk[] | Chunk[][]>} The chunked result(s).
846
+ * @throws {Error} If input is not a string or array of strings.
847
+ */
848
+ call(text: string, showProgress?: boolean): Promise<Chunk[]>;
849
+ call(texts: string[], showProgress?: boolean): Promise<Chunk[][]>;
850
+ /**
851
+ * Process a batch of texts sequentially (one after another).
852
+ *
853
+ * @protected
854
+ * @param {string[]} texts - The texts to chunk.
855
+ * @param {boolean} [showProgress=false] - Whether to display progress in the console.
856
+ * @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
857
+ */
858
+ protected _sequential_batch_processing(texts: string[], showProgress?: boolean): Promise<Chunk[][]>;
859
+ /**
860
+ * Process a batch of texts concurrently using Promise.all.
861
+ *
862
+ * @protected
863
+ * @param {string[]} texts - The texts to chunk.
864
+ * @param {boolean} [showProgress=false] - Whether to display progress in the console.
865
+ * @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
866
+ */
867
+ protected _concurrent_batch_processing(texts: string[], showProgress?: boolean): Promise<Chunk[][]>;
868
+ /**
869
+ * Abstract method to chunk a single text. Must be implemented by subclasses.
870
+ *
871
+ * @param {string} text - The text to chunk.
872
+ * @returns {Promise<Chunk[]>} The chunked representation of the input text.
873
+ * @abstract
874
+ */
875
+ abstract chunk(text: string): Promise<Chunk[]>;
876
+ /**
877
+ * Chunk a batch of texts, using either concurrent or sequential processing.
878
+ *
879
+ * If only one text is provided, processes it directly without batch overhead.
880
+ *
881
+ * @param {string[]} texts - The texts to chunk.
882
+ * @param {boolean} [showProgress=true] - Whether to display progress in the console.
883
+ * @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
884
+ */
885
+ chunkBatch(texts: string[], showProgress?: boolean): Promise<Chunk[][]>;
886
+ }
887
+
888
+ /** Module containing RecursiveChunker class. */
889
+
890
+ /**
891
+ * Configuration options for creating a RecursiveChunker instance.
892
+ * All options are optional and have sensible defaults.
893
+ *
894
+ * @interface RecursiveChunkerOptions
895
+ * @property {string | Tokenizer} [tokenizer] - The tokenizer to use for text processing. Can be a string identifier (default: "Xenova/gpt2") or a Tokenizer instance.
896
+ * @property {number} [chunkSize] - The maximum number of tokens per chunk. Must be greater than 0. Default: 512.
897
+ * @property {RecursiveRules} [rules] - The rules that define how text should be recursively chunked. Default: new RecursiveRules().
898
+ * @property {number} [minCharactersPerChunk] - The minimum number of characters that should be in each chunk. Must be greater than 0. Default: 24.
899
+ */
900
+ interface RecursiveChunkerOptions {
901
+ tokenizer?: TokenizerModelName;
902
+ chunkSize?: number;
903
+ rules?: RecursiveRules;
904
+ minCharactersPerChunk?: number;
905
+ }
906
+ /**
907
+ * Represents a RecursiveChunker instance that is also directly callable as a function.
908
+ *
909
+ * This type combines all properties and methods of {@link RecursiveChunker} with callable signatures for chunking text(s).
910
+ *
911
+ * Calling the instance executes its `call` method (from {@link BaseChunker}), which in turn calls `chunk` or `chunkBatch`.
912
+ *
913
+ * @typedef {Object} CallableRecursiveChunker
914
+ * @property {number} chunkSize - The maximum number of tokens per chunk.
915
+ * @property {number} minCharactersPerChunk - The minimum number of characters per chunk.
916
+ * @property {RecursiveRules} rules - The rules that define how text should be recursively chunked.
917
+ * @property {string} sep - The separator string used for internal splitting (usually "✄").
918
+ * @property {Tokenizer} tokenizer - The tokenizer instance used for chunking operations (inherited from BaseChunker).
919
+ *
920
+ * @method chunk - Recursively chunk a single text into chunks or strings.
921
+ * @method chunkBatch - Recursively chunk a batch of texts.
922
+ * @method toString - Returns a string representation of the RecursiveChunker instance.
923
+ * @method call - Call the chunker with a single string or an array of strings. (see callable signatures)
924
+ *
925
+ * @static
926
+ * @method create
927
+ * @memberof CallableRecursiveChunker
928
+ * @param {RecursiveChunkerOptions} [options] - Configuration options for the RecursiveChunker.
929
+ * @returns {Promise<CallableRecursiveChunker>} A Promise that resolves to a callable RecursiveChunker instance.
930
+ *
931
+ * @example
932
+ * const chunker = await RecursiveChunker.create({ chunkSize: 256 });
933
+ * const chunks = await chunker("Some text to chunk");
934
+ * const batchChunks = await chunker(["Text 1", "Text 2"]);
935
+ */
936
+ type CallableRecursiveChunker = RecursiveChunker & {
937
+ (text: string, showProgress?: boolean): Promise<RecursiveChunk[]>;
938
+ (texts: string[], showProgress?: boolean): Promise<(RecursiveChunk[])[]>;
939
+ };
940
+ /**
941
+ * Recursively chunk text using a set of rules.
942
+ *
943
+ * This class extends the BaseChunker class and implements the chunk method.
944
+ * It provides a flexible way to chunk text based on custom rules, including
945
+ * delimiters, whitespace, and token-based chunking.
946
+ *
947
+ * @extends BaseChunker
948
+ * @property {number} chunkSize - The maximum number of tokens per chunk.
949
+ * @property {number} minCharactersPerChunk - The minimum number of characters per chunk.
950
+ * @property {RecursiveRules} rules - The rules that define how text should be recursively chunked.
951
+ * @property {string} sep - The separator string used for internal splitting (usually "✄").
952
+ *
953
+ * @method chunk - Recursively chunk a single text into chunks or strings.
954
+ * @method chunkBatch - Recursively chunk a batch of texts.
955
+ * @method toString - Returns a string representation of the RecursiveChunker instance.
956
+ * @method call - Call the chunker with a single string or an array of strings. (see callable signatures)
957
+ *
958
+ * @static
959
+ * @method create
960
+ * @memberof RecursiveChunker
961
+ * @param {RecursiveChunkerOptions} [options] - Configuration options for the RecursiveChunker.
962
+ * @returns {Promise<RecursiveChunker>} A Promise that resolves to a RecursiveChunker instance.
963
+ *
964
+ * @example
965
+ * const chunker = await RecursiveChunker.create({ chunkSize: 256 });
966
+ * const chunks = await chunker("Some text to chunk");
967
+ * const batchChunks = await chunker(["Text 1", "Text 2"]);
968
+ */
969
+ declare class RecursiveChunker extends BaseChunker {
970
+ readonly chunkSize: number;
971
+ readonly minCharactersPerChunk: number;
972
+ readonly rules: RecursiveRules;
973
+ readonly sep: string;
974
+ private readonly _CHARS_PER_TOKEN;
975
+ /**
976
+ * Private constructor. Use `RecursiveChunker.create()` to instantiate.
977
+ */
978
+ private constructor();
979
+ /**
980
+ * Creates and initializes a directly callable RecursiveChunker instance.
981
+ *
982
+ * This static factory method constructs a RecursiveChunker with the provided options and returns a callable function object.
983
+ * The returned instance can be used as both a function (to chunk text(s)) and as an object (with all RecursiveChunker methods and properties).
984
+ *
985
+ * @param {RecursiveChunkerOptions} [options] - Configuration options for the chunker. All options are optional:
986
+ * @param {string|Tokenizer} [options.tokenizer="Xenova/gpt2"] - Tokenizer to use for text processing. Can be a string identifier (e.g., "Xenova/gpt2") or a Tokenizer instance. If a string is provided, Tokenizer.create() is called internally.
987
+ * @param {number} [options.chunkSize=512] - Maximum number of tokens per chunk. Must be > 0.
988
+ * @param {RecursiveRules} [options.rules=new RecursiveRules()] - Rules for recursive chunking. See {@link RecursiveRules} for customization.
989
+ * @param {number} [options.minCharactersPerChunk=24] - Minimum number of characters per chunk. Must be > 0.
990
+ *
991
+ * @returns {Promise<CallableRecursiveChunker>} Promise resolving to a callable RecursiveChunker instance.
992
+ *
993
+ * @throws {Error} If any option is invalid (e.g., chunkSize <= 0).
994
+ *
995
+ * @see CallableRecursiveChunker for the callable interface and available properties/methods.
996
+ *
997
+ * @example <caption>Basic usage with default options</caption>
998
+ * const chunker = await RecursiveChunker.create();
999
+ * const chunks = await chunker("Some text to chunk");
1000
+ *
1001
+ * @example <caption>Custom options and batch chunking</caption>
1002
+ * const chunker = await RecursiveChunker.create({ chunkSize: 256 });
1003
+ * const batchChunks = await chunker(["Text 1", "Text 2"]);
1004
+ *
1005
+ * @example <caption>Accessing properties and methods</caption>
1006
+ * const chunker = await RecursiveChunker.create();
1007
+ * console.log(chunker.chunkSize); // 512
1008
+ * console.log(chunker.rules); // RecursiveRules instance
1009
+ * const chunks = await chunker.chunk("Some text"); // Use as object method
1010
+ *
1011
+ * @note
1012
+ * The returned instance is both callable (like a function) and has all properties/methods of RecursiveChunker.
1013
+ * You can use it as a drop-in replacement for a function or a class instance.
1014
+ *
1015
+ * @note
1016
+ * For advanced customization, pass a custom RecursiveRules object to the rules option.
1017
+ * See {@link RecursiveRules} and {@link RecursiveLevel} for rule structure.
1018
+ */
1019
+ static create(options?: RecursiveChunkerOptions): Promise<CallableRecursiveChunker>;
1020
+ /**
1021
+ * Estimates the number of tokens in a given text.
1022
+ *
1023
+ * This method uses a character-to-token ratio (default: 6.5 characters per token) for quick estimation.
1024
+ * If the estimated token count exceeds the chunk size, it performs an actual token count.
1025
+ *
1026
+ * @param {string} text - The text to estimate token count for
1027
+ * @returns {Promise<number>} A promise that resolves to the estimated number of tokens
1028
+ * @private
1029
+ */
1030
+ private _estimateTokenCount;
1031
+ /**
1032
+ * Split the text into chunks based on the provided recursive level rules.
1033
+ *
1034
+ * This method handles three different splitting strategies:
1035
+ * 1. Whitespace-based splitting: Splits text on spaces
1036
+ * 2. Delimiter-based splitting: Splits text on specified delimiters with options to include delimiters
1037
+ * 3. Token-based splitting: Splits text into chunks of maximum token size
1038
+ *
1039
+ * @param {string} text - The text to be split into chunks
1040
+ * @param {RecursiveLevel} recursiveLevel - The rules defining how to split the text
1041
+ * @returns {Promise<string[]>} A promise that resolves to an array of text chunks
1042
+ * @private
1043
+ */
1044
+ private _splitText;
1045
+ /**
1046
+ * Create a RecursiveChunk object with indices based on the current offset.
1047
+ *
1048
+ * This method constructs a RecursiveChunk object that contains metadata about the chunk,
1049
+ * including the text content, its start and end indices, token count, and the level of recursion.
1050
+ *
1051
+ * @param {string} text - The text content of the chunk
1052
+ * @param {number} tokenCount - The number of tokens in the chunk
1053
+ */
1054
+ private _makeChunks;
1055
+ /**
1056
+ * Merge short splits.
1057
+ */
1058
+ private _mergeSplits;
1059
+ /**
1060
+ * Binary search to find the leftmost position where value should be inserted to maintain order.
1061
+ *
1062
+ * @param {number[]} arr - The array to search
1063
+ * @param {number} value - The value to insert
1064
+ * @param {number} [lo=0] - The starting index for the search
1065
+ * @returns {number} The index where the value should be inserted
1066
+ * @private
1067
+ */
1068
+ private _bisectLeft;
1069
+ /**
1070
+ * Recursive helper for core chunking.
1071
+ */
1072
+ private _recursiveChunk;
1073
+ /**
1074
+ * Recursively chunk text.
1075
+ *
1076
+ * This method is the main entry point for chunking text using the RecursiveChunker.
1077
+ * It takes a single text string and returns an array of RecursiveChunk objects.
1078
+ *
1079
+ * @param {string} text - The text to be chunked
1080
+ * @returns {Promise<RecursiveChunk[]>} A promise that resolves to an array of RecursiveChunk objects
1081
+ */
1082
+ chunk(text: string): Promise<RecursiveChunk[]>;
1083
+ /**
1084
+ * Return a string representation of the RecursiveChunker.
1085
+ *
1086
+ * This method provides a string representation of the RecursiveChunker instance,
1087
+ * including its tokenizer, rules, chunk size, minimum characters per chunk, and return type.
1088
+ *
1089
+ * @returns {string} A string representation of the RecursiveChunker
1090
+ */
1091
+ toString(): string;
1092
+ }
1093
+
1094
+ /**
1095
+ * Represents the essential data for a sentence within a text.
1096
+ *
1097
+ * @property text - The actual sentence string as it appears in the source text.
1098
+ * @property startIndex - The zero-based index indicating where the sentence starts in the original text.
1099
+ * @property endIndex - The zero-based index indicating where the sentence ends in the original text (inclusive).
1100
+ * @property tokenCount - The number of tokens (words or subwords) in the sentence, useful for NLP tasks.
1101
+ */
1102
+ interface SentenceData {
1103
+ text: string;
1104
+ startIndex: number;
1105
+ endIndex: number;
1106
+ tokenCount: number;
1107
+ }
1108
+ /**
1109
+ * Class to represent a sentence.
1110
+ *
1111
+ * Represents a single sentence within a text, including its text, position, and token count.
1112
+ *
1113
+ * @class
1114
+ * @param {SentenceData} data - The data required to construct a Sentence instance.
1115
+ * @property {string} text - The text of the sentence.
1116
+ * @property {number} startIndex - The starting index of the sentence in the original text.
1117
+ * @property {number} endIndex - The ending index of the sentence in the original text.
1118
+ * @property {number} tokenCount - The number of tokens in the sentence.
1119
+ * @property {number[]} [embedding] - The embedding vector for the sentence (array of numbers, or null if not present).
1120
+ *
1121
+ * @method toString Returns a string representation of the Sentence.
1122
+ * @returns {string}
1123
+ *
1124
+ * @method toDict Returns the Sentence as a dictionary-like object.
1125
+ * @returns {SentenceData}
1126
+ *
1127
+ * @method static fromDict Creates a Sentence object from a dictionary-like object.
1128
+ * @param {SentenceData} data - The data to create the Sentence from.
1129
+ * @returns {Sentence}
1130
+ */
1131
+ declare class Sentence {
1132
+ /** The text of the sentence */
1133
+ text: string;
1134
+ /** The starting index of the sentence in the original text */
1135
+ startIndex: number;
1136
+ /** The ending index of the sentence in the original text */
1137
+ endIndex: number;
1138
+ /** The number of tokens in the sentence */
1139
+ tokenCount: number;
1140
+ constructor(data: SentenceData);
1141
+ /** Return a string representation of the Sentence */
1142
+ toString(): string;
1143
+ /** Return the Sentence as a dictionary-like object */
1144
+ toDict(): SentenceData;
1145
+ /** Create a Sentence object from a dictionary-like object */
1146
+ static fromDict(data: SentenceData): Sentence;
1147
+ }
1148
+ /**
1149
+ * Represents the essential data for a chunk of sentences within a text.
1150
+ *
1151
+ * @property text - The combined text of all sentences in the chunk as it appears in the source text.
1152
+ * @property startIndex - The zero-based index indicating where the chunk starts in the original text.
1153
+ * @property endIndex - The zero-based index indicating where the chunk ends in the original text (inclusive).
1154
+ * @property tokenCount - The total number of tokens (words or subwords) in the chunk, useful for NLP tasks.
1155
+ * @property sentences - An array of SentenceData objects, each representing an individual sentence within the chunk.
1156
+ */
1157
+ interface SentenceChunkData {
1158
+ text: string;
1159
+ startIndex: number;
1160
+ endIndex: number;
1161
+ tokenCount: number;
1162
+ sentences: SentenceData[];
1163
+ embedding?: number[];
1164
+ }
1165
+ /**
1166
+ * Represents a chunk of one or more sentences within a text.
1167
+ *
1168
+ * A SentenceChunk groups together multiple {@link Sentence} objects, providing their combined text, position, and token count within the original text.
1169
+ *
1170
+ * @class
1171
+ * @extends Chunk
1172
+ *
1173
+ * @param {Object} data - Data to construct a SentenceChunk instance.
1174
+ * @param {string} data.text - Combined text of all sentences in the chunk.
1175
+ * @param {number} data.startIndex - Zero-based index where the chunk starts in the original text.
1176
+ * @param {number} data.endIndex - Zero-based index where the chunk ends in the original text (inclusive).
1177
+ * @param {number} data.tokenCount - Total number of tokens in the chunk.
1178
+ * @param {Sentence[]} data.sentences - Array of {@link Sentence} objects in the chunk.
1179
+ *
1180
+ * @property {string} text - Combined text of all sentences in the chunk.
1181
+ * @property {number} startIndex - Starting index of the chunk in the original text.
1182
+ * @property {number} endIndex - Ending index of the chunk in the original text.
1183
+ * @property {number} tokenCount - Total number of tokens in the chunk.
1184
+ * @property {Sentence[]} sentences - List of {@link Sentence} objects in the chunk.
1185
+ *
1186
+ * @method toString Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata.
1187
+ * @method toDict Returns the SentenceChunk as a plain object (see {@link SentenceChunkData}).
1188
+ * @method static fromDict Creates a SentenceChunk from a {@link SentenceChunkData} object.
1189
+ */
1190
+ declare class SentenceChunk extends Chunk {
1191
+ /** List of sentences in the chunk */
1192
+ sentences: Sentence[];
1193
+ constructor(data: {
1194
+ text: string;
1195
+ startIndex: number;
1196
+ endIndex: number;
1197
+ tokenCount: number;
1198
+ sentences: Sentence[];
1199
+ embedding?: number[];
1200
+ });
1201
+ /**
1202
+ * Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata.
1203
+ *
1204
+ * This method overrides the base {@link Chunk} toString method to provide a more informative output, which is especially useful for debugging and logging. Each sentence in the chunk is represented using its own toString method, and all sentences are included in the output.
1205
+ *
1206
+ * @returns {string} A string describing the SentenceChunk and all its sentences, e.g.,
1207
+ * SentenceChunk(text=..., startIndex=..., endIndex=..., tokenCount=..., sentences=[Sentence(...), ...])
1208
+ */
1209
+ toString(): string;
1210
+ /**
1211
+ * Returns the SentenceChunk as a dictionary-like object.
1212
+ *
1213
+ * This method extends the base {@link Chunk} toDict method to include the sentences in the chunk.
1214
+ *
1215
+ * @returns {SentenceChunkData} A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
1216
+ /** Return the SentenceChunk as a dictionary-like object */
1217
+ toDict(): SentenceChunkData;
1218
+ /**
1219
+ * Creates a SentenceChunk object from a dictionary-like object.
1220
+ *
1221
+ * This method extends the base {@link Chunk} fromDict method to include the sentences in the chunk.
1222
+ *
1223
+ * @param {SentenceChunkData} data - A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
1224
+ * @returns {SentenceChunk} A new SentenceChunk object created from the provided dictionary-like object.
1225
+ */
1226
+ static fromDict(data: SentenceChunkData): SentenceChunk;
1227
+ }
1228
+
1229
+ /** Module containing SentenceChunker class. */
1230
+
1231
+ /**
1232
+ * Options for creating a SentenceChunker instance.
1233
+ *
1234
+ * @property {string | Tokenizer} [tokenizer] - The tokenizer to use for token counting. Can be a string (model name) or a Tokenizer instance. Default: 'Xenova/gpt2'.
1235
+ * @property {number} [chunkSize] - Maximum number of tokens per chunk. Must be > 0. Default: 512.
1236
+ * @property {number} [chunkOverlap] - Number of tokens to overlap between consecutive chunks. Must be >= 0 and < chunkSize. Default: 0.
1237
+ * @property {number} [minSentencesPerChunk] - Minimum number of sentences per chunk. Must be > 0. Default: 1.
1238
+ * @property {number} [minCharactersPerSentence] - Minimum number of characters for a valid sentence. Sentences shorter than this are merged. Must be > 0. Default: 12.
1239
+ * @property {boolean} [approximate] - (Deprecated) Whether to use approximate token counting. Default: false. Will be removed in future versions.
1240
+ * @property {string[]} [delim] - List of sentence delimiters to use for splitting. Default: ['. ', '! ', '? ', '\n'].
1241
+ * @property {('prev' | 'next' | null)} [includeDelim] - Whether to include the delimiter with the previous sentence ('prev'), next sentence ('next'), or exclude it (null). Default: 'prev'.
1242
+ */
1243
+ interface SentenceChunkerOptions {
1244
+ tokenizer?: TokenizerModelName;
1245
+ chunkSize?: number;
1246
+ chunkOverlap?: number;
1247
+ minSentencesPerChunk?: number;
1248
+ minCharactersPerSentence?: number;
1249
+ approximate?: boolean;
1250
+ delim?: string[];
1251
+ includeDelim?: "prev" | "next" | null;
1252
+ }
1253
+ /**
1254
+ * Represents a SentenceChunker instance that is also directly callable.
1255
+ * This type combines the SentenceChunker class with a function interface,
1256
+ * allowing the instance to be called directly like a function.
1257
+ *
1258
+ * When called, it executes the `call` method inherited from BaseChunker,
1259
+ * which in turn calls either `chunk` (for single text) or `chunkBatch` (for multiple texts).
1260
+ *
1261
+ * @example
1262
+ * const chunker = await SentenceChunker.create();
1263
+ * // Single text processing
1264
+ * const chunks = await chunker("This is a sample text.");
1265
+ * // Batch processing
1266
+ * const batchChunks = await chunker(["Text 1", "Text 2"]);
1267
+ *
1268
+ * @type {SentenceChunker & {
1269
+ * (text: string, showProgress?: boolean): Promise<SentenceChunk[]>;
1270
+ * (texts: string[], showProgress?: boolean): Promise<SentenceChunk[][]>;
1271
+ * }}
1272
+ */
1273
+ type CallableSentenceChunker = SentenceChunker & {
1274
+ (text: string, showProgress?: boolean): Promise<SentenceChunk[]>;
1275
+ (texts: string[], showProgress?: boolean): Promise<SentenceChunk[][]>;
1276
+ };
1277
+ /**
1278
+ * SentenceChunker is a class that implements the BaseChunker interface.
1279
+ * It uses a tokenizer to split text into sentences and then creates chunks of text.
1280
+ *
1281
+ * @extends BaseChunker
1282
+ *
1283
+ * @property {number} chunkSize - Maximum number of tokens per chunk.
1284
+ * @property {number} chunkOverlap - Number of tokens to overlap between consecutive chunks.
1285
+ * @property {number} minSentencesPerChunk - Minimum number of sentences per chunk.
1286
+ * @property {number} minCharactersPerSentence - Minimum number of characters for a valid sentence.
1287
+ * @property {boolean} approximate - Whether to use approximate token counting.
1288
+ * @property {string[]} delim - List of sentence delimiters to use for splitting.
1289
+ * @property {('prev' | 'next' | null)} includeDelim - Whether to include the delimiter with the previous sentence ('prev'), next sentence ('next'), or exclude it (null).
1290
+ *
1291
+ * @method chunk - Chunk a single text string.
1292
+ * @method chunkBatch - Chunk an array of text strings.
1293
+ * @method call - (Inherited from BaseChunker) Chunk a single text string or an array of text strings.
1294
+ * @method toString - Return a string representation of the SentenceChunker.
1295
+ *
1296
+ * @example
1297
+ * const chunker = await SentenceChunker.create();
1298
+ * const chunks = await chunker("This is a sample text.");
1299
+ * const batchChunks = await chunker(["Text 1", "Text 2"]);
1300
+ *
1301
+ * @see BaseChunker
1302
+ */
1303
+ declare class SentenceChunker extends BaseChunker {
1304
+ readonly chunkSize: number;
1305
+ readonly chunkOverlap: number;
1306
+ readonly minSentencesPerChunk: number;
1307
+ readonly minCharactersPerSentence: number;
1308
+ readonly approximate: boolean;
1309
+ readonly delim: string[];
1310
+ readonly includeDelim: "prev" | "next" | null;
1311
+ readonly sep: string;
1312
+ /**
1313
+ * Private constructor. Use `SentenceChunker.create()` to instantiate.
1314
+ *
1315
+ * @param {Tokenizer} tokenizer - The tokenizer to use for token counting.
1316
+ * @param {number} chunkSize - Maximum number of tokens per chunk.
1317
+ * @param {number} chunkOverlap - Number of tokens to overlap between consecutive chunks.
1318
+ * @param {number} minSentencesPerChunk - Minimum number of sentences per chunk.
1319
+ * @param {number} minCharactersPerSentence - Minimum number of characters for a valid sentence.
1320
+ * @param {boolean} approximate - Whether to use approximate token counting.
1321
+ * @param {string[]} delim - List of sentence delimiters to use for splitting.
1322
+ * @param {('prev' | 'next' | null)} includeDelim - Whether to include the delimiter with the previous sentence ('prev'), next sentence ('next'), or exclude it (null).
1323
+ */
1324
+ private constructor();
1325
+ /**
1326
+ * Creates and initializes a SentenceChunker instance that is directly callable.
1327
+ *
1328
+ * This method is a static factory function that returns a Promise resolving to a CallableSentenceChunker instance.
1329
+ * The returned instance is a callable function that can be used to chunk text strings or arrays of text strings.
1330
+ *
1331
+ * @param {SentenceChunkerOptions} [options] - Options for configuring the SentenceChunker.
1332
+ * @returns {Promise<CallableSentenceChunker>} A promise that resolves to a callable SentenceChunker instance.
1333
+ *
1334
+ * @example
1335
+ * const chunker = await SentenceChunker.create();
1336
+ * const chunks = await chunker("This is a sample text.");
1337
+ * const batchChunks = await chunker(["Text 1", "Text 2"]);
1338
+ *
1339
+ * @see SentenceChunkerOptions
1340
+ */
1341
+ static create(options?: SentenceChunkerOptions): Promise<CallableSentenceChunker>;
1342
+ /**
1343
+ * Fast sentence splitting while maintaining accuracy.
1344
+ *
1345
+ * @param {string} text - The text to split into sentences.
1346
+ * @returns {string[]} An array of sentences.
1347
+ */
1348
+ private _splitText;
1349
+ /**
1350
+ * Split text into sentences and calculate token counts for each sentence.
1351
+ *
1352
+ * @param {string} text - The text to split into sentences.
1353
+ * @returns {Promise<Sentence[]>} An array of Sentence objects.
1354
+ */
1355
+ private _prepareSentences;
1356
+ /**
1357
+ * Create a chunk from a list of sentences.
1358
+ *
1359
+ * @param {Sentence[]} sentences - The sentences to create a chunk from.
1360
+ * @returns {Promise<SentenceChunk>} A promise that resolves to a SentenceChunk object.
1361
+ */
1362
+ private _createChunk;
1363
+ /**
1364
+ * Split text into overlapping chunks based on sentences while respecting token limits.
1365
+ *
1366
+ * @param {string} text - The text to split into chunks.
1367
+ * @returns {Promise<SentenceChunk[]>} A promise that resolves to an array of SentenceChunk objects.
1368
+ */
1369
+ chunk(text: string): Promise<SentenceChunk[]>;
1370
+ /**
1371
+ * Binary search to find the leftmost position where value should be inserted to maintain order.
1372
+ *
1373
+ * @param {number[]} arr - The array to search.
1374
+ * @param {number} value - The value to search for.
1375
+ * @param {number} [lo] - The starting index of the search.
1376
+ * @returns {number} The index of the leftmost position where value should be inserted.
1377
+ */
1378
+ private _bisectLeft;
1379
+ /**
1380
+ * Return a string representation of the SentenceChunker.
1381
+ *
1382
+ * @returns {string} A string representation of the SentenceChunker.
1383
+ */
1384
+ toString(): string;
1385
+ }
1386
+
529
1387
  type JOB_STATUS = "completed" | "failed" | "delayed" | "active" | "waiting" | "paused" | "stuck";
530
1388
  declare const JOB_STATUS_ENUM: {
531
1389
  completed: string;
@@ -549,12 +1407,10 @@ declare const ExuluJobs: {
549
1407
  };
550
1408
 
551
1409
  declare const ExuluChunkers: {
552
- chonkie: {
553
- sentence: typeof SentenceChunker;
554
- recursive: {
555
- function: typeof RecursiveChunker;
556
- rules: typeof RecursiveRules;
557
- };
1410
+ sentence: typeof SentenceChunker;
1411
+ recursive: {
1412
+ function: typeof RecursiveChunker;
1413
+ rules: typeof RecursiveRules;
558
1414
  };
559
1415
  };
560
1416
  declare const ExuluDatabase: {