@exulu/backend 0.3.4 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1850,10 +1850,10 @@ var updateStatistic = async (statistic) => {
1850
1850
  };
1851
1851
 
1852
1852
  // src/registry/index.ts
1853
- var import_express6 = require("express");
1853
+ var import_express4 = require("express");
1854
1854
 
1855
1855
  // src/registry/routes.ts
1856
- var import_express3 = require("express");
1856
+ var import_express2 = require("express");
1857
1857
 
1858
1858
  // src/registry/rate-limiter.ts
1859
1859
  var rateLimiter = async (key, windowSeconds, limit, points) => {
@@ -1891,9 +1891,25 @@ var rateLimiter = async (key, windowSeconds, limit, points) => {
1891
1891
  }
1892
1892
  };
1893
1893
 
1894
- // src/registry/route-validators/index.ts
1895
- var import_express = require("express");
1896
- var import_jwt = require("next-auth/jwt");
1894
+ // src/auth/get-token.ts
1895
+ var import_jose = require("jose");
1896
+ var getToken = async (authHeader) => {
1897
+ const token = authHeader.split(" ")[1];
1898
+ if (!token) {
1899
+ throw new Error("No token provided");
1900
+ }
1901
+ if (!process.env.NEXTAUTH_SECRET) {
1902
+ throw new Error("No NEXTAUTH_SECRET provided");
1903
+ }
1904
+ try {
1905
+ const secret = process.env.NEXTAUTH_SECRET;
1906
+ const jwk = await (0, import_jose.importJWK)({ k: secret, alg: "HS256", kty: "oct" });
1907
+ const { payload } = await (0, import_jose.jwtVerify)(token, jwk);
1908
+ return payload;
1909
+ } catch (error) {
1910
+ throw new Error("Invalid token");
1911
+ }
1912
+ };
1897
1913
 
1898
1914
  // src/auth/auth.ts
1899
1915
  var import_bcryptjs2 = __toESM(require("bcryptjs"), 1);
@@ -2031,7 +2047,8 @@ var requestValidators = {
2031
2047
  let authtoken = null;
2032
2048
  if (typeof apikey !== "string") {
2033
2049
  const secret = process.env.NEXTAUTH_SECRET;
2034
- authtoken = await (0, import_jwt.getToken)({ req, secret });
2050
+ authtoken = await getToken(req.headers["authorization"] ?? "");
2051
+ console.log("[EXULU] authtoken", authtoken);
2035
2052
  }
2036
2053
  return await authentication({
2037
2054
  authtoken,
@@ -2216,7 +2233,7 @@ var VectorMethodEnum = {
2216
2233
  };
2217
2234
 
2218
2235
  // src/registry/routes.ts
2219
- var import_express4 = __toESM(require("express"), 1);
2236
+ var import_express3 = __toESM(require("express"), 1);
2220
2237
  var import_server3 = require("@apollo/server");
2221
2238
  var Papa = __toESM(require("papaparse"), 1);
2222
2239
  var import_cors = __toESM(require("cors"), 1);
@@ -2554,9 +2571,8 @@ type PageInfo {
2554
2571
  var import_express5 = require("@as-integrations/express5");
2555
2572
 
2556
2573
  // src/registry/uppy.ts
2557
- var import_express2 = require("express");
2574
+ var import_express = require("express");
2558
2575
  var import_body_parser = __toESM(require("body-parser"), 1);
2559
- var import_jwt2 = require("next-auth/jwt");
2560
2576
  var createUppyRoutes = async (app) => {
2561
2577
  const {
2562
2578
  S3Client,
@@ -2619,11 +2635,11 @@ var createUppyRoutes = async (app) => {
2619
2635
  }
2620
2636
  app.use(import_body_parser.default.urlencoded({ extended: true }), import_body_parser.default.json());
2621
2637
  app.get("/s3/list", async (req, res, next) => {
2638
+ req.accepts;
2622
2639
  const apikey = req.headers["exulu-api-key"] || null;
2623
2640
  let authtoken = null;
2624
2641
  if (typeof apikey !== "string") {
2625
- const secret = process.env.NEXTAUTH_SECRET;
2626
- authtoken = await (0, import_jwt2.getToken)({ req, secret });
2642
+ authtoken = await getToken(req.headers.authorization ?? "");
2627
2643
  }
2628
2644
  const { db: db2 } = await postgresClient();
2629
2645
  const authenticationResult = await authentication({
@@ -2673,8 +2689,7 @@ var createUppyRoutes = async (app) => {
2673
2689
  const { db: db2 } = await postgresClient();
2674
2690
  let authtoken = null;
2675
2691
  if (typeof apikey !== "string" && typeof internalkey !== "string") {
2676
- const secret = process.env.NEXTAUTH_SECRET;
2677
- authtoken = await (0, import_jwt2.getToken)({ req, secret });
2692
+ authtoken = await getToken(req.headers.authorization ?? "");
2678
2693
  }
2679
2694
  const authenticationResult = await authentication({
2680
2695
  authtoken,
@@ -2747,8 +2762,7 @@ var createUppyRoutes = async (app) => {
2747
2762
  const { db: db2 } = await postgresClient();
2748
2763
  let authtoken = null;
2749
2764
  if (typeof apikey !== "string") {
2750
- const secret = process.env.NEXTAUTH_SECRET;
2751
- authtoken = await (0, import_jwt2.getToken)({ req, secret });
2765
+ authtoken = await getToken(req.headers.authorization ?? "");
2752
2766
  }
2753
2767
  const authenticationResult = await authentication({
2754
2768
  authtoken,
@@ -2796,8 +2810,7 @@ var createUppyRoutes = async (app) => {
2796
2810
  const { db: db2 } = await postgresClient();
2797
2811
  let authtoken = null;
2798
2812
  if (typeof apikey !== "string") {
2799
- const secret = process.env.NEXTAUTH_SECRET;
2800
- authtoken = await (0, import_jwt2.getToken)({ req, secret });
2813
+ authtoken = await getToken(req.headers.authorization ?? "");
2801
2814
  }
2802
2815
  const authenticationResult = await authentication({
2803
2816
  authtoken,
@@ -3062,7 +3075,7 @@ var createExpressRoutes = async (app, agents, embedders, tools, workflows, conte
3062
3075
  app.use(
3063
3076
  "/graphql",
3064
3077
  (0, import_cors.default)(),
3065
- import_express4.default.json(),
3078
+ import_express3.default.json(),
3066
3079
  (0, import_express5.expressMiddleware)(server, {
3067
3080
  context: async ({ req }) => {
3068
3081
  const authenticationResult = await requestValidators.authenticate(req);
@@ -4232,8 +4245,1208 @@ var ExuluApp = class {
4232
4245
  };
4233
4246
  };
4234
4247
 
4235
- // src/index.ts
4236
- var import_chonkie = require("chonkie");
4248
+ // src/chunking/types/base.ts
4249
+ var Chunk = class _Chunk {
4250
+ /** The text of the chunk. */
4251
+ text;
4252
+ /** The starting index of the chunk in the original text. */
4253
+ startIndex;
4254
+ /** The ending index of the chunk in the original text. */
4255
+ endIndex;
4256
+ /** The number of tokens in the chunk. */
4257
+ tokenCount;
4258
+ /** Optional embedding for the chunk. */
4259
+ embedding;
4260
+ /**
4261
+ * Constructs a new Chunk object.
4262
+ *
4263
+ * @param {ChunkData} data - The data to construct the Chunk from.
4264
+ */
4265
+ constructor(data) {
4266
+ this.text = data.text;
4267
+ this.startIndex = data.startIndex;
4268
+ this.endIndex = data.endIndex;
4269
+ this.tokenCount = data.tokenCount;
4270
+ this.embedding = data.embedding;
4271
+ if (this.startIndex > this.endIndex) {
4272
+ throw new Error("Start index must be less than or equal to end index.");
4273
+ }
4274
+ if (this.tokenCount < 0) {
4275
+ throw new Error("Token count must be a non-negative integer.");
4276
+ }
4277
+ }
4278
+ /** Return a string representation of the Chunk.
4279
+ *
4280
+ * @returns {string} The text of the chunk.
4281
+ */
4282
+ toString() {
4283
+ return this.text;
4284
+ }
4285
+ /** Return a detailed string representation of the Chunk.
4286
+ *
4287
+ * @returns {string} The detailed string representation of the Chunk.
4288
+ */
4289
+ toRepresentation() {
4290
+ let repr = `Chunk(text='${this.text}', tokenCount=${this.tokenCount}, startIndex=${this.startIndex}, endIndex=${this.endIndex}`;
4291
+ repr += ")";
4292
+ return repr;
4293
+ }
4294
+ /** Return a slice of the chunk's text.
4295
+ *
4296
+ * @param {number} [start] - The starting index of the slice.
4297
+ * @param {number} [end] - The ending index of the slice.
4298
+ * @returns {string} The slice of the chunk's text.
4299
+ */
4300
+ slice(start, end) {
4301
+ return this.text.slice(start, end);
4302
+ }
4303
+ /** Return the Chunk as a dictionary-like object.
4304
+ *
4305
+ * @returns {ChunkData} The dictionary-like object.
4306
+ */
4307
+ toDict() {
4308
+ return {
4309
+ text: this.text,
4310
+ startIndex: this.startIndex,
4311
+ endIndex: this.endIndex,
4312
+ tokenCount: this.tokenCount,
4313
+ embedding: this.embedding
4314
+ };
4315
+ }
4316
+ /** Create a Chunk object from a dictionary-like object.
4317
+ *
4318
+ * @param {ChunkData} data - The dictionary-like object.
4319
+ * @returns {Chunk} The Chunk object.
4320
+ */
4321
+ static fromDict(data) {
4322
+ return new _Chunk({
4323
+ text: data.text,
4324
+ startIndex: data.startIndex,
4325
+ endIndex: data.endIndex,
4326
+ tokenCount: data.tokenCount,
4327
+ embedding: data.embedding
4328
+ });
4329
+ }
4330
+ /** Return a deep copy of the chunk.
4331
+ *
4332
+ * @returns {Chunk} The deep copy of the chunk.
4333
+ */
4334
+ copy() {
4335
+ return _Chunk.fromDict(this.toDict());
4336
+ }
4337
+ };
4338
+
4339
+ // src/chunking/types/recursive.ts
4340
+ var RecursiveLevel = class _RecursiveLevel {
4341
+ /** Custom delimiters for chunking */
4342
+ delimiters;
4343
+ /** Whether to use whitespace as a delimiter */
4344
+ whitespace;
4345
+ /** Whether to include the delimiter in the previous or next chunk */
4346
+ includeDelim;
4347
+ /**
4348
+ * Constructs a new RecursiveLevel object.
4349
+ *
4350
+ * @param {RecursiveLevelData} data - The data to construct the RecursiveLevel from.
4351
+ */
4352
+ constructor(data = {}) {
4353
+ this.delimiters = data.delimiters;
4354
+ this.whitespace = data.whitespace ?? false;
4355
+ this.includeDelim = data.includeDelim ?? "prev";
4356
+ this.validate();
4357
+ }
4358
+ /**
4359
+ * Validates the RecursiveLevel object.
4360
+ *
4361
+ * @private
4362
+ */
4363
+ validate() {
4364
+ if (this.delimiters !== void 0 && this.whitespace) {
4365
+ throw new Error("Cannot use whitespace as a delimiter and also specify custom delimiters.");
4366
+ }
4367
+ if (this.delimiters !== void 0) {
4368
+ if (typeof this.delimiters === "string" && this.delimiters.length === 0) {
4369
+ throw new Error("Custom delimiters cannot be an empty string.");
4370
+ }
4371
+ if (Array.isArray(this.delimiters)) {
4372
+ if (this.delimiters.some((delim) => typeof delim !== "string" || delim.length === 0)) {
4373
+ throw new Error("Custom delimiters cannot be an empty string.");
4374
+ }
4375
+ if (this.delimiters.includes(" ")) {
4376
+ throw new Error("Custom delimiters cannot be whitespace only. Set whitespace to true instead.");
4377
+ }
4378
+ }
4379
+ }
4380
+ }
4381
+ /** Return a string representation of the RecursiveLevel
4382
+ *
4383
+ * @returns {string} The string representation of the RecursiveLevel.
4384
+ */
4385
+ toString() {
4386
+ return `RecursiveLevel(delimiters=${this.delimiters}, whitespace=${this.whitespace}, includeDelim=${this.includeDelim})`;
4387
+ }
4388
+ /** Return the RecursiveLevel as a dictionary-like object
4389
+ *
4390
+ * @returns {RecursiveLevelData} The dictionary-like object.
4391
+ */
4392
+ toDict() {
4393
+ return {
4394
+ delimiters: this.delimiters,
4395
+ whitespace: this.whitespace,
4396
+ includeDelim: this.includeDelim
4397
+ };
4398
+ }
4399
+ /** Create RecursiveLevel object from a dictionary
4400
+ *
4401
+ * @param {RecursiveLevelData} data - The dictionary-like object.
4402
+ * @returns {RecursiveLevel} The RecursiveLevel object.
4403
+ */
4404
+ static fromDict(data) {
4405
+ return new _RecursiveLevel(data);
4406
+ }
4407
+ /** Create RecursiveLevel object from a recipe
4408
+ *
4409
+ * @param {string} name - The name of the recipe.
4410
+ * @param {string} lang - The language of the recipe.
4411
+ * @returns {Promise<RecursiveLevel>} The RecursiveLevel object.
4412
+ */
4413
+ static async fromRecipe(name, lang = "en") {
4414
+ throw new Error("Not implemented");
4415
+ }
4416
+ };
4417
+ var RecursiveRules = class _RecursiveRules {
4418
+ /** List of recursive levels */
4419
+ levels;
4420
+ constructor(data = {}) {
4421
+ if (data.levels === void 0) {
4422
+ const paragraphs = new RecursiveLevel({ delimiters: ["\n\n", "\r\n", "\n", "\r"] });
4423
+ const sentences = new RecursiveLevel({ delimiters: [". ", "! ", "? "] });
4424
+ const pauses = new RecursiveLevel({
4425
+ delimiters: [
4426
+ "{",
4427
+ "}",
4428
+ '"',
4429
+ "[",
4430
+ "]",
4431
+ "<",
4432
+ ">",
4433
+ "(",
4434
+ ")",
4435
+ ":",
4436
+ ";",
4437
+ ",",
4438
+ "\u2014",
4439
+ "|",
4440
+ "~",
4441
+ "-",
4442
+ "...",
4443
+ "`",
4444
+ "'"
4445
+ ]
4446
+ });
4447
+ const word = new RecursiveLevel({ whitespace: true });
4448
+ const token = new RecursiveLevel();
4449
+ this.levels = [paragraphs, sentences, pauses, word, token];
4450
+ } else {
4451
+ this.levels = data.levels.map((level) => new RecursiveLevel(level));
4452
+ }
4453
+ }
4454
+ /** Return a string representation of the RecursiveRules
4455
+ *
4456
+ * @returns {string} The string representation of the RecursiveRules.
4457
+ */
4458
+ toString() {
4459
+ return `RecursiveRules(levels=${this.levels})`;
4460
+ }
4461
+ /** Return the number of levels
4462
+ *
4463
+ * @returns {number} The number of levels.
4464
+ */
4465
+ get length() {
4466
+ return this.levels.length;
4467
+ }
4468
+ /** Get a level by index
4469
+ *
4470
+ * @param {number} index - The index of the level.
4471
+ * @returns {RecursiveLevel | undefined} The level.
4472
+ */
4473
+ getLevel(index) {
4474
+ return this.levels[index];
4475
+ }
4476
+ /** Return an iterator over the levels
4477
+ *
4478
+ * @returns {Iterator<RecursiveLevel>} The iterator over the levels.
4479
+ */
4480
+ [Symbol.iterator]() {
4481
+ return this.levels[Symbol.iterator]();
4482
+ }
4483
+ /** Create a RecursiveRules object from a dictionary
4484
+ *
4485
+ * @param {RecursiveRulesData} data - The dictionary-like object.
4486
+ * @returns {RecursiveRules} The RecursiveRules object.
4487
+ */
4488
+ static fromDict(data) {
4489
+ return new _RecursiveRules(data);
4490
+ }
4491
+ /** Return the RecursiveRules as a dictionary-like object
4492
+ *
4493
+ * @returns {RecursiveRulesData} The dictionary-like object.
4494
+ */
4495
+ toDict() {
4496
+ return {
4497
+ levels: this.levels.map((level) => level.toDict())
4498
+ };
4499
+ }
4500
+ /** Create a RecursiveRules object from a recipe
4501
+ *
4502
+ * @param {string} name - The name of the recipe.
4503
+ * @param {string} lang - The language of the recipe.
4504
+ * @param {string} path - The path to the recipe.
4505
+ * @returns {Promise<RecursiveRules>} The RecursiveRules object.
4506
+ */
4507
+ static async fromRecipe(name = "default", lang = "en", path3) {
4508
+ throw new Error("Not implemented");
4509
+ }
4510
+ };
4511
+ var RecursiveChunk = class _RecursiveChunk extends Chunk {
4512
+ /** The level of recursion for the chunk */
4513
+ level;
4514
+ constructor(data) {
4515
+ super(data);
4516
+ this.level = data.level;
4517
+ }
4518
+ /** Return a string representation of the RecursiveChunk
4519
+ *
4520
+ * @returns {string} The string representation of the RecursiveChunk.
4521
+ */
4522
+ toString() {
4523
+ return `RecursiveChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, level=${this.level})`;
4524
+ }
4525
+ /** Return the RecursiveChunk as a dictionary-like object
4526
+ *
4527
+ * @returns {RecursiveChunkData} The dictionary-like object.
4528
+ */
4529
+ toDict() {
4530
+ const baseDict = super.toDict();
4531
+ return {
4532
+ ...baseDict,
4533
+ level: this.level
4534
+ };
4535
+ }
4536
+ /** Create a RecursiveChunk object from a dictionary
4537
+ *
4538
+ * @param {RecursiveChunkData} data - The dictionary-like object.
4539
+ * @returns {RecursiveChunk} The RecursiveChunk object.
4540
+ */
4541
+ static fromDict(data) {
4542
+ return new _RecursiveChunk(data);
4543
+ }
4544
+ };
4545
+
4546
+ // src/chunking/tokenizer.ts
4547
+ var import_lite = require("tiktoken/lite");
4548
+ var import_load = require("tiktoken/load");
4549
+ var import_registry = __toESM(require("tiktoken/registry.json"), 1);
4550
+ var import_model_to_encoding = __toESM(require("tiktoken/model_to_encoding.json"), 1);
4551
+ var ExuluTokenizer = class {
4552
+ constructor() {
4553
+ }
4554
+ encoder = null;
4555
+ async create(modelName) {
4556
+ if (this.encoder) {
4557
+ return this.encoder;
4558
+ }
4559
+ const time = performance.now();
4560
+ console.log("[EXULU] Loading tokenizer.", modelName);
4561
+ const model = await (0, import_load.load)(import_registry.default[import_model_to_encoding.default[modelName]]);
4562
+ console.log("[EXULU] Loaded tokenizer.", modelName, performance.now() - time);
4563
+ console.log("[EXULU] Model.", model.bpe_ranks);
4564
+ console.log("[EXULU] Model.", model.special_tokens);
4565
+ console.log("[EXULU] Model.", model.pat_str);
4566
+ const encoder = new import_lite.Tiktoken(
4567
+ model.bpe_ranks,
4568
+ model.special_tokens,
4569
+ model.pat_str
4570
+ );
4571
+ console.log("[EXULU] Encoder.", encoder);
4572
+ this.encoder = encoder;
4573
+ return encoder;
4574
+ }
4575
+ async decode(tokens) {
4576
+ if (!this.encoder) {
4577
+ throw new Error("Tokenizer not initialized");
4578
+ }
4579
+ const text = this.encoder.decode(tokens);
4580
+ return new TextDecoder().decode(text);
4581
+ }
4582
+ async decodeBatch(tokenSequences) {
4583
+ if (!this.encoder) {
4584
+ throw new Error("Tokenizer not initialized");
4585
+ }
4586
+ const promises2 = tokenSequences.map((tokens) => this.decode(tokens));
4587
+ return await Promise.all(promises2);
4588
+ }
4589
+ encode(text) {
4590
+ if (!this.encoder) {
4591
+ throw new Error("Tokenizer not initialized");
4592
+ }
4593
+ const time = performance.now();
4594
+ console.log("[EXULU] Encoding text.", text);
4595
+ const tokens = this.encoder.encode(text);
4596
+ console.log("[EXULU] Encoded text.", text, performance.now() - time);
4597
+ return tokens;
4598
+ }
4599
+ async countTokensBatch(texts) {
4600
+ if (!this.encoder) {
4601
+ throw new Error("Tokenizer not initialized");
4602
+ }
4603
+ const promises2 = texts.map((text) => this.countTokens(text));
4604
+ return await Promise.all(promises2);
4605
+ }
4606
+ countTokens(text) {
4607
+ if (!this.encoder) {
4608
+ throw new Error("Tokenizer not initialized");
4609
+ }
4610
+ console.log("[EXULU] Counting tokens.", text);
4611
+ const tokens = this.encoder.encode(text);
4612
+ const count = tokens.length;
4613
+ console.log("[EXULU] Token count.", count);
4614
+ return count;
4615
+ }
4616
+ async free() {
4617
+ console.log("[EXULU] Freeing tokenizer.");
4618
+ if (this.encoder) {
4619
+ this.encoder.free();
4620
+ }
4621
+ }
4622
+ };
4623
+
4624
+ // src/chunking/base.ts
4625
+ var BaseChunker = class {
4626
+ tokenizer;
4627
+ _useConcurrency = true;
4628
+ // Determines if batch processing uses Promise.all
4629
+ constructor(tokenizer) {
4630
+ this.tokenizer = tokenizer;
4631
+ }
4632
+ /**
4633
+ * Returns a string representation of the chunker instance.
4634
+ *
4635
+ * @returns {string} The class name and constructor signature.
4636
+ */
4637
+ toString() {
4638
+ return `${this.constructor.name}()`;
4639
+ }
4640
+ async call(textOrTexts, showProgress = false) {
4641
+ if (typeof textOrTexts === "string") {
4642
+ return this.chunk(textOrTexts);
4643
+ } else if (Array.isArray(textOrTexts)) {
4644
+ return this.chunkBatch(textOrTexts, showProgress);
4645
+ } else {
4646
+ throw new Error("Input must be a string or an array of strings.");
4647
+ }
4648
+ }
4649
+ /**
4650
+ * Process a batch of texts sequentially (one after another).
4651
+ *
4652
+ * @protected
4653
+ * @param {string[]} texts - The texts to chunk.
4654
+ * @param {boolean} [showProgress=false] - Whether to display progress in the console.
4655
+ * @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
4656
+ */
4657
+ async _sequential_batch_processing(texts, showProgress = false) {
4658
+ const results = [];
4659
+ const total = texts.length;
4660
+ for (let i = 0; i < total; i++) {
4661
+ if (showProgress && total > 1) {
4662
+ const progress = Math.round((i + 1) / total * 100);
4663
+ process.stdout.write(`Sequential processing: Document ${i + 1}/${total} (${progress}%)\r`);
4664
+ }
4665
+ results.push(await this.chunk(texts[i]));
4666
+ }
4667
+ if (showProgress && total > 1) {
4668
+ process.stdout.write("\n");
4669
+ }
4670
+ return results;
4671
+ }
4672
+ /**
4673
+ * Process a batch of texts concurrently using Promise.all.
4674
+ *
4675
+ * @protected
4676
+ * @param {string[]} texts - The texts to chunk.
4677
+ * @param {boolean} [showProgress=false] - Whether to display progress in the console.
4678
+ * @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
4679
+ */
4680
+ async _concurrent_batch_processing(texts, showProgress = false) {
4681
+ const total = texts.length;
4682
+ let completedCount = 0;
4683
+ const updateProgress = () => {
4684
+ if (showProgress && total > 1) {
4685
+ completedCount++;
4686
+ const progress = Math.round(completedCount / total * 100);
4687
+ process.stdout.write(`Concurrent processing: Document ${completedCount}/${total} (${progress}%)\r`);
4688
+ }
4689
+ };
4690
+ const chunkPromises = texts.map(
4691
+ (text) => this.chunk(text).then((result) => {
4692
+ updateProgress();
4693
+ return result;
4694
+ })
4695
+ );
4696
+ const results = await Promise.all(chunkPromises);
4697
+ if (showProgress && total > 1 && completedCount > 0) {
4698
+ process.stdout.write("\n");
4699
+ }
4700
+ return results;
4701
+ }
4702
+ /**
4703
+ * Chunk a batch of texts, using either concurrent or sequential processing.
4704
+ *
4705
+ * If only one text is provided, processes it directly without batch overhead.
4706
+ *
4707
+ * @param {string[]} texts - The texts to chunk.
4708
+ * @param {boolean} [showProgress=true] - Whether to display progress in the console.
4709
+ * @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
4710
+ */
4711
+ async chunkBatch(texts, showProgress = true) {
4712
+ if (texts.length === 0) {
4713
+ return [];
4714
+ }
4715
+ if (texts.length === 1) {
4716
+ return [await this.chunk(texts[0])];
4717
+ }
4718
+ if (this._useConcurrency) {
4719
+ return this._concurrent_batch_processing(texts, showProgress);
4720
+ } else {
4721
+ return this._sequential_batch_processing(texts, showProgress);
4722
+ }
4723
+ }
4724
+ };
4725
+
4726
+ // src/chunking/recursive.ts
4727
+ var RecursiveChunker = class _RecursiveChunker extends BaseChunker {
4728
+ chunkSize;
4729
+ minCharactersPerChunk;
4730
+ rules;
4731
+ sep;
4732
+ _CHARS_PER_TOKEN = 6.5;
4733
+ /**
4734
+ * Private constructor. Use `RecursiveChunker.create()` to instantiate.
4735
+ */
4736
+ constructor(tokenizer, chunkSize, rules, minCharactersPerChunk) {
4737
+ super(tokenizer);
4738
+ if (chunkSize <= 0) {
4739
+ throw new Error("chunkSize must be greater than 0");
4740
+ }
4741
+ if (minCharactersPerChunk <= 0) {
4742
+ throw new Error("minCharactersPerChunk must be greater than 0");
4743
+ }
4744
+ if (!(rules instanceof RecursiveRules)) {
4745
+ throw new Error("rules must be a RecursiveRules object");
4746
+ }
4747
+ this.chunkSize = chunkSize;
4748
+ this.minCharactersPerChunk = minCharactersPerChunk;
4749
+ this.rules = rules;
4750
+ this.sep = "\u2704";
4751
+ }
4752
+ /**
4753
+ * Creates and initializes a directly callable RecursiveChunker instance.
4754
+ *
4755
+ * This static factory method constructs a RecursiveChunker with the provided options and returns a callable function object.
4756
+ * The returned instance can be used as both a function (to chunk text(s)) and as an object (with all RecursiveChunker methods and properties).
4757
+ *
4758
+ * @param {RecursiveChunkerOptions} [options] - Configuration options for the chunker. All options are optional:
4759
+ * @param {string|Tokenizer} [options.tokenizer="Xenova/gpt2"] - Tokenizer to use for text processing. Can be a string identifier (e.g., "Xenova/gpt2") or a Tokenizer instance. If a string is provided, Tokenizer.create() is called internally.
4760
+ * @param {number} [options.chunkSize=512] - Maximum number of tokens per chunk. Must be > 0.
4761
+ * @param {RecursiveRules} [options.rules=new RecursiveRules()] - Rules for recursive chunking. See {@link RecursiveRules} for customization.
4762
+ * @param {number} [options.minCharactersPerChunk=24] - Minimum number of characters per chunk. Must be > 0.
4763
+ *
4764
+ * @returns {Promise<CallableRecursiveChunker>} Promise resolving to a callable RecursiveChunker instance.
4765
+ *
4766
+ * @throws {Error} If any option is invalid (e.g., chunkSize <= 0).
4767
+ *
4768
+ * @see CallableRecursiveChunker for the callable interface and available properties/methods.
4769
+ *
4770
+ * @example <caption>Basic usage with default options</caption>
4771
+ * const chunker = await RecursiveChunker.create();
4772
+ * const chunks = await chunker("Some text to chunk");
4773
+ *
4774
+ * @example <caption>Custom options and batch chunking</caption>
4775
+ * const chunker = await RecursiveChunker.create({ chunkSize: 256 });
4776
+ * const batchChunks = await chunker(["Text 1", "Text 2"]);
4777
+ *
4778
+ * @example <caption>Accessing properties and methods</caption>
4779
+ * const chunker = await RecursiveChunker.create();
4780
+ * console.log(chunker.chunkSize); // 512
4781
+ * console.log(chunker.rules); // RecursiveRules instance
4782
+ * const chunks = await chunker.chunk("Some text"); // Use as object method
4783
+ *
4784
+ * @note
4785
+ * The returned instance is both callable (like a function) and has all properties/methods of RecursiveChunker.
4786
+ * You can use it as a drop-in replacement for a function or a class instance.
4787
+ *
4788
+ * @note
4789
+ * For advanced customization, pass a custom RecursiveRules object to the rules option.
4790
+ * See {@link RecursiveRules} and {@link RecursiveLevel} for rule structure.
4791
+ */
4792
+ static async create(options = {}) {
4793
+ const {
4794
+ tokenizer = "gpt-3.5-turbo",
4795
+ chunkSize = 512,
4796
+ rules = new RecursiveRules(),
4797
+ minCharactersPerChunk = 24
4798
+ } = options;
4799
+ const tokenizerInstance = await new ExuluTokenizer();
4800
+ await tokenizerInstance.create(tokenizer);
4801
+ const plainInstance = new _RecursiveChunker(
4802
+ tokenizerInstance,
4803
+ chunkSize,
4804
+ rules,
4805
+ minCharactersPerChunk
4806
+ );
4807
+ const callableFn = function(textOrTexts, showProgress) {
4808
+ if (typeof textOrTexts === "string") {
4809
+ return plainInstance.call(textOrTexts, showProgress);
4810
+ } else {
4811
+ return plainInstance.call(textOrTexts, showProgress);
4812
+ }
4813
+ };
4814
+ Object.setPrototypeOf(callableFn, _RecursiveChunker.prototype);
4815
+ Object.assign(callableFn, plainInstance);
4816
+ return callableFn;
4817
+ }
4818
+ /**
4819
+ * Estimates the number of tokens in a given text.
4820
+ *
4821
+ * This method uses a character-to-token ratio (default: 6.5 characters per token) for quick estimation.
4822
+ * If the estimated token count exceeds the chunk size, it performs an actual token count.
4823
+ *
4824
+ * @param {string} text - The text to estimate token count for
4825
+ * @returns {Promise<number>} A promise that resolves to the estimated number of tokens
4826
+ * @private
4827
+ */
4828
+ async _estimateTokenCount(text) {
4829
+ const estimate = Math.max(1, Math.floor(text.length / this._CHARS_PER_TOKEN));
4830
+ if (estimate > this.chunkSize) {
4831
+ return this.chunkSize + 1;
4832
+ }
4833
+ return this.tokenizer.countTokens(text);
4834
+ }
4835
+ /**
4836
+ * Split the text into chunks based on the provided recursive level rules.
4837
+ *
4838
+ * This method handles three different splitting strategies:
4839
+ * 1. Whitespace-based splitting: Splits text on spaces
4840
+ * 2. Delimiter-based splitting: Splits text on specified delimiters with options to include delimiters
4841
+ * 3. Token-based splitting: Splits text into chunks of maximum token size
4842
+ *
4843
+ * @param {string} text - The text to be split into chunks
4844
+ * @param {RecursiveLevel} recursiveLevel - The rules defining how to split the text
4845
+ * @returns {Promise<string[]>} A promise that resolves to an array of text chunks
4846
+ * @private
4847
+ */
4848
+ async _splitText(text, recursiveLevel) {
4849
+ if (recursiveLevel.whitespace) {
4850
+ return text.split(" ");
4851
+ } else if (recursiveLevel.delimiters) {
4852
+ let t = text;
4853
+ if (recursiveLevel.includeDelim === "prev") {
4854
+ for (const delimiter of Array.isArray(recursiveLevel.delimiters) ? recursiveLevel.delimiters : [recursiveLevel.delimiters]) {
4855
+ t = t.replace(delimiter, delimiter + this.sep);
4856
+ }
4857
+ } else if (recursiveLevel.includeDelim === "next") {
4858
+ for (const delimiter of Array.isArray(recursiveLevel.delimiters) ? recursiveLevel.delimiters : [recursiveLevel.delimiters]) {
4859
+ t = t.replace(delimiter, this.sep + delimiter);
4860
+ }
4861
+ } else {
4862
+ for (const delimiter of Array.isArray(recursiveLevel.delimiters) ? recursiveLevel.delimiters : [recursiveLevel.delimiters]) {
4863
+ t = t.replace(delimiter, this.sep);
4864
+ }
4865
+ }
4866
+ const splits = t.split(this.sep).filter((split) => split !== "");
4867
+ let current = "";
4868
+ const merged = [];
4869
+ for (const split of splits) {
4870
+ if (split.length < this.minCharactersPerChunk) {
4871
+ current += split;
4872
+ } else if (current) {
4873
+ current += split;
4874
+ merged.push(current);
4875
+ current = "";
4876
+ } else {
4877
+ merged.push(split);
4878
+ }
4879
+ if (current.length >= this.minCharactersPerChunk) {
4880
+ merged.push(current);
4881
+ current = "";
4882
+ }
4883
+ }
4884
+ if (current) {
4885
+ merged.push(current);
4886
+ }
4887
+ return merged;
4888
+ } else {
4889
+ const encoded = await this.tokenizer.encode(text);
4890
+ const tokenSplits = [];
4891
+ for (let i = 0; i < encoded.length; i += this.chunkSize) {
4892
+ tokenSplits.push(encoded.slice(i, i + this.chunkSize));
4893
+ }
4894
+ return await this.tokenizer.decodeBatch(tokenSplits);
4895
+ }
4896
+ }
4897
+ /**
4898
+ * Create a RecursiveChunk object with indices based on the current offset.
4899
+ *
4900
+ * This method constructs a RecursiveChunk object that contains metadata about the chunk,
4901
+ * including the text content, its start and end indices, token count, and the level of recursion.
4902
+ *
4903
+ * @param {string} text - The text content of the chunk
4904
+ * @param {number} tokenCount - The number of tokens in the chunk
4905
+ */
4906
+ _makeChunks(text, tokenCount, level, startOffset) {
4907
+ return new RecursiveChunk({
4908
+ text,
4909
+ startIndex: startOffset,
4910
+ endIndex: startOffset + text.length,
4911
+ tokenCount,
4912
+ level
4913
+ });
4914
+ }
4915
+ /**
4916
+ * Merge short splits.
4917
+ */
4918
+ _mergeSplits(splits, tokenCounts, combineWhitespace = false) {
4919
+ if (!splits.length || !tokenCounts.length) {
4920
+ return [[], []];
4921
+ }
4922
+ if (splits.length !== tokenCounts.length) {
4923
+ throw new Error(
4924
+ `Number of splits ${splits.length} does not match number of token counts ${tokenCounts.length}`
4925
+ );
4926
+ }
4927
+ if (tokenCounts.every((count) => count > this.chunkSize)) {
4928
+ return [splits, tokenCounts];
4929
+ }
4930
+ const merged = [];
4931
+ const cumulativeTokenCounts = [];
4932
+ let sum = 0;
4933
+ if (combineWhitespace) {
4934
+ cumulativeTokenCounts.push(0);
4935
+ for (const count of tokenCounts) {
4936
+ sum += count + 1;
4937
+ cumulativeTokenCounts.push(sum);
4938
+ }
4939
+ } else {
4940
+ cumulativeTokenCounts.push(0);
4941
+ for (const count of tokenCounts) {
4942
+ sum += count;
4943
+ cumulativeTokenCounts.push(sum);
4944
+ }
4945
+ }
4946
+ let currentIndex = 0;
4947
+ const combinedTokenCounts = [];
4948
+ while (currentIndex < splits.length) {
4949
+ const currentTokenCount = cumulativeTokenCounts[currentIndex] ?? 0;
4950
+ const requiredTokenCount = currentTokenCount + this.chunkSize;
4951
+ let index = this._bisectLeft(
4952
+ cumulativeTokenCounts,
4953
+ requiredTokenCount,
4954
+ currentIndex
4955
+ ) - 1;
4956
+ index = Math.min(index, splits.length);
4957
+ if (index === currentIndex) {
4958
+ index += 1;
4959
+ }
4960
+ if (combineWhitespace) {
4961
+ merged.push(splits.slice(currentIndex, index).join(" "));
4962
+ } else {
4963
+ merged.push(splits.slice(currentIndex, index).join(""));
4964
+ }
4965
+ combinedTokenCounts.push(
4966
+ (cumulativeTokenCounts[Math.min(index, splits.length)] ?? 0) - currentTokenCount
4967
+ );
4968
+ currentIndex = index;
4969
+ }
4970
+ return [merged, combinedTokenCounts];
4971
+ }
4972
+ /**
4973
+ * Binary search to find the leftmost position where value should be inserted to maintain order.
4974
+ *
4975
+ * @param {number[]} arr - The array to search
4976
+ * @param {number} value - The value to insert
4977
+ * @param {number} [lo=0] - The starting index for the search
4978
+ * @returns {number} The index where the value should be inserted
4979
+ * @private
4980
+ */
4981
+ _bisectLeft(arr, value, lo = 0) {
4982
+ let hi = arr.length;
4983
+ while (lo < hi) {
4984
+ const mid = lo + hi >>> 1;
4985
+ if (arr[mid] < value) {
4986
+ lo = mid + 1;
4987
+ } else {
4988
+ hi = mid;
4989
+ }
4990
+ }
4991
+ return lo;
4992
+ }
4993
+ /**
4994
+ * Recursive helper for core chunking.
4995
+ */
4996
+ async _recursiveChunk(text, level = 0, startOffset = 0) {
4997
+ if (!text) {
4998
+ return [];
4999
+ }
5000
+ console.log("[EXULU] Rule.", this.rules.length);
5001
+ console.log("[EXULU] Level.", level);
5002
+ if (level >= this.rules.length) {
5003
+ const tokenCount = await this._estimateTokenCount(text);
5004
+ return [
5005
+ this._makeChunks(
5006
+ text,
5007
+ tokenCount,
5008
+ level,
5009
+ startOffset
5010
+ )
5011
+ ];
5012
+ }
5013
+ const currRule = this.rules.getLevel(level);
5014
+ if (!currRule) {
5015
+ throw new Error(`No rule found at level ${level}`);
5016
+ }
5017
+ const splits = await this._splitText(text, currRule);
5018
+ const tokenCounts = await Promise.all(splits.map((split) => this._estimateTokenCount(split)));
5019
+ let merged;
5020
+ let combinedTokenCounts;
5021
+ if (currRule.delimiters === void 0 && !currRule.whitespace) {
5022
+ [merged, combinedTokenCounts] = [splits, tokenCounts];
5023
+ } else if (currRule.delimiters === void 0 && currRule.whitespace) {
5024
+ [merged, combinedTokenCounts] = this._mergeSplits(
5025
+ splits,
5026
+ tokenCounts,
5027
+ true
5028
+ );
5029
+ merged = merged.slice(0, 1).concat(
5030
+ merged.slice(1).map((text2) => " " + text2)
5031
+ );
5032
+ } else {
5033
+ [merged, combinedTokenCounts] = this._mergeSplits(
5034
+ splits,
5035
+ tokenCounts,
5036
+ false
5037
+ );
5038
+ }
5039
+ const chunks = [];
5040
+ let currentOffset = startOffset;
5041
+ for (let i = 0; i < merged.length; i++) {
5042
+ const split = merged[i];
5043
+ const tokenCount = combinedTokenCounts[i];
5044
+ if (tokenCount && tokenCount > this.chunkSize) {
5045
+ chunks.push(...await this._recursiveChunk(split ?? "", level + 1, currentOffset));
5046
+ } else {
5047
+ chunks.push(
5048
+ this._makeChunks(split ?? "", tokenCount ?? 0, level, currentOffset)
5049
+ );
5050
+ }
5051
+ currentOffset += split?.length ?? 0;
5052
+ }
5053
+ return chunks;
5054
+ }
5055
+ /**
5056
+ * Recursively chunk text.
5057
+ *
5058
+ * This method is the main entry point for chunking text using the RecursiveChunker.
5059
+ * It takes a single text string and returns an array of RecursiveChunk objects.
5060
+ *
5061
+ * @param {string} text - The text to be chunked
5062
+ * @returns {Promise<RecursiveChunk[]>} A promise that resolves to an array of RecursiveChunk objects
5063
+ */
5064
+ async chunk(text) {
5065
+ console.log("[EXULU] Chunking text.", text);
5066
+ const result = await this._recursiveChunk(text, 0, 0);
5067
+ await this.tokenizer.free();
5068
+ return result;
5069
+ }
5070
+ /**
5071
+ * Return a string representation of the RecursiveChunker.
5072
+ *
5073
+ * This method provides a string representation of the RecursiveChunker instance,
5074
+ * including its tokenizer, rules, chunk size, minimum characters per chunk, and return type.
5075
+ *
5076
+ * @returns {string} A string representation of the RecursiveChunker
5077
+ */
5078
+ toString() {
5079
+ return `RecursiveChunker(tokenizer=${this.tokenizer}, rules=${this.rules}, chunkSize=${this.chunkSize}, minCharactersPerChunk=${this.minCharactersPerChunk})`;
5080
+ }
5081
+ };
5082
+
5083
+ // src/chunking/types/sentence.ts
5084
+ var Sentence = class _Sentence {
5085
+ /** The text of the sentence */
5086
+ text;
5087
+ /** The starting index of the sentence in the original text */
5088
+ startIndex;
5089
+ /** The ending index of the sentence in the original text */
5090
+ endIndex;
5091
+ /** The number of tokens in the sentence */
5092
+ tokenCount;
5093
+ constructor(data) {
5094
+ this.text = data.text;
5095
+ this.startIndex = data.startIndex;
5096
+ this.endIndex = data.endIndex;
5097
+ this.tokenCount = data.tokenCount;
5098
+ }
5099
+ /** Return a string representation of the Sentence */
5100
+ toString() {
5101
+ return `Sentence(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount})`;
5102
+ }
5103
+ /** Return the Sentence as a dictionary-like object */
5104
+ toDict() {
5105
+ return {
5106
+ text: this.text,
5107
+ startIndex: this.startIndex,
5108
+ endIndex: this.endIndex,
5109
+ tokenCount: this.tokenCount
5110
+ };
5111
+ }
5112
+ /** Create a Sentence object from a dictionary-like object */
5113
+ static fromDict(data) {
5114
+ return new _Sentence(data);
5115
+ }
5116
+ };
5117
+ var SentenceChunk = class _SentenceChunk extends Chunk {
5118
+ /** List of sentences in the chunk */
5119
+ sentences;
5120
+ constructor(data) {
5121
+ super(data);
5122
+ this.sentences = data.sentences;
5123
+ this.embedding = data.embedding ?? void 0;
5124
+ }
5125
+ /**
5126
+ * Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata.
5127
+ *
5128
+ * This method overrides the base {@link Chunk} toString method to provide a more informative output, which is especially useful for debugging and logging. Each sentence in the chunk is represented using its own toString method, and all sentences are included in the output.
5129
+ *
5130
+ * @returns {string} A string describing the SentenceChunk and all its sentences, e.g.,
5131
+ * SentenceChunk(text=..., startIndex=..., endIndex=..., tokenCount=..., sentences=[Sentence(...), ...])
5132
+ */
5133
+ toString() {
5134
+ const sentencesStr = this.sentences.map((s) => s.toString()).join(", ");
5135
+ return `SentenceChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, sentences=[${sentencesStr}])`;
5136
+ }
5137
+ /**
5138
+ * Returns the SentenceChunk as a dictionary-like object.
5139
+ *
5140
+ * This method extends the base {@link Chunk} toDict method to include the sentences in the chunk.
5141
+ *
5142
+ * @returns {SentenceChunkData} A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
5143
+ /** Return the SentenceChunk as a dictionary-like object */
5144
+ toDict() {
5145
+ const baseDict = super.toDict();
5146
+ return {
5147
+ ...baseDict,
5148
+ sentences: this.sentences.map((sentence) => sentence.toDict())
5149
+ };
5150
+ }
5151
+ /**
5152
+ * Creates a SentenceChunk object from a dictionary-like object.
5153
+ *
5154
+ * This method extends the base {@link Chunk} fromDict method to include the sentences in the chunk.
5155
+ *
5156
+ * @param {SentenceChunkData} data - A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
5157
+ * @returns {SentenceChunk} A new SentenceChunk object created from the provided dictionary-like object.
5158
+ */
5159
+ static fromDict(data) {
5160
+ const sentences = data.sentences.map((sentence) => Sentence.fromDict(sentence));
5161
+ return new _SentenceChunk({
5162
+ text: data.text,
5163
+ startIndex: data.startIndex,
5164
+ endIndex: data.endIndex,
5165
+ tokenCount: data.tokenCount,
5166
+ sentences,
5167
+ embedding: data.embedding ?? void 0
5168
+ });
5169
+ }
5170
+ };
5171
+
5172
+ // src/chunking/sentence.ts
5173
+ var SentenceChunker = class _SentenceChunker extends BaseChunker {
5174
+ chunkSize;
5175
+ chunkOverlap;
5176
+ minSentencesPerChunk;
5177
+ minCharactersPerSentence;
5178
+ approximate;
5179
+ delim;
5180
+ includeDelim;
5181
+ sep;
5182
+ /**
5183
+ * Private constructor. Use `SentenceChunker.create()` to instantiate.
5184
+ *
5185
+ * @param {Tokenizer} tokenizer - The tokenizer to use for token counting.
5186
+ * @param {number} chunkSize - Maximum number of tokens per chunk.
5187
+ * @param {number} chunkOverlap - Number of tokens to overlap between consecutive chunks.
5188
+ * @param {number} minSentencesPerChunk - Minimum number of sentences per chunk.
5189
+ * @param {number} minCharactersPerSentence - Minimum number of characters for a valid sentence.
5190
+ * @param {boolean} approximate - Whether to use approximate token counting.
5191
+ * @param {string[]} delim - List of sentence delimiters to use for splitting.
5192
+ * @param {('prev' | 'next' | null)} includeDelim - Whether to include the delimiter with the previous sentence ('prev'), next sentence ('next'), or exclude it (null).
5193
+ */
5194
+ constructor(tokenizer, chunkSize, chunkOverlap, minSentencesPerChunk, minCharactersPerSentence, approximate, delim, includeDelim) {
5195
+ super(tokenizer);
5196
+ if (chunkSize <= 0) {
5197
+ throw new Error("chunkSize must be greater than 0");
5198
+ }
5199
+ if (chunkOverlap < 0) {
5200
+ throw new Error("chunkOverlap must be non-negative");
5201
+ }
5202
+ if (chunkOverlap >= chunkSize) {
5203
+ throw new Error("chunkOverlap must be less than chunkSize");
5204
+ }
5205
+ if (minSentencesPerChunk <= 0) {
5206
+ throw new Error("minSentencesPerChunk must be greater than 0");
5207
+ }
5208
+ if (minCharactersPerSentence <= 0) {
5209
+ throw new Error("minCharactersPerSentence must be greater than 0");
5210
+ }
5211
+ if (!delim) {
5212
+ throw new Error("delim must be a list of strings or a string");
5213
+ }
5214
+ if (includeDelim !== "prev" && includeDelim !== "next" && includeDelim !== null) {
5215
+ throw new Error("includeDelim must be 'prev', 'next' or null");
5216
+ }
5217
+ if (approximate) {
5218
+ console.warn("Approximate has been deprecated and will be removed from next version onwards!");
5219
+ }
5220
+ this.chunkSize = chunkSize;
5221
+ this.chunkOverlap = chunkOverlap;
5222
+ this.minSentencesPerChunk = minSentencesPerChunk;
5223
+ this.minCharactersPerSentence = minCharactersPerSentence;
5224
+ this.approximate = approximate;
5225
+ this.delim = delim;
5226
+ this.includeDelim = includeDelim;
5227
+ this.sep = "\u2704";
5228
+ }
5229
+ /**
5230
+ * Creates and initializes a SentenceChunker instance that is directly callable.
5231
+ *
5232
+ * This method is a static factory function that returns a Promise resolving to a CallableSentenceChunker instance.
5233
+ * The returned instance is a callable function that can be used to chunk text strings or arrays of text strings.
5234
+ *
5235
+ * @param {SentenceChunkerOptions} [options] - Options for configuring the SentenceChunker.
5236
+ * @returns {Promise<CallableSentenceChunker>} A promise that resolves to a callable SentenceChunker instance.
5237
+ *
5238
+ * @example
5239
+ * const chunker = await SentenceChunker.create();
5240
+ * const chunks = await chunker("This is a sample text.");
5241
+ * const batchChunks = await chunker(["Text 1", "Text 2"]);
5242
+ *
5243
+ * @see SentenceChunkerOptions
5244
+ */
5245
+ static async create(options = {}) {
5246
+ const {
5247
+ tokenizer = "gpt-3.5-turbo",
5248
+ chunkSize = 512,
5249
+ chunkOverlap = 0,
5250
+ minSentencesPerChunk = 1,
5251
+ minCharactersPerSentence = 12,
5252
+ approximate = false,
5253
+ delim = [". ", "! ", "? ", "\n"],
5254
+ includeDelim = "prev"
5255
+ } = options;
5256
+ const tokenizerInstance = await new ExuluTokenizer();
5257
+ await tokenizerInstance.create(tokenizer);
5258
+ const plainInstance = new _SentenceChunker(
5259
+ tokenizerInstance,
5260
+ chunkSize,
5261
+ chunkOverlap,
5262
+ minSentencesPerChunk,
5263
+ minCharactersPerSentence,
5264
+ approximate,
5265
+ delim,
5266
+ includeDelim
5267
+ );
5268
+ const callableFn = function(textOrTexts, showProgress) {
5269
+ if (typeof textOrTexts === "string") {
5270
+ return plainInstance.call(textOrTexts, showProgress);
5271
+ } else {
5272
+ return plainInstance.call(textOrTexts, showProgress);
5273
+ }
5274
+ };
5275
+ Object.setPrototypeOf(callableFn, _SentenceChunker.prototype);
5276
+ Object.assign(callableFn, plainInstance);
5277
+ return callableFn;
5278
+ }
5279
+ // NOTE: The replace + split method is not the best/most efficient way in general to be doing this. It works well in python because python implements .replace and .split in C while the re library is much slower in python.
5280
+ // NOTE: The new split -> join -> split is so weird, but it works. I don't quite like it however.
5281
+ // TODO: Implement a more efficient method for splitting text into sentences.
5282
+ /**
5283
+ * Fast sentence splitting while maintaining accuracy.
5284
+ *
5285
+ * @param {string} text - The text to split into sentences.
5286
+ * @returns {string[]} An array of sentences.
5287
+ */
5288
+ _splitText(text) {
5289
+ let t = text;
5290
+ for (const c of this.delim) {
5291
+ if (this.includeDelim === "prev") {
5292
+ t = t.split(c).join(c + this.sep);
5293
+ } else if (this.includeDelim === "next") {
5294
+ t = t.split(c).join(this.sep + c);
5295
+ } else {
5296
+ t = t.split(c).join(this.sep);
5297
+ }
5298
+ }
5299
+ const splits = t.split(this.sep);
5300
+ const sentences = [];
5301
+ let current = "";
5302
+ for (const s of splits) {
5303
+ if (!current) {
5304
+ current = s;
5305
+ } else {
5306
+ if (current.length >= this.minCharactersPerSentence) {
5307
+ sentences.push(current);
5308
+ current = s;
5309
+ } else {
5310
+ current += s;
5311
+ }
5312
+ }
5313
+ }
5314
+ if (current) {
5315
+ sentences.push(current);
5316
+ }
5317
+ return sentences;
5318
+ }
5319
+ /**
5320
+ * Split text into sentences and calculate token counts for each sentence.
5321
+ *
5322
+ * @param {string} text - The text to split into sentences.
5323
+ * @returns {Promise<Sentence[]>} An array of Sentence objects.
5324
+ */
5325
+ async _prepareSentences(text) {
5326
+ const sentenceTexts = this._splitText(text);
5327
+ if (!sentenceTexts.length) {
5328
+ return [];
5329
+ }
5330
+ const positions = [];
5331
+ let currentPos = 0;
5332
+ for (const sent of sentenceTexts) {
5333
+ positions.push(currentPos);
5334
+ currentPos += sent.length;
5335
+ }
5336
+ const tokenCounts = await this.tokenizer.countTokensBatch(sentenceTexts);
5337
+ return sentenceTexts.map((sent, i) => new Sentence({
5338
+ text: sent,
5339
+ startIndex: positions[i],
5340
+ endIndex: positions[i] + sent.length,
5341
+ tokenCount: tokenCounts[i]
5342
+ }));
5343
+ }
5344
+ /**
5345
+ * Create a chunk from a list of sentences.
5346
+ *
5347
+ * @param {Sentence[]} sentences - The sentences to create a chunk from.
5348
+ * @returns {Promise<SentenceChunk>} A promise that resolves to a SentenceChunk object.
5349
+ */
5350
+ async _createChunk(sentences) {
5351
+ const chunkText = sentences.map((sentence) => sentence.text).join("");
5352
+ const tokenCount = await this.tokenizer.countTokens(chunkText);
5353
+ return new SentenceChunk({
5354
+ text: chunkText,
5355
+ startIndex: sentences[0].startIndex,
5356
+ endIndex: sentences[sentences.length - 1].endIndex,
5357
+ tokenCount,
5358
+ sentences
5359
+ });
5360
+ }
5361
+ /**
5362
+ * Split text into overlapping chunks based on sentences while respecting token limits.
5363
+ *
5364
+ * @param {string} text - The text to split into chunks.
5365
+ * @returns {Promise<SentenceChunk[]>} A promise that resolves to an array of SentenceChunk objects.
5366
+ */
5367
+ async chunk(text) {
5368
+ if (!text.trim()) {
5369
+ return [];
5370
+ }
5371
+ const sentences = await this._prepareSentences(text);
5372
+ if (!sentences.length) {
5373
+ return [];
5374
+ }
5375
+ const tokenSums = [];
5376
+ let sum = 0;
5377
+ for (const sentence of sentences) {
5378
+ tokenSums.push(sum);
5379
+ sum += sentence.tokenCount;
5380
+ }
5381
+ tokenSums.push(sum);
5382
+ const chunks = [];
5383
+ let pos = 0;
5384
+ while (pos < sentences.length) {
5385
+ const targetTokens = tokenSums[pos] + this.chunkSize;
5386
+ let splitIdx = this._bisectLeft(tokenSums, targetTokens, pos) - 1;
5387
+ splitIdx = Math.min(splitIdx, sentences.length);
5388
+ splitIdx = Math.max(splitIdx, pos + 1);
5389
+ if (splitIdx - pos < this.minSentencesPerChunk) {
5390
+ if (pos + this.minSentencesPerChunk <= sentences.length) {
5391
+ splitIdx = pos + this.minSentencesPerChunk;
5392
+ } else {
5393
+ console.warn(
5394
+ `Minimum sentences per chunk as ${this.minSentencesPerChunk} could not be met for all chunks. Last chunk of the text will have only ${sentences.length - pos} sentences. Consider increasing the chunk_size or decreasing the min_sentences_per_chunk.`
5395
+ );
5396
+ splitIdx = sentences.length;
5397
+ }
5398
+ }
5399
+ const chunkSentences = sentences.slice(pos, splitIdx);
5400
+ chunks.push(await this._createChunk(chunkSentences));
5401
+ if (this.chunkOverlap > 0 && splitIdx < sentences.length) {
5402
+ let overlapTokens = 0;
5403
+ let overlapIdx = splitIdx - 1;
5404
+ while (overlapIdx > pos && overlapTokens < this.chunkOverlap) {
5405
+ const sent = sentences[overlapIdx];
5406
+ const nextTokens = overlapTokens + sent.tokenCount + 1;
5407
+ if (nextTokens > this.chunkOverlap) {
5408
+ break;
5409
+ }
5410
+ overlapTokens = nextTokens;
5411
+ overlapIdx--;
5412
+ }
5413
+ pos = overlapIdx + 1;
5414
+ } else {
5415
+ pos = splitIdx;
5416
+ }
5417
+ }
5418
+ await this.tokenizer.free();
5419
+ return chunks;
5420
+ }
5421
+ /**
5422
+ * Binary search to find the leftmost position where value should be inserted to maintain order.
5423
+ *
5424
+ * @param {number[]} arr - The array to search.
5425
+ * @param {number} value - The value to search for.
5426
+ * @param {number} [lo] - The starting index of the search.
5427
+ * @returns {number} The index of the leftmost position where value should be inserted.
5428
+ */
5429
+ _bisectLeft(arr, value, lo = 0) {
5430
+ let hi = arr.length;
5431
+ while (lo < hi) {
5432
+ const mid = lo + hi >>> 1;
5433
+ if (arr[mid] < value) {
5434
+ lo = mid + 1;
5435
+ } else {
5436
+ hi = mid;
5437
+ }
5438
+ }
5439
+ return lo;
5440
+ }
5441
+ /**
5442
+ * Return a string representation of the SentenceChunker.
5443
+ *
5444
+ * @returns {string} A string representation of the SentenceChunker.
5445
+ */
5446
+ toString() {
5447
+ return `SentenceChunker(tokenizer=${this.tokenizer}, chunkSize=${this.chunkSize}, chunkOverlap=${this.chunkOverlap}, minSentencesPerChunk=${this.minSentencesPerChunk}, minCharactersPerSentence=${this.minCharactersPerSentence}, approximate=${this.approximate}, delim=${this.delim}, includeDelim=${this.includeDelim})`;
5448
+ }
5449
+ };
4237
5450
 
4238
5451
  // src/cli/index.tsx
4239
5452
  var import_react2 = require("react");
@@ -4433,12 +5646,10 @@ var ExuluJobs = {
4433
5646
  }
4434
5647
  };
4435
5648
  var ExuluChunkers = {
4436
- chonkie: {
4437
- sentence: import_chonkie.SentenceChunker,
4438
- recursive: {
4439
- function: import_chonkie.RecursiveChunker,
4440
- rules: import_chonkie.RecursiveRules
4441
- }
5649
+ sentence: SentenceChunker,
5650
+ recursive: {
5651
+ function: RecursiveChunker,
5652
+ rules: RecursiveRules
4442
5653
  }
4443
5654
  };
4444
5655
  var ExuluDatabase = {