@exulu/backend 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1237 -26
- package/dist/index.d.cts +863 -7
- package/dist/index.d.ts +863 -7
- package/dist/index.js +1232 -21
- package/package.json +4 -3
package/dist/index.js
CHANGED
|
@@ -1847,9 +1847,25 @@ var rateLimiter = async (key, windowSeconds, limit, points) => {
|
|
|
1847
1847
|
}
|
|
1848
1848
|
};
|
|
1849
1849
|
|
|
1850
|
-
// src/
|
|
1851
|
-
import "
|
|
1852
|
-
|
|
1850
|
+
// src/auth/get-token.ts
|
|
1851
|
+
import { jwtVerify, importJWK } from "jose";
|
|
1852
|
+
var getToken = async (authHeader) => {
|
|
1853
|
+
const token = authHeader.split(" ")[1];
|
|
1854
|
+
if (!token) {
|
|
1855
|
+
throw new Error("No token provided");
|
|
1856
|
+
}
|
|
1857
|
+
if (!process.env.NEXTAUTH_SECRET) {
|
|
1858
|
+
throw new Error("No NEXTAUTH_SECRET provided");
|
|
1859
|
+
}
|
|
1860
|
+
try {
|
|
1861
|
+
const secret = process.env.NEXTAUTH_SECRET;
|
|
1862
|
+
const jwk = await importJWK({ k: secret, alg: "HS256", kty: "oct" });
|
|
1863
|
+
const { payload } = await jwtVerify(token, jwk);
|
|
1864
|
+
return payload;
|
|
1865
|
+
} catch (error) {
|
|
1866
|
+
throw new Error("Invalid token");
|
|
1867
|
+
}
|
|
1868
|
+
};
|
|
1853
1869
|
|
|
1854
1870
|
// src/auth/auth.ts
|
|
1855
1871
|
import bcrypt2 from "bcryptjs";
|
|
@@ -1987,7 +2003,8 @@ var requestValidators = {
|
|
|
1987
2003
|
let authtoken = null;
|
|
1988
2004
|
if (typeof apikey !== "string") {
|
|
1989
2005
|
const secret = process.env.NEXTAUTH_SECRET;
|
|
1990
|
-
authtoken = await getToken(
|
|
2006
|
+
authtoken = await getToken(req.headers["authorization"] ?? "");
|
|
2007
|
+
console.log("[EXULU] authtoken", authtoken);
|
|
1991
2008
|
}
|
|
1992
2009
|
return await authentication({
|
|
1993
2010
|
authtoken,
|
|
@@ -2512,7 +2529,6 @@ import { expressMiddleware } from "@as-integrations/express5";
|
|
|
2512
2529
|
// src/registry/uppy.ts
|
|
2513
2530
|
import "express";
|
|
2514
2531
|
import bodyParser from "body-parser";
|
|
2515
|
-
import { getToken as getToken2 } from "next-auth/jwt";
|
|
2516
2532
|
var createUppyRoutes = async (app) => {
|
|
2517
2533
|
const {
|
|
2518
2534
|
S3Client,
|
|
@@ -2575,11 +2591,11 @@ var createUppyRoutes = async (app) => {
|
|
|
2575
2591
|
}
|
|
2576
2592
|
app.use(bodyParser.urlencoded({ extended: true }), bodyParser.json());
|
|
2577
2593
|
app.get("/s3/list", async (req, res, next) => {
|
|
2594
|
+
req.accepts;
|
|
2578
2595
|
const apikey = req.headers["exulu-api-key"] || null;
|
|
2579
2596
|
let authtoken = null;
|
|
2580
2597
|
if (typeof apikey !== "string") {
|
|
2581
|
-
|
|
2582
|
-
authtoken = await getToken2({ req, secret });
|
|
2598
|
+
authtoken = await getToken(req.headers.authorization ?? "");
|
|
2583
2599
|
}
|
|
2584
2600
|
const { db: db2 } = await postgresClient();
|
|
2585
2601
|
const authenticationResult = await authentication({
|
|
@@ -2629,8 +2645,7 @@ var createUppyRoutes = async (app) => {
|
|
|
2629
2645
|
const { db: db2 } = await postgresClient();
|
|
2630
2646
|
let authtoken = null;
|
|
2631
2647
|
if (typeof apikey !== "string" && typeof internalkey !== "string") {
|
|
2632
|
-
|
|
2633
|
-
authtoken = await getToken2({ req, secret });
|
|
2648
|
+
authtoken = await getToken(req.headers.authorization ?? "");
|
|
2634
2649
|
}
|
|
2635
2650
|
const authenticationResult = await authentication({
|
|
2636
2651
|
authtoken,
|
|
@@ -2703,8 +2718,7 @@ var createUppyRoutes = async (app) => {
|
|
|
2703
2718
|
const { db: db2 } = await postgresClient();
|
|
2704
2719
|
let authtoken = null;
|
|
2705
2720
|
if (typeof apikey !== "string") {
|
|
2706
|
-
|
|
2707
|
-
authtoken = await getToken2({ req, secret });
|
|
2721
|
+
authtoken = await getToken(req.headers.authorization ?? "");
|
|
2708
2722
|
}
|
|
2709
2723
|
const authenticationResult = await authentication({
|
|
2710
2724
|
authtoken,
|
|
@@ -2752,8 +2766,7 @@ var createUppyRoutes = async (app) => {
|
|
|
2752
2766
|
const { db: db2 } = await postgresClient();
|
|
2753
2767
|
let authtoken = null;
|
|
2754
2768
|
if (typeof apikey !== "string") {
|
|
2755
|
-
|
|
2756
|
-
authtoken = await getToken2({ req, secret });
|
|
2769
|
+
authtoken = await getToken(req.headers.authorization ?? "");
|
|
2757
2770
|
}
|
|
2758
2771
|
const authenticationResult = await authentication({
|
|
2759
2772
|
authtoken,
|
|
@@ -4188,8 +4201,1208 @@ var ExuluApp = class {
|
|
|
4188
4201
|
};
|
|
4189
4202
|
};
|
|
4190
4203
|
|
|
4191
|
-
// src/
|
|
4192
|
-
|
|
4204
|
+
// src/chunking/types/base.ts
|
|
4205
|
+
var Chunk = class _Chunk {
|
|
4206
|
+
/** The text of the chunk. */
|
|
4207
|
+
text;
|
|
4208
|
+
/** The starting index of the chunk in the original text. */
|
|
4209
|
+
startIndex;
|
|
4210
|
+
/** The ending index of the chunk in the original text. */
|
|
4211
|
+
endIndex;
|
|
4212
|
+
/** The number of tokens in the chunk. */
|
|
4213
|
+
tokenCount;
|
|
4214
|
+
/** Optional embedding for the chunk. */
|
|
4215
|
+
embedding;
|
|
4216
|
+
/**
|
|
4217
|
+
* Constructs a new Chunk object.
|
|
4218
|
+
*
|
|
4219
|
+
* @param {ChunkData} data - The data to construct the Chunk from.
|
|
4220
|
+
*/
|
|
4221
|
+
constructor(data) {
|
|
4222
|
+
this.text = data.text;
|
|
4223
|
+
this.startIndex = data.startIndex;
|
|
4224
|
+
this.endIndex = data.endIndex;
|
|
4225
|
+
this.tokenCount = data.tokenCount;
|
|
4226
|
+
this.embedding = data.embedding;
|
|
4227
|
+
if (this.startIndex > this.endIndex) {
|
|
4228
|
+
throw new Error("Start index must be less than or equal to end index.");
|
|
4229
|
+
}
|
|
4230
|
+
if (this.tokenCount < 0) {
|
|
4231
|
+
throw new Error("Token count must be a non-negative integer.");
|
|
4232
|
+
}
|
|
4233
|
+
}
|
|
4234
|
+
/** Return a string representation of the Chunk.
|
|
4235
|
+
*
|
|
4236
|
+
* @returns {string} The text of the chunk.
|
|
4237
|
+
*/
|
|
4238
|
+
toString() {
|
|
4239
|
+
return this.text;
|
|
4240
|
+
}
|
|
4241
|
+
/** Return a detailed string representation of the Chunk.
|
|
4242
|
+
*
|
|
4243
|
+
* @returns {string} The detailed string representation of the Chunk.
|
|
4244
|
+
*/
|
|
4245
|
+
toRepresentation() {
|
|
4246
|
+
let repr = `Chunk(text='${this.text}', tokenCount=${this.tokenCount}, startIndex=${this.startIndex}, endIndex=${this.endIndex}`;
|
|
4247
|
+
repr += ")";
|
|
4248
|
+
return repr;
|
|
4249
|
+
}
|
|
4250
|
+
/** Return a slice of the chunk's text.
|
|
4251
|
+
*
|
|
4252
|
+
* @param {number} [start] - The starting index of the slice.
|
|
4253
|
+
* @param {number} [end] - The ending index of the slice.
|
|
4254
|
+
* @returns {string} The slice of the chunk's text.
|
|
4255
|
+
*/
|
|
4256
|
+
slice(start, end) {
|
|
4257
|
+
return this.text.slice(start, end);
|
|
4258
|
+
}
|
|
4259
|
+
/** Return the Chunk as a dictionary-like object.
|
|
4260
|
+
*
|
|
4261
|
+
* @returns {ChunkData} The dictionary-like object.
|
|
4262
|
+
*/
|
|
4263
|
+
toDict() {
|
|
4264
|
+
return {
|
|
4265
|
+
text: this.text,
|
|
4266
|
+
startIndex: this.startIndex,
|
|
4267
|
+
endIndex: this.endIndex,
|
|
4268
|
+
tokenCount: this.tokenCount,
|
|
4269
|
+
embedding: this.embedding
|
|
4270
|
+
};
|
|
4271
|
+
}
|
|
4272
|
+
/** Create a Chunk object from a dictionary-like object.
|
|
4273
|
+
*
|
|
4274
|
+
* @param {ChunkData} data - The dictionary-like object.
|
|
4275
|
+
* @returns {Chunk} The Chunk object.
|
|
4276
|
+
*/
|
|
4277
|
+
static fromDict(data) {
|
|
4278
|
+
return new _Chunk({
|
|
4279
|
+
text: data.text,
|
|
4280
|
+
startIndex: data.startIndex,
|
|
4281
|
+
endIndex: data.endIndex,
|
|
4282
|
+
tokenCount: data.tokenCount,
|
|
4283
|
+
embedding: data.embedding
|
|
4284
|
+
});
|
|
4285
|
+
}
|
|
4286
|
+
/** Return a deep copy of the chunk.
|
|
4287
|
+
*
|
|
4288
|
+
* @returns {Chunk} The deep copy of the chunk.
|
|
4289
|
+
*/
|
|
4290
|
+
copy() {
|
|
4291
|
+
return _Chunk.fromDict(this.toDict());
|
|
4292
|
+
}
|
|
4293
|
+
};
|
|
4294
|
+
|
|
4295
|
+
// src/chunking/types/recursive.ts
|
|
4296
|
+
var RecursiveLevel = class _RecursiveLevel {
|
|
4297
|
+
/** Custom delimiters for chunking */
|
|
4298
|
+
delimiters;
|
|
4299
|
+
/** Whether to use whitespace as a delimiter */
|
|
4300
|
+
whitespace;
|
|
4301
|
+
/** Whether to include the delimiter in the previous or next chunk */
|
|
4302
|
+
includeDelim;
|
|
4303
|
+
/**
|
|
4304
|
+
* Constructs a new RecursiveLevel object.
|
|
4305
|
+
*
|
|
4306
|
+
* @param {RecursiveLevelData} data - The data to construct the RecursiveLevel from.
|
|
4307
|
+
*/
|
|
4308
|
+
constructor(data = {}) {
|
|
4309
|
+
this.delimiters = data.delimiters;
|
|
4310
|
+
this.whitespace = data.whitespace ?? false;
|
|
4311
|
+
this.includeDelim = data.includeDelim ?? "prev";
|
|
4312
|
+
this.validate();
|
|
4313
|
+
}
|
|
4314
|
+
/**
|
|
4315
|
+
* Validates the RecursiveLevel object.
|
|
4316
|
+
*
|
|
4317
|
+
* @private
|
|
4318
|
+
*/
|
|
4319
|
+
validate() {
|
|
4320
|
+
if (this.delimiters !== void 0 && this.whitespace) {
|
|
4321
|
+
throw new Error("Cannot use whitespace as a delimiter and also specify custom delimiters.");
|
|
4322
|
+
}
|
|
4323
|
+
if (this.delimiters !== void 0) {
|
|
4324
|
+
if (typeof this.delimiters === "string" && this.delimiters.length === 0) {
|
|
4325
|
+
throw new Error("Custom delimiters cannot be an empty string.");
|
|
4326
|
+
}
|
|
4327
|
+
if (Array.isArray(this.delimiters)) {
|
|
4328
|
+
if (this.delimiters.some((delim) => typeof delim !== "string" || delim.length === 0)) {
|
|
4329
|
+
throw new Error("Custom delimiters cannot be an empty string.");
|
|
4330
|
+
}
|
|
4331
|
+
if (this.delimiters.includes(" ")) {
|
|
4332
|
+
throw new Error("Custom delimiters cannot be whitespace only. Set whitespace to true instead.");
|
|
4333
|
+
}
|
|
4334
|
+
}
|
|
4335
|
+
}
|
|
4336
|
+
}
|
|
4337
|
+
/** Return a string representation of the RecursiveLevel
|
|
4338
|
+
*
|
|
4339
|
+
* @returns {string} The string representation of the RecursiveLevel.
|
|
4340
|
+
*/
|
|
4341
|
+
toString() {
|
|
4342
|
+
return `RecursiveLevel(delimiters=${this.delimiters}, whitespace=${this.whitespace}, includeDelim=${this.includeDelim})`;
|
|
4343
|
+
}
|
|
4344
|
+
/** Return the RecursiveLevel as a dictionary-like object
|
|
4345
|
+
*
|
|
4346
|
+
* @returns {RecursiveLevelData} The dictionary-like object.
|
|
4347
|
+
*/
|
|
4348
|
+
toDict() {
|
|
4349
|
+
return {
|
|
4350
|
+
delimiters: this.delimiters,
|
|
4351
|
+
whitespace: this.whitespace,
|
|
4352
|
+
includeDelim: this.includeDelim
|
|
4353
|
+
};
|
|
4354
|
+
}
|
|
4355
|
+
/** Create RecursiveLevel object from a dictionary
|
|
4356
|
+
*
|
|
4357
|
+
* @param {RecursiveLevelData} data - The dictionary-like object.
|
|
4358
|
+
* @returns {RecursiveLevel} The RecursiveLevel object.
|
|
4359
|
+
*/
|
|
4360
|
+
static fromDict(data) {
|
|
4361
|
+
return new _RecursiveLevel(data);
|
|
4362
|
+
}
|
|
4363
|
+
/** Create RecursiveLevel object from a recipe
|
|
4364
|
+
*
|
|
4365
|
+
* @param {string} name - The name of the recipe.
|
|
4366
|
+
* @param {string} lang - The language of the recipe.
|
|
4367
|
+
* @returns {Promise<RecursiveLevel>} The RecursiveLevel object.
|
|
4368
|
+
*/
|
|
4369
|
+
static async fromRecipe(name, lang = "en") {
|
|
4370
|
+
throw new Error("Not implemented");
|
|
4371
|
+
}
|
|
4372
|
+
};
|
|
4373
|
+
var RecursiveRules = class _RecursiveRules {
|
|
4374
|
+
/** List of recursive levels */
|
|
4375
|
+
levels;
|
|
4376
|
+
constructor(data = {}) {
|
|
4377
|
+
if (data.levels === void 0) {
|
|
4378
|
+
const paragraphs = new RecursiveLevel({ delimiters: ["\n\n", "\r\n", "\n", "\r"] });
|
|
4379
|
+
const sentences = new RecursiveLevel({ delimiters: [". ", "! ", "? "] });
|
|
4380
|
+
const pauses = new RecursiveLevel({
|
|
4381
|
+
delimiters: [
|
|
4382
|
+
"{",
|
|
4383
|
+
"}",
|
|
4384
|
+
'"',
|
|
4385
|
+
"[",
|
|
4386
|
+
"]",
|
|
4387
|
+
"<",
|
|
4388
|
+
">",
|
|
4389
|
+
"(",
|
|
4390
|
+
")",
|
|
4391
|
+
":",
|
|
4392
|
+
";",
|
|
4393
|
+
",",
|
|
4394
|
+
"\u2014",
|
|
4395
|
+
"|",
|
|
4396
|
+
"~",
|
|
4397
|
+
"-",
|
|
4398
|
+
"...",
|
|
4399
|
+
"`",
|
|
4400
|
+
"'"
|
|
4401
|
+
]
|
|
4402
|
+
});
|
|
4403
|
+
const word = new RecursiveLevel({ whitespace: true });
|
|
4404
|
+
const token = new RecursiveLevel();
|
|
4405
|
+
this.levels = [paragraphs, sentences, pauses, word, token];
|
|
4406
|
+
} else {
|
|
4407
|
+
this.levels = data.levels.map((level) => new RecursiveLevel(level));
|
|
4408
|
+
}
|
|
4409
|
+
}
|
|
4410
|
+
/** Return a string representation of the RecursiveRules
|
|
4411
|
+
*
|
|
4412
|
+
* @returns {string} The string representation of the RecursiveRules.
|
|
4413
|
+
*/
|
|
4414
|
+
toString() {
|
|
4415
|
+
return `RecursiveRules(levels=${this.levels})`;
|
|
4416
|
+
}
|
|
4417
|
+
/** Return the number of levels
|
|
4418
|
+
*
|
|
4419
|
+
* @returns {number} The number of levels.
|
|
4420
|
+
*/
|
|
4421
|
+
get length() {
|
|
4422
|
+
return this.levels.length;
|
|
4423
|
+
}
|
|
4424
|
+
/** Get a level by index
|
|
4425
|
+
*
|
|
4426
|
+
* @param {number} index - The index of the level.
|
|
4427
|
+
* @returns {RecursiveLevel | undefined} The level.
|
|
4428
|
+
*/
|
|
4429
|
+
getLevel(index) {
|
|
4430
|
+
return this.levels[index];
|
|
4431
|
+
}
|
|
4432
|
+
/** Return an iterator over the levels
|
|
4433
|
+
*
|
|
4434
|
+
* @returns {Iterator<RecursiveLevel>} The iterator over the levels.
|
|
4435
|
+
*/
|
|
4436
|
+
[Symbol.iterator]() {
|
|
4437
|
+
return this.levels[Symbol.iterator]();
|
|
4438
|
+
}
|
|
4439
|
+
/** Create a RecursiveRules object from a dictionary
|
|
4440
|
+
*
|
|
4441
|
+
* @param {RecursiveRulesData} data - The dictionary-like object.
|
|
4442
|
+
* @returns {RecursiveRules} The RecursiveRules object.
|
|
4443
|
+
*/
|
|
4444
|
+
static fromDict(data) {
|
|
4445
|
+
return new _RecursiveRules(data);
|
|
4446
|
+
}
|
|
4447
|
+
/** Return the RecursiveRules as a dictionary-like object
|
|
4448
|
+
*
|
|
4449
|
+
* @returns {RecursiveRulesData} The dictionary-like object.
|
|
4450
|
+
*/
|
|
4451
|
+
toDict() {
|
|
4452
|
+
return {
|
|
4453
|
+
levels: this.levels.map((level) => level.toDict())
|
|
4454
|
+
};
|
|
4455
|
+
}
|
|
4456
|
+
/** Create a RecursiveRules object from a recipe
|
|
4457
|
+
*
|
|
4458
|
+
* @param {string} name - The name of the recipe.
|
|
4459
|
+
* @param {string} lang - The language of the recipe.
|
|
4460
|
+
* @param {string} path - The path to the recipe.
|
|
4461
|
+
* @returns {Promise<RecursiveRules>} The RecursiveRules object.
|
|
4462
|
+
*/
|
|
4463
|
+
static async fromRecipe(name = "default", lang = "en", path3) {
|
|
4464
|
+
throw new Error("Not implemented");
|
|
4465
|
+
}
|
|
4466
|
+
};
|
|
4467
|
+
var RecursiveChunk = class _RecursiveChunk extends Chunk {
|
|
4468
|
+
/** The level of recursion for the chunk */
|
|
4469
|
+
level;
|
|
4470
|
+
constructor(data) {
|
|
4471
|
+
super(data);
|
|
4472
|
+
this.level = data.level;
|
|
4473
|
+
}
|
|
4474
|
+
/** Return a string representation of the RecursiveChunk
|
|
4475
|
+
*
|
|
4476
|
+
* @returns {string} The string representation of the RecursiveChunk.
|
|
4477
|
+
*/
|
|
4478
|
+
toString() {
|
|
4479
|
+
return `RecursiveChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, level=${this.level})`;
|
|
4480
|
+
}
|
|
4481
|
+
/** Return the RecursiveChunk as a dictionary-like object
|
|
4482
|
+
*
|
|
4483
|
+
* @returns {RecursiveChunkData} The dictionary-like object.
|
|
4484
|
+
*/
|
|
4485
|
+
toDict() {
|
|
4486
|
+
const baseDict = super.toDict();
|
|
4487
|
+
return {
|
|
4488
|
+
...baseDict,
|
|
4489
|
+
level: this.level
|
|
4490
|
+
};
|
|
4491
|
+
}
|
|
4492
|
+
/** Create a RecursiveChunk object from a dictionary
|
|
4493
|
+
*
|
|
4494
|
+
* @param {RecursiveChunkData} data - The dictionary-like object.
|
|
4495
|
+
* @returns {RecursiveChunk} The RecursiveChunk object.
|
|
4496
|
+
*/
|
|
4497
|
+
static fromDict(data) {
|
|
4498
|
+
return new _RecursiveChunk(data);
|
|
4499
|
+
}
|
|
4500
|
+
};
|
|
4501
|
+
|
|
4502
|
+
// src/chunking/tokenizer.ts
|
|
4503
|
+
import { Tiktoken } from "tiktoken/lite";
|
|
4504
|
+
import { load } from "tiktoken/load";
|
|
4505
|
+
import registry from "tiktoken/registry.json";
|
|
4506
|
+
import models from "tiktoken/model_to_encoding.json";
|
|
4507
|
+
var ExuluTokenizer = class {
|
|
4508
|
+
constructor() {
|
|
4509
|
+
}
|
|
4510
|
+
encoder = null;
|
|
4511
|
+
async create(modelName) {
|
|
4512
|
+
if (this.encoder) {
|
|
4513
|
+
return this.encoder;
|
|
4514
|
+
}
|
|
4515
|
+
const time = performance.now();
|
|
4516
|
+
console.log("[EXULU] Loading tokenizer.", modelName);
|
|
4517
|
+
const model = await load(registry[models[modelName]]);
|
|
4518
|
+
console.log("[EXULU] Loaded tokenizer.", modelName, performance.now() - time);
|
|
4519
|
+
console.log("[EXULU] Model.", model.bpe_ranks);
|
|
4520
|
+
console.log("[EXULU] Model.", model.special_tokens);
|
|
4521
|
+
console.log("[EXULU] Model.", model.pat_str);
|
|
4522
|
+
const encoder = new Tiktoken(
|
|
4523
|
+
model.bpe_ranks,
|
|
4524
|
+
model.special_tokens,
|
|
4525
|
+
model.pat_str
|
|
4526
|
+
);
|
|
4527
|
+
console.log("[EXULU] Encoder.", encoder);
|
|
4528
|
+
this.encoder = encoder;
|
|
4529
|
+
return encoder;
|
|
4530
|
+
}
|
|
4531
|
+
async decode(tokens) {
|
|
4532
|
+
if (!this.encoder) {
|
|
4533
|
+
throw new Error("Tokenizer not initialized");
|
|
4534
|
+
}
|
|
4535
|
+
const text = this.encoder.decode(tokens);
|
|
4536
|
+
return new TextDecoder().decode(text);
|
|
4537
|
+
}
|
|
4538
|
+
async decodeBatch(tokenSequences) {
|
|
4539
|
+
if (!this.encoder) {
|
|
4540
|
+
throw new Error("Tokenizer not initialized");
|
|
4541
|
+
}
|
|
4542
|
+
const promises2 = tokenSequences.map((tokens) => this.decode(tokens));
|
|
4543
|
+
return await Promise.all(promises2);
|
|
4544
|
+
}
|
|
4545
|
+
encode(text) {
|
|
4546
|
+
if (!this.encoder) {
|
|
4547
|
+
throw new Error("Tokenizer not initialized");
|
|
4548
|
+
}
|
|
4549
|
+
const time = performance.now();
|
|
4550
|
+
console.log("[EXULU] Encoding text.", text);
|
|
4551
|
+
const tokens = this.encoder.encode(text);
|
|
4552
|
+
console.log("[EXULU] Encoded text.", text, performance.now() - time);
|
|
4553
|
+
return tokens;
|
|
4554
|
+
}
|
|
4555
|
+
async countTokensBatch(texts) {
|
|
4556
|
+
if (!this.encoder) {
|
|
4557
|
+
throw new Error("Tokenizer not initialized");
|
|
4558
|
+
}
|
|
4559
|
+
const promises2 = texts.map((text) => this.countTokens(text));
|
|
4560
|
+
return await Promise.all(promises2);
|
|
4561
|
+
}
|
|
4562
|
+
countTokens(text) {
|
|
4563
|
+
if (!this.encoder) {
|
|
4564
|
+
throw new Error("Tokenizer not initialized");
|
|
4565
|
+
}
|
|
4566
|
+
console.log("[EXULU] Counting tokens.", text);
|
|
4567
|
+
const tokens = this.encoder.encode(text);
|
|
4568
|
+
const count = tokens.length;
|
|
4569
|
+
console.log("[EXULU] Token count.", count);
|
|
4570
|
+
return count;
|
|
4571
|
+
}
|
|
4572
|
+
async free() {
|
|
4573
|
+
console.log("[EXULU] Freeing tokenizer.");
|
|
4574
|
+
if (this.encoder) {
|
|
4575
|
+
this.encoder.free();
|
|
4576
|
+
}
|
|
4577
|
+
}
|
|
4578
|
+
};
|
|
4579
|
+
|
|
4580
|
+
// src/chunking/base.ts
|
|
4581
|
+
var BaseChunker = class {
|
|
4582
|
+
tokenizer;
|
|
4583
|
+
_useConcurrency = true;
|
|
4584
|
+
// Determines if batch processing uses Promise.all
|
|
4585
|
+
constructor(tokenizer) {
|
|
4586
|
+
this.tokenizer = tokenizer;
|
|
4587
|
+
}
|
|
4588
|
+
/**
|
|
4589
|
+
* Returns a string representation of the chunker instance.
|
|
4590
|
+
*
|
|
4591
|
+
* @returns {string} The class name and constructor signature.
|
|
4592
|
+
*/
|
|
4593
|
+
toString() {
|
|
4594
|
+
return `${this.constructor.name}()`;
|
|
4595
|
+
}
|
|
4596
|
+
async call(textOrTexts, showProgress = false) {
|
|
4597
|
+
if (typeof textOrTexts === "string") {
|
|
4598
|
+
return this.chunk(textOrTexts);
|
|
4599
|
+
} else if (Array.isArray(textOrTexts)) {
|
|
4600
|
+
return this.chunkBatch(textOrTexts, showProgress);
|
|
4601
|
+
} else {
|
|
4602
|
+
throw new Error("Input must be a string or an array of strings.");
|
|
4603
|
+
}
|
|
4604
|
+
}
|
|
4605
|
+
/**
|
|
4606
|
+
* Process a batch of texts sequentially (one after another).
|
|
4607
|
+
*
|
|
4608
|
+
* @protected
|
|
4609
|
+
* @param {string[]} texts - The texts to chunk.
|
|
4610
|
+
* @param {boolean} [showProgress=false] - Whether to display progress in the console.
|
|
4611
|
+
* @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
|
|
4612
|
+
*/
|
|
4613
|
+
async _sequential_batch_processing(texts, showProgress = false) {
|
|
4614
|
+
const results = [];
|
|
4615
|
+
const total = texts.length;
|
|
4616
|
+
for (let i = 0; i < total; i++) {
|
|
4617
|
+
if (showProgress && total > 1) {
|
|
4618
|
+
const progress = Math.round((i + 1) / total * 100);
|
|
4619
|
+
process.stdout.write(`Sequential processing: Document ${i + 1}/${total} (${progress}%)\r`);
|
|
4620
|
+
}
|
|
4621
|
+
results.push(await this.chunk(texts[i]));
|
|
4622
|
+
}
|
|
4623
|
+
if (showProgress && total > 1) {
|
|
4624
|
+
process.stdout.write("\n");
|
|
4625
|
+
}
|
|
4626
|
+
return results;
|
|
4627
|
+
}
|
|
4628
|
+
/**
|
|
4629
|
+
* Process a batch of texts concurrently using Promise.all.
|
|
4630
|
+
*
|
|
4631
|
+
* @protected
|
|
4632
|
+
* @param {string[]} texts - The texts to chunk.
|
|
4633
|
+
* @param {boolean} [showProgress=false] - Whether to display progress in the console.
|
|
4634
|
+
* @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
|
|
4635
|
+
*/
|
|
4636
|
+
async _concurrent_batch_processing(texts, showProgress = false) {
|
|
4637
|
+
const total = texts.length;
|
|
4638
|
+
let completedCount = 0;
|
|
4639
|
+
const updateProgress = () => {
|
|
4640
|
+
if (showProgress && total > 1) {
|
|
4641
|
+
completedCount++;
|
|
4642
|
+
const progress = Math.round(completedCount / total * 100);
|
|
4643
|
+
process.stdout.write(`Concurrent processing: Document ${completedCount}/${total} (${progress}%)\r`);
|
|
4644
|
+
}
|
|
4645
|
+
};
|
|
4646
|
+
const chunkPromises = texts.map(
|
|
4647
|
+
(text) => this.chunk(text).then((result) => {
|
|
4648
|
+
updateProgress();
|
|
4649
|
+
return result;
|
|
4650
|
+
})
|
|
4651
|
+
);
|
|
4652
|
+
const results = await Promise.all(chunkPromises);
|
|
4653
|
+
if (showProgress && total > 1 && completedCount > 0) {
|
|
4654
|
+
process.stdout.write("\n");
|
|
4655
|
+
}
|
|
4656
|
+
return results;
|
|
4657
|
+
}
|
|
4658
|
+
/**
|
|
4659
|
+
* Chunk a batch of texts, using either concurrent or sequential processing.
|
|
4660
|
+
*
|
|
4661
|
+
* If only one text is provided, processes it directly without batch overhead.
|
|
4662
|
+
*
|
|
4663
|
+
* @param {string[]} texts - The texts to chunk.
|
|
4664
|
+
* @param {boolean} [showProgress=true] - Whether to display progress in the console.
|
|
4665
|
+
* @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
|
|
4666
|
+
*/
|
|
4667
|
+
async chunkBatch(texts, showProgress = true) {
|
|
4668
|
+
if (texts.length === 0) {
|
|
4669
|
+
return [];
|
|
4670
|
+
}
|
|
4671
|
+
if (texts.length === 1) {
|
|
4672
|
+
return [await this.chunk(texts[0])];
|
|
4673
|
+
}
|
|
4674
|
+
if (this._useConcurrency) {
|
|
4675
|
+
return this._concurrent_batch_processing(texts, showProgress);
|
|
4676
|
+
} else {
|
|
4677
|
+
return this._sequential_batch_processing(texts, showProgress);
|
|
4678
|
+
}
|
|
4679
|
+
}
|
|
4680
|
+
};
|
|
4681
|
+
|
|
4682
|
+
// src/chunking/recursive.ts
|
|
4683
|
+
var RecursiveChunker = class _RecursiveChunker extends BaseChunker {
|
|
4684
|
+
chunkSize;
|
|
4685
|
+
minCharactersPerChunk;
|
|
4686
|
+
rules;
|
|
4687
|
+
sep;
|
|
4688
|
+
_CHARS_PER_TOKEN = 6.5;
|
|
4689
|
+
/**
|
|
4690
|
+
* Private constructor. Use `RecursiveChunker.create()` to instantiate.
|
|
4691
|
+
*/
|
|
4692
|
+
constructor(tokenizer, chunkSize, rules, minCharactersPerChunk) {
|
|
4693
|
+
super(tokenizer);
|
|
4694
|
+
if (chunkSize <= 0) {
|
|
4695
|
+
throw new Error("chunkSize must be greater than 0");
|
|
4696
|
+
}
|
|
4697
|
+
if (minCharactersPerChunk <= 0) {
|
|
4698
|
+
throw new Error("minCharactersPerChunk must be greater than 0");
|
|
4699
|
+
}
|
|
4700
|
+
if (!(rules instanceof RecursiveRules)) {
|
|
4701
|
+
throw new Error("rules must be a RecursiveRules object");
|
|
4702
|
+
}
|
|
4703
|
+
this.chunkSize = chunkSize;
|
|
4704
|
+
this.minCharactersPerChunk = minCharactersPerChunk;
|
|
4705
|
+
this.rules = rules;
|
|
4706
|
+
this.sep = "\u2704";
|
|
4707
|
+
}
|
|
4708
|
+
/**
|
|
4709
|
+
* Creates and initializes a directly callable RecursiveChunker instance.
|
|
4710
|
+
*
|
|
4711
|
+
* This static factory method constructs a RecursiveChunker with the provided options and returns a callable function object.
|
|
4712
|
+
* The returned instance can be used as both a function (to chunk text(s)) and as an object (with all RecursiveChunker methods and properties).
|
|
4713
|
+
*
|
|
4714
|
+
* @param {RecursiveChunkerOptions} [options] - Configuration options for the chunker. All options are optional:
|
|
4715
|
+
* @param {string|Tokenizer} [options.tokenizer="Xenova/gpt2"] - Tokenizer to use for text processing. Can be a string identifier (e.g., "Xenova/gpt2") or a Tokenizer instance. If a string is provided, Tokenizer.create() is called internally.
|
|
4716
|
+
* @param {number} [options.chunkSize=512] - Maximum number of tokens per chunk. Must be > 0.
|
|
4717
|
+
* @param {RecursiveRules} [options.rules=new RecursiveRules()] - Rules for recursive chunking. See {@link RecursiveRules} for customization.
|
|
4718
|
+
* @param {number} [options.minCharactersPerChunk=24] - Minimum number of characters per chunk. Must be > 0.
|
|
4719
|
+
*
|
|
4720
|
+
* @returns {Promise<CallableRecursiveChunker>} Promise resolving to a callable RecursiveChunker instance.
|
|
4721
|
+
*
|
|
4722
|
+
* @throws {Error} If any option is invalid (e.g., chunkSize <= 0).
|
|
4723
|
+
*
|
|
4724
|
+
* @see CallableRecursiveChunker for the callable interface and available properties/methods.
|
|
4725
|
+
*
|
|
4726
|
+
* @example <caption>Basic usage with default options</caption>
|
|
4727
|
+
* const chunker = await RecursiveChunker.create();
|
|
4728
|
+
* const chunks = await chunker("Some text to chunk");
|
|
4729
|
+
*
|
|
4730
|
+
* @example <caption>Custom options and batch chunking</caption>
|
|
4731
|
+
* const chunker = await RecursiveChunker.create({ chunkSize: 256 });
|
|
4732
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
4733
|
+
*
|
|
4734
|
+
* @example <caption>Accessing properties and methods</caption>
|
|
4735
|
+
* const chunker = await RecursiveChunker.create();
|
|
4736
|
+
* console.log(chunker.chunkSize); // 512
|
|
4737
|
+
* console.log(chunker.rules); // RecursiveRules instance
|
|
4738
|
+
* const chunks = await chunker.chunk("Some text"); // Use as object method
|
|
4739
|
+
*
|
|
4740
|
+
* @note
|
|
4741
|
+
* The returned instance is both callable (like a function) and has all properties/methods of RecursiveChunker.
|
|
4742
|
+
* You can use it as a drop-in replacement for a function or a class instance.
|
|
4743
|
+
*
|
|
4744
|
+
* @note
|
|
4745
|
+
* For advanced customization, pass a custom RecursiveRules object to the rules option.
|
|
4746
|
+
* See {@link RecursiveRules} and {@link RecursiveLevel} for rule structure.
|
|
4747
|
+
*/
|
|
4748
|
+
static async create(options = {}) {
|
|
4749
|
+
const {
|
|
4750
|
+
tokenizer = "gpt-3.5-turbo",
|
|
4751
|
+
chunkSize = 512,
|
|
4752
|
+
rules = new RecursiveRules(),
|
|
4753
|
+
minCharactersPerChunk = 24
|
|
4754
|
+
} = options;
|
|
4755
|
+
const tokenizerInstance = await new ExuluTokenizer();
|
|
4756
|
+
await tokenizerInstance.create(tokenizer);
|
|
4757
|
+
const plainInstance = new _RecursiveChunker(
|
|
4758
|
+
tokenizerInstance,
|
|
4759
|
+
chunkSize,
|
|
4760
|
+
rules,
|
|
4761
|
+
minCharactersPerChunk
|
|
4762
|
+
);
|
|
4763
|
+
const callableFn = function(textOrTexts, showProgress) {
|
|
4764
|
+
if (typeof textOrTexts === "string") {
|
|
4765
|
+
return plainInstance.call(textOrTexts, showProgress);
|
|
4766
|
+
} else {
|
|
4767
|
+
return plainInstance.call(textOrTexts, showProgress);
|
|
4768
|
+
}
|
|
4769
|
+
};
|
|
4770
|
+
Object.setPrototypeOf(callableFn, _RecursiveChunker.prototype);
|
|
4771
|
+
Object.assign(callableFn, plainInstance);
|
|
4772
|
+
return callableFn;
|
|
4773
|
+
}
|
|
4774
|
+
/**
|
|
4775
|
+
* Estimates the number of tokens in a given text.
|
|
4776
|
+
*
|
|
4777
|
+
* This method uses a character-to-token ratio (default: 6.5 characters per token) for quick estimation.
|
|
4778
|
+
* If the estimated token count exceeds the chunk size, it performs an actual token count.
|
|
4779
|
+
*
|
|
4780
|
+
* @param {string} text - The text to estimate token count for
|
|
4781
|
+
* @returns {Promise<number>} A promise that resolves to the estimated number of tokens
|
|
4782
|
+
* @private
|
|
4783
|
+
*/
|
|
4784
|
+
async _estimateTokenCount(text) {
|
|
4785
|
+
const estimate = Math.max(1, Math.floor(text.length / this._CHARS_PER_TOKEN));
|
|
4786
|
+
if (estimate > this.chunkSize) {
|
|
4787
|
+
return this.chunkSize + 1;
|
|
4788
|
+
}
|
|
4789
|
+
return this.tokenizer.countTokens(text);
|
|
4790
|
+
}
|
|
4791
|
+
/**
|
|
4792
|
+
* Split the text into chunks based on the provided recursive level rules.
|
|
4793
|
+
*
|
|
4794
|
+
* This method handles three different splitting strategies:
|
|
4795
|
+
* 1. Whitespace-based splitting: Splits text on spaces
|
|
4796
|
+
* 2. Delimiter-based splitting: Splits text on specified delimiters with options to include delimiters
|
|
4797
|
+
* 3. Token-based splitting: Splits text into chunks of maximum token size
|
|
4798
|
+
*
|
|
4799
|
+
* @param {string} text - The text to be split into chunks
|
|
4800
|
+
* @param {RecursiveLevel} recursiveLevel - The rules defining how to split the text
|
|
4801
|
+
* @returns {Promise<string[]>} A promise that resolves to an array of text chunks
|
|
4802
|
+
* @private
|
|
4803
|
+
*/
|
|
4804
|
+
async _splitText(text, recursiveLevel) {
|
|
4805
|
+
if (recursiveLevel.whitespace) {
|
|
4806
|
+
return text.split(" ");
|
|
4807
|
+
} else if (recursiveLevel.delimiters) {
|
|
4808
|
+
let t = text;
|
|
4809
|
+
if (recursiveLevel.includeDelim === "prev") {
|
|
4810
|
+
for (const delimiter of Array.isArray(recursiveLevel.delimiters) ? recursiveLevel.delimiters : [recursiveLevel.delimiters]) {
|
|
4811
|
+
t = t.replace(delimiter, delimiter + this.sep);
|
|
4812
|
+
}
|
|
4813
|
+
} else if (recursiveLevel.includeDelim === "next") {
|
|
4814
|
+
for (const delimiter of Array.isArray(recursiveLevel.delimiters) ? recursiveLevel.delimiters : [recursiveLevel.delimiters]) {
|
|
4815
|
+
t = t.replace(delimiter, this.sep + delimiter);
|
|
4816
|
+
}
|
|
4817
|
+
} else {
|
|
4818
|
+
for (const delimiter of Array.isArray(recursiveLevel.delimiters) ? recursiveLevel.delimiters : [recursiveLevel.delimiters]) {
|
|
4819
|
+
t = t.replace(delimiter, this.sep);
|
|
4820
|
+
}
|
|
4821
|
+
}
|
|
4822
|
+
const splits = t.split(this.sep).filter((split) => split !== "");
|
|
4823
|
+
let current = "";
|
|
4824
|
+
const merged = [];
|
|
4825
|
+
for (const split of splits) {
|
|
4826
|
+
if (split.length < this.minCharactersPerChunk) {
|
|
4827
|
+
current += split;
|
|
4828
|
+
} else if (current) {
|
|
4829
|
+
current += split;
|
|
4830
|
+
merged.push(current);
|
|
4831
|
+
current = "";
|
|
4832
|
+
} else {
|
|
4833
|
+
merged.push(split);
|
|
4834
|
+
}
|
|
4835
|
+
if (current.length >= this.minCharactersPerChunk) {
|
|
4836
|
+
merged.push(current);
|
|
4837
|
+
current = "";
|
|
4838
|
+
}
|
|
4839
|
+
}
|
|
4840
|
+
if (current) {
|
|
4841
|
+
merged.push(current);
|
|
4842
|
+
}
|
|
4843
|
+
return merged;
|
|
4844
|
+
} else {
|
|
4845
|
+
const encoded = await this.tokenizer.encode(text);
|
|
4846
|
+
const tokenSplits = [];
|
|
4847
|
+
for (let i = 0; i < encoded.length; i += this.chunkSize) {
|
|
4848
|
+
tokenSplits.push(encoded.slice(i, i + this.chunkSize));
|
|
4849
|
+
}
|
|
4850
|
+
return await this.tokenizer.decodeBatch(tokenSplits);
|
|
4851
|
+
}
|
|
4852
|
+
}
|
|
4853
|
+
/**
|
|
4854
|
+
* Create a RecursiveChunk object with indices based on the current offset.
|
|
4855
|
+
*
|
|
4856
|
+
* This method constructs a RecursiveChunk object that contains metadata about the chunk,
|
|
4857
|
+
* including the text content, its start and end indices, token count, and the level of recursion.
|
|
4858
|
+
*
|
|
4859
|
+
* @param {string} text - The text content of the chunk
|
|
4860
|
+
* @param {number} tokenCount - The number of tokens in the chunk
|
|
4861
|
+
*/
|
|
4862
|
+
_makeChunks(text, tokenCount, level, startOffset) {
|
|
4863
|
+
return new RecursiveChunk({
|
|
4864
|
+
text,
|
|
4865
|
+
startIndex: startOffset,
|
|
4866
|
+
endIndex: startOffset + text.length,
|
|
4867
|
+
tokenCount,
|
|
4868
|
+
level
|
|
4869
|
+
});
|
|
4870
|
+
}
|
|
4871
|
+
/**
|
|
4872
|
+
* Merge short splits.
|
|
4873
|
+
*/
|
|
4874
|
+
_mergeSplits(splits, tokenCounts, combineWhitespace = false) {
|
|
4875
|
+
if (!splits.length || !tokenCounts.length) {
|
|
4876
|
+
return [[], []];
|
|
4877
|
+
}
|
|
4878
|
+
if (splits.length !== tokenCounts.length) {
|
|
4879
|
+
throw new Error(
|
|
4880
|
+
`Number of splits ${splits.length} does not match number of token counts ${tokenCounts.length}`
|
|
4881
|
+
);
|
|
4882
|
+
}
|
|
4883
|
+
if (tokenCounts.every((count) => count > this.chunkSize)) {
|
|
4884
|
+
return [splits, tokenCounts];
|
|
4885
|
+
}
|
|
4886
|
+
const merged = [];
|
|
4887
|
+
const cumulativeTokenCounts = [];
|
|
4888
|
+
let sum = 0;
|
|
4889
|
+
if (combineWhitespace) {
|
|
4890
|
+
cumulativeTokenCounts.push(0);
|
|
4891
|
+
for (const count of tokenCounts) {
|
|
4892
|
+
sum += count + 1;
|
|
4893
|
+
cumulativeTokenCounts.push(sum);
|
|
4894
|
+
}
|
|
4895
|
+
} else {
|
|
4896
|
+
cumulativeTokenCounts.push(0);
|
|
4897
|
+
for (const count of tokenCounts) {
|
|
4898
|
+
sum += count;
|
|
4899
|
+
cumulativeTokenCounts.push(sum);
|
|
4900
|
+
}
|
|
4901
|
+
}
|
|
4902
|
+
let currentIndex = 0;
|
|
4903
|
+
const combinedTokenCounts = [];
|
|
4904
|
+
while (currentIndex < splits.length) {
|
|
4905
|
+
const currentTokenCount = cumulativeTokenCounts[currentIndex] ?? 0;
|
|
4906
|
+
const requiredTokenCount = currentTokenCount + this.chunkSize;
|
|
4907
|
+
let index = this._bisectLeft(
|
|
4908
|
+
cumulativeTokenCounts,
|
|
4909
|
+
requiredTokenCount,
|
|
4910
|
+
currentIndex
|
|
4911
|
+
) - 1;
|
|
4912
|
+
index = Math.min(index, splits.length);
|
|
4913
|
+
if (index === currentIndex) {
|
|
4914
|
+
index += 1;
|
|
4915
|
+
}
|
|
4916
|
+
if (combineWhitespace) {
|
|
4917
|
+
merged.push(splits.slice(currentIndex, index).join(" "));
|
|
4918
|
+
} else {
|
|
4919
|
+
merged.push(splits.slice(currentIndex, index).join(""));
|
|
4920
|
+
}
|
|
4921
|
+
combinedTokenCounts.push(
|
|
4922
|
+
(cumulativeTokenCounts[Math.min(index, splits.length)] ?? 0) - currentTokenCount
|
|
4923
|
+
);
|
|
4924
|
+
currentIndex = index;
|
|
4925
|
+
}
|
|
4926
|
+
return [merged, combinedTokenCounts];
|
|
4927
|
+
}
|
|
4928
|
+
/**
|
|
4929
|
+
* Binary search to find the leftmost position where value should be inserted to maintain order.
|
|
4930
|
+
*
|
|
4931
|
+
* @param {number[]} arr - The array to search
|
|
4932
|
+
* @param {number} value - The value to insert
|
|
4933
|
+
* @param {number} [lo=0] - The starting index for the search
|
|
4934
|
+
* @returns {number} The index where the value should be inserted
|
|
4935
|
+
* @private
|
|
4936
|
+
*/
|
|
4937
|
+
_bisectLeft(arr, value, lo = 0) {
|
|
4938
|
+
let hi = arr.length;
|
|
4939
|
+
while (lo < hi) {
|
|
4940
|
+
const mid = lo + hi >>> 1;
|
|
4941
|
+
if (arr[mid] < value) {
|
|
4942
|
+
lo = mid + 1;
|
|
4943
|
+
} else {
|
|
4944
|
+
hi = mid;
|
|
4945
|
+
}
|
|
4946
|
+
}
|
|
4947
|
+
return lo;
|
|
4948
|
+
}
|
|
4949
|
+
/**
|
|
4950
|
+
* Recursive helper for core chunking.
|
|
4951
|
+
*/
|
|
4952
|
+
async _recursiveChunk(text, level = 0, startOffset = 0) {
|
|
4953
|
+
if (!text) {
|
|
4954
|
+
return [];
|
|
4955
|
+
}
|
|
4956
|
+
console.log("[EXULU] Rule.", this.rules.length);
|
|
4957
|
+
console.log("[EXULU] Level.", level);
|
|
4958
|
+
if (level >= this.rules.length) {
|
|
4959
|
+
const tokenCount = await this._estimateTokenCount(text);
|
|
4960
|
+
return [
|
|
4961
|
+
this._makeChunks(
|
|
4962
|
+
text,
|
|
4963
|
+
tokenCount,
|
|
4964
|
+
level,
|
|
4965
|
+
startOffset
|
|
4966
|
+
)
|
|
4967
|
+
];
|
|
4968
|
+
}
|
|
4969
|
+
const currRule = this.rules.getLevel(level);
|
|
4970
|
+
if (!currRule) {
|
|
4971
|
+
throw new Error(`No rule found at level ${level}`);
|
|
4972
|
+
}
|
|
4973
|
+
const splits = await this._splitText(text, currRule);
|
|
4974
|
+
const tokenCounts = await Promise.all(splits.map((split) => this._estimateTokenCount(split)));
|
|
4975
|
+
let merged;
|
|
4976
|
+
let combinedTokenCounts;
|
|
4977
|
+
if (currRule.delimiters === void 0 && !currRule.whitespace) {
|
|
4978
|
+
[merged, combinedTokenCounts] = [splits, tokenCounts];
|
|
4979
|
+
} else if (currRule.delimiters === void 0 && currRule.whitespace) {
|
|
4980
|
+
[merged, combinedTokenCounts] = this._mergeSplits(
|
|
4981
|
+
splits,
|
|
4982
|
+
tokenCounts,
|
|
4983
|
+
true
|
|
4984
|
+
);
|
|
4985
|
+
merged = merged.slice(0, 1).concat(
|
|
4986
|
+
merged.slice(1).map((text2) => " " + text2)
|
|
4987
|
+
);
|
|
4988
|
+
} else {
|
|
4989
|
+
[merged, combinedTokenCounts] = this._mergeSplits(
|
|
4990
|
+
splits,
|
|
4991
|
+
tokenCounts,
|
|
4992
|
+
false
|
|
4993
|
+
);
|
|
4994
|
+
}
|
|
4995
|
+
const chunks = [];
|
|
4996
|
+
let currentOffset = startOffset;
|
|
4997
|
+
for (let i = 0; i < merged.length; i++) {
|
|
4998
|
+
const split = merged[i];
|
|
4999
|
+
const tokenCount = combinedTokenCounts[i];
|
|
5000
|
+
if (tokenCount && tokenCount > this.chunkSize) {
|
|
5001
|
+
chunks.push(...await this._recursiveChunk(split ?? "", level + 1, currentOffset));
|
|
5002
|
+
} else {
|
|
5003
|
+
chunks.push(
|
|
5004
|
+
this._makeChunks(split ?? "", tokenCount ?? 0, level, currentOffset)
|
|
5005
|
+
);
|
|
5006
|
+
}
|
|
5007
|
+
currentOffset += split?.length ?? 0;
|
|
5008
|
+
}
|
|
5009
|
+
return chunks;
|
|
5010
|
+
}
|
|
5011
|
+
/**
|
|
5012
|
+
* Recursively chunk text.
|
|
5013
|
+
*
|
|
5014
|
+
* This method is the main entry point for chunking text using the RecursiveChunker.
|
|
5015
|
+
* It takes a single text string and returns an array of RecursiveChunk objects.
|
|
5016
|
+
*
|
|
5017
|
+
* @param {string} text - The text to be chunked
|
|
5018
|
+
* @returns {Promise<RecursiveChunk[]>} A promise that resolves to an array of RecursiveChunk objects
|
|
5019
|
+
*/
|
|
5020
|
+
async chunk(text) {
|
|
5021
|
+
console.log("[EXULU] Chunking text.", text);
|
|
5022
|
+
const result = await this._recursiveChunk(text, 0, 0);
|
|
5023
|
+
await this.tokenizer.free();
|
|
5024
|
+
return result;
|
|
5025
|
+
}
|
|
5026
|
+
/**
|
|
5027
|
+
* Return a string representation of the RecursiveChunker.
|
|
5028
|
+
*
|
|
5029
|
+
* This method provides a string representation of the RecursiveChunker instance,
|
|
5030
|
+
* including its tokenizer, rules, chunk size, minimum characters per chunk, and return type.
|
|
5031
|
+
*
|
|
5032
|
+
* @returns {string} A string representation of the RecursiveChunker
|
|
5033
|
+
*/
|
|
5034
|
+
toString() {
|
|
5035
|
+
return `RecursiveChunker(tokenizer=${this.tokenizer}, rules=${this.rules}, chunkSize=${this.chunkSize}, minCharactersPerChunk=${this.minCharactersPerChunk})`;
|
|
5036
|
+
}
|
|
5037
|
+
};
|
|
5038
|
+
|
|
5039
|
+
// src/chunking/types/sentence.ts
|
|
5040
|
+
var Sentence = class _Sentence {
|
|
5041
|
+
/** The text of the sentence */
|
|
5042
|
+
text;
|
|
5043
|
+
/** The starting index of the sentence in the original text */
|
|
5044
|
+
startIndex;
|
|
5045
|
+
/** The ending index of the sentence in the original text */
|
|
5046
|
+
endIndex;
|
|
5047
|
+
/** The number of tokens in the sentence */
|
|
5048
|
+
tokenCount;
|
|
5049
|
+
constructor(data) {
|
|
5050
|
+
this.text = data.text;
|
|
5051
|
+
this.startIndex = data.startIndex;
|
|
5052
|
+
this.endIndex = data.endIndex;
|
|
5053
|
+
this.tokenCount = data.tokenCount;
|
|
5054
|
+
}
|
|
5055
|
+
/** Return a string representation of the Sentence */
|
|
5056
|
+
toString() {
|
|
5057
|
+
return `Sentence(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount})`;
|
|
5058
|
+
}
|
|
5059
|
+
/** Return the Sentence as a dictionary-like object */
|
|
5060
|
+
toDict() {
|
|
5061
|
+
return {
|
|
5062
|
+
text: this.text,
|
|
5063
|
+
startIndex: this.startIndex,
|
|
5064
|
+
endIndex: this.endIndex,
|
|
5065
|
+
tokenCount: this.tokenCount
|
|
5066
|
+
};
|
|
5067
|
+
}
|
|
5068
|
+
/** Create a Sentence object from a dictionary-like object */
|
|
5069
|
+
static fromDict(data) {
|
|
5070
|
+
return new _Sentence(data);
|
|
5071
|
+
}
|
|
5072
|
+
};
|
|
5073
|
+
var SentenceChunk = class _SentenceChunk extends Chunk {
|
|
5074
|
+
/** List of sentences in the chunk */
|
|
5075
|
+
sentences;
|
|
5076
|
+
constructor(data) {
|
|
5077
|
+
super(data);
|
|
5078
|
+
this.sentences = data.sentences;
|
|
5079
|
+
this.embedding = data.embedding ?? void 0;
|
|
5080
|
+
}
|
|
5081
|
+
/**
|
|
5082
|
+
* Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata.
|
|
5083
|
+
*
|
|
5084
|
+
* This method overrides the base {@link Chunk} toString method to provide a more informative output, which is especially useful for debugging and logging. Each sentence in the chunk is represented using its own toString method, and all sentences are included in the output.
|
|
5085
|
+
*
|
|
5086
|
+
* @returns {string} A string describing the SentenceChunk and all its sentences, e.g.,
|
|
5087
|
+
* SentenceChunk(text=..., startIndex=..., endIndex=..., tokenCount=..., sentences=[Sentence(...), ...])
|
|
5088
|
+
*/
|
|
5089
|
+
toString() {
|
|
5090
|
+
const sentencesStr = this.sentences.map((s) => s.toString()).join(", ");
|
|
5091
|
+
return `SentenceChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, sentences=[${sentencesStr}])`;
|
|
5092
|
+
}
|
|
5093
|
+
/**
|
|
5094
|
+
* Returns the SentenceChunk as a dictionary-like object.
|
|
5095
|
+
*
|
|
5096
|
+
* This method extends the base {@link Chunk} toDict method to include the sentences in the chunk.
|
|
5097
|
+
*
|
|
5098
|
+
* @returns {SentenceChunkData} A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
|
|
5099
|
+
/** Return the SentenceChunk as a dictionary-like object */
|
|
5100
|
+
toDict() {
|
|
5101
|
+
const baseDict = super.toDict();
|
|
5102
|
+
return {
|
|
5103
|
+
...baseDict,
|
|
5104
|
+
sentences: this.sentences.map((sentence) => sentence.toDict())
|
|
5105
|
+
};
|
|
5106
|
+
}
|
|
5107
|
+
/**
|
|
5108
|
+
* Creates a SentenceChunk object from a dictionary-like object.
|
|
5109
|
+
*
|
|
5110
|
+
* This method extends the base {@link Chunk} fromDict method to include the sentences in the chunk.
|
|
5111
|
+
*
|
|
5112
|
+
* @param {SentenceChunkData} data - A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
|
|
5113
|
+
* @returns {SentenceChunk} A new SentenceChunk object created from the provided dictionary-like object.
|
|
5114
|
+
*/
|
|
5115
|
+
static fromDict(data) {
|
|
5116
|
+
const sentences = data.sentences.map((sentence) => Sentence.fromDict(sentence));
|
|
5117
|
+
return new _SentenceChunk({
|
|
5118
|
+
text: data.text,
|
|
5119
|
+
startIndex: data.startIndex,
|
|
5120
|
+
endIndex: data.endIndex,
|
|
5121
|
+
tokenCount: data.tokenCount,
|
|
5122
|
+
sentences,
|
|
5123
|
+
embedding: data.embedding ?? void 0
|
|
5124
|
+
});
|
|
5125
|
+
}
|
|
5126
|
+
};
|
|
5127
|
+
|
|
5128
|
+
// src/chunking/sentence.ts
|
|
5129
|
+
var SentenceChunker = class _SentenceChunker extends BaseChunker {
|
|
5130
|
+
chunkSize;
|
|
5131
|
+
chunkOverlap;
|
|
5132
|
+
minSentencesPerChunk;
|
|
5133
|
+
minCharactersPerSentence;
|
|
5134
|
+
approximate;
|
|
5135
|
+
delim;
|
|
5136
|
+
includeDelim;
|
|
5137
|
+
sep;
|
|
5138
|
+
/**
|
|
5139
|
+
* Private constructor. Use `SentenceChunker.create()` to instantiate.
|
|
5140
|
+
*
|
|
5141
|
+
* @param {Tokenizer} tokenizer - The tokenizer to use for token counting.
|
|
5142
|
+
* @param {number} chunkSize - Maximum number of tokens per chunk.
|
|
5143
|
+
* @param {number} chunkOverlap - Number of tokens to overlap between consecutive chunks.
|
|
5144
|
+
* @param {number} minSentencesPerChunk - Minimum number of sentences per chunk.
|
|
5145
|
+
* @param {number} minCharactersPerSentence - Minimum number of characters for a valid sentence.
|
|
5146
|
+
* @param {boolean} approximate - Whether to use approximate token counting.
|
|
5147
|
+
* @param {string[]} delim - List of sentence delimiters to use for splitting.
|
|
5148
|
+
* @param {('prev' | 'next' | null)} includeDelim - Whether to include the delimiter with the previous sentence ('prev'), next sentence ('next'), or exclude it (null).
|
|
5149
|
+
*/
|
|
5150
|
+
constructor(tokenizer, chunkSize, chunkOverlap, minSentencesPerChunk, minCharactersPerSentence, approximate, delim, includeDelim) {
|
|
5151
|
+
super(tokenizer);
|
|
5152
|
+
if (chunkSize <= 0) {
|
|
5153
|
+
throw new Error("chunkSize must be greater than 0");
|
|
5154
|
+
}
|
|
5155
|
+
if (chunkOverlap < 0) {
|
|
5156
|
+
throw new Error("chunkOverlap must be non-negative");
|
|
5157
|
+
}
|
|
5158
|
+
if (chunkOverlap >= chunkSize) {
|
|
5159
|
+
throw new Error("chunkOverlap must be less than chunkSize");
|
|
5160
|
+
}
|
|
5161
|
+
if (minSentencesPerChunk <= 0) {
|
|
5162
|
+
throw new Error("minSentencesPerChunk must be greater than 0");
|
|
5163
|
+
}
|
|
5164
|
+
if (minCharactersPerSentence <= 0) {
|
|
5165
|
+
throw new Error("minCharactersPerSentence must be greater than 0");
|
|
5166
|
+
}
|
|
5167
|
+
if (!delim) {
|
|
5168
|
+
throw new Error("delim must be a list of strings or a string");
|
|
5169
|
+
}
|
|
5170
|
+
if (includeDelim !== "prev" && includeDelim !== "next" && includeDelim !== null) {
|
|
5171
|
+
throw new Error("includeDelim must be 'prev', 'next' or null");
|
|
5172
|
+
}
|
|
5173
|
+
if (approximate) {
|
|
5174
|
+
console.warn("Approximate has been deprecated and will be removed from next version onwards!");
|
|
5175
|
+
}
|
|
5176
|
+
this.chunkSize = chunkSize;
|
|
5177
|
+
this.chunkOverlap = chunkOverlap;
|
|
5178
|
+
this.minSentencesPerChunk = minSentencesPerChunk;
|
|
5179
|
+
this.minCharactersPerSentence = minCharactersPerSentence;
|
|
5180
|
+
this.approximate = approximate;
|
|
5181
|
+
this.delim = delim;
|
|
5182
|
+
this.includeDelim = includeDelim;
|
|
5183
|
+
this.sep = "\u2704";
|
|
5184
|
+
}
|
|
5185
|
+
/**
|
|
5186
|
+
* Creates and initializes a SentenceChunker instance that is directly callable.
|
|
5187
|
+
*
|
|
5188
|
+
* This method is a static factory function that returns a Promise resolving to a CallableSentenceChunker instance.
|
|
5189
|
+
* The returned instance is a callable function that can be used to chunk text strings or arrays of text strings.
|
|
5190
|
+
*
|
|
5191
|
+
* @param {SentenceChunkerOptions} [options] - Options for configuring the SentenceChunker.
|
|
5192
|
+
* @returns {Promise<CallableSentenceChunker>} A promise that resolves to a callable SentenceChunker instance.
|
|
5193
|
+
*
|
|
5194
|
+
* @example
|
|
5195
|
+
* const chunker = await SentenceChunker.create();
|
|
5196
|
+
* const chunks = await chunker("This is a sample text.");
|
|
5197
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
5198
|
+
*
|
|
5199
|
+
* @see SentenceChunkerOptions
|
|
5200
|
+
*/
|
|
5201
|
+
static async create(options = {}) {
|
|
5202
|
+
const {
|
|
5203
|
+
tokenizer = "gpt-3.5-turbo",
|
|
5204
|
+
chunkSize = 512,
|
|
5205
|
+
chunkOverlap = 0,
|
|
5206
|
+
minSentencesPerChunk = 1,
|
|
5207
|
+
minCharactersPerSentence = 12,
|
|
5208
|
+
approximate = false,
|
|
5209
|
+
delim = [". ", "! ", "? ", "\n"],
|
|
5210
|
+
includeDelim = "prev"
|
|
5211
|
+
} = options;
|
|
5212
|
+
const tokenizerInstance = await new ExuluTokenizer();
|
|
5213
|
+
await tokenizerInstance.create(tokenizer);
|
|
5214
|
+
const plainInstance = new _SentenceChunker(
|
|
5215
|
+
tokenizerInstance,
|
|
5216
|
+
chunkSize,
|
|
5217
|
+
chunkOverlap,
|
|
5218
|
+
minSentencesPerChunk,
|
|
5219
|
+
minCharactersPerSentence,
|
|
5220
|
+
approximate,
|
|
5221
|
+
delim,
|
|
5222
|
+
includeDelim
|
|
5223
|
+
);
|
|
5224
|
+
const callableFn = function(textOrTexts, showProgress) {
|
|
5225
|
+
if (typeof textOrTexts === "string") {
|
|
5226
|
+
return plainInstance.call(textOrTexts, showProgress);
|
|
5227
|
+
} else {
|
|
5228
|
+
return plainInstance.call(textOrTexts, showProgress);
|
|
5229
|
+
}
|
|
5230
|
+
};
|
|
5231
|
+
Object.setPrototypeOf(callableFn, _SentenceChunker.prototype);
|
|
5232
|
+
Object.assign(callableFn, plainInstance);
|
|
5233
|
+
return callableFn;
|
|
5234
|
+
}
|
|
5235
|
+
// NOTE: The replace + split method is not the best/most efficient way in general to be doing this. It works well in python because python implements .replace and .split in C while the re library is much slower in python.
|
|
5236
|
+
// NOTE: The new split -> join -> split is so weird, but it works. I don't quite like it however.
|
|
5237
|
+
// TODO: Implement a more efficient method for splitting text into sentences.
|
|
5238
|
+
/**
|
|
5239
|
+
* Fast sentence splitting while maintaining accuracy.
|
|
5240
|
+
*
|
|
5241
|
+
* @param {string} text - The text to split into sentences.
|
|
5242
|
+
* @returns {string[]} An array of sentences.
|
|
5243
|
+
*/
|
|
5244
|
+
_splitText(text) {
|
|
5245
|
+
let t = text;
|
|
5246
|
+
for (const c of this.delim) {
|
|
5247
|
+
if (this.includeDelim === "prev") {
|
|
5248
|
+
t = t.split(c).join(c + this.sep);
|
|
5249
|
+
} else if (this.includeDelim === "next") {
|
|
5250
|
+
t = t.split(c).join(this.sep + c);
|
|
5251
|
+
} else {
|
|
5252
|
+
t = t.split(c).join(this.sep);
|
|
5253
|
+
}
|
|
5254
|
+
}
|
|
5255
|
+
const splits = t.split(this.sep);
|
|
5256
|
+
const sentences = [];
|
|
5257
|
+
let current = "";
|
|
5258
|
+
for (const s of splits) {
|
|
5259
|
+
if (!current) {
|
|
5260
|
+
current = s;
|
|
5261
|
+
} else {
|
|
5262
|
+
if (current.length >= this.minCharactersPerSentence) {
|
|
5263
|
+
sentences.push(current);
|
|
5264
|
+
current = s;
|
|
5265
|
+
} else {
|
|
5266
|
+
current += s;
|
|
5267
|
+
}
|
|
5268
|
+
}
|
|
5269
|
+
}
|
|
5270
|
+
if (current) {
|
|
5271
|
+
sentences.push(current);
|
|
5272
|
+
}
|
|
5273
|
+
return sentences;
|
|
5274
|
+
}
|
|
5275
|
+
/**
|
|
5276
|
+
* Split text into sentences and calculate token counts for each sentence.
|
|
5277
|
+
*
|
|
5278
|
+
* @param {string} text - The text to split into sentences.
|
|
5279
|
+
* @returns {Promise<Sentence[]>} An array of Sentence objects.
|
|
5280
|
+
*/
|
|
5281
|
+
async _prepareSentences(text) {
|
|
5282
|
+
const sentenceTexts = this._splitText(text);
|
|
5283
|
+
if (!sentenceTexts.length) {
|
|
5284
|
+
return [];
|
|
5285
|
+
}
|
|
5286
|
+
const positions = [];
|
|
5287
|
+
let currentPos = 0;
|
|
5288
|
+
for (const sent of sentenceTexts) {
|
|
5289
|
+
positions.push(currentPos);
|
|
5290
|
+
currentPos += sent.length;
|
|
5291
|
+
}
|
|
5292
|
+
const tokenCounts = await this.tokenizer.countTokensBatch(sentenceTexts);
|
|
5293
|
+
return sentenceTexts.map((sent, i) => new Sentence({
|
|
5294
|
+
text: sent,
|
|
5295
|
+
startIndex: positions[i],
|
|
5296
|
+
endIndex: positions[i] + sent.length,
|
|
5297
|
+
tokenCount: tokenCounts[i]
|
|
5298
|
+
}));
|
|
5299
|
+
}
|
|
5300
|
+
/**
|
|
5301
|
+
* Create a chunk from a list of sentences.
|
|
5302
|
+
*
|
|
5303
|
+
* @param {Sentence[]} sentences - The sentences to create a chunk from.
|
|
5304
|
+
* @returns {Promise<SentenceChunk>} A promise that resolves to a SentenceChunk object.
|
|
5305
|
+
*/
|
|
5306
|
+
async _createChunk(sentences) {
|
|
5307
|
+
const chunkText = sentences.map((sentence) => sentence.text).join("");
|
|
5308
|
+
const tokenCount = await this.tokenizer.countTokens(chunkText);
|
|
5309
|
+
return new SentenceChunk({
|
|
5310
|
+
text: chunkText,
|
|
5311
|
+
startIndex: sentences[0].startIndex,
|
|
5312
|
+
endIndex: sentences[sentences.length - 1].endIndex,
|
|
5313
|
+
tokenCount,
|
|
5314
|
+
sentences
|
|
5315
|
+
});
|
|
5316
|
+
}
|
|
5317
|
+
/**
|
|
5318
|
+
* Split text into overlapping chunks based on sentences while respecting token limits.
|
|
5319
|
+
*
|
|
5320
|
+
* @param {string} text - The text to split into chunks.
|
|
5321
|
+
* @returns {Promise<SentenceChunk[]>} A promise that resolves to an array of SentenceChunk objects.
|
|
5322
|
+
*/
|
|
5323
|
+
async chunk(text) {
|
|
5324
|
+
if (!text.trim()) {
|
|
5325
|
+
return [];
|
|
5326
|
+
}
|
|
5327
|
+
const sentences = await this._prepareSentences(text);
|
|
5328
|
+
if (!sentences.length) {
|
|
5329
|
+
return [];
|
|
5330
|
+
}
|
|
5331
|
+
const tokenSums = [];
|
|
5332
|
+
let sum = 0;
|
|
5333
|
+
for (const sentence of sentences) {
|
|
5334
|
+
tokenSums.push(sum);
|
|
5335
|
+
sum += sentence.tokenCount;
|
|
5336
|
+
}
|
|
5337
|
+
tokenSums.push(sum);
|
|
5338
|
+
const chunks = [];
|
|
5339
|
+
let pos = 0;
|
|
5340
|
+
while (pos < sentences.length) {
|
|
5341
|
+
const targetTokens = tokenSums[pos] + this.chunkSize;
|
|
5342
|
+
let splitIdx = this._bisectLeft(tokenSums, targetTokens, pos) - 1;
|
|
5343
|
+
splitIdx = Math.min(splitIdx, sentences.length);
|
|
5344
|
+
splitIdx = Math.max(splitIdx, pos + 1);
|
|
5345
|
+
if (splitIdx - pos < this.minSentencesPerChunk) {
|
|
5346
|
+
if (pos + this.minSentencesPerChunk <= sentences.length) {
|
|
5347
|
+
splitIdx = pos + this.minSentencesPerChunk;
|
|
5348
|
+
} else {
|
|
5349
|
+
console.warn(
|
|
5350
|
+
`Minimum sentences per chunk as ${this.minSentencesPerChunk} could not be met for all chunks. Last chunk of the text will have only ${sentences.length - pos} sentences. Consider increasing the chunk_size or decreasing the min_sentences_per_chunk.`
|
|
5351
|
+
);
|
|
5352
|
+
splitIdx = sentences.length;
|
|
5353
|
+
}
|
|
5354
|
+
}
|
|
5355
|
+
const chunkSentences = sentences.slice(pos, splitIdx);
|
|
5356
|
+
chunks.push(await this._createChunk(chunkSentences));
|
|
5357
|
+
if (this.chunkOverlap > 0 && splitIdx < sentences.length) {
|
|
5358
|
+
let overlapTokens = 0;
|
|
5359
|
+
let overlapIdx = splitIdx - 1;
|
|
5360
|
+
while (overlapIdx > pos && overlapTokens < this.chunkOverlap) {
|
|
5361
|
+
const sent = sentences[overlapIdx];
|
|
5362
|
+
const nextTokens = overlapTokens + sent.tokenCount + 1;
|
|
5363
|
+
if (nextTokens > this.chunkOverlap) {
|
|
5364
|
+
break;
|
|
5365
|
+
}
|
|
5366
|
+
overlapTokens = nextTokens;
|
|
5367
|
+
overlapIdx--;
|
|
5368
|
+
}
|
|
5369
|
+
pos = overlapIdx + 1;
|
|
5370
|
+
} else {
|
|
5371
|
+
pos = splitIdx;
|
|
5372
|
+
}
|
|
5373
|
+
}
|
|
5374
|
+
await this.tokenizer.free();
|
|
5375
|
+
return chunks;
|
|
5376
|
+
}
|
|
5377
|
+
/**
|
|
5378
|
+
* Binary search to find the leftmost position where value should be inserted to maintain order.
|
|
5379
|
+
*
|
|
5380
|
+
* @param {number[]} arr - The array to search.
|
|
5381
|
+
* @param {number} value - The value to search for.
|
|
5382
|
+
* @param {number} [lo] - The starting index of the search.
|
|
5383
|
+
* @returns {number} The index of the leftmost position where value should be inserted.
|
|
5384
|
+
*/
|
|
5385
|
+
_bisectLeft(arr, value, lo = 0) {
|
|
5386
|
+
let hi = arr.length;
|
|
5387
|
+
while (lo < hi) {
|
|
5388
|
+
const mid = lo + hi >>> 1;
|
|
5389
|
+
if (arr[mid] < value) {
|
|
5390
|
+
lo = mid + 1;
|
|
5391
|
+
} else {
|
|
5392
|
+
hi = mid;
|
|
5393
|
+
}
|
|
5394
|
+
}
|
|
5395
|
+
return lo;
|
|
5396
|
+
}
|
|
5397
|
+
/**
|
|
5398
|
+
* Return a string representation of the SentenceChunker.
|
|
5399
|
+
*
|
|
5400
|
+
* @returns {string} A string representation of the SentenceChunker.
|
|
5401
|
+
*/
|
|
5402
|
+
toString() {
|
|
5403
|
+
return `SentenceChunker(tokenizer=${this.tokenizer}, chunkSize=${this.chunkSize}, chunkOverlap=${this.chunkOverlap}, minSentencesPerChunk=${this.minSentencesPerChunk}, minCharactersPerSentence=${this.minCharactersPerSentence}, approximate=${this.approximate}, delim=${this.delim}, includeDelim=${this.includeDelim})`;
|
|
5404
|
+
}
|
|
5405
|
+
};
|
|
4193
5406
|
|
|
4194
5407
|
// src/cli/index.tsx
|
|
4195
5408
|
import { useState as useState2 } from "react";
|
|
@@ -4389,12 +5602,10 @@ var ExuluJobs = {
|
|
|
4389
5602
|
}
|
|
4390
5603
|
};
|
|
4391
5604
|
var ExuluChunkers = {
|
|
4392
|
-
|
|
4393
|
-
|
|
4394
|
-
|
|
4395
|
-
|
|
4396
|
-
rules: RecursiveRules
|
|
4397
|
-
}
|
|
5605
|
+
sentence: SentenceChunker,
|
|
5606
|
+
recursive: {
|
|
5607
|
+
function: RecursiveChunker,
|
|
5608
|
+
rules: RecursiveRules
|
|
4398
5609
|
}
|
|
4399
5610
|
};
|
|
4400
5611
|
var ExuluDatabase = {
|