@exulu/backend 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1237 -26
- package/dist/index.d.cts +863 -7
- package/dist/index.d.ts +863 -7
- package/dist/index.js +1232 -21
- package/package.json +4 -3
package/dist/index.cjs
CHANGED
|
@@ -1850,10 +1850,10 @@ var updateStatistic = async (statistic) => {
|
|
|
1850
1850
|
};
|
|
1851
1851
|
|
|
1852
1852
|
// src/registry/index.ts
|
|
1853
|
-
var
|
|
1853
|
+
var import_express4 = require("express");
|
|
1854
1854
|
|
|
1855
1855
|
// src/registry/routes.ts
|
|
1856
|
-
var
|
|
1856
|
+
var import_express2 = require("express");
|
|
1857
1857
|
|
|
1858
1858
|
// src/registry/rate-limiter.ts
|
|
1859
1859
|
var rateLimiter = async (key, windowSeconds, limit, points) => {
|
|
@@ -1891,9 +1891,25 @@ var rateLimiter = async (key, windowSeconds, limit, points) => {
|
|
|
1891
1891
|
}
|
|
1892
1892
|
};
|
|
1893
1893
|
|
|
1894
|
-
// src/
|
|
1895
|
-
var
|
|
1896
|
-
var
|
|
1894
|
+
// src/auth/get-token.ts
|
|
1895
|
+
var import_jose = require("jose");
|
|
1896
|
+
var getToken = async (authHeader) => {
|
|
1897
|
+
const token = authHeader.split(" ")[1];
|
|
1898
|
+
if (!token) {
|
|
1899
|
+
throw new Error("No token provided");
|
|
1900
|
+
}
|
|
1901
|
+
if (!process.env.NEXTAUTH_SECRET) {
|
|
1902
|
+
throw new Error("No NEXTAUTH_SECRET provided");
|
|
1903
|
+
}
|
|
1904
|
+
try {
|
|
1905
|
+
const secret = process.env.NEXTAUTH_SECRET;
|
|
1906
|
+
const jwk = await (0, import_jose.importJWK)({ k: secret, alg: "HS256", kty: "oct" });
|
|
1907
|
+
const { payload } = await (0, import_jose.jwtVerify)(token, jwk);
|
|
1908
|
+
return payload;
|
|
1909
|
+
} catch (error) {
|
|
1910
|
+
throw new Error("Invalid token");
|
|
1911
|
+
}
|
|
1912
|
+
};
|
|
1897
1913
|
|
|
1898
1914
|
// src/auth/auth.ts
|
|
1899
1915
|
var import_bcryptjs2 = __toESM(require("bcryptjs"), 1);
|
|
@@ -2031,7 +2047,8 @@ var requestValidators = {
|
|
|
2031
2047
|
let authtoken = null;
|
|
2032
2048
|
if (typeof apikey !== "string") {
|
|
2033
2049
|
const secret = process.env.NEXTAUTH_SECRET;
|
|
2034
|
-
authtoken = await
|
|
2050
|
+
authtoken = await getToken(req.headers["authorization"] ?? "");
|
|
2051
|
+
console.log("[EXULU] authtoken", authtoken);
|
|
2035
2052
|
}
|
|
2036
2053
|
return await authentication({
|
|
2037
2054
|
authtoken,
|
|
@@ -2216,7 +2233,7 @@ var VectorMethodEnum = {
|
|
|
2216
2233
|
};
|
|
2217
2234
|
|
|
2218
2235
|
// src/registry/routes.ts
|
|
2219
|
-
var
|
|
2236
|
+
var import_express3 = __toESM(require("express"), 1);
|
|
2220
2237
|
var import_server3 = require("@apollo/server");
|
|
2221
2238
|
var Papa = __toESM(require("papaparse"), 1);
|
|
2222
2239
|
var import_cors = __toESM(require("cors"), 1);
|
|
@@ -2554,9 +2571,8 @@ type PageInfo {
|
|
|
2554
2571
|
var import_express5 = require("@as-integrations/express5");
|
|
2555
2572
|
|
|
2556
2573
|
// src/registry/uppy.ts
|
|
2557
|
-
var
|
|
2574
|
+
var import_express = require("express");
|
|
2558
2575
|
var import_body_parser = __toESM(require("body-parser"), 1);
|
|
2559
|
-
var import_jwt2 = require("next-auth/jwt");
|
|
2560
2576
|
var createUppyRoutes = async (app) => {
|
|
2561
2577
|
const {
|
|
2562
2578
|
S3Client,
|
|
@@ -2619,11 +2635,11 @@ var createUppyRoutes = async (app) => {
|
|
|
2619
2635
|
}
|
|
2620
2636
|
app.use(import_body_parser.default.urlencoded({ extended: true }), import_body_parser.default.json());
|
|
2621
2637
|
app.get("/s3/list", async (req, res, next) => {
|
|
2638
|
+
req.accepts;
|
|
2622
2639
|
const apikey = req.headers["exulu-api-key"] || null;
|
|
2623
2640
|
let authtoken = null;
|
|
2624
2641
|
if (typeof apikey !== "string") {
|
|
2625
|
-
|
|
2626
|
-
authtoken = await (0, import_jwt2.getToken)({ req, secret });
|
|
2642
|
+
authtoken = await getToken(req.headers.authorization ?? "");
|
|
2627
2643
|
}
|
|
2628
2644
|
const { db: db2 } = await postgresClient();
|
|
2629
2645
|
const authenticationResult = await authentication({
|
|
@@ -2673,8 +2689,7 @@ var createUppyRoutes = async (app) => {
|
|
|
2673
2689
|
const { db: db2 } = await postgresClient();
|
|
2674
2690
|
let authtoken = null;
|
|
2675
2691
|
if (typeof apikey !== "string" && typeof internalkey !== "string") {
|
|
2676
|
-
|
|
2677
|
-
authtoken = await (0, import_jwt2.getToken)({ req, secret });
|
|
2692
|
+
authtoken = await getToken(req.headers.authorization ?? "");
|
|
2678
2693
|
}
|
|
2679
2694
|
const authenticationResult = await authentication({
|
|
2680
2695
|
authtoken,
|
|
@@ -2747,8 +2762,7 @@ var createUppyRoutes = async (app) => {
|
|
|
2747
2762
|
const { db: db2 } = await postgresClient();
|
|
2748
2763
|
let authtoken = null;
|
|
2749
2764
|
if (typeof apikey !== "string") {
|
|
2750
|
-
|
|
2751
|
-
authtoken = await (0, import_jwt2.getToken)({ req, secret });
|
|
2765
|
+
authtoken = await getToken(req.headers.authorization ?? "");
|
|
2752
2766
|
}
|
|
2753
2767
|
const authenticationResult = await authentication({
|
|
2754
2768
|
authtoken,
|
|
@@ -2796,8 +2810,7 @@ var createUppyRoutes = async (app) => {
|
|
|
2796
2810
|
const { db: db2 } = await postgresClient();
|
|
2797
2811
|
let authtoken = null;
|
|
2798
2812
|
if (typeof apikey !== "string") {
|
|
2799
|
-
|
|
2800
|
-
authtoken = await (0, import_jwt2.getToken)({ req, secret });
|
|
2813
|
+
authtoken = await getToken(req.headers.authorization ?? "");
|
|
2801
2814
|
}
|
|
2802
2815
|
const authenticationResult = await authentication({
|
|
2803
2816
|
authtoken,
|
|
@@ -3062,7 +3075,7 @@ var createExpressRoutes = async (app, agents, embedders, tools, workflows, conte
|
|
|
3062
3075
|
app.use(
|
|
3063
3076
|
"/graphql",
|
|
3064
3077
|
(0, import_cors.default)(),
|
|
3065
|
-
|
|
3078
|
+
import_express3.default.json(),
|
|
3066
3079
|
(0, import_express5.expressMiddleware)(server, {
|
|
3067
3080
|
context: async ({ req }) => {
|
|
3068
3081
|
const authenticationResult = await requestValidators.authenticate(req);
|
|
@@ -4232,8 +4245,1208 @@ var ExuluApp = class {
|
|
|
4232
4245
|
};
|
|
4233
4246
|
};
|
|
4234
4247
|
|
|
4235
|
-
// src/
|
|
4236
|
-
var
|
|
4248
|
+
// src/chunking/types/base.ts
|
|
4249
|
+
var Chunk = class _Chunk {
|
|
4250
|
+
/** The text of the chunk. */
|
|
4251
|
+
text;
|
|
4252
|
+
/** The starting index of the chunk in the original text. */
|
|
4253
|
+
startIndex;
|
|
4254
|
+
/** The ending index of the chunk in the original text. */
|
|
4255
|
+
endIndex;
|
|
4256
|
+
/** The number of tokens in the chunk. */
|
|
4257
|
+
tokenCount;
|
|
4258
|
+
/** Optional embedding for the chunk. */
|
|
4259
|
+
embedding;
|
|
4260
|
+
/**
|
|
4261
|
+
* Constructs a new Chunk object.
|
|
4262
|
+
*
|
|
4263
|
+
* @param {ChunkData} data - The data to construct the Chunk from.
|
|
4264
|
+
*/
|
|
4265
|
+
constructor(data) {
|
|
4266
|
+
this.text = data.text;
|
|
4267
|
+
this.startIndex = data.startIndex;
|
|
4268
|
+
this.endIndex = data.endIndex;
|
|
4269
|
+
this.tokenCount = data.tokenCount;
|
|
4270
|
+
this.embedding = data.embedding;
|
|
4271
|
+
if (this.startIndex > this.endIndex) {
|
|
4272
|
+
throw new Error("Start index must be less than or equal to end index.");
|
|
4273
|
+
}
|
|
4274
|
+
if (this.tokenCount < 0) {
|
|
4275
|
+
throw new Error("Token count must be a non-negative integer.");
|
|
4276
|
+
}
|
|
4277
|
+
}
|
|
4278
|
+
/** Return a string representation of the Chunk.
|
|
4279
|
+
*
|
|
4280
|
+
* @returns {string} The text of the chunk.
|
|
4281
|
+
*/
|
|
4282
|
+
toString() {
|
|
4283
|
+
return this.text;
|
|
4284
|
+
}
|
|
4285
|
+
/** Return a detailed string representation of the Chunk.
|
|
4286
|
+
*
|
|
4287
|
+
* @returns {string} The detailed string representation of the Chunk.
|
|
4288
|
+
*/
|
|
4289
|
+
toRepresentation() {
|
|
4290
|
+
let repr = `Chunk(text='${this.text}', tokenCount=${this.tokenCount}, startIndex=${this.startIndex}, endIndex=${this.endIndex}`;
|
|
4291
|
+
repr += ")";
|
|
4292
|
+
return repr;
|
|
4293
|
+
}
|
|
4294
|
+
/** Return a slice of the chunk's text.
|
|
4295
|
+
*
|
|
4296
|
+
* @param {number} [start] - The starting index of the slice.
|
|
4297
|
+
* @param {number} [end] - The ending index of the slice.
|
|
4298
|
+
* @returns {string} The slice of the chunk's text.
|
|
4299
|
+
*/
|
|
4300
|
+
slice(start, end) {
|
|
4301
|
+
return this.text.slice(start, end);
|
|
4302
|
+
}
|
|
4303
|
+
/** Return the Chunk as a dictionary-like object.
|
|
4304
|
+
*
|
|
4305
|
+
* @returns {ChunkData} The dictionary-like object.
|
|
4306
|
+
*/
|
|
4307
|
+
toDict() {
|
|
4308
|
+
return {
|
|
4309
|
+
text: this.text,
|
|
4310
|
+
startIndex: this.startIndex,
|
|
4311
|
+
endIndex: this.endIndex,
|
|
4312
|
+
tokenCount: this.tokenCount,
|
|
4313
|
+
embedding: this.embedding
|
|
4314
|
+
};
|
|
4315
|
+
}
|
|
4316
|
+
/** Create a Chunk object from a dictionary-like object.
|
|
4317
|
+
*
|
|
4318
|
+
* @param {ChunkData} data - The dictionary-like object.
|
|
4319
|
+
* @returns {Chunk} The Chunk object.
|
|
4320
|
+
*/
|
|
4321
|
+
static fromDict(data) {
|
|
4322
|
+
return new _Chunk({
|
|
4323
|
+
text: data.text,
|
|
4324
|
+
startIndex: data.startIndex,
|
|
4325
|
+
endIndex: data.endIndex,
|
|
4326
|
+
tokenCount: data.tokenCount,
|
|
4327
|
+
embedding: data.embedding
|
|
4328
|
+
});
|
|
4329
|
+
}
|
|
4330
|
+
/** Return a deep copy of the chunk.
|
|
4331
|
+
*
|
|
4332
|
+
* @returns {Chunk} The deep copy of the chunk.
|
|
4333
|
+
*/
|
|
4334
|
+
copy() {
|
|
4335
|
+
return _Chunk.fromDict(this.toDict());
|
|
4336
|
+
}
|
|
4337
|
+
};
|
|
4338
|
+
|
|
4339
|
+
// src/chunking/types/recursive.ts
|
|
4340
|
+
var RecursiveLevel = class _RecursiveLevel {
|
|
4341
|
+
/** Custom delimiters for chunking */
|
|
4342
|
+
delimiters;
|
|
4343
|
+
/** Whether to use whitespace as a delimiter */
|
|
4344
|
+
whitespace;
|
|
4345
|
+
/** Whether to include the delimiter in the previous or next chunk */
|
|
4346
|
+
includeDelim;
|
|
4347
|
+
/**
|
|
4348
|
+
* Constructs a new RecursiveLevel object.
|
|
4349
|
+
*
|
|
4350
|
+
* @param {RecursiveLevelData} data - The data to construct the RecursiveLevel from.
|
|
4351
|
+
*/
|
|
4352
|
+
constructor(data = {}) {
|
|
4353
|
+
this.delimiters = data.delimiters;
|
|
4354
|
+
this.whitespace = data.whitespace ?? false;
|
|
4355
|
+
this.includeDelim = data.includeDelim ?? "prev";
|
|
4356
|
+
this.validate();
|
|
4357
|
+
}
|
|
4358
|
+
/**
|
|
4359
|
+
* Validates the RecursiveLevel object.
|
|
4360
|
+
*
|
|
4361
|
+
* @private
|
|
4362
|
+
*/
|
|
4363
|
+
validate() {
|
|
4364
|
+
if (this.delimiters !== void 0 && this.whitespace) {
|
|
4365
|
+
throw new Error("Cannot use whitespace as a delimiter and also specify custom delimiters.");
|
|
4366
|
+
}
|
|
4367
|
+
if (this.delimiters !== void 0) {
|
|
4368
|
+
if (typeof this.delimiters === "string" && this.delimiters.length === 0) {
|
|
4369
|
+
throw new Error("Custom delimiters cannot be an empty string.");
|
|
4370
|
+
}
|
|
4371
|
+
if (Array.isArray(this.delimiters)) {
|
|
4372
|
+
if (this.delimiters.some((delim) => typeof delim !== "string" || delim.length === 0)) {
|
|
4373
|
+
throw new Error("Custom delimiters cannot be an empty string.");
|
|
4374
|
+
}
|
|
4375
|
+
if (this.delimiters.includes(" ")) {
|
|
4376
|
+
throw new Error("Custom delimiters cannot be whitespace only. Set whitespace to true instead.");
|
|
4377
|
+
}
|
|
4378
|
+
}
|
|
4379
|
+
}
|
|
4380
|
+
}
|
|
4381
|
+
/** Return a string representation of the RecursiveLevel
|
|
4382
|
+
*
|
|
4383
|
+
* @returns {string} The string representation of the RecursiveLevel.
|
|
4384
|
+
*/
|
|
4385
|
+
toString() {
|
|
4386
|
+
return `RecursiveLevel(delimiters=${this.delimiters}, whitespace=${this.whitespace}, includeDelim=${this.includeDelim})`;
|
|
4387
|
+
}
|
|
4388
|
+
/** Return the RecursiveLevel as a dictionary-like object
|
|
4389
|
+
*
|
|
4390
|
+
* @returns {RecursiveLevelData} The dictionary-like object.
|
|
4391
|
+
*/
|
|
4392
|
+
toDict() {
|
|
4393
|
+
return {
|
|
4394
|
+
delimiters: this.delimiters,
|
|
4395
|
+
whitespace: this.whitespace,
|
|
4396
|
+
includeDelim: this.includeDelim
|
|
4397
|
+
};
|
|
4398
|
+
}
|
|
4399
|
+
/** Create RecursiveLevel object from a dictionary
|
|
4400
|
+
*
|
|
4401
|
+
* @param {RecursiveLevelData} data - The dictionary-like object.
|
|
4402
|
+
* @returns {RecursiveLevel} The RecursiveLevel object.
|
|
4403
|
+
*/
|
|
4404
|
+
static fromDict(data) {
|
|
4405
|
+
return new _RecursiveLevel(data);
|
|
4406
|
+
}
|
|
4407
|
+
/** Create RecursiveLevel object from a recipe
|
|
4408
|
+
*
|
|
4409
|
+
* @param {string} name - The name of the recipe.
|
|
4410
|
+
* @param {string} lang - The language of the recipe.
|
|
4411
|
+
* @returns {Promise<RecursiveLevel>} The RecursiveLevel object.
|
|
4412
|
+
*/
|
|
4413
|
+
static async fromRecipe(name, lang = "en") {
|
|
4414
|
+
throw new Error("Not implemented");
|
|
4415
|
+
}
|
|
4416
|
+
};
|
|
4417
|
+
var RecursiveRules = class _RecursiveRules {
|
|
4418
|
+
/** List of recursive levels */
|
|
4419
|
+
levels;
|
|
4420
|
+
constructor(data = {}) {
|
|
4421
|
+
if (data.levels === void 0) {
|
|
4422
|
+
const paragraphs = new RecursiveLevel({ delimiters: ["\n\n", "\r\n", "\n", "\r"] });
|
|
4423
|
+
const sentences = new RecursiveLevel({ delimiters: [". ", "! ", "? "] });
|
|
4424
|
+
const pauses = new RecursiveLevel({
|
|
4425
|
+
delimiters: [
|
|
4426
|
+
"{",
|
|
4427
|
+
"}",
|
|
4428
|
+
'"',
|
|
4429
|
+
"[",
|
|
4430
|
+
"]",
|
|
4431
|
+
"<",
|
|
4432
|
+
">",
|
|
4433
|
+
"(",
|
|
4434
|
+
")",
|
|
4435
|
+
":",
|
|
4436
|
+
";",
|
|
4437
|
+
",",
|
|
4438
|
+
"\u2014",
|
|
4439
|
+
"|",
|
|
4440
|
+
"~",
|
|
4441
|
+
"-",
|
|
4442
|
+
"...",
|
|
4443
|
+
"`",
|
|
4444
|
+
"'"
|
|
4445
|
+
]
|
|
4446
|
+
});
|
|
4447
|
+
const word = new RecursiveLevel({ whitespace: true });
|
|
4448
|
+
const token = new RecursiveLevel();
|
|
4449
|
+
this.levels = [paragraphs, sentences, pauses, word, token];
|
|
4450
|
+
} else {
|
|
4451
|
+
this.levels = data.levels.map((level) => new RecursiveLevel(level));
|
|
4452
|
+
}
|
|
4453
|
+
}
|
|
4454
|
+
/** Return a string representation of the RecursiveRules
|
|
4455
|
+
*
|
|
4456
|
+
* @returns {string} The string representation of the RecursiveRules.
|
|
4457
|
+
*/
|
|
4458
|
+
toString() {
|
|
4459
|
+
return `RecursiveRules(levels=${this.levels})`;
|
|
4460
|
+
}
|
|
4461
|
+
/** Return the number of levels
|
|
4462
|
+
*
|
|
4463
|
+
* @returns {number} The number of levels.
|
|
4464
|
+
*/
|
|
4465
|
+
get length() {
|
|
4466
|
+
return this.levels.length;
|
|
4467
|
+
}
|
|
4468
|
+
/** Get a level by index
|
|
4469
|
+
*
|
|
4470
|
+
* @param {number} index - The index of the level.
|
|
4471
|
+
* @returns {RecursiveLevel | undefined} The level.
|
|
4472
|
+
*/
|
|
4473
|
+
getLevel(index) {
|
|
4474
|
+
return this.levels[index];
|
|
4475
|
+
}
|
|
4476
|
+
/** Return an iterator over the levels
|
|
4477
|
+
*
|
|
4478
|
+
* @returns {Iterator<RecursiveLevel>} The iterator over the levels.
|
|
4479
|
+
*/
|
|
4480
|
+
[Symbol.iterator]() {
|
|
4481
|
+
return this.levels[Symbol.iterator]();
|
|
4482
|
+
}
|
|
4483
|
+
/** Create a RecursiveRules object from a dictionary
|
|
4484
|
+
*
|
|
4485
|
+
* @param {RecursiveRulesData} data - The dictionary-like object.
|
|
4486
|
+
* @returns {RecursiveRules} The RecursiveRules object.
|
|
4487
|
+
*/
|
|
4488
|
+
static fromDict(data) {
|
|
4489
|
+
return new _RecursiveRules(data);
|
|
4490
|
+
}
|
|
4491
|
+
/** Return the RecursiveRules as a dictionary-like object
|
|
4492
|
+
*
|
|
4493
|
+
* @returns {RecursiveRulesData} The dictionary-like object.
|
|
4494
|
+
*/
|
|
4495
|
+
toDict() {
|
|
4496
|
+
return {
|
|
4497
|
+
levels: this.levels.map((level) => level.toDict())
|
|
4498
|
+
};
|
|
4499
|
+
}
|
|
4500
|
+
/** Create a RecursiveRules object from a recipe
|
|
4501
|
+
*
|
|
4502
|
+
* @param {string} name - The name of the recipe.
|
|
4503
|
+
* @param {string} lang - The language of the recipe.
|
|
4504
|
+
* @param {string} path - The path to the recipe.
|
|
4505
|
+
* @returns {Promise<RecursiveRules>} The RecursiveRules object.
|
|
4506
|
+
*/
|
|
4507
|
+
static async fromRecipe(name = "default", lang = "en", path3) {
|
|
4508
|
+
throw new Error("Not implemented");
|
|
4509
|
+
}
|
|
4510
|
+
};
|
|
4511
|
+
var RecursiveChunk = class _RecursiveChunk extends Chunk {
|
|
4512
|
+
/** The level of recursion for the chunk */
|
|
4513
|
+
level;
|
|
4514
|
+
constructor(data) {
|
|
4515
|
+
super(data);
|
|
4516
|
+
this.level = data.level;
|
|
4517
|
+
}
|
|
4518
|
+
/** Return a string representation of the RecursiveChunk
|
|
4519
|
+
*
|
|
4520
|
+
* @returns {string} The string representation of the RecursiveChunk.
|
|
4521
|
+
*/
|
|
4522
|
+
toString() {
|
|
4523
|
+
return `RecursiveChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, level=${this.level})`;
|
|
4524
|
+
}
|
|
4525
|
+
/** Return the RecursiveChunk as a dictionary-like object
|
|
4526
|
+
*
|
|
4527
|
+
* @returns {RecursiveChunkData} The dictionary-like object.
|
|
4528
|
+
*/
|
|
4529
|
+
toDict() {
|
|
4530
|
+
const baseDict = super.toDict();
|
|
4531
|
+
return {
|
|
4532
|
+
...baseDict,
|
|
4533
|
+
level: this.level
|
|
4534
|
+
};
|
|
4535
|
+
}
|
|
4536
|
+
/** Create a RecursiveChunk object from a dictionary
|
|
4537
|
+
*
|
|
4538
|
+
* @param {RecursiveChunkData} data - The dictionary-like object.
|
|
4539
|
+
* @returns {RecursiveChunk} The RecursiveChunk object.
|
|
4540
|
+
*/
|
|
4541
|
+
static fromDict(data) {
|
|
4542
|
+
return new _RecursiveChunk(data);
|
|
4543
|
+
}
|
|
4544
|
+
};
|
|
4545
|
+
|
|
4546
|
+
// src/chunking/tokenizer.ts
|
|
4547
|
+
var import_lite = require("tiktoken/lite");
|
|
4548
|
+
var import_load = require("tiktoken/load");
|
|
4549
|
+
var import_registry = __toESM(require("tiktoken/registry.json"), 1);
|
|
4550
|
+
var import_model_to_encoding = __toESM(require("tiktoken/model_to_encoding.json"), 1);
|
|
4551
|
+
var ExuluTokenizer = class {
|
|
4552
|
+
constructor() {
|
|
4553
|
+
}
|
|
4554
|
+
encoder = null;
|
|
4555
|
+
async create(modelName) {
|
|
4556
|
+
if (this.encoder) {
|
|
4557
|
+
return this.encoder;
|
|
4558
|
+
}
|
|
4559
|
+
const time = performance.now();
|
|
4560
|
+
console.log("[EXULU] Loading tokenizer.", modelName);
|
|
4561
|
+
const model = await (0, import_load.load)(import_registry.default[import_model_to_encoding.default[modelName]]);
|
|
4562
|
+
console.log("[EXULU] Loaded tokenizer.", modelName, performance.now() - time);
|
|
4563
|
+
console.log("[EXULU] Model.", model.bpe_ranks);
|
|
4564
|
+
console.log("[EXULU] Model.", model.special_tokens);
|
|
4565
|
+
console.log("[EXULU] Model.", model.pat_str);
|
|
4566
|
+
const encoder = new import_lite.Tiktoken(
|
|
4567
|
+
model.bpe_ranks,
|
|
4568
|
+
model.special_tokens,
|
|
4569
|
+
model.pat_str
|
|
4570
|
+
);
|
|
4571
|
+
console.log("[EXULU] Encoder.", encoder);
|
|
4572
|
+
this.encoder = encoder;
|
|
4573
|
+
return encoder;
|
|
4574
|
+
}
|
|
4575
|
+
async decode(tokens) {
|
|
4576
|
+
if (!this.encoder) {
|
|
4577
|
+
throw new Error("Tokenizer not initialized");
|
|
4578
|
+
}
|
|
4579
|
+
const text = this.encoder.decode(tokens);
|
|
4580
|
+
return new TextDecoder().decode(text);
|
|
4581
|
+
}
|
|
4582
|
+
async decodeBatch(tokenSequences) {
|
|
4583
|
+
if (!this.encoder) {
|
|
4584
|
+
throw new Error("Tokenizer not initialized");
|
|
4585
|
+
}
|
|
4586
|
+
const promises2 = tokenSequences.map((tokens) => this.decode(tokens));
|
|
4587
|
+
return await Promise.all(promises2);
|
|
4588
|
+
}
|
|
4589
|
+
encode(text) {
|
|
4590
|
+
if (!this.encoder) {
|
|
4591
|
+
throw new Error("Tokenizer not initialized");
|
|
4592
|
+
}
|
|
4593
|
+
const time = performance.now();
|
|
4594
|
+
console.log("[EXULU] Encoding text.", text);
|
|
4595
|
+
const tokens = this.encoder.encode(text);
|
|
4596
|
+
console.log("[EXULU] Encoded text.", text, performance.now() - time);
|
|
4597
|
+
return tokens;
|
|
4598
|
+
}
|
|
4599
|
+
async countTokensBatch(texts) {
|
|
4600
|
+
if (!this.encoder) {
|
|
4601
|
+
throw new Error("Tokenizer not initialized");
|
|
4602
|
+
}
|
|
4603
|
+
const promises2 = texts.map((text) => this.countTokens(text));
|
|
4604
|
+
return await Promise.all(promises2);
|
|
4605
|
+
}
|
|
4606
|
+
countTokens(text) {
|
|
4607
|
+
if (!this.encoder) {
|
|
4608
|
+
throw new Error("Tokenizer not initialized");
|
|
4609
|
+
}
|
|
4610
|
+
console.log("[EXULU] Counting tokens.", text);
|
|
4611
|
+
const tokens = this.encoder.encode(text);
|
|
4612
|
+
const count = tokens.length;
|
|
4613
|
+
console.log("[EXULU] Token count.", count);
|
|
4614
|
+
return count;
|
|
4615
|
+
}
|
|
4616
|
+
async free() {
|
|
4617
|
+
console.log("[EXULU] Freeing tokenizer.");
|
|
4618
|
+
if (this.encoder) {
|
|
4619
|
+
this.encoder.free();
|
|
4620
|
+
}
|
|
4621
|
+
}
|
|
4622
|
+
};
|
|
4623
|
+
|
|
4624
|
+
// src/chunking/base.ts
|
|
4625
|
+
var BaseChunker = class {
|
|
4626
|
+
tokenizer;
|
|
4627
|
+
_useConcurrency = true;
|
|
4628
|
+
// Determines if batch processing uses Promise.all
|
|
4629
|
+
constructor(tokenizer) {
|
|
4630
|
+
this.tokenizer = tokenizer;
|
|
4631
|
+
}
|
|
4632
|
+
/**
|
|
4633
|
+
* Returns a string representation of the chunker instance.
|
|
4634
|
+
*
|
|
4635
|
+
* @returns {string} The class name and constructor signature.
|
|
4636
|
+
*/
|
|
4637
|
+
toString() {
|
|
4638
|
+
return `${this.constructor.name}()`;
|
|
4639
|
+
}
|
|
4640
|
+
async call(textOrTexts, showProgress = false) {
|
|
4641
|
+
if (typeof textOrTexts === "string") {
|
|
4642
|
+
return this.chunk(textOrTexts);
|
|
4643
|
+
} else if (Array.isArray(textOrTexts)) {
|
|
4644
|
+
return this.chunkBatch(textOrTexts, showProgress);
|
|
4645
|
+
} else {
|
|
4646
|
+
throw new Error("Input must be a string or an array of strings.");
|
|
4647
|
+
}
|
|
4648
|
+
}
|
|
4649
|
+
/**
|
|
4650
|
+
* Process a batch of texts sequentially (one after another).
|
|
4651
|
+
*
|
|
4652
|
+
* @protected
|
|
4653
|
+
* @param {string[]} texts - The texts to chunk.
|
|
4654
|
+
* @param {boolean} [showProgress=false] - Whether to display progress in the console.
|
|
4655
|
+
* @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
|
|
4656
|
+
*/
|
|
4657
|
+
async _sequential_batch_processing(texts, showProgress = false) {
|
|
4658
|
+
const results = [];
|
|
4659
|
+
const total = texts.length;
|
|
4660
|
+
for (let i = 0; i < total; i++) {
|
|
4661
|
+
if (showProgress && total > 1) {
|
|
4662
|
+
const progress = Math.round((i + 1) / total * 100);
|
|
4663
|
+
process.stdout.write(`Sequential processing: Document ${i + 1}/${total} (${progress}%)\r`);
|
|
4664
|
+
}
|
|
4665
|
+
results.push(await this.chunk(texts[i]));
|
|
4666
|
+
}
|
|
4667
|
+
if (showProgress && total > 1) {
|
|
4668
|
+
process.stdout.write("\n");
|
|
4669
|
+
}
|
|
4670
|
+
return results;
|
|
4671
|
+
}
|
|
4672
|
+
/**
|
|
4673
|
+
* Process a batch of texts concurrently using Promise.all.
|
|
4674
|
+
*
|
|
4675
|
+
* @protected
|
|
4676
|
+
* @param {string[]} texts - The texts to chunk.
|
|
4677
|
+
* @param {boolean} [showProgress=false] - Whether to display progress in the console.
|
|
4678
|
+
* @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
|
|
4679
|
+
*/
|
|
4680
|
+
async _concurrent_batch_processing(texts, showProgress = false) {
|
|
4681
|
+
const total = texts.length;
|
|
4682
|
+
let completedCount = 0;
|
|
4683
|
+
const updateProgress = () => {
|
|
4684
|
+
if (showProgress && total > 1) {
|
|
4685
|
+
completedCount++;
|
|
4686
|
+
const progress = Math.round(completedCount / total * 100);
|
|
4687
|
+
process.stdout.write(`Concurrent processing: Document ${completedCount}/${total} (${progress}%)\r`);
|
|
4688
|
+
}
|
|
4689
|
+
};
|
|
4690
|
+
const chunkPromises = texts.map(
|
|
4691
|
+
(text) => this.chunk(text).then((result) => {
|
|
4692
|
+
updateProgress();
|
|
4693
|
+
return result;
|
|
4694
|
+
})
|
|
4695
|
+
);
|
|
4696
|
+
const results = await Promise.all(chunkPromises);
|
|
4697
|
+
if (showProgress && total > 1 && completedCount > 0) {
|
|
4698
|
+
process.stdout.write("\n");
|
|
4699
|
+
}
|
|
4700
|
+
return results;
|
|
4701
|
+
}
|
|
4702
|
+
/**
|
|
4703
|
+
* Chunk a batch of texts, using either concurrent or sequential processing.
|
|
4704
|
+
*
|
|
4705
|
+
* If only one text is provided, processes it directly without batch overhead.
|
|
4706
|
+
*
|
|
4707
|
+
* @param {string[]} texts - The texts to chunk.
|
|
4708
|
+
* @param {boolean} [showProgress=true] - Whether to display progress in the console.
|
|
4709
|
+
* @returns {Promise<Chunk[][]>} An array of chunked results for each input text.
|
|
4710
|
+
*/
|
|
4711
|
+
async chunkBatch(texts, showProgress = true) {
|
|
4712
|
+
if (texts.length === 0) {
|
|
4713
|
+
return [];
|
|
4714
|
+
}
|
|
4715
|
+
if (texts.length === 1) {
|
|
4716
|
+
return [await this.chunk(texts[0])];
|
|
4717
|
+
}
|
|
4718
|
+
if (this._useConcurrency) {
|
|
4719
|
+
return this._concurrent_batch_processing(texts, showProgress);
|
|
4720
|
+
} else {
|
|
4721
|
+
return this._sequential_batch_processing(texts, showProgress);
|
|
4722
|
+
}
|
|
4723
|
+
}
|
|
4724
|
+
};
|
|
4725
|
+
|
|
4726
|
+
// src/chunking/recursive.ts
|
|
4727
|
+
var RecursiveChunker = class _RecursiveChunker extends BaseChunker {
|
|
4728
|
+
chunkSize;
|
|
4729
|
+
minCharactersPerChunk;
|
|
4730
|
+
rules;
|
|
4731
|
+
sep;
|
|
4732
|
+
_CHARS_PER_TOKEN = 6.5;
|
|
4733
|
+
/**
|
|
4734
|
+
* Private constructor. Use `RecursiveChunker.create()` to instantiate.
|
|
4735
|
+
*/
|
|
4736
|
+
constructor(tokenizer, chunkSize, rules, minCharactersPerChunk) {
|
|
4737
|
+
super(tokenizer);
|
|
4738
|
+
if (chunkSize <= 0) {
|
|
4739
|
+
throw new Error("chunkSize must be greater than 0");
|
|
4740
|
+
}
|
|
4741
|
+
if (minCharactersPerChunk <= 0) {
|
|
4742
|
+
throw new Error("minCharactersPerChunk must be greater than 0");
|
|
4743
|
+
}
|
|
4744
|
+
if (!(rules instanceof RecursiveRules)) {
|
|
4745
|
+
throw new Error("rules must be a RecursiveRules object");
|
|
4746
|
+
}
|
|
4747
|
+
this.chunkSize = chunkSize;
|
|
4748
|
+
this.minCharactersPerChunk = minCharactersPerChunk;
|
|
4749
|
+
this.rules = rules;
|
|
4750
|
+
this.sep = "\u2704";
|
|
4751
|
+
}
|
|
4752
|
+
/**
|
|
4753
|
+
* Creates and initializes a directly callable RecursiveChunker instance.
|
|
4754
|
+
*
|
|
4755
|
+
* This static factory method constructs a RecursiveChunker with the provided options and returns a callable function object.
|
|
4756
|
+
* The returned instance can be used as both a function (to chunk text(s)) and as an object (with all RecursiveChunker methods and properties).
|
|
4757
|
+
*
|
|
4758
|
+
* @param {RecursiveChunkerOptions} [options] - Configuration options for the chunker. All options are optional:
|
|
4759
|
+
* @param {string|Tokenizer} [options.tokenizer="Xenova/gpt2"] - Tokenizer to use for text processing. Can be a string identifier (e.g., "Xenova/gpt2") or a Tokenizer instance. If a string is provided, Tokenizer.create() is called internally.
|
|
4760
|
+
* @param {number} [options.chunkSize=512] - Maximum number of tokens per chunk. Must be > 0.
|
|
4761
|
+
* @param {RecursiveRules} [options.rules=new RecursiveRules()] - Rules for recursive chunking. See {@link RecursiveRules} for customization.
|
|
4762
|
+
* @param {number} [options.minCharactersPerChunk=24] - Minimum number of characters per chunk. Must be > 0.
|
|
4763
|
+
*
|
|
4764
|
+
* @returns {Promise<CallableRecursiveChunker>} Promise resolving to a callable RecursiveChunker instance.
|
|
4765
|
+
*
|
|
4766
|
+
* @throws {Error} If any option is invalid (e.g., chunkSize <= 0).
|
|
4767
|
+
*
|
|
4768
|
+
* @see CallableRecursiveChunker for the callable interface and available properties/methods.
|
|
4769
|
+
*
|
|
4770
|
+
* @example <caption>Basic usage with default options</caption>
|
|
4771
|
+
* const chunker = await RecursiveChunker.create();
|
|
4772
|
+
* const chunks = await chunker("Some text to chunk");
|
|
4773
|
+
*
|
|
4774
|
+
* @example <caption>Custom options and batch chunking</caption>
|
|
4775
|
+
* const chunker = await RecursiveChunker.create({ chunkSize: 256 });
|
|
4776
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
4777
|
+
*
|
|
4778
|
+
* @example <caption>Accessing properties and methods</caption>
|
|
4779
|
+
* const chunker = await RecursiveChunker.create();
|
|
4780
|
+
* console.log(chunker.chunkSize); // 512
|
|
4781
|
+
* console.log(chunker.rules); // RecursiveRules instance
|
|
4782
|
+
* const chunks = await chunker.chunk("Some text"); // Use as object method
|
|
4783
|
+
*
|
|
4784
|
+
* @note
|
|
4785
|
+
* The returned instance is both callable (like a function) and has all properties/methods of RecursiveChunker.
|
|
4786
|
+
* You can use it as a drop-in replacement for a function or a class instance.
|
|
4787
|
+
*
|
|
4788
|
+
* @note
|
|
4789
|
+
* For advanced customization, pass a custom RecursiveRules object to the rules option.
|
|
4790
|
+
* See {@link RecursiveRules} and {@link RecursiveLevel} for rule structure.
|
|
4791
|
+
*/
|
|
4792
|
+
static async create(options = {}) {
|
|
4793
|
+
const {
|
|
4794
|
+
tokenizer = "gpt-3.5-turbo",
|
|
4795
|
+
chunkSize = 512,
|
|
4796
|
+
rules = new RecursiveRules(),
|
|
4797
|
+
minCharactersPerChunk = 24
|
|
4798
|
+
} = options;
|
|
4799
|
+
const tokenizerInstance = await new ExuluTokenizer();
|
|
4800
|
+
await tokenizerInstance.create(tokenizer);
|
|
4801
|
+
const plainInstance = new _RecursiveChunker(
|
|
4802
|
+
tokenizerInstance,
|
|
4803
|
+
chunkSize,
|
|
4804
|
+
rules,
|
|
4805
|
+
minCharactersPerChunk
|
|
4806
|
+
);
|
|
4807
|
+
const callableFn = function(textOrTexts, showProgress) {
|
|
4808
|
+
if (typeof textOrTexts === "string") {
|
|
4809
|
+
return plainInstance.call(textOrTexts, showProgress);
|
|
4810
|
+
} else {
|
|
4811
|
+
return plainInstance.call(textOrTexts, showProgress);
|
|
4812
|
+
}
|
|
4813
|
+
};
|
|
4814
|
+
Object.setPrototypeOf(callableFn, _RecursiveChunker.prototype);
|
|
4815
|
+
Object.assign(callableFn, plainInstance);
|
|
4816
|
+
return callableFn;
|
|
4817
|
+
}
|
|
4818
|
+
/**
|
|
4819
|
+
* Estimates the number of tokens in a given text.
|
|
4820
|
+
*
|
|
4821
|
+
* This method uses a character-to-token ratio (default: 6.5 characters per token) for quick estimation.
|
|
4822
|
+
* If the estimated token count exceeds the chunk size, it performs an actual token count.
|
|
4823
|
+
*
|
|
4824
|
+
* @param {string} text - The text to estimate token count for
|
|
4825
|
+
* @returns {Promise<number>} A promise that resolves to the estimated number of tokens
|
|
4826
|
+
* @private
|
|
4827
|
+
*/
|
|
4828
|
+
async _estimateTokenCount(text) {
|
|
4829
|
+
const estimate = Math.max(1, Math.floor(text.length / this._CHARS_PER_TOKEN));
|
|
4830
|
+
if (estimate > this.chunkSize) {
|
|
4831
|
+
return this.chunkSize + 1;
|
|
4832
|
+
}
|
|
4833
|
+
return this.tokenizer.countTokens(text);
|
|
4834
|
+
}
|
|
4835
|
+
/**
|
|
4836
|
+
* Split the text into chunks based on the provided recursive level rules.
|
|
4837
|
+
*
|
|
4838
|
+
* This method handles three different splitting strategies:
|
|
4839
|
+
* 1. Whitespace-based splitting: Splits text on spaces
|
|
4840
|
+
* 2. Delimiter-based splitting: Splits text on specified delimiters with options to include delimiters
|
|
4841
|
+
* 3. Token-based splitting: Splits text into chunks of maximum token size
|
|
4842
|
+
*
|
|
4843
|
+
* @param {string} text - The text to be split into chunks
|
|
4844
|
+
* @param {RecursiveLevel} recursiveLevel - The rules defining how to split the text
|
|
4845
|
+
* @returns {Promise<string[]>} A promise that resolves to an array of text chunks
|
|
4846
|
+
* @private
|
|
4847
|
+
*/
|
|
4848
|
+
async _splitText(text, recursiveLevel) {
|
|
4849
|
+
if (recursiveLevel.whitespace) {
|
|
4850
|
+
return text.split(" ");
|
|
4851
|
+
} else if (recursiveLevel.delimiters) {
|
|
4852
|
+
let t = text;
|
|
4853
|
+
if (recursiveLevel.includeDelim === "prev") {
|
|
4854
|
+
for (const delimiter of Array.isArray(recursiveLevel.delimiters) ? recursiveLevel.delimiters : [recursiveLevel.delimiters]) {
|
|
4855
|
+
t = t.replace(delimiter, delimiter + this.sep);
|
|
4856
|
+
}
|
|
4857
|
+
} else if (recursiveLevel.includeDelim === "next") {
|
|
4858
|
+
for (const delimiter of Array.isArray(recursiveLevel.delimiters) ? recursiveLevel.delimiters : [recursiveLevel.delimiters]) {
|
|
4859
|
+
t = t.replace(delimiter, this.sep + delimiter);
|
|
4860
|
+
}
|
|
4861
|
+
} else {
|
|
4862
|
+
for (const delimiter of Array.isArray(recursiveLevel.delimiters) ? recursiveLevel.delimiters : [recursiveLevel.delimiters]) {
|
|
4863
|
+
t = t.replace(delimiter, this.sep);
|
|
4864
|
+
}
|
|
4865
|
+
}
|
|
4866
|
+
const splits = t.split(this.sep).filter((split) => split !== "");
|
|
4867
|
+
let current = "";
|
|
4868
|
+
const merged = [];
|
|
4869
|
+
for (const split of splits) {
|
|
4870
|
+
if (split.length < this.minCharactersPerChunk) {
|
|
4871
|
+
current += split;
|
|
4872
|
+
} else if (current) {
|
|
4873
|
+
current += split;
|
|
4874
|
+
merged.push(current);
|
|
4875
|
+
current = "";
|
|
4876
|
+
} else {
|
|
4877
|
+
merged.push(split);
|
|
4878
|
+
}
|
|
4879
|
+
if (current.length >= this.minCharactersPerChunk) {
|
|
4880
|
+
merged.push(current);
|
|
4881
|
+
current = "";
|
|
4882
|
+
}
|
|
4883
|
+
}
|
|
4884
|
+
if (current) {
|
|
4885
|
+
merged.push(current);
|
|
4886
|
+
}
|
|
4887
|
+
return merged;
|
|
4888
|
+
} else {
|
|
4889
|
+
const encoded = await this.tokenizer.encode(text);
|
|
4890
|
+
const tokenSplits = [];
|
|
4891
|
+
for (let i = 0; i < encoded.length; i += this.chunkSize) {
|
|
4892
|
+
tokenSplits.push(encoded.slice(i, i + this.chunkSize));
|
|
4893
|
+
}
|
|
4894
|
+
return await this.tokenizer.decodeBatch(tokenSplits);
|
|
4895
|
+
}
|
|
4896
|
+
}
|
|
4897
|
+
/**
|
|
4898
|
+
* Create a RecursiveChunk object with indices based on the current offset.
|
|
4899
|
+
*
|
|
4900
|
+
* This method constructs a RecursiveChunk object that contains metadata about the chunk,
|
|
4901
|
+
* including the text content, its start and end indices, token count, and the level of recursion.
|
|
4902
|
+
*
|
|
4903
|
+
* @param {string} text - The text content of the chunk
|
|
4904
|
+
* @param {number} tokenCount - The number of tokens in the chunk
|
|
4905
|
+
*/
|
|
4906
|
+
_makeChunks(text, tokenCount, level, startOffset) {
|
|
4907
|
+
return new RecursiveChunk({
|
|
4908
|
+
text,
|
|
4909
|
+
startIndex: startOffset,
|
|
4910
|
+
endIndex: startOffset + text.length,
|
|
4911
|
+
tokenCount,
|
|
4912
|
+
level
|
|
4913
|
+
});
|
|
4914
|
+
}
|
|
4915
|
+
/**
|
|
4916
|
+
* Merge short splits.
|
|
4917
|
+
*/
|
|
4918
|
+
_mergeSplits(splits, tokenCounts, combineWhitespace = false) {
|
|
4919
|
+
if (!splits.length || !tokenCounts.length) {
|
|
4920
|
+
return [[], []];
|
|
4921
|
+
}
|
|
4922
|
+
if (splits.length !== tokenCounts.length) {
|
|
4923
|
+
throw new Error(
|
|
4924
|
+
`Number of splits ${splits.length} does not match number of token counts ${tokenCounts.length}`
|
|
4925
|
+
);
|
|
4926
|
+
}
|
|
4927
|
+
if (tokenCounts.every((count) => count > this.chunkSize)) {
|
|
4928
|
+
return [splits, tokenCounts];
|
|
4929
|
+
}
|
|
4930
|
+
const merged = [];
|
|
4931
|
+
const cumulativeTokenCounts = [];
|
|
4932
|
+
let sum = 0;
|
|
4933
|
+
if (combineWhitespace) {
|
|
4934
|
+
cumulativeTokenCounts.push(0);
|
|
4935
|
+
for (const count of tokenCounts) {
|
|
4936
|
+
sum += count + 1;
|
|
4937
|
+
cumulativeTokenCounts.push(sum);
|
|
4938
|
+
}
|
|
4939
|
+
} else {
|
|
4940
|
+
cumulativeTokenCounts.push(0);
|
|
4941
|
+
for (const count of tokenCounts) {
|
|
4942
|
+
sum += count;
|
|
4943
|
+
cumulativeTokenCounts.push(sum);
|
|
4944
|
+
}
|
|
4945
|
+
}
|
|
4946
|
+
let currentIndex = 0;
|
|
4947
|
+
const combinedTokenCounts = [];
|
|
4948
|
+
while (currentIndex < splits.length) {
|
|
4949
|
+
const currentTokenCount = cumulativeTokenCounts[currentIndex] ?? 0;
|
|
4950
|
+
const requiredTokenCount = currentTokenCount + this.chunkSize;
|
|
4951
|
+
let index = this._bisectLeft(
|
|
4952
|
+
cumulativeTokenCounts,
|
|
4953
|
+
requiredTokenCount,
|
|
4954
|
+
currentIndex
|
|
4955
|
+
) - 1;
|
|
4956
|
+
index = Math.min(index, splits.length);
|
|
4957
|
+
if (index === currentIndex) {
|
|
4958
|
+
index += 1;
|
|
4959
|
+
}
|
|
4960
|
+
if (combineWhitespace) {
|
|
4961
|
+
merged.push(splits.slice(currentIndex, index).join(" "));
|
|
4962
|
+
} else {
|
|
4963
|
+
merged.push(splits.slice(currentIndex, index).join(""));
|
|
4964
|
+
}
|
|
4965
|
+
combinedTokenCounts.push(
|
|
4966
|
+
(cumulativeTokenCounts[Math.min(index, splits.length)] ?? 0) - currentTokenCount
|
|
4967
|
+
);
|
|
4968
|
+
currentIndex = index;
|
|
4969
|
+
}
|
|
4970
|
+
return [merged, combinedTokenCounts];
|
|
4971
|
+
}
|
|
4972
|
+
/**
|
|
4973
|
+
* Binary search to find the leftmost position where value should be inserted to maintain order.
|
|
4974
|
+
*
|
|
4975
|
+
* @param {number[]} arr - The array to search
|
|
4976
|
+
* @param {number} value - The value to insert
|
|
4977
|
+
* @param {number} [lo=0] - The starting index for the search
|
|
4978
|
+
* @returns {number} The index where the value should be inserted
|
|
4979
|
+
* @private
|
|
4980
|
+
*/
|
|
4981
|
+
_bisectLeft(arr, value, lo = 0) {
|
|
4982
|
+
let hi = arr.length;
|
|
4983
|
+
while (lo < hi) {
|
|
4984
|
+
const mid = lo + hi >>> 1;
|
|
4985
|
+
if (arr[mid] < value) {
|
|
4986
|
+
lo = mid + 1;
|
|
4987
|
+
} else {
|
|
4988
|
+
hi = mid;
|
|
4989
|
+
}
|
|
4990
|
+
}
|
|
4991
|
+
return lo;
|
|
4992
|
+
}
|
|
4993
|
+
/**
|
|
4994
|
+
* Recursive helper for core chunking.
|
|
4995
|
+
*/
|
|
4996
|
+
async _recursiveChunk(text, level = 0, startOffset = 0) {
|
|
4997
|
+
if (!text) {
|
|
4998
|
+
return [];
|
|
4999
|
+
}
|
|
5000
|
+
console.log("[EXULU] Rule.", this.rules.length);
|
|
5001
|
+
console.log("[EXULU] Level.", level);
|
|
5002
|
+
if (level >= this.rules.length) {
|
|
5003
|
+
const tokenCount = await this._estimateTokenCount(text);
|
|
5004
|
+
return [
|
|
5005
|
+
this._makeChunks(
|
|
5006
|
+
text,
|
|
5007
|
+
tokenCount,
|
|
5008
|
+
level,
|
|
5009
|
+
startOffset
|
|
5010
|
+
)
|
|
5011
|
+
];
|
|
5012
|
+
}
|
|
5013
|
+
const currRule = this.rules.getLevel(level);
|
|
5014
|
+
if (!currRule) {
|
|
5015
|
+
throw new Error(`No rule found at level ${level}`);
|
|
5016
|
+
}
|
|
5017
|
+
const splits = await this._splitText(text, currRule);
|
|
5018
|
+
const tokenCounts = await Promise.all(splits.map((split) => this._estimateTokenCount(split)));
|
|
5019
|
+
let merged;
|
|
5020
|
+
let combinedTokenCounts;
|
|
5021
|
+
if (currRule.delimiters === void 0 && !currRule.whitespace) {
|
|
5022
|
+
[merged, combinedTokenCounts] = [splits, tokenCounts];
|
|
5023
|
+
} else if (currRule.delimiters === void 0 && currRule.whitespace) {
|
|
5024
|
+
[merged, combinedTokenCounts] = this._mergeSplits(
|
|
5025
|
+
splits,
|
|
5026
|
+
tokenCounts,
|
|
5027
|
+
true
|
|
5028
|
+
);
|
|
5029
|
+
merged = merged.slice(0, 1).concat(
|
|
5030
|
+
merged.slice(1).map((text2) => " " + text2)
|
|
5031
|
+
);
|
|
5032
|
+
} else {
|
|
5033
|
+
[merged, combinedTokenCounts] = this._mergeSplits(
|
|
5034
|
+
splits,
|
|
5035
|
+
tokenCounts,
|
|
5036
|
+
false
|
|
5037
|
+
);
|
|
5038
|
+
}
|
|
5039
|
+
const chunks = [];
|
|
5040
|
+
let currentOffset = startOffset;
|
|
5041
|
+
for (let i = 0; i < merged.length; i++) {
|
|
5042
|
+
const split = merged[i];
|
|
5043
|
+
const tokenCount = combinedTokenCounts[i];
|
|
5044
|
+
if (tokenCount && tokenCount > this.chunkSize) {
|
|
5045
|
+
chunks.push(...await this._recursiveChunk(split ?? "", level + 1, currentOffset));
|
|
5046
|
+
} else {
|
|
5047
|
+
chunks.push(
|
|
5048
|
+
this._makeChunks(split ?? "", tokenCount ?? 0, level, currentOffset)
|
|
5049
|
+
);
|
|
5050
|
+
}
|
|
5051
|
+
currentOffset += split?.length ?? 0;
|
|
5052
|
+
}
|
|
5053
|
+
return chunks;
|
|
5054
|
+
}
|
|
5055
|
+
/**
|
|
5056
|
+
* Recursively chunk text.
|
|
5057
|
+
*
|
|
5058
|
+
* This method is the main entry point for chunking text using the RecursiveChunker.
|
|
5059
|
+
* It takes a single text string and returns an array of RecursiveChunk objects.
|
|
5060
|
+
*
|
|
5061
|
+
* @param {string} text - The text to be chunked
|
|
5062
|
+
* @returns {Promise<RecursiveChunk[]>} A promise that resolves to an array of RecursiveChunk objects
|
|
5063
|
+
*/
|
|
5064
|
+
async chunk(text) {
|
|
5065
|
+
console.log("[EXULU] Chunking text.", text);
|
|
5066
|
+
const result = await this._recursiveChunk(text, 0, 0);
|
|
5067
|
+
await this.tokenizer.free();
|
|
5068
|
+
return result;
|
|
5069
|
+
}
|
|
5070
|
+
/**
|
|
5071
|
+
* Return a string representation of the RecursiveChunker.
|
|
5072
|
+
*
|
|
5073
|
+
* This method provides a string representation of the RecursiveChunker instance,
|
|
5074
|
+
* including its tokenizer, rules, chunk size, minimum characters per chunk, and return type.
|
|
5075
|
+
*
|
|
5076
|
+
* @returns {string} A string representation of the RecursiveChunker
|
|
5077
|
+
*/
|
|
5078
|
+
toString() {
|
|
5079
|
+
return `RecursiveChunker(tokenizer=${this.tokenizer}, rules=${this.rules}, chunkSize=${this.chunkSize}, minCharactersPerChunk=${this.minCharactersPerChunk})`;
|
|
5080
|
+
}
|
|
5081
|
+
};
|
|
5082
|
+
|
|
5083
|
+
// src/chunking/types/sentence.ts
|
|
5084
|
+
var Sentence = class _Sentence {
|
|
5085
|
+
/** The text of the sentence */
|
|
5086
|
+
text;
|
|
5087
|
+
/** The starting index of the sentence in the original text */
|
|
5088
|
+
startIndex;
|
|
5089
|
+
/** The ending index of the sentence in the original text */
|
|
5090
|
+
endIndex;
|
|
5091
|
+
/** The number of tokens in the sentence */
|
|
5092
|
+
tokenCount;
|
|
5093
|
+
constructor(data) {
|
|
5094
|
+
this.text = data.text;
|
|
5095
|
+
this.startIndex = data.startIndex;
|
|
5096
|
+
this.endIndex = data.endIndex;
|
|
5097
|
+
this.tokenCount = data.tokenCount;
|
|
5098
|
+
}
|
|
5099
|
+
/** Return a string representation of the Sentence */
|
|
5100
|
+
toString() {
|
|
5101
|
+
return `Sentence(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount})`;
|
|
5102
|
+
}
|
|
5103
|
+
/** Return the Sentence as a dictionary-like object */
|
|
5104
|
+
toDict() {
|
|
5105
|
+
return {
|
|
5106
|
+
text: this.text,
|
|
5107
|
+
startIndex: this.startIndex,
|
|
5108
|
+
endIndex: this.endIndex,
|
|
5109
|
+
tokenCount: this.tokenCount
|
|
5110
|
+
};
|
|
5111
|
+
}
|
|
5112
|
+
/** Create a Sentence object from a dictionary-like object */
|
|
5113
|
+
static fromDict(data) {
|
|
5114
|
+
return new _Sentence(data);
|
|
5115
|
+
}
|
|
5116
|
+
};
|
|
5117
|
+
var SentenceChunk = class _SentenceChunk extends Chunk {
|
|
5118
|
+
/** List of sentences in the chunk */
|
|
5119
|
+
sentences;
|
|
5120
|
+
constructor(data) {
|
|
5121
|
+
super(data);
|
|
5122
|
+
this.sentences = data.sentences;
|
|
5123
|
+
this.embedding = data.embedding ?? void 0;
|
|
5124
|
+
}
|
|
5125
|
+
/**
|
|
5126
|
+
* Returns a detailed string representation of the SentenceChunk, including its text, start and end indices, token count, and a list of all contained sentences with their metadata.
|
|
5127
|
+
*
|
|
5128
|
+
* This method overrides the base {@link Chunk} toString method to provide a more informative output, which is especially useful for debugging and logging. Each sentence in the chunk is represented using its own toString method, and all sentences are included in the output.
|
|
5129
|
+
*
|
|
5130
|
+
* @returns {string} A string describing the SentenceChunk and all its sentences, e.g.,
|
|
5131
|
+
* SentenceChunk(text=..., startIndex=..., endIndex=..., tokenCount=..., sentences=[Sentence(...), ...])
|
|
5132
|
+
*/
|
|
5133
|
+
toString() {
|
|
5134
|
+
const sentencesStr = this.sentences.map((s) => s.toString()).join(", ");
|
|
5135
|
+
return `SentenceChunk(text=${this.text}, startIndex=${this.startIndex}, endIndex=${this.endIndex}, tokenCount=${this.tokenCount}, sentences=[${sentencesStr}])`;
|
|
5136
|
+
}
|
|
5137
|
+
/**
|
|
5138
|
+
* Returns the SentenceChunk as a dictionary-like object.
|
|
5139
|
+
*
|
|
5140
|
+
* This method extends the base {@link Chunk} toDict method to include the sentences in the chunk.
|
|
5141
|
+
*
|
|
5142
|
+
* @returns {SentenceChunkData} A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
|
|
5143
|
+
/** Return the SentenceChunk as a dictionary-like object */
|
|
5144
|
+
toDict() {
|
|
5145
|
+
const baseDict = super.toDict();
|
|
5146
|
+
return {
|
|
5147
|
+
...baseDict,
|
|
5148
|
+
sentences: this.sentences.map((sentence) => sentence.toDict())
|
|
5149
|
+
};
|
|
5150
|
+
}
|
|
5151
|
+
/**
|
|
5152
|
+
* Creates a SentenceChunk object from a dictionary-like object.
|
|
5153
|
+
*
|
|
5154
|
+
* This method extends the base {@link Chunk} fromDict method to include the sentences in the chunk.
|
|
5155
|
+
*
|
|
5156
|
+
* @param {SentenceChunkData} data - A dictionary-like object containing the chunk's text, start and end indices, token count, and an array of sentence data.
|
|
5157
|
+
* @returns {SentenceChunk} A new SentenceChunk object created from the provided dictionary-like object.
|
|
5158
|
+
*/
|
|
5159
|
+
static fromDict(data) {
|
|
5160
|
+
const sentences = data.sentences.map((sentence) => Sentence.fromDict(sentence));
|
|
5161
|
+
return new _SentenceChunk({
|
|
5162
|
+
text: data.text,
|
|
5163
|
+
startIndex: data.startIndex,
|
|
5164
|
+
endIndex: data.endIndex,
|
|
5165
|
+
tokenCount: data.tokenCount,
|
|
5166
|
+
sentences,
|
|
5167
|
+
embedding: data.embedding ?? void 0
|
|
5168
|
+
});
|
|
5169
|
+
}
|
|
5170
|
+
};
|
|
5171
|
+
|
|
5172
|
+
// src/chunking/sentence.ts
|
|
5173
|
+
var SentenceChunker = class _SentenceChunker extends BaseChunker {
|
|
5174
|
+
chunkSize;
|
|
5175
|
+
chunkOverlap;
|
|
5176
|
+
minSentencesPerChunk;
|
|
5177
|
+
minCharactersPerSentence;
|
|
5178
|
+
approximate;
|
|
5179
|
+
delim;
|
|
5180
|
+
includeDelim;
|
|
5181
|
+
sep;
|
|
5182
|
+
/**
|
|
5183
|
+
* Private constructor. Use `SentenceChunker.create()` to instantiate.
|
|
5184
|
+
*
|
|
5185
|
+
* @param {Tokenizer} tokenizer - The tokenizer to use for token counting.
|
|
5186
|
+
* @param {number} chunkSize - Maximum number of tokens per chunk.
|
|
5187
|
+
* @param {number} chunkOverlap - Number of tokens to overlap between consecutive chunks.
|
|
5188
|
+
* @param {number} minSentencesPerChunk - Minimum number of sentences per chunk.
|
|
5189
|
+
* @param {number} minCharactersPerSentence - Minimum number of characters for a valid sentence.
|
|
5190
|
+
* @param {boolean} approximate - Whether to use approximate token counting.
|
|
5191
|
+
* @param {string[]} delim - List of sentence delimiters to use for splitting.
|
|
5192
|
+
* @param {('prev' | 'next' | null)} includeDelim - Whether to include the delimiter with the previous sentence ('prev'), next sentence ('next'), or exclude it (null).
|
|
5193
|
+
*/
|
|
5194
|
+
constructor(tokenizer, chunkSize, chunkOverlap, minSentencesPerChunk, minCharactersPerSentence, approximate, delim, includeDelim) {
|
|
5195
|
+
super(tokenizer);
|
|
5196
|
+
if (chunkSize <= 0) {
|
|
5197
|
+
throw new Error("chunkSize must be greater than 0");
|
|
5198
|
+
}
|
|
5199
|
+
if (chunkOverlap < 0) {
|
|
5200
|
+
throw new Error("chunkOverlap must be non-negative");
|
|
5201
|
+
}
|
|
5202
|
+
if (chunkOverlap >= chunkSize) {
|
|
5203
|
+
throw new Error("chunkOverlap must be less than chunkSize");
|
|
5204
|
+
}
|
|
5205
|
+
if (minSentencesPerChunk <= 0) {
|
|
5206
|
+
throw new Error("minSentencesPerChunk must be greater than 0");
|
|
5207
|
+
}
|
|
5208
|
+
if (minCharactersPerSentence <= 0) {
|
|
5209
|
+
throw new Error("minCharactersPerSentence must be greater than 0");
|
|
5210
|
+
}
|
|
5211
|
+
if (!delim) {
|
|
5212
|
+
throw new Error("delim must be a list of strings or a string");
|
|
5213
|
+
}
|
|
5214
|
+
if (includeDelim !== "prev" && includeDelim !== "next" && includeDelim !== null) {
|
|
5215
|
+
throw new Error("includeDelim must be 'prev', 'next' or null");
|
|
5216
|
+
}
|
|
5217
|
+
if (approximate) {
|
|
5218
|
+
console.warn("Approximate has been deprecated and will be removed from next version onwards!");
|
|
5219
|
+
}
|
|
5220
|
+
this.chunkSize = chunkSize;
|
|
5221
|
+
this.chunkOverlap = chunkOverlap;
|
|
5222
|
+
this.minSentencesPerChunk = minSentencesPerChunk;
|
|
5223
|
+
this.minCharactersPerSentence = minCharactersPerSentence;
|
|
5224
|
+
this.approximate = approximate;
|
|
5225
|
+
this.delim = delim;
|
|
5226
|
+
this.includeDelim = includeDelim;
|
|
5227
|
+
this.sep = "\u2704";
|
|
5228
|
+
}
|
|
5229
|
+
/**
|
|
5230
|
+
* Creates and initializes a SentenceChunker instance that is directly callable.
|
|
5231
|
+
*
|
|
5232
|
+
* This method is a static factory function that returns a Promise resolving to a CallableSentenceChunker instance.
|
|
5233
|
+
* The returned instance is a callable function that can be used to chunk text strings or arrays of text strings.
|
|
5234
|
+
*
|
|
5235
|
+
* @param {SentenceChunkerOptions} [options] - Options for configuring the SentenceChunker.
|
|
5236
|
+
* @returns {Promise<CallableSentenceChunker>} A promise that resolves to a callable SentenceChunker instance.
|
|
5237
|
+
*
|
|
5238
|
+
* @example
|
|
5239
|
+
* const chunker = await SentenceChunker.create();
|
|
5240
|
+
* const chunks = await chunker("This is a sample text.");
|
|
5241
|
+
* const batchChunks = await chunker(["Text 1", "Text 2"]);
|
|
5242
|
+
*
|
|
5243
|
+
* @see SentenceChunkerOptions
|
|
5244
|
+
*/
|
|
5245
|
+
static async create(options = {}) {
|
|
5246
|
+
const {
|
|
5247
|
+
tokenizer = "gpt-3.5-turbo",
|
|
5248
|
+
chunkSize = 512,
|
|
5249
|
+
chunkOverlap = 0,
|
|
5250
|
+
minSentencesPerChunk = 1,
|
|
5251
|
+
minCharactersPerSentence = 12,
|
|
5252
|
+
approximate = false,
|
|
5253
|
+
delim = [". ", "! ", "? ", "\n"],
|
|
5254
|
+
includeDelim = "prev"
|
|
5255
|
+
} = options;
|
|
5256
|
+
const tokenizerInstance = await new ExuluTokenizer();
|
|
5257
|
+
await tokenizerInstance.create(tokenizer);
|
|
5258
|
+
const plainInstance = new _SentenceChunker(
|
|
5259
|
+
tokenizerInstance,
|
|
5260
|
+
chunkSize,
|
|
5261
|
+
chunkOverlap,
|
|
5262
|
+
minSentencesPerChunk,
|
|
5263
|
+
minCharactersPerSentence,
|
|
5264
|
+
approximate,
|
|
5265
|
+
delim,
|
|
5266
|
+
includeDelim
|
|
5267
|
+
);
|
|
5268
|
+
const callableFn = function(textOrTexts, showProgress) {
|
|
5269
|
+
if (typeof textOrTexts === "string") {
|
|
5270
|
+
return plainInstance.call(textOrTexts, showProgress);
|
|
5271
|
+
} else {
|
|
5272
|
+
return plainInstance.call(textOrTexts, showProgress);
|
|
5273
|
+
}
|
|
5274
|
+
};
|
|
5275
|
+
Object.setPrototypeOf(callableFn, _SentenceChunker.prototype);
|
|
5276
|
+
Object.assign(callableFn, plainInstance);
|
|
5277
|
+
return callableFn;
|
|
5278
|
+
}
|
|
5279
|
+
// NOTE: The replace + split method is not the best/most efficient way in general to be doing this. It works well in python because python implements .replace and .split in C while the re library is much slower in python.
|
|
5280
|
+
// NOTE: The new split -> join -> split is so weird, but it works. I don't quite like it however.
|
|
5281
|
+
// TODO: Implement a more efficient method for splitting text into sentences.
|
|
5282
|
+
/**
|
|
5283
|
+
* Fast sentence splitting while maintaining accuracy.
|
|
5284
|
+
*
|
|
5285
|
+
* @param {string} text - The text to split into sentences.
|
|
5286
|
+
* @returns {string[]} An array of sentences.
|
|
5287
|
+
*/
|
|
5288
|
+
_splitText(text) {
|
|
5289
|
+
let t = text;
|
|
5290
|
+
for (const c of this.delim) {
|
|
5291
|
+
if (this.includeDelim === "prev") {
|
|
5292
|
+
t = t.split(c).join(c + this.sep);
|
|
5293
|
+
} else if (this.includeDelim === "next") {
|
|
5294
|
+
t = t.split(c).join(this.sep + c);
|
|
5295
|
+
} else {
|
|
5296
|
+
t = t.split(c).join(this.sep);
|
|
5297
|
+
}
|
|
5298
|
+
}
|
|
5299
|
+
const splits = t.split(this.sep);
|
|
5300
|
+
const sentences = [];
|
|
5301
|
+
let current = "";
|
|
5302
|
+
for (const s of splits) {
|
|
5303
|
+
if (!current) {
|
|
5304
|
+
current = s;
|
|
5305
|
+
} else {
|
|
5306
|
+
if (current.length >= this.minCharactersPerSentence) {
|
|
5307
|
+
sentences.push(current);
|
|
5308
|
+
current = s;
|
|
5309
|
+
} else {
|
|
5310
|
+
current += s;
|
|
5311
|
+
}
|
|
5312
|
+
}
|
|
5313
|
+
}
|
|
5314
|
+
if (current) {
|
|
5315
|
+
sentences.push(current);
|
|
5316
|
+
}
|
|
5317
|
+
return sentences;
|
|
5318
|
+
}
|
|
5319
|
+
/**
|
|
5320
|
+
* Split text into sentences and calculate token counts for each sentence.
|
|
5321
|
+
*
|
|
5322
|
+
* @param {string} text - The text to split into sentences.
|
|
5323
|
+
* @returns {Promise<Sentence[]>} An array of Sentence objects.
|
|
5324
|
+
*/
|
|
5325
|
+
async _prepareSentences(text) {
|
|
5326
|
+
const sentenceTexts = this._splitText(text);
|
|
5327
|
+
if (!sentenceTexts.length) {
|
|
5328
|
+
return [];
|
|
5329
|
+
}
|
|
5330
|
+
const positions = [];
|
|
5331
|
+
let currentPos = 0;
|
|
5332
|
+
for (const sent of sentenceTexts) {
|
|
5333
|
+
positions.push(currentPos);
|
|
5334
|
+
currentPos += sent.length;
|
|
5335
|
+
}
|
|
5336
|
+
const tokenCounts = await this.tokenizer.countTokensBatch(sentenceTexts);
|
|
5337
|
+
return sentenceTexts.map((sent, i) => new Sentence({
|
|
5338
|
+
text: sent,
|
|
5339
|
+
startIndex: positions[i],
|
|
5340
|
+
endIndex: positions[i] + sent.length,
|
|
5341
|
+
tokenCount: tokenCounts[i]
|
|
5342
|
+
}));
|
|
5343
|
+
}
|
|
5344
|
+
/**
|
|
5345
|
+
* Create a chunk from a list of sentences.
|
|
5346
|
+
*
|
|
5347
|
+
* @param {Sentence[]} sentences - The sentences to create a chunk from.
|
|
5348
|
+
* @returns {Promise<SentenceChunk>} A promise that resolves to a SentenceChunk object.
|
|
5349
|
+
*/
|
|
5350
|
+
async _createChunk(sentences) {
|
|
5351
|
+
const chunkText = sentences.map((sentence) => sentence.text).join("");
|
|
5352
|
+
const tokenCount = await this.tokenizer.countTokens(chunkText);
|
|
5353
|
+
return new SentenceChunk({
|
|
5354
|
+
text: chunkText,
|
|
5355
|
+
startIndex: sentences[0].startIndex,
|
|
5356
|
+
endIndex: sentences[sentences.length - 1].endIndex,
|
|
5357
|
+
tokenCount,
|
|
5358
|
+
sentences
|
|
5359
|
+
});
|
|
5360
|
+
}
|
|
5361
|
+
/**
|
|
5362
|
+
* Split text into overlapping chunks based on sentences while respecting token limits.
|
|
5363
|
+
*
|
|
5364
|
+
* @param {string} text - The text to split into chunks.
|
|
5365
|
+
* @returns {Promise<SentenceChunk[]>} A promise that resolves to an array of SentenceChunk objects.
|
|
5366
|
+
*/
|
|
5367
|
+
async chunk(text) {
|
|
5368
|
+
if (!text.trim()) {
|
|
5369
|
+
return [];
|
|
5370
|
+
}
|
|
5371
|
+
const sentences = await this._prepareSentences(text);
|
|
5372
|
+
if (!sentences.length) {
|
|
5373
|
+
return [];
|
|
5374
|
+
}
|
|
5375
|
+
const tokenSums = [];
|
|
5376
|
+
let sum = 0;
|
|
5377
|
+
for (const sentence of sentences) {
|
|
5378
|
+
tokenSums.push(sum);
|
|
5379
|
+
sum += sentence.tokenCount;
|
|
5380
|
+
}
|
|
5381
|
+
tokenSums.push(sum);
|
|
5382
|
+
const chunks = [];
|
|
5383
|
+
let pos = 0;
|
|
5384
|
+
while (pos < sentences.length) {
|
|
5385
|
+
const targetTokens = tokenSums[pos] + this.chunkSize;
|
|
5386
|
+
let splitIdx = this._bisectLeft(tokenSums, targetTokens, pos) - 1;
|
|
5387
|
+
splitIdx = Math.min(splitIdx, sentences.length);
|
|
5388
|
+
splitIdx = Math.max(splitIdx, pos + 1);
|
|
5389
|
+
if (splitIdx - pos < this.minSentencesPerChunk) {
|
|
5390
|
+
if (pos + this.minSentencesPerChunk <= sentences.length) {
|
|
5391
|
+
splitIdx = pos + this.minSentencesPerChunk;
|
|
5392
|
+
} else {
|
|
5393
|
+
console.warn(
|
|
5394
|
+
`Minimum sentences per chunk as ${this.minSentencesPerChunk} could not be met for all chunks. Last chunk of the text will have only ${sentences.length - pos} sentences. Consider increasing the chunk_size or decreasing the min_sentences_per_chunk.`
|
|
5395
|
+
);
|
|
5396
|
+
splitIdx = sentences.length;
|
|
5397
|
+
}
|
|
5398
|
+
}
|
|
5399
|
+
const chunkSentences = sentences.slice(pos, splitIdx);
|
|
5400
|
+
chunks.push(await this._createChunk(chunkSentences));
|
|
5401
|
+
if (this.chunkOverlap > 0 && splitIdx < sentences.length) {
|
|
5402
|
+
let overlapTokens = 0;
|
|
5403
|
+
let overlapIdx = splitIdx - 1;
|
|
5404
|
+
while (overlapIdx > pos && overlapTokens < this.chunkOverlap) {
|
|
5405
|
+
const sent = sentences[overlapIdx];
|
|
5406
|
+
const nextTokens = overlapTokens + sent.tokenCount + 1;
|
|
5407
|
+
if (nextTokens > this.chunkOverlap) {
|
|
5408
|
+
break;
|
|
5409
|
+
}
|
|
5410
|
+
overlapTokens = nextTokens;
|
|
5411
|
+
overlapIdx--;
|
|
5412
|
+
}
|
|
5413
|
+
pos = overlapIdx + 1;
|
|
5414
|
+
} else {
|
|
5415
|
+
pos = splitIdx;
|
|
5416
|
+
}
|
|
5417
|
+
}
|
|
5418
|
+
await this.tokenizer.free();
|
|
5419
|
+
return chunks;
|
|
5420
|
+
}
|
|
5421
|
+
/**
|
|
5422
|
+
* Binary search to find the leftmost position where value should be inserted to maintain order.
|
|
5423
|
+
*
|
|
5424
|
+
* @param {number[]} arr - The array to search.
|
|
5425
|
+
* @param {number} value - The value to search for.
|
|
5426
|
+
* @param {number} [lo] - The starting index of the search.
|
|
5427
|
+
* @returns {number} The index of the leftmost position where value should be inserted.
|
|
5428
|
+
*/
|
|
5429
|
+
_bisectLeft(arr, value, lo = 0) {
|
|
5430
|
+
let hi = arr.length;
|
|
5431
|
+
while (lo < hi) {
|
|
5432
|
+
const mid = lo + hi >>> 1;
|
|
5433
|
+
if (arr[mid] < value) {
|
|
5434
|
+
lo = mid + 1;
|
|
5435
|
+
} else {
|
|
5436
|
+
hi = mid;
|
|
5437
|
+
}
|
|
5438
|
+
}
|
|
5439
|
+
return lo;
|
|
5440
|
+
}
|
|
5441
|
+
/**
|
|
5442
|
+
* Return a string representation of the SentenceChunker.
|
|
5443
|
+
*
|
|
5444
|
+
* @returns {string} A string representation of the SentenceChunker.
|
|
5445
|
+
*/
|
|
5446
|
+
toString() {
|
|
5447
|
+
return `SentenceChunker(tokenizer=${this.tokenizer}, chunkSize=${this.chunkSize}, chunkOverlap=${this.chunkOverlap}, minSentencesPerChunk=${this.minSentencesPerChunk}, minCharactersPerSentence=${this.minCharactersPerSentence}, approximate=${this.approximate}, delim=${this.delim}, includeDelim=${this.includeDelim})`;
|
|
5448
|
+
}
|
|
5449
|
+
};
|
|
4237
5450
|
|
|
4238
5451
|
// src/cli/index.tsx
|
|
4239
5452
|
var import_react2 = require("react");
|
|
@@ -4433,12 +5646,10 @@ var ExuluJobs = {
|
|
|
4433
5646
|
}
|
|
4434
5647
|
};
|
|
4435
5648
|
var ExuluChunkers = {
|
|
4436
|
-
|
|
4437
|
-
|
|
4438
|
-
|
|
4439
|
-
|
|
4440
|
-
rules: import_chonkie.RecursiveRules
|
|
4441
|
-
}
|
|
5649
|
+
sentence: SentenceChunker,
|
|
5650
|
+
recursive: {
|
|
5651
|
+
function: RecursiveChunker,
|
|
5652
|
+
rules: RecursiveRules
|
|
4442
5653
|
}
|
|
4443
5654
|
};
|
|
4444
5655
|
var ExuluDatabase = {
|