@exulu/backend 1.50.0 → 1.51.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +561 -119
- package/dist/index.d.cts +16 -3
- package/dist/index.d.ts +16 -3
- package/dist/index.js +564 -122
- package/ee/chunking/markdown.ts +83 -5
- package/ee/python/documents/processing/doc_processor.ts +380 -84
- package/ee/workers.ts +214 -18
- package/package.json +4 -1
package/dist/index.cjs
CHANGED
|
@@ -205,17 +205,19 @@ async function postgresClient() {
|
|
|
205
205
|
// 30 minutes
|
|
206
206
|
},
|
|
207
207
|
pool: {
|
|
208
|
-
min:
|
|
209
|
-
//
|
|
210
|
-
max:
|
|
211
|
-
// Increased
|
|
212
|
-
acquireTimeoutMillis:
|
|
213
|
-
//
|
|
208
|
+
min: 10,
|
|
209
|
+
// Minimum connections always ready
|
|
210
|
+
max: 300,
|
|
211
|
+
// Increased to support high worker concurrency (250+ concurrent jobs)
|
|
212
|
+
acquireTimeoutMillis: 12e4,
|
|
213
|
+
// 2 minutes - increased to handle high contention during bursts
|
|
214
214
|
createTimeoutMillis: 3e4,
|
|
215
215
|
idleTimeoutMillis: 6e4,
|
|
216
|
-
//
|
|
216
|
+
// Keep connections alive for reuse
|
|
217
217
|
reapIntervalMillis: 1e3,
|
|
218
218
|
createRetryIntervalMillis: 200,
|
|
219
|
+
// Enable propagateCreateError to properly handle connection creation failures
|
|
220
|
+
propagateCreateError: false,
|
|
219
221
|
// Log pool events to help debug connection issues
|
|
220
222
|
afterCreate: (conn, done) => {
|
|
221
223
|
console.log("[EXULU] New database connection created");
|
|
@@ -3618,7 +3620,7 @@ var convertExuluToolsToAiSdkTools = async (currentTools, approvedTools, allExulu
|
|
|
3618
3620
|
description,
|
|
3619
3621
|
// The approvedTools array uses the tool.name lookup as the frontend
|
|
3620
3622
|
// Vercel AI SDK uses the sanitized tool name as the key, so this matches.
|
|
3621
|
-
needsApproval: approvedTools?.includes("tool-" + cur.name) ? false : true,
|
|
3623
|
+
needsApproval: approvedTools?.includes("tool-" + cur.name) || !cur.needsApproval ? false : true,
|
|
3622
3624
|
// todo make configurable
|
|
3623
3625
|
async *execute(inputs, options) {
|
|
3624
3626
|
console.log(
|
|
@@ -3768,6 +3770,7 @@ var ExuluTool = class {
|
|
|
3768
3770
|
inputSchema;
|
|
3769
3771
|
type;
|
|
3770
3772
|
tool;
|
|
3773
|
+
needsApproval;
|
|
3771
3774
|
config;
|
|
3772
3775
|
constructor({
|
|
3773
3776
|
id,
|
|
@@ -3777,10 +3780,12 @@ var ExuluTool = class {
|
|
|
3777
3780
|
inputSchema,
|
|
3778
3781
|
type,
|
|
3779
3782
|
execute: execute2,
|
|
3780
|
-
config
|
|
3783
|
+
config,
|
|
3784
|
+
needsApproval
|
|
3781
3785
|
}) {
|
|
3782
3786
|
this.id = id;
|
|
3783
3787
|
this.config = config;
|
|
3788
|
+
this.needsApproval = needsApproval ?? true;
|
|
3784
3789
|
this.category = category || "default";
|
|
3785
3790
|
this.name = name;
|
|
3786
3791
|
this.description = description;
|
|
@@ -4187,11 +4192,30 @@ var ExuluContext2 = class {
|
|
|
4187
4192
|
);
|
|
4188
4193
|
await db2.from(getChunksTableName(this.id)).where({ source }).delete();
|
|
4189
4194
|
if (chunks?.length) {
|
|
4195
|
+
const sanitizeString = (str) => {
|
|
4196
|
+
if (!str) return "";
|
|
4197
|
+
return str.replace(/\0/g, "");
|
|
4198
|
+
};
|
|
4199
|
+
const sanitizeMetadata2 = (metadata) => {
|
|
4200
|
+
if (!metadata) return {};
|
|
4201
|
+
const sanitized = {};
|
|
4202
|
+
for (const [key, value] of Object.entries(metadata)) {
|
|
4203
|
+
if (typeof value === "string") {
|
|
4204
|
+
sanitized[key] = sanitizeString(value);
|
|
4205
|
+
} else {
|
|
4206
|
+
sanitized[key] = value;
|
|
4207
|
+
}
|
|
4208
|
+
}
|
|
4209
|
+
return sanitized;
|
|
4210
|
+
};
|
|
4190
4211
|
await db2.from(getChunksTableName(this.id)).insert(
|
|
4191
4212
|
chunks.map((chunk) => ({
|
|
4192
|
-
source
|
|
4193
|
-
|
|
4194
|
-
|
|
4213
|
+
// Sanitize source to remove null bytes
|
|
4214
|
+
source: sanitizeString(source),
|
|
4215
|
+
// Sanitize metadata to remove null bytes from string values
|
|
4216
|
+
metadata: sanitizeMetadata2(chunk.metadata),
|
|
4217
|
+
// Remove null bytes (0x00) which are invalid in PostgreSQL UTF8 encoding
|
|
4218
|
+
content: sanitizeString(chunk.content),
|
|
4195
4219
|
chunk_index: chunk.index,
|
|
4196
4220
|
embedding: import_knex5.default.toSql(chunk.vector)
|
|
4197
4221
|
}))
|
|
@@ -4580,6 +4604,8 @@ var ExuluContext2 = class {
|
|
|
4580
4604
|
name: `${this.name}_context_search`,
|
|
4581
4605
|
type: "context",
|
|
4582
4606
|
category: "contexts",
|
|
4607
|
+
needsApproval: true,
|
|
4608
|
+
// todo make configurable
|
|
4583
4609
|
inputSchema: import_zod4.z.object({
|
|
4584
4610
|
query: import_zod4.z.string().describe("The original question that the user asked"),
|
|
4585
4611
|
keywords: import_zod4.z.array(import_zod4.z.string()).describe(
|
|
@@ -5773,7 +5799,7 @@ var finalizeRequestedFields = async ({
|
|
|
5773
5799
|
return result;
|
|
5774
5800
|
}
|
|
5775
5801
|
const { db: db2 } = await postgresClient();
|
|
5776
|
-
const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt");
|
|
5802
|
+
const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt", "metadata");
|
|
5777
5803
|
const chunks = await query;
|
|
5778
5804
|
result.chunks = chunks.map((chunk) => ({
|
|
5779
5805
|
chunk_content: chunk.content,
|
|
@@ -5786,7 +5812,8 @@ var finalizeRequestedFields = async ({
|
|
|
5786
5812
|
item_created_at: chunk.item_created_at,
|
|
5787
5813
|
item_id: chunk.item_id,
|
|
5788
5814
|
item_external_id: chunk.item_external_id,
|
|
5789
|
-
item_name: chunk.item_name
|
|
5815
|
+
item_name: chunk.item_name,
|
|
5816
|
+
chunk_metadata: chunk.metadata
|
|
5790
5817
|
}));
|
|
5791
5818
|
}
|
|
5792
5819
|
}
|
|
@@ -7160,6 +7187,36 @@ var import_ai3 = require("ai");
|
|
|
7160
7187
|
var import_crypto_js4 = __toESM(require("crypto-js"), 1);
|
|
7161
7188
|
var redisConnection;
|
|
7162
7189
|
var unhandledRejectionHandlerInstalled = false;
|
|
7190
|
+
var poolMonitoringInterval;
|
|
7191
|
+
var startPoolMonitoring = () => {
|
|
7192
|
+
if (poolMonitoringInterval) return;
|
|
7193
|
+
poolMonitoringInterval = setInterval(async () => {
|
|
7194
|
+
try {
|
|
7195
|
+
const { db: db2 } = await postgresClient();
|
|
7196
|
+
const poolStats = db2.client.pool;
|
|
7197
|
+
if (poolStats) {
|
|
7198
|
+
const used = poolStats.numUsed?.() || 0;
|
|
7199
|
+
const free = poolStats.numFree?.() || 0;
|
|
7200
|
+
const pending = poolStats.numPendingAcquires?.() || 0;
|
|
7201
|
+
const total = used + free;
|
|
7202
|
+
console.log("[EXULU] Connection pool health check:", {
|
|
7203
|
+
used,
|
|
7204
|
+
free,
|
|
7205
|
+
pending,
|
|
7206
|
+
total,
|
|
7207
|
+
utilization: total > 0 ? `${Math.round(used / total * 100)}%` : "0%"
|
|
7208
|
+
});
|
|
7209
|
+
if (pending > 10) {
|
|
7210
|
+
console.warn(
|
|
7211
|
+
`[EXULU] WARNING: ${pending} jobs waiting for database connections. Consider increasing pool size or reducing worker concurrency.`
|
|
7212
|
+
);
|
|
7213
|
+
}
|
|
7214
|
+
}
|
|
7215
|
+
} catch (error) {
|
|
7216
|
+
console.error("[EXULU] Error checking pool health:", error);
|
|
7217
|
+
}
|
|
7218
|
+
}, 3e4);
|
|
7219
|
+
};
|
|
7163
7220
|
var installGlobalErrorHandlers = () => {
|
|
7164
7221
|
if (unhandledRejectionHandlerInstalled) return;
|
|
7165
7222
|
process.on("unhandledRejection", (reason) => {
|
|
@@ -7184,6 +7241,7 @@ var installGlobalErrorHandlers = () => {
|
|
|
7184
7241
|
unhandledRejectionHandlerInstalled = true;
|
|
7185
7242
|
console.log("[EXULU] Global error handlers installed to prevent worker crashes");
|
|
7186
7243
|
};
|
|
7244
|
+
var isShuttingDown = false;
|
|
7187
7245
|
var createWorkers = async (providers, queues2, config, contexts, rerankers, evals, tools, tracer) => {
|
|
7188
7246
|
console.log("[EXULU] creating workers for " + queues2?.length + " queues.");
|
|
7189
7247
|
console.log(
|
|
@@ -7191,7 +7249,8 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7191
7249
|
queues2.map((q) => q.queue.name)
|
|
7192
7250
|
);
|
|
7193
7251
|
installGlobalErrorHandlers();
|
|
7194
|
-
|
|
7252
|
+
startPoolMonitoring();
|
|
7253
|
+
process.setMaxListeners(Math.max(15, process.getMaxListeners()));
|
|
7195
7254
|
if (!redisServer.host || !redisServer.port) {
|
|
7196
7255
|
console.error(
|
|
7197
7256
|
"[EXULU] you are trying to start worker, but no redis server is configured in the environment."
|
|
@@ -7224,7 +7283,53 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7224
7283
|
status: await bullmqJob.getState(),
|
|
7225
7284
|
type: bullmqJob.data.type
|
|
7226
7285
|
});
|
|
7227
|
-
|
|
7286
|
+
let progressInterval;
|
|
7287
|
+
if (bullmqJob.data.type === "processor") {
|
|
7288
|
+
progressInterval = setInterval(async () => {
|
|
7289
|
+
try {
|
|
7290
|
+
await bullmqJob.updateProgress({
|
|
7291
|
+
status: "processing",
|
|
7292
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
7293
|
+
});
|
|
7294
|
+
console.log(`[EXULU] Job ${bullmqJob.id} heartbeat sent to prevent stalling`);
|
|
7295
|
+
} catch (error) {
|
|
7296
|
+
console.error(`[EXULU] Error updating job progress:`, error);
|
|
7297
|
+
}
|
|
7298
|
+
}, 25e3);
|
|
7299
|
+
}
|
|
7300
|
+
let db2;
|
|
7301
|
+
let retries = 3;
|
|
7302
|
+
let lastError;
|
|
7303
|
+
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
7304
|
+
try {
|
|
7305
|
+
const client2 = await postgresClient();
|
|
7306
|
+
db2 = client2.db;
|
|
7307
|
+
const poolStats = db2.client.pool;
|
|
7308
|
+
if (poolStats) {
|
|
7309
|
+
console.log(`[EXULU] Connection pool stats for job ${bullmqJob.id}:`, {
|
|
7310
|
+
size: poolStats.numUsed?.() || 0,
|
|
7311
|
+
available: poolStats.numFree?.() || 0,
|
|
7312
|
+
pending: poolStats.numPendingAcquires?.() || 0
|
|
7313
|
+
});
|
|
7314
|
+
}
|
|
7315
|
+
break;
|
|
7316
|
+
} catch (error) {
|
|
7317
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
7318
|
+
console.error(
|
|
7319
|
+
`[EXULU] Failed to acquire database connection (attempt ${attempt}/${retries}) for job ${bullmqJob.id}:`,
|
|
7320
|
+
lastError.message
|
|
7321
|
+
);
|
|
7322
|
+
if (attempt < retries) {
|
|
7323
|
+
const backoffMs = 500 * Math.pow(2, attempt - 1);
|
|
7324
|
+
await new Promise((resolve3) => setTimeout(resolve3, backoffMs));
|
|
7325
|
+
}
|
|
7326
|
+
}
|
|
7327
|
+
}
|
|
7328
|
+
if (!db2) {
|
|
7329
|
+
throw new Error(
|
|
7330
|
+
`Failed to acquire database connection after ${retries} attempts: ${lastError?.message}`
|
|
7331
|
+
);
|
|
7332
|
+
}
|
|
7228
7333
|
const data = bullmqJob.data;
|
|
7229
7334
|
const timeoutInSeconds = data.timeoutInSeconds || queue.timeoutInSeconds || 600;
|
|
7230
7335
|
const timeoutMs = timeoutInSeconds * 1e3;
|
|
@@ -7316,7 +7421,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7316
7421
|
}
|
|
7317
7422
|
const exuluStorage = new ExuluStorage({ config });
|
|
7318
7423
|
console.log("[EXULU] POS 2 -- EXULU CONTEXT PROCESS FIELD");
|
|
7319
|
-
|
|
7424
|
+
let processorResult = await context.processor.execute({
|
|
7320
7425
|
item: data.inputs,
|
|
7321
7426
|
user: data.user,
|
|
7322
7427
|
role: data.role,
|
|
@@ -7331,12 +7436,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7331
7436
|
);
|
|
7332
7437
|
}
|
|
7333
7438
|
delete processorResult.field;
|
|
7439
|
+
const updateData = { ...processorResult };
|
|
7334
7440
|
await db2.from(getTableName(context.id)).where({
|
|
7335
7441
|
id: processorResult.id
|
|
7336
7442
|
}).update({
|
|
7337
|
-
...
|
|
7443
|
+
...updateData,
|
|
7338
7444
|
last_processed_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
7339
7445
|
});
|
|
7446
|
+
Object.keys(updateData).forEach((key) => {
|
|
7447
|
+
delete updateData[key];
|
|
7448
|
+
});
|
|
7340
7449
|
let jobs = [];
|
|
7341
7450
|
if (context.processor?.config?.generateEmbeddings) {
|
|
7342
7451
|
const fullItem = await db2.from(getTableName(context.id)).where({
|
|
@@ -7358,12 +7467,18 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7358
7467
|
jobs.push(embeddingsJob);
|
|
7359
7468
|
}
|
|
7360
7469
|
}
|
|
7361
|
-
|
|
7362
|
-
result: processorResult,
|
|
7470
|
+
const result = {
|
|
7471
|
+
result: { id: processorResult.id },
|
|
7363
7472
|
metadata: {
|
|
7364
7473
|
jobs: jobs.length > 0 ? jobs.join(",") : void 0
|
|
7365
7474
|
}
|
|
7366
7475
|
};
|
|
7476
|
+
processorResult = null;
|
|
7477
|
+
const memUsage = process.memoryUsage();
|
|
7478
|
+
console.log(
|
|
7479
|
+
`[EXULU] Memory after processor job ${bullmqJob.id}: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`
|
|
7480
|
+
);
|
|
7481
|
+
return result;
|
|
7367
7482
|
}
|
|
7368
7483
|
if (data.type === "workflow") {
|
|
7369
7484
|
console.log("[EXULU] running a workflow job.", bullmqJob.name);
|
|
@@ -7382,10 +7497,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7382
7497
|
user,
|
|
7383
7498
|
messages: inputMessages
|
|
7384
7499
|
} = await validateWorkflowPayload(data, providers);
|
|
7385
|
-
const
|
|
7500
|
+
const retries2 = 3;
|
|
7386
7501
|
let attempts = 0;
|
|
7387
7502
|
const promise = new Promise(async (resolve3, reject) => {
|
|
7388
|
-
while (attempts <
|
|
7503
|
+
while (attempts < retries2) {
|
|
7389
7504
|
try {
|
|
7390
7505
|
const messages2 = await processUiMessagesFlow({
|
|
7391
7506
|
providers,
|
|
@@ -7407,7 +7522,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7407
7522
|
error instanceof Error ? error.message : String(error)
|
|
7408
7523
|
);
|
|
7409
7524
|
attempts++;
|
|
7410
|
-
if (attempts >=
|
|
7525
|
+
if (attempts >= retries2) {
|
|
7411
7526
|
reject(new Error(error instanceof Error ? error.message : String(error)));
|
|
7412
7527
|
}
|
|
7413
7528
|
await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
|
|
@@ -7458,10 +7573,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7458
7573
|
testCase,
|
|
7459
7574
|
messages: inputMessages
|
|
7460
7575
|
} = await validateEvalPayload(data, providers);
|
|
7461
|
-
const
|
|
7576
|
+
const retries2 = 3;
|
|
7462
7577
|
let attempts = 0;
|
|
7463
7578
|
const promise = new Promise(async (resolve3, reject) => {
|
|
7464
|
-
while (attempts <
|
|
7579
|
+
while (attempts < retries2) {
|
|
7465
7580
|
try {
|
|
7466
7581
|
const messages2 = await processUiMessagesFlow({
|
|
7467
7582
|
providers,
|
|
@@ -7482,7 +7597,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7482
7597
|
error instanceof Error ? error.message : String(error)
|
|
7483
7598
|
);
|
|
7484
7599
|
attempts++;
|
|
7485
|
-
if (attempts >=
|
|
7600
|
+
if (attempts >= retries2) {
|
|
7486
7601
|
reject(new Error(error instanceof Error ? error.message : String(error)));
|
|
7487
7602
|
}
|
|
7488
7603
|
await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
|
|
@@ -7731,9 +7846,15 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7731
7846
|
try {
|
|
7732
7847
|
const result = await Promise.race([workPromise, timeoutPromise]);
|
|
7733
7848
|
clearTimeout(timeoutHandle);
|
|
7849
|
+
if (progressInterval) {
|
|
7850
|
+
clearInterval(progressInterval);
|
|
7851
|
+
}
|
|
7734
7852
|
return result;
|
|
7735
7853
|
} catch (error) {
|
|
7736
7854
|
clearTimeout(timeoutHandle);
|
|
7855
|
+
if (progressInterval) {
|
|
7856
|
+
clearInterval(progressInterval);
|
|
7857
|
+
}
|
|
7737
7858
|
console.error(
|
|
7738
7859
|
`[EXULU] job ${bullmqJob.id} failed (error caught in race handler).`,
|
|
7739
7860
|
error instanceof Error ? error.message : String(error)
|
|
@@ -7747,6 +7868,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7747
7868
|
concurrency: queue.concurrency?.worker || 1,
|
|
7748
7869
|
removeOnComplete: { count: 1e3 },
|
|
7749
7870
|
removeOnFail: { count: 5e3 },
|
|
7871
|
+
// Configure settings for long-running jobs (especially processor jobs)
|
|
7872
|
+
// lockDuration: How long a worker can hold a job before it's considered stalled
|
|
7873
|
+
// Set to 5 minutes to accommodate CPU-intensive operations
|
|
7874
|
+
lockDuration: 3e5,
|
|
7875
|
+
// 5 minutes in milliseconds
|
|
7876
|
+
// stalledInterval: How often to check for stalled jobs
|
|
7877
|
+
// Set to 2 minutes to reduce false positives for long-running operations
|
|
7878
|
+
stalledInterval: 12e4,
|
|
7879
|
+
// 2 minutes in milliseconds
|
|
7880
|
+
maxStalledCount: 1,
|
|
7750
7881
|
...queue.ratelimit && {
|
|
7751
7882
|
limiter: {
|
|
7752
7883
|
max: queue.ratelimit,
|
|
@@ -7783,24 +7914,68 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7783
7914
|
error: error instanceof Error ? error.message : String(error)
|
|
7784
7915
|
} : error
|
|
7785
7916
|
);
|
|
7917
|
+
throw error;
|
|
7786
7918
|
});
|
|
7787
7919
|
worker.on("error", (error) => {
|
|
7788
7920
|
console.error(`[EXULU] worker error.`, error);
|
|
7921
|
+
throw error;
|
|
7789
7922
|
});
|
|
7790
7923
|
worker.on("progress", (job, progress) => {
|
|
7791
7924
|
console.log(`[EXULU] job progress ${job.id}.`, job.name, {
|
|
7792
7925
|
progress
|
|
7793
7926
|
});
|
|
7794
7927
|
});
|
|
7795
|
-
const gracefulShutdown = async (signal) => {
|
|
7796
|
-
console.log(`Received ${signal}, closing server...`);
|
|
7797
|
-
await worker.close();
|
|
7798
|
-
process.exit(0);
|
|
7799
|
-
};
|
|
7800
|
-
process.on("SIGINT", () => gracefulShutdown("SIGINT"));
|
|
7801
|
-
process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
|
|
7802
7928
|
return worker;
|
|
7803
7929
|
});
|
|
7930
|
+
const gracefulShutdown = async (signal) => {
|
|
7931
|
+
if (isShuttingDown) {
|
|
7932
|
+
console.log(`[EXULU] Shutdown already in progress, ignoring additional ${signal}`);
|
|
7933
|
+
return;
|
|
7934
|
+
}
|
|
7935
|
+
isShuttingDown = true;
|
|
7936
|
+
console.log(`[EXULU] Received ${signal}, shutting down gracefully...`);
|
|
7937
|
+
try {
|
|
7938
|
+
if (poolMonitoringInterval) {
|
|
7939
|
+
clearInterval(poolMonitoringInterval);
|
|
7940
|
+
poolMonitoringInterval = void 0;
|
|
7941
|
+
}
|
|
7942
|
+
console.log(`[EXULU] Closing ${workers.length} worker(s)...`);
|
|
7943
|
+
const closePromises = workers.map(async (worker, index) => {
|
|
7944
|
+
try {
|
|
7945
|
+
await Promise.race([
|
|
7946
|
+
worker.close(),
|
|
7947
|
+
new Promise(
|
|
7948
|
+
(_, reject) => setTimeout(() => reject(new Error("Worker close timeout")), 3e4)
|
|
7949
|
+
)
|
|
7950
|
+
]);
|
|
7951
|
+
console.log(`[EXULU] Worker ${index + 1} closed successfully`);
|
|
7952
|
+
} catch (error) {
|
|
7953
|
+
console.error(`[EXULU] Error closing worker ${index + 1}:`, error);
|
|
7954
|
+
}
|
|
7955
|
+
});
|
|
7956
|
+
await Promise.allSettled(closePromises);
|
|
7957
|
+
if (redisConnection) {
|
|
7958
|
+
console.log(`[EXULU] Closing Redis connection...`);
|
|
7959
|
+
await redisConnection.quit();
|
|
7960
|
+
}
|
|
7961
|
+
try {
|
|
7962
|
+
const { db: db2 } = await postgresClient();
|
|
7963
|
+
if (db2?.client) {
|
|
7964
|
+
console.log(`[EXULU] Closing database connection pool...`);
|
|
7965
|
+
await db2.client.destroy();
|
|
7966
|
+
}
|
|
7967
|
+
} catch (error) {
|
|
7968
|
+
console.error(`[EXULU] Error closing database:`, error);
|
|
7969
|
+
}
|
|
7970
|
+
console.log(`[EXULU] Graceful shutdown complete`);
|
|
7971
|
+
process.exit(0);
|
|
7972
|
+
} catch (error) {
|
|
7973
|
+
console.error(`[EXULU] Error during graceful shutdown:`, error);
|
|
7974
|
+
process.exit(1);
|
|
7975
|
+
}
|
|
7976
|
+
};
|
|
7977
|
+
process.once("SIGINT", () => gracefulShutdown("SIGINT"));
|
|
7978
|
+
process.once("SIGTERM", () => gracefulShutdown("SIGTERM"));
|
|
7804
7979
|
return workers;
|
|
7805
7980
|
};
|
|
7806
7981
|
var validateWorkflowPayload = async (data, providers) => {
|
|
@@ -9514,6 +9689,7 @@ type AgentEvalFunctionConfig {
|
|
|
9514
9689
|
|
|
9515
9690
|
type ItemChunks {
|
|
9516
9691
|
chunk_id: String!
|
|
9692
|
+
chunk_metadata: JSON!
|
|
9517
9693
|
chunk_index: Int!
|
|
9518
9694
|
chunk_content: String!
|
|
9519
9695
|
chunk_source: String!
|
|
@@ -9920,7 +10096,7 @@ var ExuluProvider = class {
|
|
|
9920
10096
|
prompt: import_zod7.z.string().describe("The prompt (usually a question for the agent) to send to the agent."),
|
|
9921
10097
|
information: import_zod7.z.string().describe("A summary of relevant context / information from the current session")
|
|
9922
10098
|
}),
|
|
9923
|
-
description: `This tool calls an
|
|
10099
|
+
description: `This tool calls an agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
|
|
9924
10100
|
config: [],
|
|
9925
10101
|
execute: async ({ prompt, information, user, allExuluTools }) => {
|
|
9926
10102
|
const hasAccessToAgent = await checkRecordAccess(agent, "read", user);
|
|
@@ -10033,9 +10209,6 @@ var ExuluProvider = class {
|
|
|
10033
10209
|
if (!prompt && !inputMessages?.length) {
|
|
10034
10210
|
throw new Error("Prompt or message is required for generating.");
|
|
10035
10211
|
}
|
|
10036
|
-
if (outputSchema && !prompt) {
|
|
10037
|
-
throw new Error("Prompt is required for generating with an output schema.");
|
|
10038
|
-
}
|
|
10039
10212
|
const model = this.model.create({
|
|
10040
10213
|
...providerapikey ? { apiKey: providerapikey } : {}
|
|
10041
10214
|
});
|
|
@@ -10172,14 +10345,18 @@ var ExuluProvider = class {
|
|
|
10172
10345
|
let inputTokens = 0;
|
|
10173
10346
|
let outputTokens = 0;
|
|
10174
10347
|
if (outputSchema) {
|
|
10175
|
-
const {
|
|
10348
|
+
const { output, usage } = await (0, import_ai4.generateText)({
|
|
10176
10349
|
model,
|
|
10177
10350
|
system,
|
|
10178
|
-
prompt,
|
|
10179
10351
|
maxRetries: 3,
|
|
10180
|
-
|
|
10352
|
+
output: import_ai4.Output.object({
|
|
10353
|
+
schema: outputSchema
|
|
10354
|
+
}),
|
|
10355
|
+
prompt,
|
|
10356
|
+
stopWhen: [(0, import_ai4.stepCountIs)(5)]
|
|
10357
|
+
// make configurable
|
|
10181
10358
|
});
|
|
10182
|
-
result.object =
|
|
10359
|
+
result.object = output;
|
|
10183
10360
|
inputTokens = usage.inputTokens || 0;
|
|
10184
10361
|
outputTokens = usage.outputTokens || 0;
|
|
10185
10362
|
} else {
|
|
@@ -10210,6 +10387,7 @@ var ExuluProvider = class {
|
|
|
10210
10387
|
agent
|
|
10211
10388
|
),
|
|
10212
10389
|
stopWhen: [(0, import_ai4.stepCountIs)(5)]
|
|
10390
|
+
// make configurable
|
|
10213
10391
|
});
|
|
10214
10392
|
result.text = text;
|
|
10215
10393
|
inputTokens = totalUsage?.inputTokens || 0;
|
|
@@ -10684,6 +10862,7 @@ var providerRateLimiter = async (key, windowSeconds, limit, points) => {
|
|
|
10684
10862
|
};
|
|
10685
10863
|
|
|
10686
10864
|
// src/exulu/routes.ts
|
|
10865
|
+
var import_zod_from_json_schema = require("zod-from-json-schema");
|
|
10687
10866
|
var REQUEST_SIZE_LIMIT = "50mb";
|
|
10688
10867
|
var getExuluVersionNumber = async () => {
|
|
10689
10868
|
try {
|
|
@@ -11104,6 +11283,16 @@ Mood: friendly and intelligent.
|
|
|
11104
11283
|
providers,
|
|
11105
11284
|
user
|
|
11106
11285
|
);
|
|
11286
|
+
if (req.body.outputSchema && !!headers.stream) {
|
|
11287
|
+
throw new Error("Providing a outputSchema in the POST body is not allowed when using the streaming API, set 'stream' to false in the headers when defining a response schema.");
|
|
11288
|
+
}
|
|
11289
|
+
let outputSchema;
|
|
11290
|
+
if (req.body.outputSchema) {
|
|
11291
|
+
if (typeof req.body.outputSchema === "string") {
|
|
11292
|
+
req.body.outputSchema = JSON.parse(req.body.outputSchema);
|
|
11293
|
+
}
|
|
11294
|
+
outputSchema = (0, import_zod_from_json_schema.convertJsonSchemaToZod)(req.body.outputSchema);
|
|
11295
|
+
}
|
|
11107
11296
|
let providerapikey;
|
|
11108
11297
|
const variableName = agent.providerapikey;
|
|
11109
11298
|
if (variableName) {
|
|
@@ -11250,6 +11439,7 @@ Mood: friendly and intelligent.
|
|
|
11250
11439
|
const response = await provider.generateSync({
|
|
11251
11440
|
contexts,
|
|
11252
11441
|
rerankers: rerankers || [],
|
|
11442
|
+
outputSchema,
|
|
11253
11443
|
agent,
|
|
11254
11444
|
user,
|
|
11255
11445
|
req,
|
|
@@ -15485,6 +15675,22 @@ var MarkdownChunker = class {
|
|
|
15485
15675
|
});
|
|
15486
15676
|
return result;
|
|
15487
15677
|
}
|
|
15678
|
+
/**
|
|
15679
|
+
* Checks if a position in the text falls within a <diagram> tag.
|
|
15680
|
+
* Returns the adjusted position (before the diagram) if inside a diagram, otherwise returns the original position.
|
|
15681
|
+
*/
|
|
15682
|
+
adjustForDiagramTags(text, position) {
|
|
15683
|
+
const diagramRegex = /<diagram>[\s\S]*?<\/diagram>/gi;
|
|
15684
|
+
let match;
|
|
15685
|
+
while ((match = diagramRegex.exec(text)) !== null) {
|
|
15686
|
+
const diagramStart = match.index;
|
|
15687
|
+
const diagramEnd = match.index + match[0].length;
|
|
15688
|
+
if (position > diagramStart && position < diagramEnd) {
|
|
15689
|
+
return diagramStart;
|
|
15690
|
+
}
|
|
15691
|
+
}
|
|
15692
|
+
return position;
|
|
15693
|
+
}
|
|
15488
15694
|
/**
|
|
15489
15695
|
* Find the nearest logical breakpoint working backwards from the end of the text.
|
|
15490
15696
|
* Logical breakpoints are prioritized as follows:
|
|
@@ -15496,6 +15702,7 @@ var MarkdownChunker = class {
|
|
|
15496
15702
|
*
|
|
15497
15703
|
* Only considers breakpoints in the last 50% of the text to avoid creating very small chunks.
|
|
15498
15704
|
* Returns the position of the breakpoint, or null if none found
|
|
15705
|
+
* IMPORTANT: Never splits content within <diagram> tags
|
|
15499
15706
|
*/
|
|
15500
15707
|
findLogicalBreakpoint(text) {
|
|
15501
15708
|
if (text.length === 0) return null;
|
|
@@ -15515,7 +15722,7 @@ var MarkdownChunker = class {
|
|
|
15515
15722
|
}
|
|
15516
15723
|
}
|
|
15517
15724
|
if (lastHeaderPosition > 0) {
|
|
15518
|
-
return lastHeaderPosition;
|
|
15725
|
+
return this.adjustForDiagramTags(text, lastHeaderPosition);
|
|
15519
15726
|
}
|
|
15520
15727
|
let lastParagraphBreak = -1;
|
|
15521
15728
|
let searchPos = text.length;
|
|
@@ -15528,11 +15735,12 @@ var MarkdownChunker = class {
|
|
|
15528
15735
|
searchPos = pos;
|
|
15529
15736
|
}
|
|
15530
15737
|
if (lastParagraphBreak > 0) {
|
|
15531
|
-
|
|
15738
|
+
const adjusted = this.adjustForDiagramTags(text, lastParagraphBreak + 2);
|
|
15739
|
+
return adjusted;
|
|
15532
15740
|
}
|
|
15533
15741
|
const newlineIndex = text.lastIndexOf("\n");
|
|
15534
15742
|
if (newlineIndex >= minPosition) {
|
|
15535
|
-
return newlineIndex + 1;
|
|
15743
|
+
return this.adjustForDiagramTags(text, newlineIndex + 1);
|
|
15536
15744
|
}
|
|
15537
15745
|
const sentenceEndRegex = /[.!?](?:\s|$)/g;
|
|
15538
15746
|
let lastSentenceEnd = -1;
|
|
@@ -15542,13 +15750,13 @@ var MarkdownChunker = class {
|
|
|
15542
15750
|
}
|
|
15543
15751
|
}
|
|
15544
15752
|
if (lastSentenceEnd > 0) {
|
|
15545
|
-
return lastSentenceEnd;
|
|
15753
|
+
return this.adjustForDiagramTags(text, lastSentenceEnd);
|
|
15546
15754
|
}
|
|
15547
15755
|
let lastSpace = text.length;
|
|
15548
15756
|
while (lastSpace > minPosition) {
|
|
15549
15757
|
const pos = text.lastIndexOf(" ", lastSpace - 1);
|
|
15550
15758
|
if (pos >= minPosition) {
|
|
15551
|
-
return pos + 1;
|
|
15759
|
+
return this.adjustForDiagramTags(text, pos + 1);
|
|
15552
15760
|
}
|
|
15553
15761
|
lastSpace = pos;
|
|
15554
15762
|
}
|
|
@@ -15680,6 +15888,38 @@ var MarkdownChunker = class {
|
|
|
15680
15888
|
targetPosition = currentPosition + decoded.length;
|
|
15681
15889
|
}
|
|
15682
15890
|
}
|
|
15891
|
+
const diagramCheck = /<diagram>/gi;
|
|
15892
|
+
const diagramCloseCheck = /<\/diagram>/gi;
|
|
15893
|
+
let openDiagramsInSlice = 0;
|
|
15894
|
+
while (diagramCheck.exec(currentSlice) !== null) {
|
|
15895
|
+
openDiagramsInSlice++;
|
|
15896
|
+
}
|
|
15897
|
+
let closeDiagramsInSlice = 0;
|
|
15898
|
+
while (diagramCloseCheck.exec(currentSlice) !== null) {
|
|
15899
|
+
closeDiagramsInSlice++;
|
|
15900
|
+
}
|
|
15901
|
+
if (openDiagramsInSlice > closeDiagramsInSlice) {
|
|
15902
|
+
const lastDiagramOpenIndex = currentSlice.lastIndexOf("<diagram>");
|
|
15903
|
+
if (lastDiagramOpenIndex !== -1) {
|
|
15904
|
+
const remainingText = text.slice(currentPosition + lastDiagramOpenIndex);
|
|
15905
|
+
const closingTagMatch = /<\/diagram>/i.exec(remainingText);
|
|
15906
|
+
if (closingTagMatch) {
|
|
15907
|
+
const closingTagPosition = lastDiagramOpenIndex + closingTagMatch.index + closingTagMatch[0].length;
|
|
15908
|
+
const extendedSlice = text.slice(currentPosition, currentPosition + closingTagPosition);
|
|
15909
|
+
const extendedTokens = tokenizer.encode(extendedSlice);
|
|
15910
|
+
if (extendedTokens.length <= adjustedChunkSize * 1.5) {
|
|
15911
|
+
currentSlice = extendedSlice;
|
|
15912
|
+
targetPosition = currentPosition + closingTagPosition;
|
|
15913
|
+
} else {
|
|
15914
|
+
currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
|
|
15915
|
+
targetPosition = currentPosition + lastDiagramOpenIndex;
|
|
15916
|
+
}
|
|
15917
|
+
} else {
|
|
15918
|
+
currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
|
|
15919
|
+
targetPosition = currentPosition + lastDiagramOpenIndex;
|
|
15920
|
+
}
|
|
15921
|
+
}
|
|
15922
|
+
}
|
|
15683
15923
|
const breakpointPosition = this.findLogicalBreakpoint(currentSlice);
|
|
15684
15924
|
if (breakpointPosition !== null) {
|
|
15685
15925
|
currentSlice = currentSlice.slice(0, breakpointPosition);
|
|
@@ -16101,6 +16341,8 @@ ${command}`;
|
|
|
16101
16341
|
}
|
|
16102
16342
|
|
|
16103
16343
|
// ee/python/documents/processing/doc_processor.ts
|
|
16344
|
+
var import_liteparse = require("@llamaindex/liteparse");
|
|
16345
|
+
var import_mistralai = require("@mistralai/mistralai");
|
|
16104
16346
|
async function processDocx(file) {
|
|
16105
16347
|
const html = await mammoth.convertToHtml({ buffer: file });
|
|
16106
16348
|
const turndownService = new import_turndown.default();
|
|
@@ -16175,43 +16417,80 @@ async function validatePageWithVLM(page, imagePath, model) {
|
|
|
16175
16417
|
const imageBuffer = await fs2.promises.readFile(imagePath);
|
|
16176
16418
|
const imageBase64 = imageBuffer.toString("base64");
|
|
16177
16419
|
const mimeType = "image/png";
|
|
16178
|
-
const prompt = `You are
|
|
16179
|
-
|
|
16180
|
-
Here is the current OCR/parsed content for this page:
|
|
16420
|
+
const prompt = `You are a document validation assistant. Your task is to analyze a page image and correct the output of an OCR/parsing pipeline. The content may include tables, technical diagrams, schematics, and structured text.
|
|
16181
16421
|
|
|
16182
16422
|
---
|
|
16423
|
+
## CURRENT OCR OUTPUT
|
|
16424
|
+
|
|
16183
16425
|
${page.content}
|
|
16184
16426
|
---
|
|
16185
16427
|
|
|
16186
|
-
|
|
16187
|
-
|
|
16188
|
-
1. Check if the extracted markdown text accurately represents the content from the page, including:
|
|
16189
|
-
- Table data (rows, columns, headers, values)
|
|
16190
|
-
- Technical diagrams, schematics, control boards
|
|
16191
|
-
- Icons, checkmarks, symbols
|
|
16192
|
-
- Image captions and labels
|
|
16193
|
-
|
|
16194
|
-
2. If the page has significant errors or omissions, provide a corrected version for the page.
|
|
16428
|
+
## YOUR TASK
|
|
16195
16429
|
|
|
16196
|
-
|
|
16430
|
+
Compare the page image to the OCR output above. Identify errors, omissions, and formatting issues, then return a structured validation result (see OUTPUT FORMAT below).
|
|
16197
16431
|
|
|
16198
|
-
|
|
16199
|
-
|
|
16200
|
-
|
|
16201
|
-
|
|
16202
|
-
|
|
16432
|
+
---
|
|
16433
|
+
## VALIDATION CHECKLIST
|
|
16434
|
+
|
|
16435
|
+
Work through these checks in order:
|
|
16436
|
+
|
|
16437
|
+
### 1. Text Accuracy
|
|
16438
|
+
- Verify all text is correctly transcribed.
|
|
16439
|
+
- For minor character-level OCR errors (e.g. "\xF6" vs "\xFC", "rn" vs "m"), **prefer the original OCR output** unless you are certain of an error. Do not silently "fix" characters based on guesswork.
|
|
16440
|
+
|
|
16441
|
+
### 2. Heading Levels
|
|
16442
|
+
- Verify that headings use correct Markdown levels (#, ##, ###, ####, #####, ######).
|
|
16443
|
+
- Determine heading level using the following priority:
|
|
16444
|
+
1. **Hierarchical numbering** (strongest signal): e.g. "1" \u2192 #, "2.1" \u2192 ##, "2.1.1" \u2192 ###, "2.1.2.5" \u2192 ####
|
|
16445
|
+
2. Font size (larger = higher level)
|
|
16446
|
+
3. Indentation
|
|
16447
|
+
4. Bold/emphasis styling
|
|
16448
|
+
|
|
16449
|
+
### 3. Tables
|
|
16450
|
+
|
|
16451
|
+
**First, decide whether the table should be Markdown or plain text:**
|
|
16452
|
+
- Use **Markdown table format** if the table has a consistent, clear header structure and uniform column layout throughout.
|
|
16453
|
+
- Use **plain text structured description** if the table:
|
|
16454
|
+
- Lacks a clear header row
|
|
16455
|
+
- Uses mixed or irregular column structures across rows
|
|
16456
|
+
- Functions more like a certificate, form, or label layout
|
|
16457
|
+
|
|
16458
|
+
**If using Markdown format**, follow these rules strictly:
|
|
16459
|
+
- Every table must have: header row \u2192 separator row \u2192 data rows
|
|
16460
|
+
- Use simple separators only: \`| --- | --- |\` (NOT \`|---|---|\` or long dashes)
|
|
16461
|
+
- Example:
|
|
16462
|
+
\`\`\`
|
|
16203
16463
|
| Column 1 | Column 2 |
|
|
16204
16464
|
| --- | --- |
|
|
16205
|
-
| Data 1
|
|
16206
|
-
|
|
16207
|
-
-
|
|
16208
|
-
|
|
16209
|
-
|
|
16210
|
-
|
|
16211
|
-
|
|
16212
|
-
-
|
|
16213
|
-
|
|
16214
|
-
|
|
16465
|
+
| Data 1 | Data 2 |
|
|
16466
|
+
\`\`\`
|
|
16467
|
+
- Important: do not use the | character as part of the data inside a cell, this would break the table, if a cell contains a | character, use a capital I.
|
|
16468
|
+
|
|
16469
|
+
**Symbol translation rules for table cells:**
|
|
16470
|
+
- Black/filled dot \u2192 \`+\` (active); White/empty dot \u2192 \`-\` (inactive)
|
|
16471
|
+
*(e.g. Rufe-LED columns)*
|
|
16472
|
+
- Green or black checkmark \u2192 \`+\` (active); Red or black cross \u2192 \`-\` (inactive)
|
|
16473
|
+
|
|
16474
|
+
### 4. Multi-Page Table Continuity
|
|
16475
|
+
- If this page contains a table with a header row that runs to the bottom of the page (suggesting it may continue on the next page), extract the header row and include it in the \`current_page_table.headers\` field.
|
|
16476
|
+
- If this page contains a table WITHOUT a header row (suggesting it's a continuation from a previous page), set \`current_page_table.is_continuation\` to true and try to identify what the headers might be based on the data structure. Include your best guess for headers in \`current_page_table.headers\`.
|
|
16477
|
+
|
|
16478
|
+
### 5. Technical Diagrams & Schematics
|
|
16479
|
+
If the page contains a flow-chart, schematic, technical drawing or control board layout that is **absent or poorly described** in the OCR output do the following:
|
|
16480
|
+
- Open a <diagram> tag with the following content:
|
|
16481
|
+
<diagram>
|
|
16482
|
+
<description>
|
|
16483
|
+
Add a detailed description of the diagram here.
|
|
16484
|
+
</description>
|
|
16485
|
+
<mermaid>
|
|
16486
|
+
Add a mermaid diagram schema here that in detail describes the diagram.
|
|
16487
|
+
</mermaid>
|
|
16488
|
+
</diagram>
|
|
16489
|
+
|
|
16490
|
+
### 6. Captions, Icons & Symbols
|
|
16491
|
+
- Verify that image captions, labels, icons, and checkmarks are present and correctly transcribed.
|
|
16492
|
+
|
|
16493
|
+
### 7. Only populate \`corrected_text\` when \`needs_correction\` is true. If the OCR output is accurate, return \`needs_correction: false\` and \`corrected_content: null\`.
|
|
16215
16494
|
`;
|
|
16216
16495
|
const result = await (0, import_ai7.generateText)({
|
|
16217
16496
|
model,
|
|
@@ -16219,6 +16498,10 @@ Specific notes and guidelines:
|
|
|
16219
16498
|
schema: import_zod12.z.object({
|
|
16220
16499
|
needs_correction: import_zod12.z.boolean(),
|
|
16221
16500
|
corrected_text: import_zod12.z.string().nullable(),
|
|
16501
|
+
current_page_table: import_zod12.z.object({
|
|
16502
|
+
headers: import_zod12.z.array(import_zod12.z.string()),
|
|
16503
|
+
is_continuation: import_zod12.z.boolean()
|
|
16504
|
+
}).nullable(),
|
|
16222
16505
|
confidence: import_zod12.z.enum(["high", "medium", "low"]),
|
|
16223
16506
|
reasoning: import_zod12.z.string()
|
|
16224
16507
|
})
|
|
@@ -16241,23 +16524,80 @@ Specific notes and guidelines:
|
|
|
16241
16524
|
needs_correction: parsedOutput.needs_correction,
|
|
16242
16525
|
corrected_text: parsedOutput.corrected_text || void 0,
|
|
16243
16526
|
confidence: parsedOutput.confidence,
|
|
16527
|
+
current_page_table: parsedOutput.current_page_table || void 0,
|
|
16244
16528
|
reasoning: parsedOutput.reasoning
|
|
16245
16529
|
};
|
|
16246
16530
|
return validation;
|
|
16247
16531
|
}
|
|
16532
|
+
function reconstructTableHeaders(document2, validationResults, verbose = false) {
|
|
16533
|
+
let lastTableHeaders = void 0;
|
|
16534
|
+
for (const page of document2) {
|
|
16535
|
+
const validation = validationResults.get(page.page);
|
|
16536
|
+
if (!validation) continue;
|
|
16537
|
+
const tableInfo = validation.current_page_table;
|
|
16538
|
+
if (tableInfo && tableInfo.headers.length > 0) {
|
|
16539
|
+
if (tableInfo.is_continuation && lastTableHeaders) {
|
|
16540
|
+
if (verbose) {
|
|
16541
|
+
console.log(`[EXULU] Page ${page.page}: Reconstructing table headers from previous page`);
|
|
16542
|
+
console.log(`[EXULU] Previous headers: ${lastTableHeaders.join(" | ")}`);
|
|
16543
|
+
}
|
|
16544
|
+
const contentToModify = page.vlm_corrected_text || page.content;
|
|
16545
|
+
const lines = contentToModify.split("\n");
|
|
16546
|
+
const firstTableLineIndex = lines.findIndex((line) => line.trim().startsWith("|"));
|
|
16547
|
+
if (firstTableLineIndex !== -1) {
|
|
16548
|
+
const headerRow = `| ${lastTableHeaders.join(" | ")} |`;
|
|
16549
|
+
const separatorRow = `| ${lastTableHeaders.map(() => "---").join(" | ")} |`;
|
|
16550
|
+
lines.splice(firstTableLineIndex, 0, headerRow, separatorRow);
|
|
16551
|
+
const reconstructedContent = lines.join("\n");
|
|
16552
|
+
if (page.vlm_corrected_text) {
|
|
16553
|
+
page.vlm_corrected_text = reconstructedContent;
|
|
16554
|
+
} else {
|
|
16555
|
+
page.content = reconstructedContent;
|
|
16556
|
+
}
|
|
16557
|
+
if (verbose) {
|
|
16558
|
+
console.log(`[EXULU] Page ${page.page}: Added table headers successfully`);
|
|
16559
|
+
}
|
|
16560
|
+
}
|
|
16561
|
+
if (!tableInfo.is_continuation) {
|
|
16562
|
+
lastTableHeaders = tableInfo.headers;
|
|
16563
|
+
}
|
|
16564
|
+
} else {
|
|
16565
|
+
lastTableHeaders = tableInfo.headers;
|
|
16566
|
+
if (verbose) {
|
|
16567
|
+
console.log(`[EXULU] Page ${page.page}: Storing table headers for potential continuation`);
|
|
16568
|
+
console.log(`[EXULU] Headers: ${lastTableHeaders.join(" | ")}`);
|
|
16569
|
+
}
|
|
16570
|
+
}
|
|
16571
|
+
} else {
|
|
16572
|
+
lastTableHeaders = void 0;
|
|
16573
|
+
}
|
|
16574
|
+
}
|
|
16575
|
+
}
|
|
16248
16576
|
async function validateWithVLM(document2, model, verbose = false, concurrency = 10) {
|
|
16249
16577
|
console.log(`[EXULU] Starting VLM validation for docling output, ${document2.length} pages...`);
|
|
16250
|
-
console.log(
|
|
16251
|
-
|
|
16252
|
-
);
|
|
16578
|
+
console.log(`[EXULU] Concurrency limit: ${concurrency}`);
|
|
16579
|
+
const limit = (0, import_p_limit.default)(concurrency);
|
|
16580
|
+
const validationResults = /* @__PURE__ */ new Map();
|
|
16253
16581
|
let validatedCount = 0;
|
|
16254
16582
|
let correctedCount = 0;
|
|
16255
|
-
const limit = (0, import_p_limit.default)(concurrency);
|
|
16256
16583
|
const validationTasks = document2.map(
|
|
16257
16584
|
(page) => limit(async () => {
|
|
16585
|
+
await new Promise((resolve3) => setImmediate(resolve3));
|
|
16258
16586
|
const imagePath = page.image;
|
|
16587
|
+
if (!page.content) {
|
|
16588
|
+
console.warn(`[EXULU] Page ${page.page}: No content found, skipping validation`);
|
|
16589
|
+
return;
|
|
16590
|
+
}
|
|
16259
16591
|
if (!imagePath) {
|
|
16260
|
-
console.
|
|
16592
|
+
console.warn(`[EXULU] Page ${page.page}: No image found, skipping validation`);
|
|
16593
|
+
return;
|
|
16594
|
+
}
|
|
16595
|
+
const hasImage = page.content.match(/\.(jpeg|jpg|png|gif|webp)/i);
|
|
16596
|
+
const hasTable = (page.content.match(/\|/g)?.length || 0) > 1;
|
|
16597
|
+
if (!hasImage && !hasTable) {
|
|
16598
|
+
if (verbose) {
|
|
16599
|
+
console.log(`[EXULU] Page ${page.page}: No image or table found, SKIPPING VLM validation`);
|
|
16600
|
+
}
|
|
16261
16601
|
return;
|
|
16262
16602
|
}
|
|
16263
16603
|
let validation;
|
|
@@ -16265,6 +16605,13 @@ async function validateWithVLM(document2, model, verbose = false, concurrency =
|
|
|
16265
16605
|
validation = await withRetry(async () => {
|
|
16266
16606
|
return await validatePageWithVLM(page, imagePath, model);
|
|
16267
16607
|
}, 3);
|
|
16608
|
+
validationResults.set(page.page, validation);
|
|
16609
|
+
if (verbose && validation.current_page_table) {
|
|
16610
|
+
console.log(`[EXULU] Page ${page.page} table info:`, {
|
|
16611
|
+
headers: validation.current_page_table.headers,
|
|
16612
|
+
is_continuation: validation.current_page_table.is_continuation
|
|
16613
|
+
});
|
|
16614
|
+
}
|
|
16268
16615
|
} catch (error) {
|
|
16269
16616
|
console.error(`[EXULU] Error validating page ${page.page} with VLM more than 3 times, skipping:`, error);
|
|
16270
16617
|
throw error;
|
|
@@ -16295,9 +16642,12 @@ async function validateWithVLM(document2, model, verbose = false, concurrency =
|
|
|
16295
16642
|
})
|
|
16296
16643
|
);
|
|
16297
16644
|
await Promise.all(validationTasks);
|
|
16298
|
-
console.log(`[EXULU] VLM validation complete:`);
|
|
16299
|
-
console.log(`[EXULU] Validated: ${validatedCount}
|
|
16300
|
-
console.log(`[EXULU] Corrected: ${correctedCount}
|
|
16645
|
+
console.log(`[EXULU] VLM validation complete (parallel processing):`);
|
|
16646
|
+
console.log(`[EXULU] Validated: ${validatedCount} pages`);
|
|
16647
|
+
console.log(`[EXULU] Corrected: ${correctedCount} pages`);
|
|
16648
|
+
console.log(`[EXULU] Starting sequential table header reconstruction...`);
|
|
16649
|
+
reconstructTableHeaders(document2, validationResults, verbose);
|
|
16650
|
+
console.log(`[EXULU] Table header reconstruction complete`);
|
|
16301
16651
|
return document2;
|
|
16302
16652
|
}
|
|
16303
16653
|
async function processDocument(filePath, fileType, buffer, tempDir, config, verbose = false) {
|
|
@@ -16312,15 +16662,6 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
|
|
|
16312
16662
|
const stripped = filePath.split(".").pop()?.trim();
|
|
16313
16663
|
let result;
|
|
16314
16664
|
switch (stripped) {
|
|
16315
|
-
case "pdf":
|
|
16316
|
-
result = await processPdf(buffer, paths, config, verbose);
|
|
16317
|
-
break;
|
|
16318
|
-
case "docx":
|
|
16319
|
-
result = await processDocx(buffer);
|
|
16320
|
-
break;
|
|
16321
|
-
case "doc":
|
|
16322
|
-
result = await processWord(buffer);
|
|
16323
|
-
break;
|
|
16324
16665
|
case "txt":
|
|
16325
16666
|
case "md":
|
|
16326
16667
|
let content = buffer.toString();
|
|
@@ -16334,6 +16675,15 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
|
|
|
16334
16675
|
}]
|
|
16335
16676
|
};
|
|
16336
16677
|
break;
|
|
16678
|
+
case "pdf":
|
|
16679
|
+
result = await processPdf(buffer, paths, config, verbose);
|
|
16680
|
+
break;
|
|
16681
|
+
case "docx":
|
|
16682
|
+
result = await processDocx(buffer);
|
|
16683
|
+
break;
|
|
16684
|
+
case "doc":
|
|
16685
|
+
result = await processWord(buffer);
|
|
16686
|
+
break;
|
|
16337
16687
|
// Todo other file types with docx and officeparser
|
|
16338
16688
|
default:
|
|
16339
16689
|
throw new Error(`[EXULU] Unsupported file type: ${fileType}`);
|
|
@@ -16348,8 +16698,8 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
|
|
|
16348
16698
|
}
|
|
16349
16699
|
async function processPdf(buffer, paths, config, verbose = false) {
|
|
16350
16700
|
try {
|
|
16351
|
-
let json;
|
|
16352
|
-
if (config?.docling) {
|
|
16701
|
+
let json = [];
|
|
16702
|
+
if (config?.processor.name === "docling") {
|
|
16353
16703
|
console.log(`[EXULU] Validating Python environment...`);
|
|
16354
16704
|
const validation = await validatePythonEnvironment(void 0, true);
|
|
16355
16705
|
if (!validation.valid) {
|
|
@@ -16390,7 +16740,7 @@ ${setupResult.output || ""}`);
|
|
|
16390
16740
|
}
|
|
16391
16741
|
const jsonContent = await fs2.promises.readFile(paths.json, "utf-8");
|
|
16392
16742
|
json = JSON.parse(jsonContent);
|
|
16393
|
-
} else {
|
|
16743
|
+
} else if (config?.processor.name === "officeparser") {
|
|
16394
16744
|
const text = await (0, import_officeparser2.parseOfficeAsync)(buffer, {
|
|
16395
16745
|
outputErrorToConsole: false,
|
|
16396
16746
|
newlineDelimiter: "\n"
|
|
@@ -16400,15 +16750,69 @@ ${setupResult.output || ""}`);
|
|
|
16400
16750
|
content: text,
|
|
16401
16751
|
headings: []
|
|
16402
16752
|
}];
|
|
16753
|
+
} else if (config?.processor.name === "mistral") {
|
|
16754
|
+
if (!process.env.MISTRAL_API_KEY) {
|
|
16755
|
+
throw new Error("[EXULU] MISTRAL_API_KEY is not set, please set it in the environment variables.");
|
|
16756
|
+
}
|
|
16757
|
+
await new Promise((resolve3) => setTimeout(resolve3, Math.floor(Math.random() * 4e3) + 1e3));
|
|
16758
|
+
const base64Pdf = buffer.toString("base64");
|
|
16759
|
+
const client2 = new import_mistralai.Mistral({ apiKey: process.env.MISTRAL_API_KEY });
|
|
16760
|
+
const ocrResponse = await withRetry(async () => {
|
|
16761
|
+
const ocrResponse2 = await client2.ocr.process({
|
|
16762
|
+
document: {
|
|
16763
|
+
type: "document_url",
|
|
16764
|
+
documentUrl: "data:application/pdf;base64," + base64Pdf
|
|
16765
|
+
},
|
|
16766
|
+
model: "mistral-ocr-latest",
|
|
16767
|
+
includeImageBase64: false
|
|
16768
|
+
});
|
|
16769
|
+
return ocrResponse2;
|
|
16770
|
+
}, 10);
|
|
16771
|
+
const parser = new import_liteparse.LiteParse();
|
|
16772
|
+
const screenshots = await parser.screenshot(paths.source, void 0);
|
|
16773
|
+
await fs2.promises.mkdir(paths.images, { recursive: true });
|
|
16774
|
+
for (const screenshot of screenshots) {
|
|
16775
|
+
await fs2.promises.writeFile(
|
|
16776
|
+
path.join(
|
|
16777
|
+
paths.images,
|
|
16778
|
+
`${screenshot.pageNum}.png`
|
|
16779
|
+
),
|
|
16780
|
+
screenshot.imageBuffer
|
|
16781
|
+
);
|
|
16782
|
+
screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
|
|
16783
|
+
}
|
|
16784
|
+
json = ocrResponse.pages.map((page) => ({
|
|
16785
|
+
page: page.index + 1,
|
|
16786
|
+
content: page.markdown,
|
|
16787
|
+
image: screenshots.find((s) => s.pageNum === page.index + 1)?.imagePath,
|
|
16788
|
+
headings: []
|
|
16789
|
+
}));
|
|
16790
|
+
fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
|
|
16791
|
+
} else if (config?.processor.name === "liteparse") {
|
|
16792
|
+
const parser = new import_liteparse.LiteParse();
|
|
16793
|
+
const result = await parser.parse(paths.source);
|
|
16794
|
+
const screenshots = await parser.screenshot(paths.source, void 0);
|
|
16795
|
+
console.log(`[EXULU] Liteparse screenshots: ${JSON.stringify(screenshots)}`);
|
|
16796
|
+
await fs2.promises.mkdir(paths.images, { recursive: true });
|
|
16797
|
+
for (const screenshot of screenshots) {
|
|
16798
|
+
await fs2.promises.writeFile(path.join(paths.images, `${screenshot.pageNum}.png`), screenshot.imageBuffer);
|
|
16799
|
+
screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
|
|
16800
|
+
}
|
|
16801
|
+
json = result.pages.map((page) => ({
|
|
16802
|
+
page: page.pageNum,
|
|
16803
|
+
content: page.text,
|
|
16804
|
+
image: screenshots.find((s) => s.pageNum === page.pageNum)?.imagePath
|
|
16805
|
+
}));
|
|
16806
|
+
fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
|
|
16403
16807
|
}
|
|
16404
16808
|
console.log(`[EXULU]
|
|
16405
16809
|
\u2713 Document processing completed successfully`);
|
|
16406
16810
|
console.log(`[EXULU] Total pages: ${json.length}`);
|
|
16407
16811
|
console.log(`[EXULU] Output file: ${paths.json}`);
|
|
16408
|
-
if (
|
|
16812
|
+
if (config?.vlm?.model) {
|
|
16409
16813
|
console.error("[EXULU] VLM validation is only supported when docling is enabled, skipping validation.");
|
|
16410
16814
|
}
|
|
16411
|
-
if (config?.
|
|
16815
|
+
if (config?.vlm?.model && json.length > 0) {
|
|
16412
16816
|
json = await validateWithVLM(
|
|
16413
16817
|
json,
|
|
16414
16818
|
config.vlm.model,
|
|
@@ -16436,29 +16840,37 @@ ${setupResult.output || ""}`);
|
|
|
16436
16840
|
"utf-8"
|
|
16437
16841
|
);
|
|
16438
16842
|
}
|
|
16439
|
-
const
|
|
16440
|
-
|
|
16441
|
-
|
|
16442
|
-
|
|
16443
|
-
|
|
16843
|
+
const markdownStream = fs2.createWriteStream(paths.markdown, { encoding: "utf-8" });
|
|
16844
|
+
for (let i = 0; i < json.length; i++) {
|
|
16845
|
+
const p = json[i];
|
|
16846
|
+
if (!p) continue;
|
|
16847
|
+
const content = p.vlm_corrected_text ?? p.content;
|
|
16848
|
+
markdownStream.write(content);
|
|
16849
|
+
if (i < json.length - 1) {
|
|
16850
|
+
markdownStream.write("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
|
|
16444
16851
|
}
|
|
16445
|
-
}
|
|
16446
|
-
await
|
|
16447
|
-
|
|
16448
|
-
|
|
16449
|
-
|
|
16450
|
-
);
|
|
16852
|
+
}
|
|
16853
|
+
await new Promise((resolve3, reject) => {
|
|
16854
|
+
markdownStream.end(() => resolve3());
|
|
16855
|
+
markdownStream.on("error", reject);
|
|
16856
|
+
});
|
|
16451
16857
|
console.log(`[EXULU] Validated output saved to: ${paths.json}`);
|
|
16452
16858
|
console.log(`[EXULU] Validated markdown saved to: ${paths.markdown}`);
|
|
16859
|
+
const markdown = await fs2.promises.readFile(paths.markdown, "utf-8");
|
|
16860
|
+
const processedJson = json.map((e) => {
|
|
16861
|
+
const finalContent = e.vlm_corrected_text ?? e.content;
|
|
16862
|
+
return {
|
|
16863
|
+
page: e.page,
|
|
16864
|
+
content: finalContent
|
|
16865
|
+
};
|
|
16866
|
+
});
|
|
16867
|
+
json.length = 0;
|
|
16868
|
+
json = [];
|
|
16869
|
+
const memUsage = process.memoryUsage();
|
|
16870
|
+
console.log(`[EXULU] Memory after document processing: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`);
|
|
16453
16871
|
return {
|
|
16454
16872
|
markdown,
|
|
16455
|
-
json:
|
|
16456
|
-
const finalContent = e.vlm_corrected_text || e.content;
|
|
16457
|
-
return {
|
|
16458
|
-
page: e.page,
|
|
16459
|
-
content: finalContent
|
|
16460
|
-
};
|
|
16461
|
-
})
|
|
16873
|
+
json: processedJson
|
|
16462
16874
|
};
|
|
16463
16875
|
} catch (error) {
|
|
16464
16876
|
console.error("[EXULU] Error processing document:", error);
|
|
@@ -16471,9 +16883,9 @@ var loadFile = async (file, name, tempDir) => {
|
|
|
16471
16883
|
if (!fileType) {
|
|
16472
16884
|
throw new Error("[EXULU] File name does not include extension, extension is required for document processing.");
|
|
16473
16885
|
}
|
|
16886
|
+
const UUID = (0, import_crypto.randomUUID)();
|
|
16474
16887
|
let buffer;
|
|
16475
16888
|
if (Buffer.isBuffer(file)) {
|
|
16476
|
-
const UUID = (0, import_crypto.randomUUID)();
|
|
16477
16889
|
filePath = path.join(tempDir, `${UUID}.${fileType}`);
|
|
16478
16890
|
await fs2.promises.writeFile(filePath, file);
|
|
16479
16891
|
buffer = file;
|
|
@@ -16482,7 +16894,10 @@ var loadFile = async (file, name, tempDir) => {
|
|
|
16482
16894
|
if (filePath.startsWith("http")) {
|
|
16483
16895
|
const response = await fetch(filePath);
|
|
16484
16896
|
const array = await response.arrayBuffer();
|
|
16897
|
+
const tempFilePath = path.join(tempDir, `${UUID}.${fileType}`);
|
|
16898
|
+
await fs2.promises.writeFile(tempFilePath, Buffer.from(array));
|
|
16485
16899
|
buffer = Buffer.from(array);
|
|
16900
|
+
filePath = tempFilePath;
|
|
16486
16901
|
} else {
|
|
16487
16902
|
buffer = await fs2.promises.readFile(file);
|
|
16488
16903
|
}
|
|
@@ -16500,17 +16915,34 @@ async function documentProcessor({
|
|
|
16500
16915
|
}
|
|
16501
16916
|
const uuid = (0, import_crypto.randomUUID)();
|
|
16502
16917
|
const tempDir = path.join(process.cwd(), "temp", uuid);
|
|
16918
|
+
const localFilesAndFoldersToDelete = [tempDir];
|
|
16503
16919
|
console.log(`[EXULU] Temporary directory for processing document ${name}: ${tempDir}`);
|
|
16504
16920
|
await fs2.promises.mkdir(tempDir, { recursive: true });
|
|
16921
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
16922
|
+
await fs2.promises.writeFile(path.join(tempDir, "created_at.txt"), timestamp);
|
|
16505
16923
|
try {
|
|
16506
16924
|
const {
|
|
16507
16925
|
filePath,
|
|
16508
16926
|
fileType,
|
|
16509
16927
|
buffer
|
|
16510
16928
|
} = await loadFile(file, name, tempDir);
|
|
16511
|
-
|
|
16929
|
+
let supportedTypes = [];
|
|
16930
|
+
switch (config?.processor.name) {
|
|
16931
|
+
case "docling":
|
|
16932
|
+
supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
|
|
16933
|
+
break;
|
|
16934
|
+
case "officeparser":
|
|
16935
|
+
supportedTypes = [];
|
|
16936
|
+
break;
|
|
16937
|
+
case "liteparse":
|
|
16938
|
+
supportedTypes = ["pdf", "doc", "docx", "docm", "odt", "rtf", "ppt", "pptx", "pptm", "odp", "xls", "xlsx", "xlsm", "ods", "csv", "tsv"];
|
|
16939
|
+
break;
|
|
16940
|
+
case "mistral":
|
|
16941
|
+
supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
|
|
16942
|
+
break;
|
|
16943
|
+
}
|
|
16512
16944
|
if (!supportedTypes.includes(fileType)) {
|
|
16513
|
-
throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor.`);
|
|
16945
|
+
throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor, the ${config?.processor.name} processor only supports the following file types: ${supportedTypes.join(", ")}.`);
|
|
16514
16946
|
}
|
|
16515
16947
|
const { content } = await processDocument(
|
|
16516
16948
|
filePath,
|
|
@@ -16523,9 +16955,19 @@ async function documentProcessor({
|
|
|
16523
16955
|
return content.json;
|
|
16524
16956
|
} catch (error) {
|
|
16525
16957
|
console.error("Error during chunking:", error);
|
|
16526
|
-
|
|
16958
|
+
throw error;
|
|
16527
16959
|
} finally {
|
|
16528
|
-
|
|
16960
|
+
if (config?.debugging?.deleteTempFiles !== false) {
|
|
16961
|
+
for (const file2 of localFilesAndFoldersToDelete) {
|
|
16962
|
+
try {
|
|
16963
|
+
await fs2.promises.rm(file2, { recursive: true });
|
|
16964
|
+
console.log(`[EXULU] Deleted file or folder: ${file2}`);
|
|
16965
|
+
} catch (error) {
|
|
16966
|
+
console.error(`[EXULU] Error deleting file or folder: ${file2}`, error);
|
|
16967
|
+
console.log(`[EXULU] File or folder still exists: ${file2}`);
|
|
16968
|
+
}
|
|
16969
|
+
}
|
|
16970
|
+
}
|
|
16529
16971
|
}
|
|
16530
16972
|
}
|
|
16531
16973
|
|