@exulu/backend 1.50.0 → 1.51.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +597 -146
- package/dist/index.d.cts +16 -3
- package/dist/index.d.ts +16 -3
- package/dist/index.js +600 -149
- package/ee/chunking/markdown.ts +83 -5
- package/ee/python/documents/processing/doc_processor.ts +391 -84
- package/ee/workers.ts +214 -18
- package/package.json +4 -1
package/dist/index.cjs
CHANGED
|
@@ -205,17 +205,19 @@ async function postgresClient() {
|
|
|
205
205
|
// 30 minutes
|
|
206
206
|
},
|
|
207
207
|
pool: {
|
|
208
|
-
min:
|
|
209
|
-
//
|
|
210
|
-
max:
|
|
211
|
-
// Increased
|
|
212
|
-
acquireTimeoutMillis:
|
|
213
|
-
//
|
|
208
|
+
min: 10,
|
|
209
|
+
// Minimum connections always ready
|
|
210
|
+
max: 300,
|
|
211
|
+
// Increased to support high worker concurrency (250+ concurrent jobs)
|
|
212
|
+
acquireTimeoutMillis: 12e4,
|
|
213
|
+
// 2 minutes - increased to handle high contention during bursts
|
|
214
214
|
createTimeoutMillis: 3e4,
|
|
215
215
|
idleTimeoutMillis: 6e4,
|
|
216
|
-
//
|
|
216
|
+
// Keep connections alive for reuse
|
|
217
217
|
reapIntervalMillis: 1e3,
|
|
218
218
|
createRetryIntervalMillis: 200,
|
|
219
|
+
// Enable propagateCreateError to properly handle connection creation failures
|
|
220
|
+
propagateCreateError: false,
|
|
219
221
|
// Log pool events to help debug connection issues
|
|
220
222
|
afterCreate: (conn, done) => {
|
|
221
223
|
console.log("[EXULU] New database connection created");
|
|
@@ -3618,7 +3620,7 @@ var convertExuluToolsToAiSdkTools = async (currentTools, approvedTools, allExulu
|
|
|
3618
3620
|
description,
|
|
3619
3621
|
// The approvedTools array uses the tool.name lookup as the frontend
|
|
3620
3622
|
// Vercel AI SDK uses the sanitized tool name as the key, so this matches.
|
|
3621
|
-
needsApproval: approvedTools?.includes("tool-" + cur.name) ? false : true,
|
|
3623
|
+
needsApproval: approvedTools?.includes("tool-" + cur.name) || !cur.needsApproval ? false : true,
|
|
3622
3624
|
// todo make configurable
|
|
3623
3625
|
async *execute(inputs, options) {
|
|
3624
3626
|
console.log(
|
|
@@ -3768,6 +3770,7 @@ var ExuluTool = class {
|
|
|
3768
3770
|
inputSchema;
|
|
3769
3771
|
type;
|
|
3770
3772
|
tool;
|
|
3773
|
+
needsApproval;
|
|
3771
3774
|
config;
|
|
3772
3775
|
constructor({
|
|
3773
3776
|
id,
|
|
@@ -3777,10 +3780,12 @@ var ExuluTool = class {
|
|
|
3777
3780
|
inputSchema,
|
|
3778
3781
|
type,
|
|
3779
3782
|
execute: execute2,
|
|
3780
|
-
config
|
|
3783
|
+
config,
|
|
3784
|
+
needsApproval
|
|
3781
3785
|
}) {
|
|
3782
3786
|
this.id = id;
|
|
3783
3787
|
this.config = config;
|
|
3788
|
+
this.needsApproval = needsApproval ?? true;
|
|
3784
3789
|
this.category = category || "default";
|
|
3785
3790
|
this.name = name;
|
|
3786
3791
|
this.description = description;
|
|
@@ -4187,11 +4192,30 @@ var ExuluContext2 = class {
|
|
|
4187
4192
|
);
|
|
4188
4193
|
await db2.from(getChunksTableName(this.id)).where({ source }).delete();
|
|
4189
4194
|
if (chunks?.length) {
|
|
4195
|
+
const sanitizeString = (str) => {
|
|
4196
|
+
if (!str) return "";
|
|
4197
|
+
return str.replace(/\0/g, "");
|
|
4198
|
+
};
|
|
4199
|
+
const sanitizeMetadata2 = (metadata) => {
|
|
4200
|
+
if (!metadata) return {};
|
|
4201
|
+
const sanitized = {};
|
|
4202
|
+
for (const [key, value] of Object.entries(metadata)) {
|
|
4203
|
+
if (typeof value === "string") {
|
|
4204
|
+
sanitized[key] = sanitizeString(value);
|
|
4205
|
+
} else {
|
|
4206
|
+
sanitized[key] = value;
|
|
4207
|
+
}
|
|
4208
|
+
}
|
|
4209
|
+
return sanitized;
|
|
4210
|
+
};
|
|
4190
4211
|
await db2.from(getChunksTableName(this.id)).insert(
|
|
4191
4212
|
chunks.map((chunk) => ({
|
|
4192
|
-
source
|
|
4193
|
-
|
|
4194
|
-
|
|
4213
|
+
// Sanitize source to remove null bytes
|
|
4214
|
+
source: sanitizeString(source),
|
|
4215
|
+
// Sanitize metadata to remove null bytes from string values
|
|
4216
|
+
metadata: sanitizeMetadata2(chunk.metadata),
|
|
4217
|
+
// Remove null bytes (0x00) which are invalid in PostgreSQL UTF8 encoding
|
|
4218
|
+
content: sanitizeString(chunk.content),
|
|
4195
4219
|
chunk_index: chunk.index,
|
|
4196
4220
|
embedding: import_knex5.default.toSql(chunk.vector)
|
|
4197
4221
|
}))
|
|
@@ -4580,6 +4604,8 @@ var ExuluContext2 = class {
|
|
|
4580
4604
|
name: `${this.name}_context_search`,
|
|
4581
4605
|
type: "context",
|
|
4582
4606
|
category: "contexts",
|
|
4607
|
+
needsApproval: true,
|
|
4608
|
+
// todo make configurable
|
|
4583
4609
|
inputSchema: import_zod4.z.object({
|
|
4584
4610
|
query: import_zod4.z.string().describe("The original question that the user asked"),
|
|
4585
4611
|
keywords: import_zod4.z.array(import_zod4.z.string()).describe(
|
|
@@ -5773,7 +5799,7 @@ var finalizeRequestedFields = async ({
|
|
|
5773
5799
|
return result;
|
|
5774
5800
|
}
|
|
5775
5801
|
const { db: db2 } = await postgresClient();
|
|
5776
|
-
const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt");
|
|
5802
|
+
const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt", "metadata");
|
|
5777
5803
|
const chunks = await query;
|
|
5778
5804
|
result.chunks = chunks.map((chunk) => ({
|
|
5779
5805
|
chunk_content: chunk.content,
|
|
@@ -5786,7 +5812,8 @@ var finalizeRequestedFields = async ({
|
|
|
5786
5812
|
item_created_at: chunk.item_created_at,
|
|
5787
5813
|
item_id: chunk.item_id,
|
|
5788
5814
|
item_external_id: chunk.item_external_id,
|
|
5789
|
-
item_name: chunk.item_name
|
|
5815
|
+
item_name: chunk.item_name,
|
|
5816
|
+
chunk_metadata: chunk.metadata
|
|
5790
5817
|
}));
|
|
5791
5818
|
}
|
|
5792
5819
|
}
|
|
@@ -7160,6 +7187,36 @@ var import_ai3 = require("ai");
|
|
|
7160
7187
|
var import_crypto_js4 = __toESM(require("crypto-js"), 1);
|
|
7161
7188
|
var redisConnection;
|
|
7162
7189
|
var unhandledRejectionHandlerInstalled = false;
|
|
7190
|
+
var poolMonitoringInterval;
|
|
7191
|
+
var startPoolMonitoring = () => {
|
|
7192
|
+
if (poolMonitoringInterval) return;
|
|
7193
|
+
poolMonitoringInterval = setInterval(async () => {
|
|
7194
|
+
try {
|
|
7195
|
+
const { db: db2 } = await postgresClient();
|
|
7196
|
+
const poolStats = db2.client.pool;
|
|
7197
|
+
if (poolStats) {
|
|
7198
|
+
const used = poolStats.numUsed?.() || 0;
|
|
7199
|
+
const free = poolStats.numFree?.() || 0;
|
|
7200
|
+
const pending = poolStats.numPendingAcquires?.() || 0;
|
|
7201
|
+
const total = used + free;
|
|
7202
|
+
console.log("[EXULU] Connection pool health check:", {
|
|
7203
|
+
used,
|
|
7204
|
+
free,
|
|
7205
|
+
pending,
|
|
7206
|
+
total,
|
|
7207
|
+
utilization: total > 0 ? `${Math.round(used / total * 100)}%` : "0%"
|
|
7208
|
+
});
|
|
7209
|
+
if (pending > 10) {
|
|
7210
|
+
console.warn(
|
|
7211
|
+
`[EXULU] WARNING: ${pending} jobs waiting for database connections. Consider increasing pool size or reducing worker concurrency.`
|
|
7212
|
+
);
|
|
7213
|
+
}
|
|
7214
|
+
}
|
|
7215
|
+
} catch (error) {
|
|
7216
|
+
console.error("[EXULU] Error checking pool health:", error);
|
|
7217
|
+
}
|
|
7218
|
+
}, 3e4);
|
|
7219
|
+
};
|
|
7163
7220
|
var installGlobalErrorHandlers = () => {
|
|
7164
7221
|
if (unhandledRejectionHandlerInstalled) return;
|
|
7165
7222
|
process.on("unhandledRejection", (reason) => {
|
|
@@ -7184,6 +7241,7 @@ var installGlobalErrorHandlers = () => {
|
|
|
7184
7241
|
unhandledRejectionHandlerInstalled = true;
|
|
7185
7242
|
console.log("[EXULU] Global error handlers installed to prevent worker crashes");
|
|
7186
7243
|
};
|
|
7244
|
+
var isShuttingDown = false;
|
|
7187
7245
|
var createWorkers = async (providers, queues2, config, contexts, rerankers, evals, tools, tracer) => {
|
|
7188
7246
|
console.log("[EXULU] creating workers for " + queues2?.length + " queues.");
|
|
7189
7247
|
console.log(
|
|
@@ -7191,7 +7249,8 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7191
7249
|
queues2.map((q) => q.queue.name)
|
|
7192
7250
|
);
|
|
7193
7251
|
installGlobalErrorHandlers();
|
|
7194
|
-
|
|
7252
|
+
startPoolMonitoring();
|
|
7253
|
+
process.setMaxListeners(Math.max(15, process.getMaxListeners()));
|
|
7195
7254
|
if (!redisServer.host || !redisServer.port) {
|
|
7196
7255
|
console.error(
|
|
7197
7256
|
"[EXULU] you are trying to start worker, but no redis server is configured in the environment."
|
|
@@ -7224,7 +7283,53 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7224
7283
|
status: await bullmqJob.getState(),
|
|
7225
7284
|
type: bullmqJob.data.type
|
|
7226
7285
|
});
|
|
7227
|
-
|
|
7286
|
+
let progressInterval;
|
|
7287
|
+
if (bullmqJob.data.type === "processor") {
|
|
7288
|
+
progressInterval = setInterval(async () => {
|
|
7289
|
+
try {
|
|
7290
|
+
await bullmqJob.updateProgress({
|
|
7291
|
+
status: "processing",
|
|
7292
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
7293
|
+
});
|
|
7294
|
+
console.log(`[EXULU] Job ${bullmqJob.id} heartbeat sent to prevent stalling`);
|
|
7295
|
+
} catch (error) {
|
|
7296
|
+
console.error(`[EXULU] Error updating job progress:`, error);
|
|
7297
|
+
}
|
|
7298
|
+
}, 25e3);
|
|
7299
|
+
}
|
|
7300
|
+
let db2;
|
|
7301
|
+
let retries = 3;
|
|
7302
|
+
let lastError;
|
|
7303
|
+
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
7304
|
+
try {
|
|
7305
|
+
const client2 = await postgresClient();
|
|
7306
|
+
db2 = client2.db;
|
|
7307
|
+
const poolStats = db2.client.pool;
|
|
7308
|
+
if (poolStats) {
|
|
7309
|
+
console.log(`[EXULU] Connection pool stats for job ${bullmqJob.id}:`, {
|
|
7310
|
+
size: poolStats.numUsed?.() || 0,
|
|
7311
|
+
available: poolStats.numFree?.() || 0,
|
|
7312
|
+
pending: poolStats.numPendingAcquires?.() || 0
|
|
7313
|
+
});
|
|
7314
|
+
}
|
|
7315
|
+
break;
|
|
7316
|
+
} catch (error) {
|
|
7317
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
7318
|
+
console.error(
|
|
7319
|
+
`[EXULU] Failed to acquire database connection (attempt ${attempt}/${retries}) for job ${bullmqJob.id}:`,
|
|
7320
|
+
lastError.message
|
|
7321
|
+
);
|
|
7322
|
+
if (attempt < retries) {
|
|
7323
|
+
const backoffMs = 500 * Math.pow(2, attempt - 1);
|
|
7324
|
+
await new Promise((resolve3) => setTimeout(resolve3, backoffMs));
|
|
7325
|
+
}
|
|
7326
|
+
}
|
|
7327
|
+
}
|
|
7328
|
+
if (!db2) {
|
|
7329
|
+
throw new Error(
|
|
7330
|
+
`Failed to acquire database connection after ${retries} attempts: ${lastError?.message}`
|
|
7331
|
+
);
|
|
7332
|
+
}
|
|
7228
7333
|
const data = bullmqJob.data;
|
|
7229
7334
|
const timeoutInSeconds = data.timeoutInSeconds || queue.timeoutInSeconds || 600;
|
|
7230
7335
|
const timeoutMs = timeoutInSeconds * 1e3;
|
|
@@ -7316,7 +7421,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7316
7421
|
}
|
|
7317
7422
|
const exuluStorage = new ExuluStorage({ config });
|
|
7318
7423
|
console.log("[EXULU] POS 2 -- EXULU CONTEXT PROCESS FIELD");
|
|
7319
|
-
|
|
7424
|
+
let processorResult = await context.processor.execute({
|
|
7320
7425
|
item: data.inputs,
|
|
7321
7426
|
user: data.user,
|
|
7322
7427
|
role: data.role,
|
|
@@ -7331,12 +7436,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7331
7436
|
);
|
|
7332
7437
|
}
|
|
7333
7438
|
delete processorResult.field;
|
|
7439
|
+
const updateData = { ...processorResult };
|
|
7334
7440
|
await db2.from(getTableName(context.id)).where({
|
|
7335
7441
|
id: processorResult.id
|
|
7336
7442
|
}).update({
|
|
7337
|
-
...
|
|
7443
|
+
...updateData,
|
|
7338
7444
|
last_processed_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
7339
7445
|
});
|
|
7446
|
+
Object.keys(updateData).forEach((key) => {
|
|
7447
|
+
delete updateData[key];
|
|
7448
|
+
});
|
|
7340
7449
|
let jobs = [];
|
|
7341
7450
|
if (context.processor?.config?.generateEmbeddings) {
|
|
7342
7451
|
const fullItem = await db2.from(getTableName(context.id)).where({
|
|
@@ -7358,12 +7467,18 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7358
7467
|
jobs.push(embeddingsJob);
|
|
7359
7468
|
}
|
|
7360
7469
|
}
|
|
7361
|
-
|
|
7362
|
-
result: processorResult,
|
|
7470
|
+
const result = {
|
|
7471
|
+
result: { id: processorResult.id },
|
|
7363
7472
|
metadata: {
|
|
7364
7473
|
jobs: jobs.length > 0 ? jobs.join(",") : void 0
|
|
7365
7474
|
}
|
|
7366
7475
|
};
|
|
7476
|
+
processorResult = null;
|
|
7477
|
+
const memUsage = process.memoryUsage();
|
|
7478
|
+
console.log(
|
|
7479
|
+
`[EXULU] Memory after processor job ${bullmqJob.id}: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`
|
|
7480
|
+
);
|
|
7481
|
+
return result;
|
|
7367
7482
|
}
|
|
7368
7483
|
if (data.type === "workflow") {
|
|
7369
7484
|
console.log("[EXULU] running a workflow job.", bullmqJob.name);
|
|
@@ -7382,10 +7497,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7382
7497
|
user,
|
|
7383
7498
|
messages: inputMessages
|
|
7384
7499
|
} = await validateWorkflowPayload(data, providers);
|
|
7385
|
-
const
|
|
7500
|
+
const retries2 = 3;
|
|
7386
7501
|
let attempts = 0;
|
|
7387
7502
|
const promise = new Promise(async (resolve3, reject) => {
|
|
7388
|
-
while (attempts <
|
|
7503
|
+
while (attempts < retries2) {
|
|
7389
7504
|
try {
|
|
7390
7505
|
const messages2 = await processUiMessagesFlow({
|
|
7391
7506
|
providers,
|
|
@@ -7407,7 +7522,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7407
7522
|
error instanceof Error ? error.message : String(error)
|
|
7408
7523
|
);
|
|
7409
7524
|
attempts++;
|
|
7410
|
-
if (attempts >=
|
|
7525
|
+
if (attempts >= retries2) {
|
|
7411
7526
|
reject(new Error(error instanceof Error ? error.message : String(error)));
|
|
7412
7527
|
}
|
|
7413
7528
|
await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
|
|
@@ -7458,10 +7573,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7458
7573
|
testCase,
|
|
7459
7574
|
messages: inputMessages
|
|
7460
7575
|
} = await validateEvalPayload(data, providers);
|
|
7461
|
-
const
|
|
7576
|
+
const retries2 = 3;
|
|
7462
7577
|
let attempts = 0;
|
|
7463
7578
|
const promise = new Promise(async (resolve3, reject) => {
|
|
7464
|
-
while (attempts <
|
|
7579
|
+
while (attempts < retries2) {
|
|
7465
7580
|
try {
|
|
7466
7581
|
const messages2 = await processUiMessagesFlow({
|
|
7467
7582
|
providers,
|
|
@@ -7482,7 +7597,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7482
7597
|
error instanceof Error ? error.message : String(error)
|
|
7483
7598
|
);
|
|
7484
7599
|
attempts++;
|
|
7485
|
-
if (attempts >=
|
|
7600
|
+
if (attempts >= retries2) {
|
|
7486
7601
|
reject(new Error(error instanceof Error ? error.message : String(error)));
|
|
7487
7602
|
}
|
|
7488
7603
|
await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
|
|
@@ -7731,9 +7846,15 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7731
7846
|
try {
|
|
7732
7847
|
const result = await Promise.race([workPromise, timeoutPromise]);
|
|
7733
7848
|
clearTimeout(timeoutHandle);
|
|
7849
|
+
if (progressInterval) {
|
|
7850
|
+
clearInterval(progressInterval);
|
|
7851
|
+
}
|
|
7734
7852
|
return result;
|
|
7735
7853
|
} catch (error) {
|
|
7736
7854
|
clearTimeout(timeoutHandle);
|
|
7855
|
+
if (progressInterval) {
|
|
7856
|
+
clearInterval(progressInterval);
|
|
7857
|
+
}
|
|
7737
7858
|
console.error(
|
|
7738
7859
|
`[EXULU] job ${bullmqJob.id} failed (error caught in race handler).`,
|
|
7739
7860
|
error instanceof Error ? error.message : String(error)
|
|
@@ -7747,6 +7868,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7747
7868
|
concurrency: queue.concurrency?.worker || 1,
|
|
7748
7869
|
removeOnComplete: { count: 1e3 },
|
|
7749
7870
|
removeOnFail: { count: 5e3 },
|
|
7871
|
+
// Configure settings for long-running jobs (especially processor jobs)
|
|
7872
|
+
// lockDuration: How long a worker can hold a job before it's considered stalled
|
|
7873
|
+
// Set to 5 minutes to accommodate CPU-intensive operations
|
|
7874
|
+
lockDuration: 3e5,
|
|
7875
|
+
// 5 minutes in milliseconds
|
|
7876
|
+
// stalledInterval: How often to check for stalled jobs
|
|
7877
|
+
// Set to 2 minutes to reduce false positives for long-running operations
|
|
7878
|
+
stalledInterval: 12e4,
|
|
7879
|
+
// 2 minutes in milliseconds
|
|
7880
|
+
maxStalledCount: 1,
|
|
7750
7881
|
...queue.ratelimit && {
|
|
7751
7882
|
limiter: {
|
|
7752
7883
|
max: queue.ratelimit,
|
|
@@ -7783,24 +7914,68 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7783
7914
|
error: error instanceof Error ? error.message : String(error)
|
|
7784
7915
|
} : error
|
|
7785
7916
|
);
|
|
7917
|
+
throw error;
|
|
7786
7918
|
});
|
|
7787
7919
|
worker.on("error", (error) => {
|
|
7788
7920
|
console.error(`[EXULU] worker error.`, error);
|
|
7921
|
+
throw error;
|
|
7789
7922
|
});
|
|
7790
7923
|
worker.on("progress", (job, progress) => {
|
|
7791
7924
|
console.log(`[EXULU] job progress ${job.id}.`, job.name, {
|
|
7792
7925
|
progress
|
|
7793
7926
|
});
|
|
7794
7927
|
});
|
|
7795
|
-
const gracefulShutdown = async (signal) => {
|
|
7796
|
-
console.log(`Received ${signal}, closing server...`);
|
|
7797
|
-
await worker.close();
|
|
7798
|
-
process.exit(0);
|
|
7799
|
-
};
|
|
7800
|
-
process.on("SIGINT", () => gracefulShutdown("SIGINT"));
|
|
7801
|
-
process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
|
|
7802
7928
|
return worker;
|
|
7803
7929
|
});
|
|
7930
|
+
const gracefulShutdown = async (signal) => {
|
|
7931
|
+
if (isShuttingDown) {
|
|
7932
|
+
console.log(`[EXULU] Shutdown already in progress, ignoring additional ${signal}`);
|
|
7933
|
+
return;
|
|
7934
|
+
}
|
|
7935
|
+
isShuttingDown = true;
|
|
7936
|
+
console.log(`[EXULU] Received ${signal}, shutting down gracefully...`);
|
|
7937
|
+
try {
|
|
7938
|
+
if (poolMonitoringInterval) {
|
|
7939
|
+
clearInterval(poolMonitoringInterval);
|
|
7940
|
+
poolMonitoringInterval = void 0;
|
|
7941
|
+
}
|
|
7942
|
+
console.log(`[EXULU] Closing ${workers.length} worker(s)...`);
|
|
7943
|
+
const closePromises = workers.map(async (worker, index) => {
|
|
7944
|
+
try {
|
|
7945
|
+
await Promise.race([
|
|
7946
|
+
worker.close(),
|
|
7947
|
+
new Promise(
|
|
7948
|
+
(_, reject) => setTimeout(() => reject(new Error("Worker close timeout")), 3e4)
|
|
7949
|
+
)
|
|
7950
|
+
]);
|
|
7951
|
+
console.log(`[EXULU] Worker ${index + 1} closed successfully`);
|
|
7952
|
+
} catch (error) {
|
|
7953
|
+
console.error(`[EXULU] Error closing worker ${index + 1}:`, error);
|
|
7954
|
+
}
|
|
7955
|
+
});
|
|
7956
|
+
await Promise.allSettled(closePromises);
|
|
7957
|
+
if (redisConnection) {
|
|
7958
|
+
console.log(`[EXULU] Closing Redis connection...`);
|
|
7959
|
+
await redisConnection.quit();
|
|
7960
|
+
}
|
|
7961
|
+
try {
|
|
7962
|
+
const { db: db2 } = await postgresClient();
|
|
7963
|
+
if (db2?.client) {
|
|
7964
|
+
console.log(`[EXULU] Closing database connection pool...`);
|
|
7965
|
+
await db2.client.destroy();
|
|
7966
|
+
}
|
|
7967
|
+
} catch (error) {
|
|
7968
|
+
console.error(`[EXULU] Error closing database:`, error);
|
|
7969
|
+
}
|
|
7970
|
+
console.log(`[EXULU] Graceful shutdown complete`);
|
|
7971
|
+
process.exit(0);
|
|
7972
|
+
} catch (error) {
|
|
7973
|
+
console.error(`[EXULU] Error during graceful shutdown:`, error);
|
|
7974
|
+
process.exit(1);
|
|
7975
|
+
}
|
|
7976
|
+
};
|
|
7977
|
+
process.once("SIGINT", () => gracefulShutdown("SIGINT"));
|
|
7978
|
+
process.once("SIGTERM", () => gracefulShutdown("SIGTERM"));
|
|
7804
7979
|
return workers;
|
|
7805
7980
|
};
|
|
7806
7981
|
var validateWorkflowPayload = async (data, providers) => {
|
|
@@ -9514,6 +9689,7 @@ type AgentEvalFunctionConfig {
|
|
|
9514
9689
|
|
|
9515
9690
|
type ItemChunks {
|
|
9516
9691
|
chunk_id: String!
|
|
9692
|
+
chunk_metadata: JSON!
|
|
9517
9693
|
chunk_index: Int!
|
|
9518
9694
|
chunk_content: String!
|
|
9519
9695
|
chunk_source: String!
|
|
@@ -9920,7 +10096,7 @@ var ExuluProvider = class {
|
|
|
9920
10096
|
prompt: import_zod7.z.string().describe("The prompt (usually a question for the agent) to send to the agent."),
|
|
9921
10097
|
information: import_zod7.z.string().describe("A summary of relevant context / information from the current session")
|
|
9922
10098
|
}),
|
|
9923
|
-
description: `This tool calls an
|
|
10099
|
+
description: `This tool calls an agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
|
|
9924
10100
|
config: [],
|
|
9925
10101
|
execute: async ({ prompt, information, user, allExuluTools }) => {
|
|
9926
10102
|
const hasAccessToAgent = await checkRecordAccess(agent, "read", user);
|
|
@@ -10033,9 +10209,6 @@ var ExuluProvider = class {
|
|
|
10033
10209
|
if (!prompt && !inputMessages?.length) {
|
|
10034
10210
|
throw new Error("Prompt or message is required for generating.");
|
|
10035
10211
|
}
|
|
10036
|
-
if (outputSchema && !prompt) {
|
|
10037
|
-
throw new Error("Prompt is required for generating with an output schema.");
|
|
10038
|
-
}
|
|
10039
10212
|
const model = this.model.create({
|
|
10040
10213
|
...providerapikey ? { apiKey: providerapikey } : {}
|
|
10041
10214
|
});
|
|
@@ -10172,14 +10345,18 @@ var ExuluProvider = class {
|
|
|
10172
10345
|
let inputTokens = 0;
|
|
10173
10346
|
let outputTokens = 0;
|
|
10174
10347
|
if (outputSchema) {
|
|
10175
|
-
const {
|
|
10348
|
+
const { output, usage } = await (0, import_ai4.generateText)({
|
|
10176
10349
|
model,
|
|
10177
10350
|
system,
|
|
10178
|
-
prompt,
|
|
10179
10351
|
maxRetries: 3,
|
|
10180
|
-
|
|
10352
|
+
output: import_ai4.Output.object({
|
|
10353
|
+
schema: outputSchema
|
|
10354
|
+
}),
|
|
10355
|
+
prompt,
|
|
10356
|
+
stopWhen: [(0, import_ai4.stepCountIs)(5)]
|
|
10357
|
+
// make configurable
|
|
10181
10358
|
});
|
|
10182
|
-
result.object =
|
|
10359
|
+
result.object = output;
|
|
10183
10360
|
inputTokens = usage.inputTokens || 0;
|
|
10184
10361
|
outputTokens = usage.outputTokens || 0;
|
|
10185
10362
|
} else {
|
|
@@ -10210,6 +10387,7 @@ var ExuluProvider = class {
|
|
|
10210
10387
|
agent
|
|
10211
10388
|
),
|
|
10212
10389
|
stopWhen: [(0, import_ai4.stepCountIs)(5)]
|
|
10390
|
+
// make configurable
|
|
10213
10391
|
});
|
|
10214
10392
|
result.text = text;
|
|
10215
10393
|
inputTokens = totalUsage?.inputTokens || 0;
|
|
@@ -10684,6 +10862,8 @@ var providerRateLimiter = async (key, windowSeconds, limit, points) => {
|
|
|
10684
10862
|
};
|
|
10685
10863
|
|
|
10686
10864
|
// src/exulu/routes.ts
|
|
10865
|
+
var import_zod_from_json_schema = require("zod-from-json-schema");
|
|
10866
|
+
var import_zod8 = require("zod");
|
|
10687
10867
|
var REQUEST_SIZE_LIMIT = "50mb";
|
|
10688
10868
|
var getExuluVersionNumber = async () => {
|
|
10689
10869
|
try {
|
|
@@ -11104,6 +11284,16 @@ Mood: friendly and intelligent.
|
|
|
11104
11284
|
providers,
|
|
11105
11285
|
user
|
|
11106
11286
|
);
|
|
11287
|
+
if (req.body.outputSchema && !!headers.stream) {
|
|
11288
|
+
throw new Error("Providing a outputSchema in the POST body is not allowed when using the streaming API, set 'stream' to false in the headers when defining a response schema.");
|
|
11289
|
+
}
|
|
11290
|
+
let outputSchema;
|
|
11291
|
+
if (req.body.outputSchema) {
|
|
11292
|
+
if (typeof req.body.outputSchema === "string") {
|
|
11293
|
+
req.body.outputSchema = JSON.parse(req.body.outputSchema);
|
|
11294
|
+
}
|
|
11295
|
+
outputSchema = (0, import_zod_from_json_schema.convertJsonSchemaToZod)(req.body.outputSchema);
|
|
11296
|
+
}
|
|
11107
11297
|
let providerapikey;
|
|
11108
11298
|
const variableName = agent.providerapikey;
|
|
11109
11299
|
if (variableName) {
|
|
@@ -11250,6 +11440,7 @@ Mood: friendly and intelligent.
|
|
|
11250
11440
|
const response = await provider.generateSync({
|
|
11251
11441
|
contexts,
|
|
11252
11442
|
rerankers: rerankers || [],
|
|
11443
|
+
outputSchema,
|
|
11253
11444
|
agent,
|
|
11254
11445
|
user,
|
|
11255
11446
|
req,
|
|
@@ -11575,7 +11766,7 @@ var import_types2 = require("@modelcontextprotocol/sdk/types.js");
|
|
|
11575
11766
|
var import_express4 = require("express");
|
|
11576
11767
|
var import_api3 = require("@opentelemetry/api");
|
|
11577
11768
|
var import_crypto_js7 = __toESM(require("crypto-js"), 1);
|
|
11578
|
-
var
|
|
11769
|
+
var import_zod9 = require("zod");
|
|
11579
11770
|
var SESSION_ID_HEADER = "mcp-session-id";
|
|
11580
11771
|
var ExuluMCP = class {
|
|
11581
11772
|
server = {};
|
|
@@ -11658,7 +11849,7 @@ var ExuluMCP = class {
|
|
|
11658
11849
|
title: tool3.name + " agent",
|
|
11659
11850
|
description: tool3.description,
|
|
11660
11851
|
inputSchema: {
|
|
11661
|
-
inputs: tool3.inputSchema ||
|
|
11852
|
+
inputs: tool3.inputSchema || import_zod9.z.object({})
|
|
11662
11853
|
}
|
|
11663
11854
|
},
|
|
11664
11855
|
async ({ inputs }, args) => {
|
|
@@ -11710,7 +11901,7 @@ var ExuluMCP = class {
|
|
|
11710
11901
|
title: "Get List of Prompt Templates",
|
|
11711
11902
|
description: "Retrieves a list of prompt templates available for this agent. Returns the name, description, and ID of each template.",
|
|
11712
11903
|
inputSchema: {
|
|
11713
|
-
inputs:
|
|
11904
|
+
inputs: import_zod9.z.object({})
|
|
11714
11905
|
}
|
|
11715
11906
|
},
|
|
11716
11907
|
async ({ inputs }, args) => {
|
|
@@ -11756,8 +11947,8 @@ var ExuluMCP = class {
|
|
|
11756
11947
|
title: "Get Prompt Template Details",
|
|
11757
11948
|
description: "Retrieves the full details of a specific prompt template by ID, including the actual template content with variables.",
|
|
11758
11949
|
inputSchema: {
|
|
11759
|
-
inputs:
|
|
11760
|
-
id:
|
|
11950
|
+
inputs: import_zod9.z.object({
|
|
11951
|
+
id: import_zod9.z.string().describe("The ID of the prompt template to retrieve")
|
|
11761
11952
|
})
|
|
11762
11953
|
}
|
|
11763
11954
|
},
|
|
@@ -12665,7 +12856,7 @@ var ExuluEval = class {
|
|
|
12665
12856
|
};
|
|
12666
12857
|
|
|
12667
12858
|
// src/templates/evals/index.ts
|
|
12668
|
-
var
|
|
12859
|
+
var import_zod10 = require("zod");
|
|
12669
12860
|
var llmAsJudgeEval = () => {
|
|
12670
12861
|
if (process.env.REDIS_HOST?.length && process.env.REDIS_PORT?.length) {
|
|
12671
12862
|
return new ExuluEval({
|
|
@@ -12710,8 +12901,8 @@ var llmAsJudgeEval = () => {
|
|
|
12710
12901
|
contexts: [],
|
|
12711
12902
|
rerankers: [],
|
|
12712
12903
|
prompt,
|
|
12713
|
-
outputSchema:
|
|
12714
|
-
score:
|
|
12904
|
+
outputSchema: import_zod10.z.object({
|
|
12905
|
+
score: import_zod10.z.number().min(0).max(100).describe("The score between 0 and 100.")
|
|
12715
12906
|
}),
|
|
12716
12907
|
providerapikey
|
|
12717
12908
|
});
|
|
@@ -12939,12 +13130,12 @@ Usage:
|
|
|
12939
13130
|
- If no todos exist yet, an empty list will be returned`;
|
|
12940
13131
|
|
|
12941
13132
|
// src/templates/tools/todo/todo.ts
|
|
12942
|
-
var
|
|
12943
|
-
var TodoSchema =
|
|
12944
|
-
content:
|
|
12945
|
-
status:
|
|
12946
|
-
priority:
|
|
12947
|
-
id:
|
|
13133
|
+
var import_zod11 = __toESM(require("zod"), 1);
|
|
13134
|
+
var TodoSchema = import_zod11.default.object({
|
|
13135
|
+
content: import_zod11.default.string().describe("Brief description of the task"),
|
|
13136
|
+
status: import_zod11.default.string().describe("Current status of the task: pending, in_progress, completed, cancelled"),
|
|
13137
|
+
priority: import_zod11.default.string().describe("Priority level of the task: high, medium, low"),
|
|
13138
|
+
id: import_zod11.default.string().describe("Unique identifier for the todo item")
|
|
12948
13139
|
});
|
|
12949
13140
|
var TodoWriteTool = new ExuluTool({
|
|
12950
13141
|
id: "todo_write",
|
|
@@ -12960,8 +13151,8 @@ var TodoWriteTool = new ExuluTool({
|
|
|
12960
13151
|
default: todowrite_default
|
|
12961
13152
|
}
|
|
12962
13153
|
],
|
|
12963
|
-
inputSchema:
|
|
12964
|
-
todos:
|
|
13154
|
+
inputSchema: import_zod11.default.object({
|
|
13155
|
+
todos: import_zod11.default.array(TodoSchema).describe("The updated todo list")
|
|
12965
13156
|
}),
|
|
12966
13157
|
execute: async (inputs) => {
|
|
12967
13158
|
const { sessionID, todos, user } = inputs;
|
|
@@ -12996,7 +13187,7 @@ var TodoReadTool = new ExuluTool({
|
|
|
12996
13187
|
id: "todo_read",
|
|
12997
13188
|
name: "Todo Read",
|
|
12998
13189
|
description: "Use this tool to read your todo list",
|
|
12999
|
-
inputSchema:
|
|
13190
|
+
inputSchema: import_zod11.default.object({}),
|
|
13000
13191
|
type: "function",
|
|
13001
13192
|
category: "todo",
|
|
13002
13193
|
config: [
|
|
@@ -13035,15 +13226,15 @@ async function getTodos(sessionID) {
|
|
|
13035
13226
|
var todoTools = [TodoWriteTool, TodoReadTool];
|
|
13036
13227
|
|
|
13037
13228
|
// src/templates/tools/perplexity.ts
|
|
13038
|
-
var
|
|
13229
|
+
var import_zod12 = __toESM(require("zod"), 1);
|
|
13039
13230
|
var import_perplexity_ai = __toESM(require("@perplexity-ai/perplexity_ai"), 1);
|
|
13040
13231
|
var internetSearchTool = new ExuluTool({
|
|
13041
13232
|
id: "internet_search",
|
|
13042
13233
|
name: "Perplexity Live Internet Search",
|
|
13043
13234
|
description: "Search the internet for information.",
|
|
13044
|
-
inputSchema:
|
|
13045
|
-
query:
|
|
13046
|
-
search_recency_filter:
|
|
13235
|
+
inputSchema: import_zod12.default.object({
|
|
13236
|
+
query: import_zod12.default.string().describe("The query to the tool."),
|
|
13237
|
+
search_recency_filter: import_zod12.default.enum(["day", "week", "month", "year"]).optional().describe("The recency filter for the search, can be day, week, month or year.")
|
|
13047
13238
|
}),
|
|
13048
13239
|
category: "internet_search",
|
|
13049
13240
|
type: "web_search",
|
|
@@ -15485,6 +15676,22 @@ var MarkdownChunker = class {
|
|
|
15485
15676
|
});
|
|
15486
15677
|
return result;
|
|
15487
15678
|
}
|
|
15679
|
+
/**
|
|
15680
|
+
* Checks if a position in the text falls within a <diagram> tag.
|
|
15681
|
+
* Returns the adjusted position (before the diagram) if inside a diagram, otherwise returns the original position.
|
|
15682
|
+
*/
|
|
15683
|
+
adjustForDiagramTags(text, position) {
|
|
15684
|
+
const diagramRegex = /<diagram>[\s\S]*?<\/diagram>/gi;
|
|
15685
|
+
let match;
|
|
15686
|
+
while ((match = diagramRegex.exec(text)) !== null) {
|
|
15687
|
+
const diagramStart = match.index;
|
|
15688
|
+
const diagramEnd = match.index + match[0].length;
|
|
15689
|
+
if (position > diagramStart && position < diagramEnd) {
|
|
15690
|
+
return diagramStart;
|
|
15691
|
+
}
|
|
15692
|
+
}
|
|
15693
|
+
return position;
|
|
15694
|
+
}
|
|
15488
15695
|
/**
|
|
15489
15696
|
* Find the nearest logical breakpoint working backwards from the end of the text.
|
|
15490
15697
|
* Logical breakpoints are prioritized as follows:
|
|
@@ -15496,6 +15703,7 @@ var MarkdownChunker = class {
|
|
|
15496
15703
|
*
|
|
15497
15704
|
* Only considers breakpoints in the last 50% of the text to avoid creating very small chunks.
|
|
15498
15705
|
* Returns the position of the breakpoint, or null if none found
|
|
15706
|
+
* IMPORTANT: Never splits content within <diagram> tags
|
|
15499
15707
|
*/
|
|
15500
15708
|
findLogicalBreakpoint(text) {
|
|
15501
15709
|
if (text.length === 0) return null;
|
|
@@ -15515,7 +15723,7 @@ var MarkdownChunker = class {
|
|
|
15515
15723
|
}
|
|
15516
15724
|
}
|
|
15517
15725
|
if (lastHeaderPosition > 0) {
|
|
15518
|
-
return lastHeaderPosition;
|
|
15726
|
+
return this.adjustForDiagramTags(text, lastHeaderPosition);
|
|
15519
15727
|
}
|
|
15520
15728
|
let lastParagraphBreak = -1;
|
|
15521
15729
|
let searchPos = text.length;
|
|
@@ -15528,11 +15736,12 @@ var MarkdownChunker = class {
|
|
|
15528
15736
|
searchPos = pos;
|
|
15529
15737
|
}
|
|
15530
15738
|
if (lastParagraphBreak > 0) {
|
|
15531
|
-
|
|
15739
|
+
const adjusted = this.adjustForDiagramTags(text, lastParagraphBreak + 2);
|
|
15740
|
+
return adjusted;
|
|
15532
15741
|
}
|
|
15533
15742
|
const newlineIndex = text.lastIndexOf("\n");
|
|
15534
15743
|
if (newlineIndex >= minPosition) {
|
|
15535
|
-
return newlineIndex + 1;
|
|
15744
|
+
return this.adjustForDiagramTags(text, newlineIndex + 1);
|
|
15536
15745
|
}
|
|
15537
15746
|
const sentenceEndRegex = /[.!?](?:\s|$)/g;
|
|
15538
15747
|
let lastSentenceEnd = -1;
|
|
@@ -15542,13 +15751,13 @@ var MarkdownChunker = class {
|
|
|
15542
15751
|
}
|
|
15543
15752
|
}
|
|
15544
15753
|
if (lastSentenceEnd > 0) {
|
|
15545
|
-
return lastSentenceEnd;
|
|
15754
|
+
return this.adjustForDiagramTags(text, lastSentenceEnd);
|
|
15546
15755
|
}
|
|
15547
15756
|
let lastSpace = text.length;
|
|
15548
15757
|
while (lastSpace > minPosition) {
|
|
15549
15758
|
const pos = text.lastIndexOf(" ", lastSpace - 1);
|
|
15550
15759
|
if (pos >= minPosition) {
|
|
15551
|
-
return pos + 1;
|
|
15760
|
+
return this.adjustForDiagramTags(text, pos + 1);
|
|
15552
15761
|
}
|
|
15553
15762
|
lastSpace = pos;
|
|
15554
15763
|
}
|
|
@@ -15680,6 +15889,38 @@ var MarkdownChunker = class {
|
|
|
15680
15889
|
targetPosition = currentPosition + decoded.length;
|
|
15681
15890
|
}
|
|
15682
15891
|
}
|
|
15892
|
+
const diagramCheck = /<diagram>/gi;
|
|
15893
|
+
const diagramCloseCheck = /<\/diagram>/gi;
|
|
15894
|
+
let openDiagramsInSlice = 0;
|
|
15895
|
+
while (diagramCheck.exec(currentSlice) !== null) {
|
|
15896
|
+
openDiagramsInSlice++;
|
|
15897
|
+
}
|
|
15898
|
+
let closeDiagramsInSlice = 0;
|
|
15899
|
+
while (diagramCloseCheck.exec(currentSlice) !== null) {
|
|
15900
|
+
closeDiagramsInSlice++;
|
|
15901
|
+
}
|
|
15902
|
+
if (openDiagramsInSlice > closeDiagramsInSlice) {
|
|
15903
|
+
const lastDiagramOpenIndex = currentSlice.lastIndexOf("<diagram>");
|
|
15904
|
+
if (lastDiagramOpenIndex !== -1) {
|
|
15905
|
+
const remainingText = text.slice(currentPosition + lastDiagramOpenIndex);
|
|
15906
|
+
const closingTagMatch = /<\/diagram>/i.exec(remainingText);
|
|
15907
|
+
if (closingTagMatch) {
|
|
15908
|
+
const closingTagPosition = lastDiagramOpenIndex + closingTagMatch.index + closingTagMatch[0].length;
|
|
15909
|
+
const extendedSlice = text.slice(currentPosition, currentPosition + closingTagPosition);
|
|
15910
|
+
const extendedTokens = tokenizer.encode(extendedSlice);
|
|
15911
|
+
if (extendedTokens.length <= adjustedChunkSize * 1.5) {
|
|
15912
|
+
currentSlice = extendedSlice;
|
|
15913
|
+
targetPosition = currentPosition + closingTagPosition;
|
|
15914
|
+
} else {
|
|
15915
|
+
currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
|
|
15916
|
+
targetPosition = currentPosition + lastDiagramOpenIndex;
|
|
15917
|
+
}
|
|
15918
|
+
} else {
|
|
15919
|
+
currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
|
|
15920
|
+
targetPosition = currentPosition + lastDiagramOpenIndex;
|
|
15921
|
+
}
|
|
15922
|
+
}
|
|
15923
|
+
}
|
|
15683
15924
|
const breakpointPosition = this.findLogicalBreakpoint(currentSlice);
|
|
15684
15925
|
if (breakpointPosition !== null) {
|
|
15685
15926
|
currentSlice = currentSlice.slice(0, breakpointPosition);
|
|
@@ -15952,7 +16193,7 @@ Or manually run the setup script:
|
|
|
15952
16193
|
var fs2 = __toESM(require("fs"), 1);
|
|
15953
16194
|
var path = __toESM(require("path"), 1);
|
|
15954
16195
|
var import_ai7 = require("ai");
|
|
15955
|
-
var
|
|
16196
|
+
var import_zod13 = require("zod");
|
|
15956
16197
|
var import_p_limit = __toESM(require("p-limit"), 1);
|
|
15957
16198
|
var import_crypto = require("crypto");
|
|
15958
16199
|
var mammoth = __toESM(require("mammoth"), 1);
|
|
@@ -16101,6 +16342,8 @@ ${command}`;
|
|
|
16101
16342
|
}
|
|
16102
16343
|
|
|
16103
16344
|
// ee/python/documents/processing/doc_processor.ts
|
|
16345
|
+
var import_liteparse = require("@llamaindex/liteparse");
|
|
16346
|
+
var import_mistralai = require("@mistralai/mistralai");
|
|
16104
16347
|
async function processDocx(file) {
|
|
16105
16348
|
const html = await mammoth.convertToHtml({ buffer: file });
|
|
16106
16349
|
const turndownService = new import_turndown.default();
|
|
@@ -16175,52 +16418,93 @@ async function validatePageWithVLM(page, imagePath, model) {
|
|
|
16175
16418
|
const imageBuffer = await fs2.promises.readFile(imagePath);
|
|
16176
16419
|
const imageBase64 = imageBuffer.toString("base64");
|
|
16177
16420
|
const mimeType = "image/png";
|
|
16178
|
-
const prompt = `You are
|
|
16179
|
-
|
|
16180
|
-
Here is the current OCR/parsed content for this page:
|
|
16421
|
+
const prompt = `You are a document validation assistant. Your task is to analyze a page image and correct the output of an OCR/parsing pipeline. The content may include tables, technical diagrams, schematics, and structured text.
|
|
16181
16422
|
|
|
16182
16423
|
---
|
|
16424
|
+
## CURRENT OCR OUTPUT
|
|
16425
|
+
|
|
16183
16426
|
${page.content}
|
|
16184
16427
|
---
|
|
16185
16428
|
|
|
16186
|
-
|
|
16429
|
+
## YOUR TASK
|
|
16187
16430
|
|
|
16188
|
-
|
|
16189
|
-
- Table data (rows, columns, headers, values)
|
|
16190
|
-
- Technical diagrams, schematics, control boards
|
|
16191
|
-
- Icons, checkmarks, symbols
|
|
16192
|
-
- Image captions and labels
|
|
16431
|
+
Compare the page image to the OCR output above. Identify errors, omissions, and formatting issues, then return a structured validation result (see OUTPUT FORMAT below).
|
|
16193
16432
|
|
|
16194
|
-
|
|
16195
|
-
|
|
16196
|
-
|
|
16197
|
-
|
|
16198
|
-
|
|
16199
|
-
|
|
16200
|
-
-
|
|
16201
|
-
-
|
|
16202
|
-
|
|
16433
|
+
---
|
|
16434
|
+
## VALIDATION CHECKLIST
|
|
16435
|
+
|
|
16436
|
+
Work through these checks in order:
|
|
16437
|
+
|
|
16438
|
+
### 1. Text Accuracy
|
|
16439
|
+
- Verify all text is correctly transcribed.
|
|
16440
|
+
- For minor character-level OCR errors (e.g. "\xF6" vs "\xFC", "rn" vs "m"), **prefer the original OCR output** unless you are certain of an error. Do not silently "fix" characters based on guesswork.
|
|
16441
|
+
|
|
16442
|
+
### 2. Heading Levels
|
|
16443
|
+
- Verify that headings use correct Markdown levels (#, ##, ###, ####, #####, ######).
|
|
16444
|
+
- Determine heading level using the following priority:
|
|
16445
|
+
1. **Hierarchical numbering** (strongest signal): e.g. "1" \u2192 #, "2.1" \u2192 ##, "2.1.1" \u2192 ###, "2.1.2.5" \u2192 ####
|
|
16446
|
+
2. Font size (larger = higher level)
|
|
16447
|
+
3. Indentation
|
|
16448
|
+
4. Bold/emphasis styling
|
|
16449
|
+
|
|
16450
|
+
### 3. Tables
|
|
16451
|
+
|
|
16452
|
+
**First, decide whether the table should be Markdown or plain text:**
|
|
16453
|
+
- Use **Markdown table format** if the table has a consistent, clear header structure and uniform column layout throughout.
|
|
16454
|
+
- Use **plain text structured description** if the table:
|
|
16455
|
+
- Lacks a clear header row
|
|
16456
|
+
- Uses mixed or irregular column structures across rows
|
|
16457
|
+
- Functions more like a certificate, form, or label layout
|
|
16458
|
+
|
|
16459
|
+
**If using Markdown format**, follow these rules strictly:
|
|
16460
|
+
- Every table must have: header row \u2192 separator row \u2192 data rows
|
|
16461
|
+
- Use simple separators only: \`| --- | --- |\` (NOT \`|---|---|\` or long dashes)
|
|
16462
|
+
- Example:
|
|
16463
|
+
\`\`\`
|
|
16203
16464
|
| Column 1 | Column 2 |
|
|
16204
16465
|
| --- | --- |
|
|
16205
|
-
| Data 1
|
|
16206
|
-
|
|
16207
|
-
-
|
|
16208
|
-
|
|
16209
|
-
|
|
16210
|
-
|
|
16211
|
-
|
|
16212
|
-
-
|
|
16213
|
-
|
|
16214
|
-
|
|
16466
|
+
| Data 1 | Data 2 |
|
|
16467
|
+
\`\`\`
|
|
16468
|
+
- Important: do not use the | character as part of the data inside a cell, this would break the table, if a cell contains a | character, use a capital I.
|
|
16469
|
+
|
|
16470
|
+
**Symbol translation rules for table cells:**
|
|
16471
|
+
- Black/filled dot \u2192 \`+\` (active); White/empty dot \u2192 \`-\` (inactive)
|
|
16472
|
+
*(e.g. Rufe-LED columns)*
|
|
16473
|
+
- Green or black checkmark \u2192 \`+\` (active); Red or black cross \u2192 \`-\` (inactive)
|
|
16474
|
+
|
|
16475
|
+
### 4. Multi-Page Table Continuity
|
|
16476
|
+
- If this page contains a table with a header row that runs to the bottom of the page (suggesting it may continue on the next page), extract the header row and include it in the \`current_page_table.headers\` field.
|
|
16477
|
+
- If this page contains a table WITHOUT a header row (suggesting it's a continuation from a previous page), set \`current_page_table.is_continuation\` to true and try to identify what the headers might be based on the data structure. Include your best guess for headers in \`current_page_table.headers\`.
|
|
16478
|
+
|
|
16479
|
+
### 5. Technical Diagrams & Schematics
|
|
16480
|
+
If the page contains a flow-chart, schematic, technical drawing or control board layout that is **absent or poorly described** in the OCR output do the following:
|
|
16481
|
+
- Open a <diagram> tag with the following content:
|
|
16482
|
+
<diagram>
|
|
16483
|
+
<description>
|
|
16484
|
+
Add a detailed description of the diagram here.
|
|
16485
|
+
</description>
|
|
16486
|
+
<mermaid>
|
|
16487
|
+
Add a mermaid diagram schema here that in detail describes the diagram.
|
|
16488
|
+
</mermaid>
|
|
16489
|
+
</diagram>
|
|
16490
|
+
|
|
16491
|
+
### 6. Captions, Icons & Symbols
|
|
16492
|
+
- Verify that image captions, labels, icons, and checkmarks are present and correctly transcribed.
|
|
16493
|
+
|
|
16494
|
+
### 7. Only populate \`corrected_text\` when \`needs_correction\` is true. If the OCR output is accurate, return \`needs_correction: false\` and \`corrected_content: null\`.
|
|
16215
16495
|
`;
|
|
16216
16496
|
const result = await (0, import_ai7.generateText)({
|
|
16217
16497
|
model,
|
|
16218
16498
|
output: import_ai7.Output.object({
|
|
16219
|
-
schema:
|
|
16220
|
-
needs_correction:
|
|
16221
|
-
corrected_text:
|
|
16222
|
-
|
|
16223
|
-
|
|
16499
|
+
schema: import_zod13.z.object({
|
|
16500
|
+
needs_correction: import_zod13.z.boolean(),
|
|
16501
|
+
corrected_text: import_zod13.z.string().nullable(),
|
|
16502
|
+
current_page_table: import_zod13.z.object({
|
|
16503
|
+
headers: import_zod13.z.array(import_zod13.z.string()),
|
|
16504
|
+
is_continuation: import_zod13.z.boolean()
|
|
16505
|
+
}).nullable(),
|
|
16506
|
+
confidence: import_zod13.z.enum(["high", "medium", "low"]),
|
|
16507
|
+
reasoning: import_zod13.z.string()
|
|
16224
16508
|
})
|
|
16225
16509
|
}),
|
|
16226
16510
|
messages: [
|
|
@@ -16241,23 +16525,80 @@ Specific notes and guidelines:
|
|
|
16241
16525
|
needs_correction: parsedOutput.needs_correction,
|
|
16242
16526
|
corrected_text: parsedOutput.corrected_text || void 0,
|
|
16243
16527
|
confidence: parsedOutput.confidence,
|
|
16528
|
+
current_page_table: parsedOutput.current_page_table || void 0,
|
|
16244
16529
|
reasoning: parsedOutput.reasoning
|
|
16245
16530
|
};
|
|
16246
16531
|
return validation;
|
|
16247
16532
|
}
|
|
16533
|
+
function reconstructTableHeaders(document2, validationResults, verbose = false) {
|
|
16534
|
+
let lastTableHeaders = void 0;
|
|
16535
|
+
for (const page of document2) {
|
|
16536
|
+
const validation = validationResults.get(page.page);
|
|
16537
|
+
if (!validation) continue;
|
|
16538
|
+
const tableInfo = validation.current_page_table;
|
|
16539
|
+
if (tableInfo && tableInfo.headers.length > 0) {
|
|
16540
|
+
if (tableInfo.is_continuation && lastTableHeaders) {
|
|
16541
|
+
if (verbose) {
|
|
16542
|
+
console.log(`[EXULU] Page ${page.page}: Reconstructing table headers from previous page`);
|
|
16543
|
+
console.log(`[EXULU] Previous headers: ${lastTableHeaders.join(" | ")}`);
|
|
16544
|
+
}
|
|
16545
|
+
const contentToModify = page.vlm_corrected_text || page.content;
|
|
16546
|
+
const lines = contentToModify.split("\n");
|
|
16547
|
+
const firstTableLineIndex = lines.findIndex((line) => line.trim().startsWith("|"));
|
|
16548
|
+
if (firstTableLineIndex !== -1) {
|
|
16549
|
+
const headerRow = `| ${lastTableHeaders.join(" | ")} |`;
|
|
16550
|
+
const separatorRow = `| ${lastTableHeaders.map(() => "---").join(" | ")} |`;
|
|
16551
|
+
lines.splice(firstTableLineIndex, 0, headerRow, separatorRow);
|
|
16552
|
+
const reconstructedContent = lines.join("\n");
|
|
16553
|
+
if (page.vlm_corrected_text) {
|
|
16554
|
+
page.vlm_corrected_text = reconstructedContent;
|
|
16555
|
+
} else {
|
|
16556
|
+
page.content = reconstructedContent;
|
|
16557
|
+
}
|
|
16558
|
+
if (verbose) {
|
|
16559
|
+
console.log(`[EXULU] Page ${page.page}: Added table headers successfully`);
|
|
16560
|
+
}
|
|
16561
|
+
}
|
|
16562
|
+
if (!tableInfo.is_continuation) {
|
|
16563
|
+
lastTableHeaders = tableInfo.headers;
|
|
16564
|
+
}
|
|
16565
|
+
} else {
|
|
16566
|
+
lastTableHeaders = tableInfo.headers;
|
|
16567
|
+
if (verbose) {
|
|
16568
|
+
console.log(`[EXULU] Page ${page.page}: Storing table headers for potential continuation`);
|
|
16569
|
+
console.log(`[EXULU] Headers: ${lastTableHeaders.join(" | ")}`);
|
|
16570
|
+
}
|
|
16571
|
+
}
|
|
16572
|
+
} else {
|
|
16573
|
+
lastTableHeaders = void 0;
|
|
16574
|
+
}
|
|
16575
|
+
}
|
|
16576
|
+
}
|
|
16248
16577
|
async function validateWithVLM(document2, model, verbose = false, concurrency = 10) {
|
|
16249
16578
|
console.log(`[EXULU] Starting VLM validation for docling output, ${document2.length} pages...`);
|
|
16250
|
-
console.log(
|
|
16251
|
-
|
|
16252
|
-
);
|
|
16579
|
+
console.log(`[EXULU] Concurrency limit: ${concurrency}`);
|
|
16580
|
+
const limit = (0, import_p_limit.default)(concurrency);
|
|
16581
|
+
const validationResults = /* @__PURE__ */ new Map();
|
|
16253
16582
|
let validatedCount = 0;
|
|
16254
16583
|
let correctedCount = 0;
|
|
16255
|
-
const limit = (0, import_p_limit.default)(concurrency);
|
|
16256
16584
|
const validationTasks = document2.map(
|
|
16257
16585
|
(page) => limit(async () => {
|
|
16586
|
+
await new Promise((resolve3) => setImmediate(resolve3));
|
|
16258
16587
|
const imagePath = page.image;
|
|
16588
|
+
if (!page.content) {
|
|
16589
|
+
console.warn(`[EXULU] Page ${page.page}: No content found, skipping validation`);
|
|
16590
|
+
return;
|
|
16591
|
+
}
|
|
16259
16592
|
if (!imagePath) {
|
|
16260
|
-
console.
|
|
16593
|
+
console.warn(`[EXULU] Page ${page.page}: No image found, skipping validation`);
|
|
16594
|
+
return;
|
|
16595
|
+
}
|
|
16596
|
+
const hasImage = page.content.match(/\.(jpeg|jpg|png|gif|webp)/i);
|
|
16597
|
+
const hasTable = (page.content.match(/\|/g)?.length || 0) > 1;
|
|
16598
|
+
if (!hasImage && !hasTable) {
|
|
16599
|
+
if (verbose) {
|
|
16600
|
+
console.log(`[EXULU] Page ${page.page}: No image or table found, SKIPPING VLM validation`);
|
|
16601
|
+
}
|
|
16261
16602
|
return;
|
|
16262
16603
|
}
|
|
16263
16604
|
let validation;
|
|
@@ -16265,6 +16606,13 @@ async function validateWithVLM(document2, model, verbose = false, concurrency =
|
|
|
16265
16606
|
validation = await withRetry(async () => {
|
|
16266
16607
|
return await validatePageWithVLM(page, imagePath, model);
|
|
16267
16608
|
}, 3);
|
|
16609
|
+
validationResults.set(page.page, validation);
|
|
16610
|
+
if (verbose && validation.current_page_table) {
|
|
16611
|
+
console.log(`[EXULU] Page ${page.page} table info:`, {
|
|
16612
|
+
headers: validation.current_page_table.headers,
|
|
16613
|
+
is_continuation: validation.current_page_table.is_continuation
|
|
16614
|
+
});
|
|
16615
|
+
}
|
|
16268
16616
|
} catch (error) {
|
|
16269
16617
|
console.error(`[EXULU] Error validating page ${page.page} with VLM more than 3 times, skipping:`, error);
|
|
16270
16618
|
throw error;
|
|
@@ -16295,9 +16643,12 @@ async function validateWithVLM(document2, model, verbose = false, concurrency =
|
|
|
16295
16643
|
})
|
|
16296
16644
|
);
|
|
16297
16645
|
await Promise.all(validationTasks);
|
|
16298
|
-
console.log(`[EXULU] VLM validation complete:`);
|
|
16299
|
-
console.log(`[EXULU] Validated: ${validatedCount}
|
|
16300
|
-
console.log(`[EXULU] Corrected: ${correctedCount}
|
|
16646
|
+
console.log(`[EXULU] VLM validation complete (parallel processing):`);
|
|
16647
|
+
console.log(`[EXULU] Validated: ${validatedCount} pages`);
|
|
16648
|
+
console.log(`[EXULU] Corrected: ${correctedCount} pages`);
|
|
16649
|
+
console.log(`[EXULU] Starting sequential table header reconstruction...`);
|
|
16650
|
+
reconstructTableHeaders(document2, validationResults, verbose);
|
|
16651
|
+
console.log(`[EXULU] Table header reconstruction complete`);
|
|
16301
16652
|
return document2;
|
|
16302
16653
|
}
|
|
16303
16654
|
async function processDocument(filePath, fileType, buffer, tempDir, config, verbose = false) {
|
|
@@ -16312,15 +16663,6 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
|
|
|
16312
16663
|
const stripped = filePath.split(".").pop()?.trim();
|
|
16313
16664
|
let result;
|
|
16314
16665
|
switch (stripped) {
|
|
16315
|
-
case "pdf":
|
|
16316
|
-
result = await processPdf(buffer, paths, config, verbose);
|
|
16317
|
-
break;
|
|
16318
|
-
case "docx":
|
|
16319
|
-
result = await processDocx(buffer);
|
|
16320
|
-
break;
|
|
16321
|
-
case "doc":
|
|
16322
|
-
result = await processWord(buffer);
|
|
16323
|
-
break;
|
|
16324
16666
|
case "txt":
|
|
16325
16667
|
case "md":
|
|
16326
16668
|
let content = buffer.toString();
|
|
@@ -16334,6 +16676,15 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
|
|
|
16334
16676
|
}]
|
|
16335
16677
|
};
|
|
16336
16678
|
break;
|
|
16679
|
+
case "pdf":
|
|
16680
|
+
result = await processPdf(buffer, paths, config, verbose);
|
|
16681
|
+
break;
|
|
16682
|
+
case "docx":
|
|
16683
|
+
result = await processDocx(buffer);
|
|
16684
|
+
break;
|
|
16685
|
+
case "doc":
|
|
16686
|
+
result = await processWord(buffer);
|
|
16687
|
+
break;
|
|
16337
16688
|
// Todo other file types with docx and officeparser
|
|
16338
16689
|
default:
|
|
16339
16690
|
throw new Error(`[EXULU] Unsupported file type: ${fileType}`);
|
|
@@ -16346,10 +16697,17 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
|
|
|
16346
16697
|
}
|
|
16347
16698
|
};
|
|
16348
16699
|
}
|
|
16700
|
+
var getMistralApiKey = async () => {
|
|
16701
|
+
if (process.env.MISTRAL_API_KEY) {
|
|
16702
|
+
return process.env.MISTRAL_API_KEY;
|
|
16703
|
+
} else {
|
|
16704
|
+
return await ExuluVariables.get("MISTRAL_API_KEY");
|
|
16705
|
+
}
|
|
16706
|
+
};
|
|
16349
16707
|
async function processPdf(buffer, paths, config, verbose = false) {
|
|
16350
16708
|
try {
|
|
16351
|
-
let json;
|
|
16352
|
-
if (config?.docling) {
|
|
16709
|
+
let json = [];
|
|
16710
|
+
if (config?.processor.name === "docling") {
|
|
16353
16711
|
console.log(`[EXULU] Validating Python environment...`);
|
|
16354
16712
|
const validation = await validatePythonEnvironment(void 0, true);
|
|
16355
16713
|
if (!validation.valid) {
|
|
@@ -16390,7 +16748,7 @@ ${setupResult.output || ""}`);
|
|
|
16390
16748
|
}
|
|
16391
16749
|
const jsonContent = await fs2.promises.readFile(paths.json, "utf-8");
|
|
16392
16750
|
json = JSON.parse(jsonContent);
|
|
16393
|
-
} else {
|
|
16751
|
+
} else if (config?.processor.name === "officeparser") {
|
|
16394
16752
|
const text = await (0, import_officeparser2.parseOfficeAsync)(buffer, {
|
|
16395
16753
|
outputErrorToConsole: false,
|
|
16396
16754
|
newlineDelimiter: "\n"
|
|
@@ -16400,15 +16758,70 @@ ${setupResult.output || ""}`);
|
|
|
16400
16758
|
content: text,
|
|
16401
16759
|
headings: []
|
|
16402
16760
|
}];
|
|
16761
|
+
} else if (config?.processor.name === "mistral") {
|
|
16762
|
+
const MISTRAL_API_KEY = await getMistralApiKey();
|
|
16763
|
+
if (MISTRAL_API_KEY) {
|
|
16764
|
+
throw new Error('[EXULU] MISTRAL_API_KEY is not set, please set it in the environment variable via process.env or via an Exulu variable named "MISTRAL_API_KEY".');
|
|
16765
|
+
}
|
|
16766
|
+
await new Promise((resolve3) => setTimeout(resolve3, Math.floor(Math.random() * 4e3) + 1e3));
|
|
16767
|
+
const base64Pdf = buffer.toString("base64");
|
|
16768
|
+
const client2 = new import_mistralai.Mistral({ apiKey: MISTRAL_API_KEY });
|
|
16769
|
+
const ocrResponse = await withRetry(async () => {
|
|
16770
|
+
const ocrResponse2 = await client2.ocr.process({
|
|
16771
|
+
document: {
|
|
16772
|
+
type: "document_url",
|
|
16773
|
+
documentUrl: "data:application/pdf;base64," + base64Pdf
|
|
16774
|
+
},
|
|
16775
|
+
model: "mistral-ocr-latest",
|
|
16776
|
+
includeImageBase64: false
|
|
16777
|
+
});
|
|
16778
|
+
return ocrResponse2;
|
|
16779
|
+
}, 10);
|
|
16780
|
+
const parser = new import_liteparse.LiteParse();
|
|
16781
|
+
const screenshots = await parser.screenshot(paths.source, void 0);
|
|
16782
|
+
await fs2.promises.mkdir(paths.images, { recursive: true });
|
|
16783
|
+
for (const screenshot of screenshots) {
|
|
16784
|
+
await fs2.promises.writeFile(
|
|
16785
|
+
path.join(
|
|
16786
|
+
paths.images,
|
|
16787
|
+
`${screenshot.pageNum}.png`
|
|
16788
|
+
),
|
|
16789
|
+
screenshot.imageBuffer
|
|
16790
|
+
);
|
|
16791
|
+
screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
|
|
16792
|
+
}
|
|
16793
|
+
json = ocrResponse.pages.map((page) => ({
|
|
16794
|
+
page: page.index + 1,
|
|
16795
|
+
content: page.markdown,
|
|
16796
|
+
image: screenshots.find((s) => s.pageNum === page.index + 1)?.imagePath,
|
|
16797
|
+
headings: []
|
|
16798
|
+
}));
|
|
16799
|
+
fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
|
|
16800
|
+
} else if (config?.processor.name === "liteparse") {
|
|
16801
|
+
const parser = new import_liteparse.LiteParse();
|
|
16802
|
+
const result = await parser.parse(paths.source);
|
|
16803
|
+
const screenshots = await parser.screenshot(paths.source, void 0);
|
|
16804
|
+
console.log(`[EXULU] Liteparse screenshots: ${JSON.stringify(screenshots)}`);
|
|
16805
|
+
await fs2.promises.mkdir(paths.images, { recursive: true });
|
|
16806
|
+
for (const screenshot of screenshots) {
|
|
16807
|
+
await fs2.promises.writeFile(path.join(paths.images, `${screenshot.pageNum}.png`), screenshot.imageBuffer);
|
|
16808
|
+
screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
|
|
16809
|
+
}
|
|
16810
|
+
json = result.pages.map((page) => ({
|
|
16811
|
+
page: page.pageNum,
|
|
16812
|
+
content: page.text,
|
|
16813
|
+
image: screenshots.find((s) => s.pageNum === page.pageNum)?.imagePath
|
|
16814
|
+
}));
|
|
16815
|
+
fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
|
|
16403
16816
|
}
|
|
16404
16817
|
console.log(`[EXULU]
|
|
16405
16818
|
\u2713 Document processing completed successfully`);
|
|
16406
16819
|
console.log(`[EXULU] Total pages: ${json.length}`);
|
|
16407
16820
|
console.log(`[EXULU] Output file: ${paths.json}`);
|
|
16408
|
-
if (
|
|
16821
|
+
if (config?.vlm?.model) {
|
|
16409
16822
|
console.error("[EXULU] VLM validation is only supported when docling is enabled, skipping validation.");
|
|
16410
16823
|
}
|
|
16411
|
-
if (config?.
|
|
16824
|
+
if (config?.vlm?.model && json.length > 0) {
|
|
16412
16825
|
json = await validateWithVLM(
|
|
16413
16826
|
json,
|
|
16414
16827
|
config.vlm.model,
|
|
@@ -16436,29 +16849,37 @@ ${setupResult.output || ""}`);
|
|
|
16436
16849
|
"utf-8"
|
|
16437
16850
|
);
|
|
16438
16851
|
}
|
|
16439
|
-
const
|
|
16440
|
-
|
|
16441
|
-
|
|
16442
|
-
|
|
16443
|
-
|
|
16852
|
+
const markdownStream = fs2.createWriteStream(paths.markdown, { encoding: "utf-8" });
|
|
16853
|
+
for (let i = 0; i < json.length; i++) {
|
|
16854
|
+
const p = json[i];
|
|
16855
|
+
if (!p) continue;
|
|
16856
|
+
const content = p.vlm_corrected_text ?? p.content;
|
|
16857
|
+
markdownStream.write(content);
|
|
16858
|
+
if (i < json.length - 1) {
|
|
16859
|
+
markdownStream.write("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
|
|
16444
16860
|
}
|
|
16445
|
-
}
|
|
16446
|
-
await
|
|
16447
|
-
|
|
16448
|
-
|
|
16449
|
-
|
|
16450
|
-
);
|
|
16861
|
+
}
|
|
16862
|
+
await new Promise((resolve3, reject) => {
|
|
16863
|
+
markdownStream.end(() => resolve3());
|
|
16864
|
+
markdownStream.on("error", reject);
|
|
16865
|
+
});
|
|
16451
16866
|
console.log(`[EXULU] Validated output saved to: ${paths.json}`);
|
|
16452
16867
|
console.log(`[EXULU] Validated markdown saved to: ${paths.markdown}`);
|
|
16868
|
+
const markdown = await fs2.promises.readFile(paths.markdown, "utf-8");
|
|
16869
|
+
const processedJson = json.map((e) => {
|
|
16870
|
+
const finalContent = e.vlm_corrected_text ?? e.content;
|
|
16871
|
+
return {
|
|
16872
|
+
page: e.page,
|
|
16873
|
+
content: finalContent
|
|
16874
|
+
};
|
|
16875
|
+
});
|
|
16876
|
+
json.length = 0;
|
|
16877
|
+
json = [];
|
|
16878
|
+
const memUsage = process.memoryUsage();
|
|
16879
|
+
console.log(`[EXULU] Memory after document processing: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`);
|
|
16453
16880
|
return {
|
|
16454
16881
|
markdown,
|
|
16455
|
-
json:
|
|
16456
|
-
const finalContent = e.vlm_corrected_text || e.content;
|
|
16457
|
-
return {
|
|
16458
|
-
page: e.page,
|
|
16459
|
-
content: finalContent
|
|
16460
|
-
};
|
|
16461
|
-
})
|
|
16882
|
+
json: processedJson
|
|
16462
16883
|
};
|
|
16463
16884
|
} catch (error) {
|
|
16464
16885
|
console.error("[EXULU] Error processing document:", error);
|
|
@@ -16471,9 +16892,9 @@ var loadFile = async (file, name, tempDir) => {
|
|
|
16471
16892
|
if (!fileType) {
|
|
16472
16893
|
throw new Error("[EXULU] File name does not include extension, extension is required for document processing.");
|
|
16473
16894
|
}
|
|
16895
|
+
const UUID = (0, import_crypto.randomUUID)();
|
|
16474
16896
|
let buffer;
|
|
16475
16897
|
if (Buffer.isBuffer(file)) {
|
|
16476
|
-
const UUID = (0, import_crypto.randomUUID)();
|
|
16477
16898
|
filePath = path.join(tempDir, `${UUID}.${fileType}`);
|
|
16478
16899
|
await fs2.promises.writeFile(filePath, file);
|
|
16479
16900
|
buffer = file;
|
|
@@ -16482,7 +16903,10 @@ var loadFile = async (file, name, tempDir) => {
|
|
|
16482
16903
|
if (filePath.startsWith("http")) {
|
|
16483
16904
|
const response = await fetch(filePath);
|
|
16484
16905
|
const array = await response.arrayBuffer();
|
|
16906
|
+
const tempFilePath = path.join(tempDir, `${UUID}.${fileType}`);
|
|
16907
|
+
await fs2.promises.writeFile(tempFilePath, Buffer.from(array));
|
|
16485
16908
|
buffer = Buffer.from(array);
|
|
16909
|
+
filePath = tempFilePath;
|
|
16486
16910
|
} else {
|
|
16487
16911
|
buffer = await fs2.promises.readFile(file);
|
|
16488
16912
|
}
|
|
@@ -16500,17 +16924,34 @@ async function documentProcessor({
|
|
|
16500
16924
|
}
|
|
16501
16925
|
const uuid = (0, import_crypto.randomUUID)();
|
|
16502
16926
|
const tempDir = path.join(process.cwd(), "temp", uuid);
|
|
16927
|
+
const localFilesAndFoldersToDelete = [tempDir];
|
|
16503
16928
|
console.log(`[EXULU] Temporary directory for processing document ${name}: ${tempDir}`);
|
|
16504
16929
|
await fs2.promises.mkdir(tempDir, { recursive: true });
|
|
16930
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
16931
|
+
await fs2.promises.writeFile(path.join(tempDir, "created_at.txt"), timestamp);
|
|
16505
16932
|
try {
|
|
16506
16933
|
const {
|
|
16507
16934
|
filePath,
|
|
16508
16935
|
fileType,
|
|
16509
16936
|
buffer
|
|
16510
16937
|
} = await loadFile(file, name, tempDir);
|
|
16511
|
-
|
|
16938
|
+
let supportedTypes = [];
|
|
16939
|
+
switch (config?.processor.name) {
|
|
16940
|
+
case "docling":
|
|
16941
|
+
supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
|
|
16942
|
+
break;
|
|
16943
|
+
case "officeparser":
|
|
16944
|
+
supportedTypes = [];
|
|
16945
|
+
break;
|
|
16946
|
+
case "liteparse":
|
|
16947
|
+
supportedTypes = ["pdf", "doc", "docx", "docm", "odt", "rtf", "ppt", "pptx", "pptm", "odp", "xls", "xlsx", "xlsm", "ods", "csv", "tsv"];
|
|
16948
|
+
break;
|
|
16949
|
+
case "mistral":
|
|
16950
|
+
supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
|
|
16951
|
+
break;
|
|
16952
|
+
}
|
|
16512
16953
|
if (!supportedTypes.includes(fileType)) {
|
|
16513
|
-
throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor.`);
|
|
16954
|
+
throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor, the ${config?.processor.name} processor only supports the following file types: ${supportedTypes.join(", ")}.`);
|
|
16514
16955
|
}
|
|
16515
16956
|
const { content } = await processDocument(
|
|
16516
16957
|
filePath,
|
|
@@ -16523,9 +16964,19 @@ async function documentProcessor({
|
|
|
16523
16964
|
return content.json;
|
|
16524
16965
|
} catch (error) {
|
|
16525
16966
|
console.error("Error during chunking:", error);
|
|
16526
|
-
|
|
16967
|
+
throw error;
|
|
16527
16968
|
} finally {
|
|
16528
|
-
|
|
16969
|
+
if (config?.debugging?.deleteTempFiles !== false) {
|
|
16970
|
+
for (const file2 of localFilesAndFoldersToDelete) {
|
|
16971
|
+
try {
|
|
16972
|
+
await fs2.promises.rm(file2, { recursive: true });
|
|
16973
|
+
console.log(`[EXULU] Deleted file or folder: ${file2}`);
|
|
16974
|
+
} catch (error) {
|
|
16975
|
+
console.error(`[EXULU] Error deleting file or folder: ${file2}`, error);
|
|
16976
|
+
console.log(`[EXULU] File or folder still exists: ${file2}`);
|
|
16977
|
+
}
|
|
16978
|
+
}
|
|
16979
|
+
}
|
|
16529
16980
|
}
|
|
16530
16981
|
}
|
|
16531
16982
|
|