@exulu/backend 1.49.2 → 1.51.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/setup-python.cjs +140 -0
- package/dist/index.cjs +561 -119
- package/dist/index.d.cts +16 -3
- package/dist/index.d.ts +16 -3
- package/dist/index.js +564 -122
- package/ee/chunking/markdown.ts +83 -5
- package/ee/python/documents/processing/doc_processor.ts +380 -84
- package/ee/workers.ts +214 -18
- package/package.json +8 -1
package/dist/index.js
CHANGED
|
@@ -147,17 +147,19 @@ async function postgresClient() {
|
|
|
147
147
|
// 30 minutes
|
|
148
148
|
},
|
|
149
149
|
pool: {
|
|
150
|
-
min:
|
|
151
|
-
//
|
|
152
|
-
max:
|
|
153
|
-
// Increased
|
|
154
|
-
acquireTimeoutMillis:
|
|
155
|
-
//
|
|
150
|
+
min: 10,
|
|
151
|
+
// Minimum connections always ready
|
|
152
|
+
max: 300,
|
|
153
|
+
// Increased to support high worker concurrency (250+ concurrent jobs)
|
|
154
|
+
acquireTimeoutMillis: 12e4,
|
|
155
|
+
// 2 minutes - increased to handle high contention during bursts
|
|
156
156
|
createTimeoutMillis: 3e4,
|
|
157
157
|
idleTimeoutMillis: 6e4,
|
|
158
|
-
//
|
|
158
|
+
// Keep connections alive for reuse
|
|
159
159
|
reapIntervalMillis: 1e3,
|
|
160
160
|
createRetryIntervalMillis: 200,
|
|
161
|
+
// Enable propagateCreateError to properly handle connection creation failures
|
|
162
|
+
propagateCreateError: false,
|
|
161
163
|
// Log pool events to help debug connection issues
|
|
162
164
|
afterCreate: (conn, done) => {
|
|
163
165
|
console.log("[EXULU] New database connection created");
|
|
@@ -3577,7 +3579,7 @@ var convertExuluToolsToAiSdkTools = async (currentTools, approvedTools, allExulu
|
|
|
3577
3579
|
description,
|
|
3578
3580
|
// The approvedTools array uses the tool.name lookup as the frontend
|
|
3579
3581
|
// Vercel AI SDK uses the sanitized tool name as the key, so this matches.
|
|
3580
|
-
needsApproval: approvedTools?.includes("tool-" + cur.name) ? false : true,
|
|
3582
|
+
needsApproval: approvedTools?.includes("tool-" + cur.name) || !cur.needsApproval ? false : true,
|
|
3581
3583
|
// todo make configurable
|
|
3582
3584
|
async *execute(inputs, options) {
|
|
3583
3585
|
console.log(
|
|
@@ -3727,6 +3729,7 @@ var ExuluTool = class {
|
|
|
3727
3729
|
inputSchema;
|
|
3728
3730
|
type;
|
|
3729
3731
|
tool;
|
|
3732
|
+
needsApproval;
|
|
3730
3733
|
config;
|
|
3731
3734
|
constructor({
|
|
3732
3735
|
id,
|
|
@@ -3736,10 +3739,12 @@ var ExuluTool = class {
|
|
|
3736
3739
|
inputSchema,
|
|
3737
3740
|
type,
|
|
3738
3741
|
execute: execute2,
|
|
3739
|
-
config
|
|
3742
|
+
config,
|
|
3743
|
+
needsApproval
|
|
3740
3744
|
}) {
|
|
3741
3745
|
this.id = id;
|
|
3742
3746
|
this.config = config;
|
|
3747
|
+
this.needsApproval = needsApproval ?? true;
|
|
3743
3748
|
this.category = category || "default";
|
|
3744
3749
|
this.name = name;
|
|
3745
3750
|
this.description = description;
|
|
@@ -4146,11 +4151,30 @@ var ExuluContext2 = class {
|
|
|
4146
4151
|
);
|
|
4147
4152
|
await db2.from(getChunksTableName(this.id)).where({ source }).delete();
|
|
4148
4153
|
if (chunks?.length) {
|
|
4154
|
+
const sanitizeString = (str) => {
|
|
4155
|
+
if (!str) return "";
|
|
4156
|
+
return str.replace(/\0/g, "");
|
|
4157
|
+
};
|
|
4158
|
+
const sanitizeMetadata2 = (metadata) => {
|
|
4159
|
+
if (!metadata) return {};
|
|
4160
|
+
const sanitized = {};
|
|
4161
|
+
for (const [key, value] of Object.entries(metadata)) {
|
|
4162
|
+
if (typeof value === "string") {
|
|
4163
|
+
sanitized[key] = sanitizeString(value);
|
|
4164
|
+
} else {
|
|
4165
|
+
sanitized[key] = value;
|
|
4166
|
+
}
|
|
4167
|
+
}
|
|
4168
|
+
return sanitized;
|
|
4169
|
+
};
|
|
4149
4170
|
await db2.from(getChunksTableName(this.id)).insert(
|
|
4150
4171
|
chunks.map((chunk) => ({
|
|
4151
|
-
source
|
|
4152
|
-
|
|
4153
|
-
|
|
4172
|
+
// Sanitize source to remove null bytes
|
|
4173
|
+
source: sanitizeString(source),
|
|
4174
|
+
// Sanitize metadata to remove null bytes from string values
|
|
4175
|
+
metadata: sanitizeMetadata2(chunk.metadata),
|
|
4176
|
+
// Remove null bytes (0x00) which are invalid in PostgreSQL UTF8 encoding
|
|
4177
|
+
content: sanitizeString(chunk.content),
|
|
4154
4178
|
chunk_index: chunk.index,
|
|
4155
4179
|
embedding: pgvector2.toSql(chunk.vector)
|
|
4156
4180
|
}))
|
|
@@ -4539,6 +4563,8 @@ var ExuluContext2 = class {
|
|
|
4539
4563
|
name: `${this.name}_context_search`,
|
|
4540
4564
|
type: "context",
|
|
4541
4565
|
category: "contexts",
|
|
4566
|
+
needsApproval: true,
|
|
4567
|
+
// todo make configurable
|
|
4542
4568
|
inputSchema: z4.object({
|
|
4543
4569
|
query: z4.string().describe("The original question that the user asked"),
|
|
4544
4570
|
keywords: z4.array(z4.string()).describe(
|
|
@@ -5732,7 +5758,7 @@ var finalizeRequestedFields = async ({
|
|
|
5732
5758
|
return result;
|
|
5733
5759
|
}
|
|
5734
5760
|
const { db: db2 } = await postgresClient();
|
|
5735
|
-
const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt");
|
|
5761
|
+
const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt", "metadata");
|
|
5736
5762
|
const chunks = await query;
|
|
5737
5763
|
result.chunks = chunks.map((chunk) => ({
|
|
5738
5764
|
chunk_content: chunk.content,
|
|
@@ -5745,7 +5771,8 @@ var finalizeRequestedFields = async ({
|
|
|
5745
5771
|
item_created_at: chunk.item_created_at,
|
|
5746
5772
|
item_id: chunk.item_id,
|
|
5747
5773
|
item_external_id: chunk.item_external_id,
|
|
5748
|
-
item_name: chunk.item_name
|
|
5774
|
+
item_name: chunk.item_name,
|
|
5775
|
+
chunk_metadata: chunk.metadata
|
|
5749
5776
|
}));
|
|
5750
5777
|
}
|
|
5751
5778
|
}
|
|
@@ -7119,6 +7146,36 @@ import "ai";
|
|
|
7119
7146
|
import CryptoJS4 from "crypto-js";
|
|
7120
7147
|
var redisConnection;
|
|
7121
7148
|
var unhandledRejectionHandlerInstalled = false;
|
|
7149
|
+
var poolMonitoringInterval;
|
|
7150
|
+
var startPoolMonitoring = () => {
|
|
7151
|
+
if (poolMonitoringInterval) return;
|
|
7152
|
+
poolMonitoringInterval = setInterval(async () => {
|
|
7153
|
+
try {
|
|
7154
|
+
const { db: db2 } = await postgresClient();
|
|
7155
|
+
const poolStats = db2.client.pool;
|
|
7156
|
+
if (poolStats) {
|
|
7157
|
+
const used = poolStats.numUsed?.() || 0;
|
|
7158
|
+
const free = poolStats.numFree?.() || 0;
|
|
7159
|
+
const pending = poolStats.numPendingAcquires?.() || 0;
|
|
7160
|
+
const total = used + free;
|
|
7161
|
+
console.log("[EXULU] Connection pool health check:", {
|
|
7162
|
+
used,
|
|
7163
|
+
free,
|
|
7164
|
+
pending,
|
|
7165
|
+
total,
|
|
7166
|
+
utilization: total > 0 ? `${Math.round(used / total * 100)}%` : "0%"
|
|
7167
|
+
});
|
|
7168
|
+
if (pending > 10) {
|
|
7169
|
+
console.warn(
|
|
7170
|
+
`[EXULU] WARNING: ${pending} jobs waiting for database connections. Consider increasing pool size or reducing worker concurrency.`
|
|
7171
|
+
);
|
|
7172
|
+
}
|
|
7173
|
+
}
|
|
7174
|
+
} catch (error) {
|
|
7175
|
+
console.error("[EXULU] Error checking pool health:", error);
|
|
7176
|
+
}
|
|
7177
|
+
}, 3e4);
|
|
7178
|
+
};
|
|
7122
7179
|
var installGlobalErrorHandlers = () => {
|
|
7123
7180
|
if (unhandledRejectionHandlerInstalled) return;
|
|
7124
7181
|
process.on("unhandledRejection", (reason) => {
|
|
@@ -7143,6 +7200,7 @@ var installGlobalErrorHandlers = () => {
|
|
|
7143
7200
|
unhandledRejectionHandlerInstalled = true;
|
|
7144
7201
|
console.log("[EXULU] Global error handlers installed to prevent worker crashes");
|
|
7145
7202
|
};
|
|
7203
|
+
var isShuttingDown = false;
|
|
7146
7204
|
var createWorkers = async (providers, queues2, config, contexts, rerankers, evals, tools, tracer) => {
|
|
7147
7205
|
console.log("[EXULU] creating workers for " + queues2?.length + " queues.");
|
|
7148
7206
|
console.log(
|
|
@@ -7150,7 +7208,8 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7150
7208
|
queues2.map((q) => q.queue.name)
|
|
7151
7209
|
);
|
|
7152
7210
|
installGlobalErrorHandlers();
|
|
7153
|
-
|
|
7211
|
+
startPoolMonitoring();
|
|
7212
|
+
process.setMaxListeners(Math.max(15, process.getMaxListeners()));
|
|
7154
7213
|
if (!redisServer.host || !redisServer.port) {
|
|
7155
7214
|
console.error(
|
|
7156
7215
|
"[EXULU] you are trying to start worker, but no redis server is configured in the environment."
|
|
@@ -7183,7 +7242,53 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7183
7242
|
status: await bullmqJob.getState(),
|
|
7184
7243
|
type: bullmqJob.data.type
|
|
7185
7244
|
});
|
|
7186
|
-
|
|
7245
|
+
let progressInterval;
|
|
7246
|
+
if (bullmqJob.data.type === "processor") {
|
|
7247
|
+
progressInterval = setInterval(async () => {
|
|
7248
|
+
try {
|
|
7249
|
+
await bullmqJob.updateProgress({
|
|
7250
|
+
status: "processing",
|
|
7251
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
7252
|
+
});
|
|
7253
|
+
console.log(`[EXULU] Job ${bullmqJob.id} heartbeat sent to prevent stalling`);
|
|
7254
|
+
} catch (error) {
|
|
7255
|
+
console.error(`[EXULU] Error updating job progress:`, error);
|
|
7256
|
+
}
|
|
7257
|
+
}, 25e3);
|
|
7258
|
+
}
|
|
7259
|
+
let db2;
|
|
7260
|
+
let retries = 3;
|
|
7261
|
+
let lastError;
|
|
7262
|
+
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
7263
|
+
try {
|
|
7264
|
+
const client2 = await postgresClient();
|
|
7265
|
+
db2 = client2.db;
|
|
7266
|
+
const poolStats = db2.client.pool;
|
|
7267
|
+
if (poolStats) {
|
|
7268
|
+
console.log(`[EXULU] Connection pool stats for job ${bullmqJob.id}:`, {
|
|
7269
|
+
size: poolStats.numUsed?.() || 0,
|
|
7270
|
+
available: poolStats.numFree?.() || 0,
|
|
7271
|
+
pending: poolStats.numPendingAcquires?.() || 0
|
|
7272
|
+
});
|
|
7273
|
+
}
|
|
7274
|
+
break;
|
|
7275
|
+
} catch (error) {
|
|
7276
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
7277
|
+
console.error(
|
|
7278
|
+
`[EXULU] Failed to acquire database connection (attempt ${attempt}/${retries}) for job ${bullmqJob.id}:`,
|
|
7279
|
+
lastError.message
|
|
7280
|
+
);
|
|
7281
|
+
if (attempt < retries) {
|
|
7282
|
+
const backoffMs = 500 * Math.pow(2, attempt - 1);
|
|
7283
|
+
await new Promise((resolve3) => setTimeout(resolve3, backoffMs));
|
|
7284
|
+
}
|
|
7285
|
+
}
|
|
7286
|
+
}
|
|
7287
|
+
if (!db2) {
|
|
7288
|
+
throw new Error(
|
|
7289
|
+
`Failed to acquire database connection after ${retries} attempts: ${lastError?.message}`
|
|
7290
|
+
);
|
|
7291
|
+
}
|
|
7187
7292
|
const data = bullmqJob.data;
|
|
7188
7293
|
const timeoutInSeconds = data.timeoutInSeconds || queue.timeoutInSeconds || 600;
|
|
7189
7294
|
const timeoutMs = timeoutInSeconds * 1e3;
|
|
@@ -7275,7 +7380,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7275
7380
|
}
|
|
7276
7381
|
const exuluStorage = new ExuluStorage({ config });
|
|
7277
7382
|
console.log("[EXULU] POS 2 -- EXULU CONTEXT PROCESS FIELD");
|
|
7278
|
-
|
|
7383
|
+
let processorResult = await context.processor.execute({
|
|
7279
7384
|
item: data.inputs,
|
|
7280
7385
|
user: data.user,
|
|
7281
7386
|
role: data.role,
|
|
@@ -7290,12 +7395,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7290
7395
|
);
|
|
7291
7396
|
}
|
|
7292
7397
|
delete processorResult.field;
|
|
7398
|
+
const updateData = { ...processorResult };
|
|
7293
7399
|
await db2.from(getTableName(context.id)).where({
|
|
7294
7400
|
id: processorResult.id
|
|
7295
7401
|
}).update({
|
|
7296
|
-
...
|
|
7402
|
+
...updateData,
|
|
7297
7403
|
last_processed_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
7298
7404
|
});
|
|
7405
|
+
Object.keys(updateData).forEach((key) => {
|
|
7406
|
+
delete updateData[key];
|
|
7407
|
+
});
|
|
7299
7408
|
let jobs = [];
|
|
7300
7409
|
if (context.processor?.config?.generateEmbeddings) {
|
|
7301
7410
|
const fullItem = await db2.from(getTableName(context.id)).where({
|
|
@@ -7317,12 +7426,18 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7317
7426
|
jobs.push(embeddingsJob);
|
|
7318
7427
|
}
|
|
7319
7428
|
}
|
|
7320
|
-
|
|
7321
|
-
result: processorResult,
|
|
7429
|
+
const result = {
|
|
7430
|
+
result: { id: processorResult.id },
|
|
7322
7431
|
metadata: {
|
|
7323
7432
|
jobs: jobs.length > 0 ? jobs.join(",") : void 0
|
|
7324
7433
|
}
|
|
7325
7434
|
};
|
|
7435
|
+
processorResult = null;
|
|
7436
|
+
const memUsage = process.memoryUsage();
|
|
7437
|
+
console.log(
|
|
7438
|
+
`[EXULU] Memory after processor job ${bullmqJob.id}: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`
|
|
7439
|
+
);
|
|
7440
|
+
return result;
|
|
7326
7441
|
}
|
|
7327
7442
|
if (data.type === "workflow") {
|
|
7328
7443
|
console.log("[EXULU] running a workflow job.", bullmqJob.name);
|
|
@@ -7341,10 +7456,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7341
7456
|
user,
|
|
7342
7457
|
messages: inputMessages
|
|
7343
7458
|
} = await validateWorkflowPayload(data, providers);
|
|
7344
|
-
const
|
|
7459
|
+
const retries2 = 3;
|
|
7345
7460
|
let attempts = 0;
|
|
7346
7461
|
const promise = new Promise(async (resolve3, reject) => {
|
|
7347
|
-
while (attempts <
|
|
7462
|
+
while (attempts < retries2) {
|
|
7348
7463
|
try {
|
|
7349
7464
|
const messages2 = await processUiMessagesFlow({
|
|
7350
7465
|
providers,
|
|
@@ -7366,7 +7481,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7366
7481
|
error instanceof Error ? error.message : String(error)
|
|
7367
7482
|
);
|
|
7368
7483
|
attempts++;
|
|
7369
|
-
if (attempts >=
|
|
7484
|
+
if (attempts >= retries2) {
|
|
7370
7485
|
reject(new Error(error instanceof Error ? error.message : String(error)));
|
|
7371
7486
|
}
|
|
7372
7487
|
await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
|
|
@@ -7417,10 +7532,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7417
7532
|
testCase,
|
|
7418
7533
|
messages: inputMessages
|
|
7419
7534
|
} = await validateEvalPayload(data, providers);
|
|
7420
|
-
const
|
|
7535
|
+
const retries2 = 3;
|
|
7421
7536
|
let attempts = 0;
|
|
7422
7537
|
const promise = new Promise(async (resolve3, reject) => {
|
|
7423
|
-
while (attempts <
|
|
7538
|
+
while (attempts < retries2) {
|
|
7424
7539
|
try {
|
|
7425
7540
|
const messages2 = await processUiMessagesFlow({
|
|
7426
7541
|
providers,
|
|
@@ -7441,7 +7556,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7441
7556
|
error instanceof Error ? error.message : String(error)
|
|
7442
7557
|
);
|
|
7443
7558
|
attempts++;
|
|
7444
|
-
if (attempts >=
|
|
7559
|
+
if (attempts >= retries2) {
|
|
7445
7560
|
reject(new Error(error instanceof Error ? error.message : String(error)));
|
|
7446
7561
|
}
|
|
7447
7562
|
await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
|
|
@@ -7690,9 +7805,15 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7690
7805
|
try {
|
|
7691
7806
|
const result = await Promise.race([workPromise, timeoutPromise]);
|
|
7692
7807
|
clearTimeout(timeoutHandle);
|
|
7808
|
+
if (progressInterval) {
|
|
7809
|
+
clearInterval(progressInterval);
|
|
7810
|
+
}
|
|
7693
7811
|
return result;
|
|
7694
7812
|
} catch (error) {
|
|
7695
7813
|
clearTimeout(timeoutHandle);
|
|
7814
|
+
if (progressInterval) {
|
|
7815
|
+
clearInterval(progressInterval);
|
|
7816
|
+
}
|
|
7696
7817
|
console.error(
|
|
7697
7818
|
`[EXULU] job ${bullmqJob.id} failed (error caught in race handler).`,
|
|
7698
7819
|
error instanceof Error ? error.message : String(error)
|
|
@@ -7706,6 +7827,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7706
7827
|
concurrency: queue.concurrency?.worker || 1,
|
|
7707
7828
|
removeOnComplete: { count: 1e3 },
|
|
7708
7829
|
removeOnFail: { count: 5e3 },
|
|
7830
|
+
// Configure settings for long-running jobs (especially processor jobs)
|
|
7831
|
+
// lockDuration: How long a worker can hold a job before it's considered stalled
|
|
7832
|
+
// Set to 5 minutes to accommodate CPU-intensive operations
|
|
7833
|
+
lockDuration: 3e5,
|
|
7834
|
+
// 5 minutes in milliseconds
|
|
7835
|
+
// stalledInterval: How often to check for stalled jobs
|
|
7836
|
+
// Set to 2 minutes to reduce false positives for long-running operations
|
|
7837
|
+
stalledInterval: 12e4,
|
|
7838
|
+
// 2 minutes in milliseconds
|
|
7839
|
+
maxStalledCount: 1,
|
|
7709
7840
|
...queue.ratelimit && {
|
|
7710
7841
|
limiter: {
|
|
7711
7842
|
max: queue.ratelimit,
|
|
@@ -7742,24 +7873,68 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
|
|
|
7742
7873
|
error: error instanceof Error ? error.message : String(error)
|
|
7743
7874
|
} : error
|
|
7744
7875
|
);
|
|
7876
|
+
throw error;
|
|
7745
7877
|
});
|
|
7746
7878
|
worker.on("error", (error) => {
|
|
7747
7879
|
console.error(`[EXULU] worker error.`, error);
|
|
7880
|
+
throw error;
|
|
7748
7881
|
});
|
|
7749
7882
|
worker.on("progress", (job, progress) => {
|
|
7750
7883
|
console.log(`[EXULU] job progress ${job.id}.`, job.name, {
|
|
7751
7884
|
progress
|
|
7752
7885
|
});
|
|
7753
7886
|
});
|
|
7754
|
-
const gracefulShutdown = async (signal) => {
|
|
7755
|
-
console.log(`Received ${signal}, closing server...`);
|
|
7756
|
-
await worker.close();
|
|
7757
|
-
process.exit(0);
|
|
7758
|
-
};
|
|
7759
|
-
process.on("SIGINT", () => gracefulShutdown("SIGINT"));
|
|
7760
|
-
process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
|
|
7761
7887
|
return worker;
|
|
7762
7888
|
});
|
|
7889
|
+
const gracefulShutdown = async (signal) => {
|
|
7890
|
+
if (isShuttingDown) {
|
|
7891
|
+
console.log(`[EXULU] Shutdown already in progress, ignoring additional ${signal}`);
|
|
7892
|
+
return;
|
|
7893
|
+
}
|
|
7894
|
+
isShuttingDown = true;
|
|
7895
|
+
console.log(`[EXULU] Received ${signal}, shutting down gracefully...`);
|
|
7896
|
+
try {
|
|
7897
|
+
if (poolMonitoringInterval) {
|
|
7898
|
+
clearInterval(poolMonitoringInterval);
|
|
7899
|
+
poolMonitoringInterval = void 0;
|
|
7900
|
+
}
|
|
7901
|
+
console.log(`[EXULU] Closing ${workers.length} worker(s)...`);
|
|
7902
|
+
const closePromises = workers.map(async (worker, index) => {
|
|
7903
|
+
try {
|
|
7904
|
+
await Promise.race([
|
|
7905
|
+
worker.close(),
|
|
7906
|
+
new Promise(
|
|
7907
|
+
(_, reject) => setTimeout(() => reject(new Error("Worker close timeout")), 3e4)
|
|
7908
|
+
)
|
|
7909
|
+
]);
|
|
7910
|
+
console.log(`[EXULU] Worker ${index + 1} closed successfully`);
|
|
7911
|
+
} catch (error) {
|
|
7912
|
+
console.error(`[EXULU] Error closing worker ${index + 1}:`, error);
|
|
7913
|
+
}
|
|
7914
|
+
});
|
|
7915
|
+
await Promise.allSettled(closePromises);
|
|
7916
|
+
if (redisConnection) {
|
|
7917
|
+
console.log(`[EXULU] Closing Redis connection...`);
|
|
7918
|
+
await redisConnection.quit();
|
|
7919
|
+
}
|
|
7920
|
+
try {
|
|
7921
|
+
const { db: db2 } = await postgresClient();
|
|
7922
|
+
if (db2?.client) {
|
|
7923
|
+
console.log(`[EXULU] Closing database connection pool...`);
|
|
7924
|
+
await db2.client.destroy();
|
|
7925
|
+
}
|
|
7926
|
+
} catch (error) {
|
|
7927
|
+
console.error(`[EXULU] Error closing database:`, error);
|
|
7928
|
+
}
|
|
7929
|
+
console.log(`[EXULU] Graceful shutdown complete`);
|
|
7930
|
+
process.exit(0);
|
|
7931
|
+
} catch (error) {
|
|
7932
|
+
console.error(`[EXULU] Error during graceful shutdown:`, error);
|
|
7933
|
+
process.exit(1);
|
|
7934
|
+
}
|
|
7935
|
+
};
|
|
7936
|
+
process.once("SIGINT", () => gracefulShutdown("SIGINT"));
|
|
7937
|
+
process.once("SIGTERM", () => gracefulShutdown("SIGTERM"));
|
|
7763
7938
|
return workers;
|
|
7764
7939
|
};
|
|
7765
7940
|
var validateWorkflowPayload = async (data, providers) => {
|
|
@@ -9473,6 +9648,7 @@ type AgentEvalFunctionConfig {
|
|
|
9473
9648
|
|
|
9474
9649
|
type ItemChunks {
|
|
9475
9650
|
chunk_id: String!
|
|
9651
|
+
chunk_metadata: JSON!
|
|
9476
9652
|
chunk_index: Int!
|
|
9477
9653
|
chunk_content: String!
|
|
9478
9654
|
chunk_source: String!
|
|
@@ -9691,7 +9867,7 @@ import cookieParser from "cookie-parser";
|
|
|
9691
9867
|
import { z as z7 } from "zod";
|
|
9692
9868
|
import {
|
|
9693
9869
|
convertToModelMessages,
|
|
9694
|
-
|
|
9870
|
+
Output as Output2,
|
|
9695
9871
|
generateText as generateText2,
|
|
9696
9872
|
streamText,
|
|
9697
9873
|
validateUIMessages,
|
|
@@ -9886,7 +10062,7 @@ var ExuluProvider = class {
|
|
|
9886
10062
|
prompt: z7.string().describe("The prompt (usually a question for the agent) to send to the agent."),
|
|
9887
10063
|
information: z7.string().describe("A summary of relevant context / information from the current session")
|
|
9888
10064
|
}),
|
|
9889
|
-
description: `This tool calls an
|
|
10065
|
+
description: `This tool calls an agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
|
|
9890
10066
|
config: [],
|
|
9891
10067
|
execute: async ({ prompt, information, user, allExuluTools }) => {
|
|
9892
10068
|
const hasAccessToAgent = await checkRecordAccess(agent, "read", user);
|
|
@@ -9999,9 +10175,6 @@ var ExuluProvider = class {
|
|
|
9999
10175
|
if (!prompt && !inputMessages?.length) {
|
|
10000
10176
|
throw new Error("Prompt or message is required for generating.");
|
|
10001
10177
|
}
|
|
10002
|
-
if (outputSchema && !prompt) {
|
|
10003
|
-
throw new Error("Prompt is required for generating with an output schema.");
|
|
10004
|
-
}
|
|
10005
10178
|
const model = this.model.create({
|
|
10006
10179
|
...providerapikey ? { apiKey: providerapikey } : {}
|
|
10007
10180
|
});
|
|
@@ -10138,14 +10311,18 @@ var ExuluProvider = class {
|
|
|
10138
10311
|
let inputTokens = 0;
|
|
10139
10312
|
let outputTokens = 0;
|
|
10140
10313
|
if (outputSchema) {
|
|
10141
|
-
const {
|
|
10314
|
+
const { output, usage } = await generateText2({
|
|
10142
10315
|
model,
|
|
10143
10316
|
system,
|
|
10144
|
-
prompt,
|
|
10145
10317
|
maxRetries: 3,
|
|
10146
|
-
|
|
10318
|
+
output: Output2.object({
|
|
10319
|
+
schema: outputSchema
|
|
10320
|
+
}),
|
|
10321
|
+
prompt,
|
|
10322
|
+
stopWhen: [stepCountIs2(5)]
|
|
10323
|
+
// make configurable
|
|
10147
10324
|
});
|
|
10148
|
-
result.object =
|
|
10325
|
+
result.object = output;
|
|
10149
10326
|
inputTokens = usage.inputTokens || 0;
|
|
10150
10327
|
outputTokens = usage.outputTokens || 0;
|
|
10151
10328
|
} else {
|
|
@@ -10176,6 +10353,7 @@ var ExuluProvider = class {
|
|
|
10176
10353
|
agent
|
|
10177
10354
|
),
|
|
10178
10355
|
stopWhen: [stepCountIs2(5)]
|
|
10356
|
+
// make configurable
|
|
10179
10357
|
});
|
|
10180
10358
|
result.text = text;
|
|
10181
10359
|
inputTokens = totalUsage?.inputTokens || 0;
|
|
@@ -10650,6 +10828,7 @@ var providerRateLimiter = async (key, windowSeconds, limit, points) => {
|
|
|
10650
10828
|
};
|
|
10651
10829
|
|
|
10652
10830
|
// src/exulu/routes.ts
|
|
10831
|
+
import { convertJsonSchemaToZod } from "zod-from-json-schema";
|
|
10653
10832
|
var REQUEST_SIZE_LIMIT = "50mb";
|
|
10654
10833
|
var getExuluVersionNumber = async () => {
|
|
10655
10834
|
try {
|
|
@@ -11070,6 +11249,16 @@ Mood: friendly and intelligent.
|
|
|
11070
11249
|
providers,
|
|
11071
11250
|
user
|
|
11072
11251
|
);
|
|
11252
|
+
if (req.body.outputSchema && !!headers.stream) {
|
|
11253
|
+
throw new Error("Providing a outputSchema in the POST body is not allowed when using the streaming API, set 'stream' to false in the headers when defining a response schema.");
|
|
11254
|
+
}
|
|
11255
|
+
let outputSchema;
|
|
11256
|
+
if (req.body.outputSchema) {
|
|
11257
|
+
if (typeof req.body.outputSchema === "string") {
|
|
11258
|
+
req.body.outputSchema = JSON.parse(req.body.outputSchema);
|
|
11259
|
+
}
|
|
11260
|
+
outputSchema = convertJsonSchemaToZod(req.body.outputSchema);
|
|
11261
|
+
}
|
|
11073
11262
|
let providerapikey;
|
|
11074
11263
|
const variableName = agent.providerapikey;
|
|
11075
11264
|
if (variableName) {
|
|
@@ -11216,6 +11405,7 @@ Mood: friendly and intelligent.
|
|
|
11216
11405
|
const response = await provider.generateSync({
|
|
11217
11406
|
contexts,
|
|
11218
11407
|
rerankers: rerankers || [],
|
|
11408
|
+
outputSchema,
|
|
11219
11409
|
agent,
|
|
11220
11410
|
user,
|
|
11221
11411
|
req,
|
|
@@ -15451,6 +15641,22 @@ var MarkdownChunker = class {
|
|
|
15451
15641
|
});
|
|
15452
15642
|
return result;
|
|
15453
15643
|
}
|
|
15644
|
+
/**
|
|
15645
|
+
* Checks if a position in the text falls within a <diagram> tag.
|
|
15646
|
+
* Returns the adjusted position (before the diagram) if inside a diagram, otherwise returns the original position.
|
|
15647
|
+
*/
|
|
15648
|
+
adjustForDiagramTags(text, position) {
|
|
15649
|
+
const diagramRegex = /<diagram>[\s\S]*?<\/diagram>/gi;
|
|
15650
|
+
let match;
|
|
15651
|
+
while ((match = diagramRegex.exec(text)) !== null) {
|
|
15652
|
+
const diagramStart = match.index;
|
|
15653
|
+
const diagramEnd = match.index + match[0].length;
|
|
15654
|
+
if (position > diagramStart && position < diagramEnd) {
|
|
15655
|
+
return diagramStart;
|
|
15656
|
+
}
|
|
15657
|
+
}
|
|
15658
|
+
return position;
|
|
15659
|
+
}
|
|
15454
15660
|
/**
|
|
15455
15661
|
* Find the nearest logical breakpoint working backwards from the end of the text.
|
|
15456
15662
|
* Logical breakpoints are prioritized as follows:
|
|
@@ -15462,6 +15668,7 @@ var MarkdownChunker = class {
|
|
|
15462
15668
|
*
|
|
15463
15669
|
* Only considers breakpoints in the last 50% of the text to avoid creating very small chunks.
|
|
15464
15670
|
* Returns the position of the breakpoint, or null if none found
|
|
15671
|
+
* IMPORTANT: Never splits content within <diagram> tags
|
|
15465
15672
|
*/
|
|
15466
15673
|
findLogicalBreakpoint(text) {
|
|
15467
15674
|
if (text.length === 0) return null;
|
|
@@ -15481,7 +15688,7 @@ var MarkdownChunker = class {
|
|
|
15481
15688
|
}
|
|
15482
15689
|
}
|
|
15483
15690
|
if (lastHeaderPosition > 0) {
|
|
15484
|
-
return lastHeaderPosition;
|
|
15691
|
+
return this.adjustForDiagramTags(text, lastHeaderPosition);
|
|
15485
15692
|
}
|
|
15486
15693
|
let lastParagraphBreak = -1;
|
|
15487
15694
|
let searchPos = text.length;
|
|
@@ -15494,11 +15701,12 @@ var MarkdownChunker = class {
|
|
|
15494
15701
|
searchPos = pos;
|
|
15495
15702
|
}
|
|
15496
15703
|
if (lastParagraphBreak > 0) {
|
|
15497
|
-
|
|
15704
|
+
const adjusted = this.adjustForDiagramTags(text, lastParagraphBreak + 2);
|
|
15705
|
+
return adjusted;
|
|
15498
15706
|
}
|
|
15499
15707
|
const newlineIndex = text.lastIndexOf("\n");
|
|
15500
15708
|
if (newlineIndex >= minPosition) {
|
|
15501
|
-
return newlineIndex + 1;
|
|
15709
|
+
return this.adjustForDiagramTags(text, newlineIndex + 1);
|
|
15502
15710
|
}
|
|
15503
15711
|
const sentenceEndRegex = /[.!?](?:\s|$)/g;
|
|
15504
15712
|
let lastSentenceEnd = -1;
|
|
@@ -15508,13 +15716,13 @@ var MarkdownChunker = class {
|
|
|
15508
15716
|
}
|
|
15509
15717
|
}
|
|
15510
15718
|
if (lastSentenceEnd > 0) {
|
|
15511
|
-
return lastSentenceEnd;
|
|
15719
|
+
return this.adjustForDiagramTags(text, lastSentenceEnd);
|
|
15512
15720
|
}
|
|
15513
15721
|
let lastSpace = text.length;
|
|
15514
15722
|
while (lastSpace > minPosition) {
|
|
15515
15723
|
const pos = text.lastIndexOf(" ", lastSpace - 1);
|
|
15516
15724
|
if (pos >= minPosition) {
|
|
15517
|
-
return pos + 1;
|
|
15725
|
+
return this.adjustForDiagramTags(text, pos + 1);
|
|
15518
15726
|
}
|
|
15519
15727
|
lastSpace = pos;
|
|
15520
15728
|
}
|
|
@@ -15646,6 +15854,38 @@ var MarkdownChunker = class {
|
|
|
15646
15854
|
targetPosition = currentPosition + decoded.length;
|
|
15647
15855
|
}
|
|
15648
15856
|
}
|
|
15857
|
+
const diagramCheck = /<diagram>/gi;
|
|
15858
|
+
const diagramCloseCheck = /<\/diagram>/gi;
|
|
15859
|
+
let openDiagramsInSlice = 0;
|
|
15860
|
+
while (diagramCheck.exec(currentSlice) !== null) {
|
|
15861
|
+
openDiagramsInSlice++;
|
|
15862
|
+
}
|
|
15863
|
+
let closeDiagramsInSlice = 0;
|
|
15864
|
+
while (diagramCloseCheck.exec(currentSlice) !== null) {
|
|
15865
|
+
closeDiagramsInSlice++;
|
|
15866
|
+
}
|
|
15867
|
+
if (openDiagramsInSlice > closeDiagramsInSlice) {
|
|
15868
|
+
const lastDiagramOpenIndex = currentSlice.lastIndexOf("<diagram>");
|
|
15869
|
+
if (lastDiagramOpenIndex !== -1) {
|
|
15870
|
+
const remainingText = text.slice(currentPosition + lastDiagramOpenIndex);
|
|
15871
|
+
const closingTagMatch = /<\/diagram>/i.exec(remainingText);
|
|
15872
|
+
if (closingTagMatch) {
|
|
15873
|
+
const closingTagPosition = lastDiagramOpenIndex + closingTagMatch.index + closingTagMatch[0].length;
|
|
15874
|
+
const extendedSlice = text.slice(currentPosition, currentPosition + closingTagPosition);
|
|
15875
|
+
const extendedTokens = tokenizer.encode(extendedSlice);
|
|
15876
|
+
if (extendedTokens.length <= adjustedChunkSize * 1.5) {
|
|
15877
|
+
currentSlice = extendedSlice;
|
|
15878
|
+
targetPosition = currentPosition + closingTagPosition;
|
|
15879
|
+
} else {
|
|
15880
|
+
currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
|
|
15881
|
+
targetPosition = currentPosition + lastDiagramOpenIndex;
|
|
15882
|
+
}
|
|
15883
|
+
} else {
|
|
15884
|
+
currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
|
|
15885
|
+
targetPosition = currentPosition + lastDiagramOpenIndex;
|
|
15886
|
+
}
|
|
15887
|
+
}
|
|
15888
|
+
}
|
|
15649
15889
|
const breakpointPosition = this.findLogicalBreakpoint(currentSlice);
|
|
15650
15890
|
if (breakpointPosition !== null) {
|
|
15651
15891
|
currentSlice = currentSlice.slice(0, breakpointPosition);
|
|
@@ -15917,7 +16157,7 @@ Or manually run the setup script:
|
|
|
15917
16157
|
// ee/python/documents/processing/doc_processor.ts
|
|
15918
16158
|
import * as fs2 from "fs";
|
|
15919
16159
|
import * as path from "path";
|
|
15920
|
-
import { generateText as generateText3, Output as
|
|
16160
|
+
import { generateText as generateText3, Output as Output3 } from "ai";
|
|
15921
16161
|
import { z as z12 } from "zod";
|
|
15922
16162
|
import pLimit from "p-limit";
|
|
15923
16163
|
import { randomUUID as randomUUID6 } from "crypto";
|
|
@@ -16067,6 +16307,8 @@ ${command}`;
|
|
|
16067
16307
|
}
|
|
16068
16308
|
|
|
16069
16309
|
// ee/python/documents/processing/doc_processor.ts
|
|
16310
|
+
import { LiteParse } from "@llamaindex/liteparse";
|
|
16311
|
+
import { Mistral } from "@mistralai/mistralai";
|
|
16070
16312
|
async function processDocx(file) {
|
|
16071
16313
|
const html = await mammoth.convertToHtml({ buffer: file });
|
|
16072
16314
|
const turndownService = new TurndownService();
|
|
@@ -16141,50 +16383,91 @@ async function validatePageWithVLM(page, imagePath, model) {
|
|
|
16141
16383
|
const imageBuffer = await fs2.promises.readFile(imagePath);
|
|
16142
16384
|
const imageBase64 = imageBuffer.toString("base64");
|
|
16143
16385
|
const mimeType = "image/png";
|
|
16144
|
-
const prompt = `You are
|
|
16145
|
-
|
|
16146
|
-
Here is the current OCR/parsed content for this page:
|
|
16386
|
+
const prompt = `You are a document validation assistant. Your task is to analyze a page image and correct the output of an OCR/parsing pipeline. The content may include tables, technical diagrams, schematics, and structured text.
|
|
16147
16387
|
|
|
16148
16388
|
---
|
|
16389
|
+
## CURRENT OCR OUTPUT
|
|
16390
|
+
|
|
16149
16391
|
${page.content}
|
|
16150
16392
|
---
|
|
16151
16393
|
|
|
16152
|
-
|
|
16153
|
-
|
|
16154
|
-
1. Check if the extracted markdown text accurately represents the content from the page, including:
|
|
16155
|
-
- Table data (rows, columns, headers, values)
|
|
16156
|
-
- Technical diagrams, schematics, control boards
|
|
16157
|
-
- Icons, checkmarks, symbols
|
|
16158
|
-
- Image captions and labels
|
|
16159
|
-
|
|
16160
|
-
2. If the page has significant errors or omissions, provide a corrected version for the page.
|
|
16394
|
+
## YOUR TASK
|
|
16161
16395
|
|
|
16162
|
-
|
|
16396
|
+
Compare the page image to the OCR output above. Identify errors, omissions, and formatting issues, then return a structured validation result (see OUTPUT FORMAT below).
|
|
16163
16397
|
|
|
16164
|
-
|
|
16165
|
-
|
|
16166
|
-
|
|
16167
|
-
|
|
16168
|
-
|
|
16398
|
+
---
|
|
16399
|
+
## VALIDATION CHECKLIST
|
|
16400
|
+
|
|
16401
|
+
Work through these checks in order:
|
|
16402
|
+
|
|
16403
|
+
### 1. Text Accuracy
|
|
16404
|
+
- Verify all text is correctly transcribed.
|
|
16405
|
+
- For minor character-level OCR errors (e.g. "\xF6" vs "\xFC", "rn" vs "m"), **prefer the original OCR output** unless you are certain of an error. Do not silently "fix" characters based on guesswork.
|
|
16406
|
+
|
|
16407
|
+
### 2. Heading Levels
|
|
16408
|
+
- Verify that headings use correct Markdown levels (#, ##, ###, ####, #####, ######).
|
|
16409
|
+
- Determine heading level using the following priority:
|
|
16410
|
+
1. **Hierarchical numbering** (strongest signal): e.g. "1" \u2192 #, "2.1" \u2192 ##, "2.1.1" \u2192 ###, "2.1.2.5" \u2192 ####
|
|
16411
|
+
2. Font size (larger = higher level)
|
|
16412
|
+
3. Indentation
|
|
16413
|
+
4. Bold/emphasis styling
|
|
16414
|
+
|
|
16415
|
+
### 3. Tables
|
|
16416
|
+
|
|
16417
|
+
**First, decide whether the table should be Markdown or plain text:**
|
|
16418
|
+
- Use **Markdown table format** if the table has a consistent, clear header structure and uniform column layout throughout.
|
|
16419
|
+
- Use **plain text structured description** if the table:
|
|
16420
|
+
- Lacks a clear header row
|
|
16421
|
+
- Uses mixed or irregular column structures across rows
|
|
16422
|
+
- Functions more like a certificate, form, or label layout
|
|
16423
|
+
|
|
16424
|
+
**If using Markdown format**, follow these rules strictly:
|
|
16425
|
+
- Every table must have: header row \u2192 separator row \u2192 data rows
|
|
16426
|
+
- Use simple separators only: \`| --- | --- |\` (NOT \`|---|---|\` or long dashes)
|
|
16427
|
+
- Example:
|
|
16428
|
+
\`\`\`
|
|
16169
16429
|
| Column 1 | Column 2 |
|
|
16170
16430
|
| --- | --- |
|
|
16171
|
-
| Data 1
|
|
16172
|
-
|
|
16173
|
-
-
|
|
16174
|
-
|
|
16175
|
-
|
|
16176
|
-
|
|
16177
|
-
|
|
16178
|
-
-
|
|
16179
|
-
|
|
16180
|
-
|
|
16431
|
+
| Data 1 | Data 2 |
|
|
16432
|
+
\`\`\`
|
|
16433
|
+
- Important: do not use the | character as part of the data inside a cell, this would break the table, if a cell contains a | character, use a capital I.
|
|
16434
|
+
|
|
16435
|
+
**Symbol translation rules for table cells:**
|
|
16436
|
+
- Black/filled dot \u2192 \`+\` (active); White/empty dot \u2192 \`-\` (inactive)
|
|
16437
|
+
*(e.g. Rufe-LED columns)*
|
|
16438
|
+
- Green or black checkmark \u2192 \`+\` (active); Red or black cross \u2192 \`-\` (inactive)
|
|
16439
|
+
|
|
16440
|
+
### 4. Multi-Page Table Continuity
|
|
16441
|
+
- If this page contains a table with a header row that runs to the bottom of the page (suggesting it may continue on the next page), extract the header row and include it in the \`current_page_table.headers\` field.
|
|
16442
|
+
- If this page contains a table WITHOUT a header row (suggesting it's a continuation from a previous page), set \`current_page_table.is_continuation\` to true and try to identify what the headers might be based on the data structure. Include your best guess for headers in \`current_page_table.headers\`.
|
|
16443
|
+
|
|
16444
|
+
### 5. Technical Diagrams & Schematics
|
|
16445
|
+
If the page contains a flow-chart, schematic, technical drawing or control board layout that is **absent or poorly described** in the OCR output do the following:
|
|
16446
|
+
- Open a <diagram> tag with the following content:
|
|
16447
|
+
<diagram>
|
|
16448
|
+
<description>
|
|
16449
|
+
Add a detailed description of the diagram here.
|
|
16450
|
+
</description>
|
|
16451
|
+
<mermaid>
|
|
16452
|
+
Add a mermaid diagram schema here that in detail describes the diagram.
|
|
16453
|
+
</mermaid>
|
|
16454
|
+
</diagram>
|
|
16455
|
+
|
|
16456
|
+
### 6. Captions, Icons & Symbols
|
|
16457
|
+
- Verify that image captions, labels, icons, and checkmarks are present and correctly transcribed.
|
|
16458
|
+
|
|
16459
|
+
### 7. Only populate \`corrected_text\` when \`needs_correction\` is true. If the OCR output is accurate, return \`needs_correction: false\` and \`corrected_content: null\`.
|
|
16181
16460
|
`;
|
|
16182
16461
|
const result = await generateText3({
|
|
16183
16462
|
model,
|
|
16184
|
-
output:
|
|
16463
|
+
output: Output3.object({
|
|
16185
16464
|
schema: z12.object({
|
|
16186
16465
|
needs_correction: z12.boolean(),
|
|
16187
16466
|
corrected_text: z12.string().nullable(),
|
|
16467
|
+
current_page_table: z12.object({
|
|
16468
|
+
headers: z12.array(z12.string()),
|
|
16469
|
+
is_continuation: z12.boolean()
|
|
16470
|
+
}).nullable(),
|
|
16188
16471
|
confidence: z12.enum(["high", "medium", "low"]),
|
|
16189
16472
|
reasoning: z12.string()
|
|
16190
16473
|
})
|
|
@@ -16207,23 +16490,80 @@ Specific notes and guidelines:
|
|
|
16207
16490
|
needs_correction: parsedOutput.needs_correction,
|
|
16208
16491
|
corrected_text: parsedOutput.corrected_text || void 0,
|
|
16209
16492
|
confidence: parsedOutput.confidence,
|
|
16493
|
+
current_page_table: parsedOutput.current_page_table || void 0,
|
|
16210
16494
|
reasoning: parsedOutput.reasoning
|
|
16211
16495
|
};
|
|
16212
16496
|
return validation;
|
|
16213
16497
|
}
|
|
16498
|
+
function reconstructTableHeaders(document, validationResults, verbose = false) {
|
|
16499
|
+
let lastTableHeaders = void 0;
|
|
16500
|
+
for (const page of document) {
|
|
16501
|
+
const validation = validationResults.get(page.page);
|
|
16502
|
+
if (!validation) continue;
|
|
16503
|
+
const tableInfo = validation.current_page_table;
|
|
16504
|
+
if (tableInfo && tableInfo.headers.length > 0) {
|
|
16505
|
+
if (tableInfo.is_continuation && lastTableHeaders) {
|
|
16506
|
+
if (verbose) {
|
|
16507
|
+
console.log(`[EXULU] Page ${page.page}: Reconstructing table headers from previous page`);
|
|
16508
|
+
console.log(`[EXULU] Previous headers: ${lastTableHeaders.join(" | ")}`);
|
|
16509
|
+
}
|
|
16510
|
+
const contentToModify = page.vlm_corrected_text || page.content;
|
|
16511
|
+
const lines = contentToModify.split("\n");
|
|
16512
|
+
const firstTableLineIndex = lines.findIndex((line) => line.trim().startsWith("|"));
|
|
16513
|
+
if (firstTableLineIndex !== -1) {
|
|
16514
|
+
const headerRow = `| ${lastTableHeaders.join(" | ")} |`;
|
|
16515
|
+
const separatorRow = `| ${lastTableHeaders.map(() => "---").join(" | ")} |`;
|
|
16516
|
+
lines.splice(firstTableLineIndex, 0, headerRow, separatorRow);
|
|
16517
|
+
const reconstructedContent = lines.join("\n");
|
|
16518
|
+
if (page.vlm_corrected_text) {
|
|
16519
|
+
page.vlm_corrected_text = reconstructedContent;
|
|
16520
|
+
} else {
|
|
16521
|
+
page.content = reconstructedContent;
|
|
16522
|
+
}
|
|
16523
|
+
if (verbose) {
|
|
16524
|
+
console.log(`[EXULU] Page ${page.page}: Added table headers successfully`);
|
|
16525
|
+
}
|
|
16526
|
+
}
|
|
16527
|
+
if (!tableInfo.is_continuation) {
|
|
16528
|
+
lastTableHeaders = tableInfo.headers;
|
|
16529
|
+
}
|
|
16530
|
+
} else {
|
|
16531
|
+
lastTableHeaders = tableInfo.headers;
|
|
16532
|
+
if (verbose) {
|
|
16533
|
+
console.log(`[EXULU] Page ${page.page}: Storing table headers for potential continuation`);
|
|
16534
|
+
console.log(`[EXULU] Headers: ${lastTableHeaders.join(" | ")}`);
|
|
16535
|
+
}
|
|
16536
|
+
}
|
|
16537
|
+
} else {
|
|
16538
|
+
lastTableHeaders = void 0;
|
|
16539
|
+
}
|
|
16540
|
+
}
|
|
16541
|
+
}
|
|
16214
16542
|
async function validateWithVLM(document, model, verbose = false, concurrency = 10) {
|
|
16215
16543
|
console.log(`[EXULU] Starting VLM validation for docling output, ${document.length} pages...`);
|
|
16216
|
-
console.log(
|
|
16217
|
-
|
|
16218
|
-
);
|
|
16544
|
+
console.log(`[EXULU] Concurrency limit: ${concurrency}`);
|
|
16545
|
+
const limit = pLimit(concurrency);
|
|
16546
|
+
const validationResults = /* @__PURE__ */ new Map();
|
|
16219
16547
|
let validatedCount = 0;
|
|
16220
16548
|
let correctedCount = 0;
|
|
16221
|
-
const limit = pLimit(concurrency);
|
|
16222
16549
|
const validationTasks = document.map(
|
|
16223
16550
|
(page) => limit(async () => {
|
|
16551
|
+
await new Promise((resolve3) => setImmediate(resolve3));
|
|
16224
16552
|
const imagePath = page.image;
|
|
16553
|
+
if (!page.content) {
|
|
16554
|
+
console.warn(`[EXULU] Page ${page.page}: No content found, skipping validation`);
|
|
16555
|
+
return;
|
|
16556
|
+
}
|
|
16225
16557
|
if (!imagePath) {
|
|
16226
|
-
console.
|
|
16558
|
+
console.warn(`[EXULU] Page ${page.page}: No image found, skipping validation`);
|
|
16559
|
+
return;
|
|
16560
|
+
}
|
|
16561
|
+
const hasImage = page.content.match(/\.(jpeg|jpg|png|gif|webp)/i);
|
|
16562
|
+
const hasTable = (page.content.match(/\|/g)?.length || 0) > 1;
|
|
16563
|
+
if (!hasImage && !hasTable) {
|
|
16564
|
+
if (verbose) {
|
|
16565
|
+
console.log(`[EXULU] Page ${page.page}: No image or table found, SKIPPING VLM validation`);
|
|
16566
|
+
}
|
|
16227
16567
|
return;
|
|
16228
16568
|
}
|
|
16229
16569
|
let validation;
|
|
@@ -16231,6 +16571,13 @@ async function validateWithVLM(document, model, verbose = false, concurrency = 1
|
|
|
16231
16571
|
validation = await withRetry(async () => {
|
|
16232
16572
|
return await validatePageWithVLM(page, imagePath, model);
|
|
16233
16573
|
}, 3);
|
|
16574
|
+
validationResults.set(page.page, validation);
|
|
16575
|
+
if (verbose && validation.current_page_table) {
|
|
16576
|
+
console.log(`[EXULU] Page ${page.page} table info:`, {
|
|
16577
|
+
headers: validation.current_page_table.headers,
|
|
16578
|
+
is_continuation: validation.current_page_table.is_continuation
|
|
16579
|
+
});
|
|
16580
|
+
}
|
|
16234
16581
|
} catch (error) {
|
|
16235
16582
|
console.error(`[EXULU] Error validating page ${page.page} with VLM more than 3 times, skipping:`, error);
|
|
16236
16583
|
throw error;
|
|
@@ -16261,9 +16608,12 @@ async function validateWithVLM(document, model, verbose = false, concurrency = 1
|
|
|
16261
16608
|
})
|
|
16262
16609
|
);
|
|
16263
16610
|
await Promise.all(validationTasks);
|
|
16264
|
-
console.log(`[EXULU] VLM validation complete:`);
|
|
16265
|
-
console.log(`[EXULU] Validated: ${validatedCount}
|
|
16266
|
-
console.log(`[EXULU] Corrected: ${correctedCount}
|
|
16611
|
+
console.log(`[EXULU] VLM validation complete (parallel processing):`);
|
|
16612
|
+
console.log(`[EXULU] Validated: ${validatedCount} pages`);
|
|
16613
|
+
console.log(`[EXULU] Corrected: ${correctedCount} pages`);
|
|
16614
|
+
console.log(`[EXULU] Starting sequential table header reconstruction...`);
|
|
16615
|
+
reconstructTableHeaders(document, validationResults, verbose);
|
|
16616
|
+
console.log(`[EXULU] Table header reconstruction complete`);
|
|
16267
16617
|
return document;
|
|
16268
16618
|
}
|
|
16269
16619
|
async function processDocument(filePath, fileType, buffer, tempDir, config, verbose = false) {
|
|
@@ -16278,15 +16628,6 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
|
|
|
16278
16628
|
const stripped = filePath.split(".").pop()?.trim();
|
|
16279
16629
|
let result;
|
|
16280
16630
|
switch (stripped) {
|
|
16281
|
-
case "pdf":
|
|
16282
|
-
result = await processPdf(buffer, paths, config, verbose);
|
|
16283
|
-
break;
|
|
16284
|
-
case "docx":
|
|
16285
|
-
result = await processDocx(buffer);
|
|
16286
|
-
break;
|
|
16287
|
-
case "doc":
|
|
16288
|
-
result = await processWord(buffer);
|
|
16289
|
-
break;
|
|
16290
16631
|
case "txt":
|
|
16291
16632
|
case "md":
|
|
16292
16633
|
let content = buffer.toString();
|
|
@@ -16300,6 +16641,15 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
|
|
|
16300
16641
|
}]
|
|
16301
16642
|
};
|
|
16302
16643
|
break;
|
|
16644
|
+
case "pdf":
|
|
16645
|
+
result = await processPdf(buffer, paths, config, verbose);
|
|
16646
|
+
break;
|
|
16647
|
+
case "docx":
|
|
16648
|
+
result = await processDocx(buffer);
|
|
16649
|
+
break;
|
|
16650
|
+
case "doc":
|
|
16651
|
+
result = await processWord(buffer);
|
|
16652
|
+
break;
|
|
16303
16653
|
// Todo other file types with docx and officeparser
|
|
16304
16654
|
default:
|
|
16305
16655
|
throw new Error(`[EXULU] Unsupported file type: ${fileType}`);
|
|
@@ -16314,8 +16664,8 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
|
|
|
16314
16664
|
}
|
|
16315
16665
|
async function processPdf(buffer, paths, config, verbose = false) {
|
|
16316
16666
|
try {
|
|
16317
|
-
let json;
|
|
16318
|
-
if (config?.docling) {
|
|
16667
|
+
let json = [];
|
|
16668
|
+
if (config?.processor.name === "docling") {
|
|
16319
16669
|
console.log(`[EXULU] Validating Python environment...`);
|
|
16320
16670
|
const validation = await validatePythonEnvironment(void 0, true);
|
|
16321
16671
|
if (!validation.valid) {
|
|
@@ -16356,7 +16706,7 @@ ${setupResult.output || ""}`);
|
|
|
16356
16706
|
}
|
|
16357
16707
|
const jsonContent = await fs2.promises.readFile(paths.json, "utf-8");
|
|
16358
16708
|
json = JSON.parse(jsonContent);
|
|
16359
|
-
} else {
|
|
16709
|
+
} else if (config?.processor.name === "officeparser") {
|
|
16360
16710
|
const text = await parseOfficeAsync2(buffer, {
|
|
16361
16711
|
outputErrorToConsole: false,
|
|
16362
16712
|
newlineDelimiter: "\n"
|
|
@@ -16366,15 +16716,69 @@ ${setupResult.output || ""}`);
|
|
|
16366
16716
|
content: text,
|
|
16367
16717
|
headings: []
|
|
16368
16718
|
}];
|
|
16719
|
+
} else if (config?.processor.name === "mistral") {
|
|
16720
|
+
if (!process.env.MISTRAL_API_KEY) {
|
|
16721
|
+
throw new Error("[EXULU] MISTRAL_API_KEY is not set, please set it in the environment variables.");
|
|
16722
|
+
}
|
|
16723
|
+
await new Promise((resolve3) => setTimeout(resolve3, Math.floor(Math.random() * 4e3) + 1e3));
|
|
16724
|
+
const base64Pdf = buffer.toString("base64");
|
|
16725
|
+
const client2 = new Mistral({ apiKey: process.env.MISTRAL_API_KEY });
|
|
16726
|
+
const ocrResponse = await withRetry(async () => {
|
|
16727
|
+
const ocrResponse2 = await client2.ocr.process({
|
|
16728
|
+
document: {
|
|
16729
|
+
type: "document_url",
|
|
16730
|
+
documentUrl: "data:application/pdf;base64," + base64Pdf
|
|
16731
|
+
},
|
|
16732
|
+
model: "mistral-ocr-latest",
|
|
16733
|
+
includeImageBase64: false
|
|
16734
|
+
});
|
|
16735
|
+
return ocrResponse2;
|
|
16736
|
+
}, 10);
|
|
16737
|
+
const parser = new LiteParse();
|
|
16738
|
+
const screenshots = await parser.screenshot(paths.source, void 0);
|
|
16739
|
+
await fs2.promises.mkdir(paths.images, { recursive: true });
|
|
16740
|
+
for (const screenshot of screenshots) {
|
|
16741
|
+
await fs2.promises.writeFile(
|
|
16742
|
+
path.join(
|
|
16743
|
+
paths.images,
|
|
16744
|
+
`${screenshot.pageNum}.png`
|
|
16745
|
+
),
|
|
16746
|
+
screenshot.imageBuffer
|
|
16747
|
+
);
|
|
16748
|
+
screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
|
|
16749
|
+
}
|
|
16750
|
+
json = ocrResponse.pages.map((page) => ({
|
|
16751
|
+
page: page.index + 1,
|
|
16752
|
+
content: page.markdown,
|
|
16753
|
+
image: screenshots.find((s) => s.pageNum === page.index + 1)?.imagePath,
|
|
16754
|
+
headings: []
|
|
16755
|
+
}));
|
|
16756
|
+
fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
|
|
16757
|
+
} else if (config?.processor.name === "liteparse") {
|
|
16758
|
+
const parser = new LiteParse();
|
|
16759
|
+
const result = await parser.parse(paths.source);
|
|
16760
|
+
const screenshots = await parser.screenshot(paths.source, void 0);
|
|
16761
|
+
console.log(`[EXULU] Liteparse screenshots: ${JSON.stringify(screenshots)}`);
|
|
16762
|
+
await fs2.promises.mkdir(paths.images, { recursive: true });
|
|
16763
|
+
for (const screenshot of screenshots) {
|
|
16764
|
+
await fs2.promises.writeFile(path.join(paths.images, `${screenshot.pageNum}.png`), screenshot.imageBuffer);
|
|
16765
|
+
screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
|
|
16766
|
+
}
|
|
16767
|
+
json = result.pages.map((page) => ({
|
|
16768
|
+
page: page.pageNum,
|
|
16769
|
+
content: page.text,
|
|
16770
|
+
image: screenshots.find((s) => s.pageNum === page.pageNum)?.imagePath
|
|
16771
|
+
}));
|
|
16772
|
+
fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
|
|
16369
16773
|
}
|
|
16370
16774
|
console.log(`[EXULU]
|
|
16371
16775
|
\u2713 Document processing completed successfully`);
|
|
16372
16776
|
console.log(`[EXULU] Total pages: ${json.length}`);
|
|
16373
16777
|
console.log(`[EXULU] Output file: ${paths.json}`);
|
|
16374
|
-
if (
|
|
16778
|
+
if (config?.vlm?.model) {
|
|
16375
16779
|
console.error("[EXULU] VLM validation is only supported when docling is enabled, skipping validation.");
|
|
16376
16780
|
}
|
|
16377
|
-
if (config?.
|
|
16781
|
+
if (config?.vlm?.model && json.length > 0) {
|
|
16378
16782
|
json = await validateWithVLM(
|
|
16379
16783
|
json,
|
|
16380
16784
|
config.vlm.model,
|
|
@@ -16402,29 +16806,37 @@ ${setupResult.output || ""}`);
|
|
|
16402
16806
|
"utf-8"
|
|
16403
16807
|
);
|
|
16404
16808
|
}
|
|
16405
|
-
const
|
|
16406
|
-
|
|
16407
|
-
|
|
16408
|
-
|
|
16409
|
-
|
|
16809
|
+
const markdownStream = fs2.createWriteStream(paths.markdown, { encoding: "utf-8" });
|
|
16810
|
+
for (let i = 0; i < json.length; i++) {
|
|
16811
|
+
const p = json[i];
|
|
16812
|
+
if (!p) continue;
|
|
16813
|
+
const content = p.vlm_corrected_text ?? p.content;
|
|
16814
|
+
markdownStream.write(content);
|
|
16815
|
+
if (i < json.length - 1) {
|
|
16816
|
+
markdownStream.write("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
|
|
16410
16817
|
}
|
|
16411
|
-
}
|
|
16412
|
-
await
|
|
16413
|
-
|
|
16414
|
-
|
|
16415
|
-
|
|
16416
|
-
);
|
|
16818
|
+
}
|
|
16819
|
+
await new Promise((resolve3, reject) => {
|
|
16820
|
+
markdownStream.end(() => resolve3());
|
|
16821
|
+
markdownStream.on("error", reject);
|
|
16822
|
+
});
|
|
16417
16823
|
console.log(`[EXULU] Validated output saved to: ${paths.json}`);
|
|
16418
16824
|
console.log(`[EXULU] Validated markdown saved to: ${paths.markdown}`);
|
|
16825
|
+
const markdown = await fs2.promises.readFile(paths.markdown, "utf-8");
|
|
16826
|
+
const processedJson = json.map((e) => {
|
|
16827
|
+
const finalContent = e.vlm_corrected_text ?? e.content;
|
|
16828
|
+
return {
|
|
16829
|
+
page: e.page,
|
|
16830
|
+
content: finalContent
|
|
16831
|
+
};
|
|
16832
|
+
});
|
|
16833
|
+
json.length = 0;
|
|
16834
|
+
json = [];
|
|
16835
|
+
const memUsage = process.memoryUsage();
|
|
16836
|
+
console.log(`[EXULU] Memory after document processing: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`);
|
|
16419
16837
|
return {
|
|
16420
16838
|
markdown,
|
|
16421
|
-
json:
|
|
16422
|
-
const finalContent = e.vlm_corrected_text || e.content;
|
|
16423
|
-
return {
|
|
16424
|
-
page: e.page,
|
|
16425
|
-
content: finalContent
|
|
16426
|
-
};
|
|
16427
|
-
})
|
|
16839
|
+
json: processedJson
|
|
16428
16840
|
};
|
|
16429
16841
|
} catch (error) {
|
|
16430
16842
|
console.error("[EXULU] Error processing document:", error);
|
|
@@ -16437,9 +16849,9 @@ var loadFile = async (file, name, tempDir) => {
|
|
|
16437
16849
|
if (!fileType) {
|
|
16438
16850
|
throw new Error("[EXULU] File name does not include extension, extension is required for document processing.");
|
|
16439
16851
|
}
|
|
16852
|
+
const UUID = randomUUID6();
|
|
16440
16853
|
let buffer;
|
|
16441
16854
|
if (Buffer.isBuffer(file)) {
|
|
16442
|
-
const UUID = randomUUID6();
|
|
16443
16855
|
filePath = path.join(tempDir, `${UUID}.${fileType}`);
|
|
16444
16856
|
await fs2.promises.writeFile(filePath, file);
|
|
16445
16857
|
buffer = file;
|
|
@@ -16448,7 +16860,10 @@ var loadFile = async (file, name, tempDir) => {
|
|
|
16448
16860
|
if (filePath.startsWith("http")) {
|
|
16449
16861
|
const response = await fetch(filePath);
|
|
16450
16862
|
const array = await response.arrayBuffer();
|
|
16863
|
+
const tempFilePath = path.join(tempDir, `${UUID}.${fileType}`);
|
|
16864
|
+
await fs2.promises.writeFile(tempFilePath, Buffer.from(array));
|
|
16451
16865
|
buffer = Buffer.from(array);
|
|
16866
|
+
filePath = tempFilePath;
|
|
16452
16867
|
} else {
|
|
16453
16868
|
buffer = await fs2.promises.readFile(file);
|
|
16454
16869
|
}
|
|
@@ -16466,17 +16881,34 @@ async function documentProcessor({
|
|
|
16466
16881
|
}
|
|
16467
16882
|
const uuid = randomUUID6();
|
|
16468
16883
|
const tempDir = path.join(process.cwd(), "temp", uuid);
|
|
16884
|
+
const localFilesAndFoldersToDelete = [tempDir];
|
|
16469
16885
|
console.log(`[EXULU] Temporary directory for processing document ${name}: ${tempDir}`);
|
|
16470
16886
|
await fs2.promises.mkdir(tempDir, { recursive: true });
|
|
16887
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
16888
|
+
await fs2.promises.writeFile(path.join(tempDir, "created_at.txt"), timestamp);
|
|
16471
16889
|
try {
|
|
16472
16890
|
const {
|
|
16473
16891
|
filePath,
|
|
16474
16892
|
fileType,
|
|
16475
16893
|
buffer
|
|
16476
16894
|
} = await loadFile(file, name, tempDir);
|
|
16477
|
-
|
|
16895
|
+
let supportedTypes = [];
|
|
16896
|
+
switch (config?.processor.name) {
|
|
16897
|
+
case "docling":
|
|
16898
|
+
supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
|
|
16899
|
+
break;
|
|
16900
|
+
case "officeparser":
|
|
16901
|
+
supportedTypes = [];
|
|
16902
|
+
break;
|
|
16903
|
+
case "liteparse":
|
|
16904
|
+
supportedTypes = ["pdf", "doc", "docx", "docm", "odt", "rtf", "ppt", "pptx", "pptm", "odp", "xls", "xlsx", "xlsm", "ods", "csv", "tsv"];
|
|
16905
|
+
break;
|
|
16906
|
+
case "mistral":
|
|
16907
|
+
supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
|
|
16908
|
+
break;
|
|
16909
|
+
}
|
|
16478
16910
|
if (!supportedTypes.includes(fileType)) {
|
|
16479
|
-
throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor.`);
|
|
16911
|
+
throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor, the ${config?.processor.name} processor only supports the following file types: ${supportedTypes.join(", ")}.`);
|
|
16480
16912
|
}
|
|
16481
16913
|
const { content } = await processDocument(
|
|
16482
16914
|
filePath,
|
|
@@ -16489,9 +16921,19 @@ async function documentProcessor({
|
|
|
16489
16921
|
return content.json;
|
|
16490
16922
|
} catch (error) {
|
|
16491
16923
|
console.error("Error during chunking:", error);
|
|
16492
|
-
|
|
16924
|
+
throw error;
|
|
16493
16925
|
} finally {
|
|
16494
|
-
|
|
16926
|
+
if (config?.debugging?.deleteTempFiles !== false) {
|
|
16927
|
+
for (const file2 of localFilesAndFoldersToDelete) {
|
|
16928
|
+
try {
|
|
16929
|
+
await fs2.promises.rm(file2, { recursive: true });
|
|
16930
|
+
console.log(`[EXULU] Deleted file or folder: ${file2}`);
|
|
16931
|
+
} catch (error) {
|
|
16932
|
+
console.error(`[EXULU] Error deleting file or folder: ${file2}`, error);
|
|
16933
|
+
console.log(`[EXULU] File or folder still exists: ${file2}`);
|
|
16934
|
+
}
|
|
16935
|
+
}
|
|
16936
|
+
}
|
|
16495
16937
|
}
|
|
16496
16938
|
}
|
|
16497
16939
|
|