comfyui-node 1.4.4 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +21 -16
  2. package/dist/.tsbuildinfo +1 -1
  3. package/dist/call-wrapper.d.ts +141 -124
  4. package/dist/call-wrapper.d.ts.map +1 -1
  5. package/dist/call-wrapper.js +353 -64
  6. package/dist/call-wrapper.js.map +1 -1
  7. package/dist/client.d.ts +290 -290
  8. package/dist/client.d.ts.map +1 -1
  9. package/dist/client.js +78 -19
  10. package/dist/client.js.map +1 -1
  11. package/dist/index.d.ts +3 -2
  12. package/dist/index.d.ts.map +1 -1
  13. package/dist/index.js +1 -1
  14. package/dist/index.js.map +1 -1
  15. package/dist/pool/SmartPool.d.ts +144 -0
  16. package/dist/pool/SmartPool.d.ts.map +1 -0
  17. package/dist/pool/SmartPool.js +677 -0
  18. package/dist/pool/SmartPool.js.map +1 -0
  19. package/dist/pool/SmartPoolV2.d.ts +120 -0
  20. package/dist/pool/SmartPoolV2.d.ts.map +1 -0
  21. package/dist/pool/SmartPoolV2.js +587 -0
  22. package/dist/pool/SmartPoolV2.js.map +1 -0
  23. package/dist/pool/WorkflowPool.d.ts +32 -2
  24. package/dist/pool/WorkflowPool.d.ts.map +1 -1
  25. package/dist/pool/WorkflowPool.js +298 -66
  26. package/dist/pool/WorkflowPool.js.map +1 -1
  27. package/dist/pool/client/ClientManager.d.ts +4 -2
  28. package/dist/pool/client/ClientManager.d.ts.map +1 -1
  29. package/dist/pool/client/ClientManager.js +29 -9
  30. package/dist/pool/client/ClientManager.js.map +1 -1
  31. package/dist/pool/index.d.ts +2 -0
  32. package/dist/pool/index.d.ts.map +1 -1
  33. package/dist/pool/index.js +2 -0
  34. package/dist/pool/index.js.map +1 -1
  35. package/dist/pool/queue/QueueAdapter.d.ts +32 -30
  36. package/dist/pool/queue/QueueAdapter.d.ts.map +1 -1
  37. package/dist/pool/queue/adapters/memory.d.ts +22 -20
  38. package/dist/pool/queue/adapters/memory.d.ts.map +1 -1
  39. package/dist/pool/queue/adapters/memory.js +14 -2
  40. package/dist/pool/queue/adapters/memory.js.map +1 -1
  41. package/dist/pool/types/affinity.d.ts +6 -0
  42. package/dist/pool/types/affinity.d.ts.map +1 -0
  43. package/dist/pool/types/affinity.js +2 -0
  44. package/dist/pool/types/affinity.js.map +1 -0
  45. package/dist/pool/types/job.d.ts.map +1 -1
  46. package/dist/pool/utils/failure-analysis.d.ts +14 -0
  47. package/dist/pool/utils/failure-analysis.d.ts.map +1 -0
  48. package/dist/pool/utils/failure-analysis.js +224 -0
  49. package/dist/pool/utils/failure-analysis.js.map +1 -0
  50. package/dist/pool.d.ts +180 -180
  51. package/dist/types/error.d.ts +31 -1
  52. package/dist/types/error.d.ts.map +1 -1
  53. package/dist/types/error.js +30 -0
  54. package/dist/types/error.js.map +1 -1
  55. package/dist/workflow.d.ts.map +1 -1
  56. package/dist/workflow.js +4 -1
  57. package/dist/workflow.js.map +1 -1
  58. package/package.json +4 -3
@@ -9,6 +9,8 @@ import { ClientManager } from "./client/ClientManager.js";
9
9
  import { hashWorkflow } from "./utils/hash.js";
10
10
  import { cloneDeep } from "./utils/clone.js";
11
11
  import { JobProfiler } from "./profiling/JobProfiler.js";
12
+ import { analyzeWorkflowFailure } from "./utils/failure-analysis.js";
13
+ import { WorkflowNotSupportedError } from "../types/error.js";
12
14
  const DEFAULT_MAX_ATTEMPTS = 3;
13
15
  const DEFAULT_RETRY_DELAY = 1000;
14
16
  export class WorkflowPool extends TypedEventTarget {
@@ -17,9 +19,18 @@ export class WorkflowPool extends TypedEventTarget {
17
19
  clientManager;
18
20
  opts;
19
21
  jobStore = new Map();
22
+ jobFailureAnalysis = new Map();
23
+ affinities = new Map();
20
24
  initPromise;
21
25
  processing = false;
26
+ processQueued = false;
22
27
  activeJobs = new Map();
28
+ queueDebug = process.env.WORKFLOW_POOL_DEBUG === "1";
29
+ debugLog(...args) {
30
+ if (this.queueDebug) {
31
+ console.log(...args);
32
+ }
33
+ }
23
34
  constructor(clients, opts) {
24
35
  super();
25
36
  this.strategy = opts?.failoverStrategy ?? new SmartFailoverStrategy();
@@ -28,6 +39,11 @@ export class WorkflowPool extends TypedEventTarget {
28
39
  healthCheckIntervalMs: opts?.healthCheckIntervalMs ?? 30000
29
40
  });
30
41
  this.opts = opts ?? {};
42
+ if (opts?.workflowAffinities) {
43
+ for (const affinity of opts.workflowAffinities) {
44
+ this.affinities.set(affinity.workflowHash, affinity);
45
+ }
46
+ }
31
47
  this.clientManager.on("client:state", (ev) => {
32
48
  this.dispatchEvent(new CustomEvent("client:state", { detail: ev.detail }));
33
49
  });
@@ -51,6 +67,15 @@ export class WorkflowPool extends TypedEventTarget {
51
67
  async ready() {
52
68
  await this.initPromise;
53
69
  }
70
+ setAffinity(affinity) {
71
+ this.affinities.set(affinity.workflowHash, affinity);
72
+ }
73
+ removeAffinity(workflowHash) {
74
+ return this.affinities.delete(workflowHash);
75
+ }
76
+ getAffinities() {
77
+ return Array.from(this.affinities.values());
78
+ }
54
79
  async enqueue(workflowInput, options) {
55
80
  await this.ready();
56
81
  const workflowJson = this.normalizeWorkflow(workflowInput);
@@ -72,6 +97,13 @@ export class WorkflowPool extends TypedEventTarget {
72
97
  outputAliases: workflowInput.outputAliases ?? {}
73
98
  };
74
99
  }
100
+ const affinity = this.affinities.get(workflowHash);
101
+ const preferredClientIds = options?.preferredClientIds
102
+ ? [...options.preferredClientIds]
103
+ : (affinity?.preferredClientIds ? [...affinity.preferredClientIds] : []);
104
+ const excludeClientIds = options?.excludeClientIds
105
+ ? [...options.excludeClientIds]
106
+ : (affinity?.excludeClientIds ? [...affinity.excludeClientIds] : []);
75
107
  const payload = {
76
108
  jobId,
77
109
  workflow: workflowJson,
@@ -83,14 +115,20 @@ export class WorkflowPool extends TypedEventTarget {
83
115
  maxAttempts: options?.maxAttempts ?? DEFAULT_MAX_ATTEMPTS,
84
116
  retryDelayMs: options?.retryDelayMs ?? DEFAULT_RETRY_DELAY,
85
117
  priority: options?.priority ?? 0,
86
- preferredClientIds: options?.preferredClientIds ?? [],
87
- excludeClientIds: options?.excludeClientIds ?? [],
118
+ preferredClientIds: preferredClientIds,
119
+ excludeClientIds: excludeClientIds,
88
120
  metadata: options?.metadata ?? {},
89
121
  includeOutputs: options?.includeOutputs ?? []
90
122
  }
91
123
  };
92
124
  const record = {
93
125
  ...payload,
126
+ options: {
127
+ ...payload.options,
128
+ preferredClientIds: payload.options.preferredClientIds ? [...payload.options.preferredClientIds] : [],
129
+ excludeClientIds: payload.options.excludeClientIds ? [...payload.options.excludeClientIds] : [],
130
+ includeOutputs: payload.options.includeOutputs ? [...payload.options.includeOutputs] : []
131
+ },
94
132
  attachments: options?.attachments,
95
133
  status: "queued"
96
134
  };
@@ -113,6 +151,7 @@ export class WorkflowPool extends TypedEventTarget {
113
151
  if (removed) {
114
152
  record.status = "cancelled";
115
153
  record.completedAt = Date.now();
154
+ this.clearJobFailures(jobId);
116
155
  this.dispatchEvent(new CustomEvent("job:cancelled", { detail: { job: record } }));
117
156
  return true;
118
157
  }
@@ -122,6 +161,7 @@ export class WorkflowPool extends TypedEventTarget {
122
161
  await active.cancel();
123
162
  record.status = "cancelled";
124
163
  record.completedAt = Date.now();
164
+ this.clearJobFailures(jobId);
125
165
  this.dispatchEvent(new CustomEvent("job:cancelled", { detail: { job: record } }));
126
166
  return true;
127
167
  }
@@ -159,7 +199,7 @@ export class WorkflowPool extends TypedEventTarget {
159
199
  }
160
200
  }
161
201
  static fallbackId() {
162
- return (globalThis.crypto && "randomUUID" in globalThis.crypto)
202
+ return globalThis.crypto && "randomUUID" in globalThis.crypto
163
203
  ? globalThis.crypto.randomUUID()
164
204
  : `job_${Math.random().toString(36).slice(2, 10)}`;
165
205
  }
@@ -185,44 +225,215 @@ export class WorkflowPool extends TypedEventTarget {
185
225
  }
186
226
  return autoSeeds;
187
227
  }
228
+ rememberJobFailure(job, clientId, analysis) {
229
+ let map = this.jobFailureAnalysis.get(job.jobId);
230
+ if (!map) {
231
+ map = new Map();
232
+ this.jobFailureAnalysis.set(job.jobId, map);
233
+ }
234
+ map.set(clientId, analysis);
235
+ }
236
+ clearJobFailures(jobId) {
237
+ this.jobFailureAnalysis.delete(jobId);
238
+ }
239
+ collectFailureReasons(jobId) {
240
+ const map = this.jobFailureAnalysis.get(jobId);
241
+ if (!map) {
242
+ return {};
243
+ }
244
+ const reasons = {};
245
+ for (const [clientId, analysis] of map.entries()) {
246
+ reasons[clientId] = analysis.reason;
247
+ }
248
+ return reasons;
249
+ }
250
+ addPermanentExclusion(job, clientId) {
251
+ if (!job.options.excludeClientIds) {
252
+ job.options.excludeClientIds = [];
253
+ }
254
+ if (!job.options.excludeClientIds.includes(clientId)) {
255
+ job.options.excludeClientIds.push(clientId);
256
+ }
257
+ }
258
+ hasRetryPath(job) {
259
+ const map = this.jobFailureAnalysis.get(job.jobId);
260
+ const exclude = new Set(job.options.excludeClientIds ?? []);
261
+ const preferred = job.options.preferredClientIds?.length ? new Set(job.options.preferredClientIds) : null;
262
+ for (const client of this.clientManager.list()) {
263
+ if (preferred && !preferred.has(client.id)) {
264
+ continue;
265
+ }
266
+ if (exclude.has(client.id)) {
267
+ continue;
268
+ }
269
+ const analysis = map?.get(client.id);
270
+ if (analysis?.blockClient === "permanent") {
271
+ continue;
272
+ }
273
+ return true;
274
+ }
275
+ return false;
276
+ }
277
+ createWorkflowNotSupportedError(job, cause) {
278
+ const reasons = this.collectFailureReasons(job.jobId);
279
+ const message = `Workflow ${job.workflowHash} is not supported by any connected clients`;
280
+ return new WorkflowNotSupportedError(message, {
281
+ workflowHash: job.workflowHash,
282
+ reasons,
283
+ cause
284
+ });
285
+ }
188
286
  async processQueue() {
287
+ this.debugLog("[processQueue] Called");
189
288
  if (this.processing) {
289
+ this.debugLog("[processQueue] Already processing, returning early");
290
+ this.processQueued = true;
190
291
  return;
191
292
  }
192
293
  this.processing = true;
193
294
  try {
295
+ // Continue processing until no more jobs can be assigned
296
+ let iteration = 0;
194
297
  while (true) {
195
- const reservation = await this.queue.reserve();
196
- if (!reservation) {
197
- break;
298
+ iteration++;
299
+ this.debugLog(`[processQueue] Iteration ${iteration}`);
300
+ const idleClients = this.clientManager.list().filter(c => this.clientManager.isClientStable(c));
301
+ this.debugLog(`[processQueue] Idle clients: [${idleClients.map(c => c.id).join(", ")}] (${idleClients.length})`);
302
+ if (!idleClients.length) {
303
+ this.debugLog("[processQueue] No idle clients, breaking");
304
+ break; // No idle clients available
198
305
  }
199
- const job = this.jobStore.get(reservation.payload.jobId);
200
- if (!job) {
201
- await this.queue.commit(reservation.reservationId);
202
- continue;
306
+ const waitingJobs = await this.queue.peek(100); // Peek at top 100 jobs
307
+ this.debugLog(`[processQueue] Waiting jobs in queue: ${waitingJobs.length}`);
308
+ if (!waitingJobs.length) {
309
+ this.debugLog("[processQueue] No waiting jobs, breaking");
310
+ break; // No jobs in queue
203
311
  }
204
- const lease = this.clientManager.claim(job);
205
- if (!lease) {
206
- await this.queue.retry(reservation.reservationId, { delayMs: job.options.retryDelayMs });
207
- this.scheduleProcess(job.options.retryDelayMs);
208
- break;
312
+ const leasedClientIds = new Set();
313
+ const reservedJobIds = new Set();
314
+ const jobMatchInfos = [];
315
+ for (const jobPayload of waitingJobs) {
316
+ const job = this.jobStore.get(jobPayload.jobId);
317
+ if (!job) {
318
+ this.debugLog(`[processQueue] Job ${jobPayload.jobId} not in jobStore, skipping`);
319
+ continue;
320
+ }
321
+ const compatibleClients = idleClients
322
+ .filter(client => {
323
+ const canRun = this.clientManager.canClientRunJob(client, job);
324
+ if (!canRun) {
325
+ this.debugLog(`[processQueue] Job ${job.jobId.substring(0, 8)}... NOT compatible with ${client.id}. Checking why...`);
326
+ this.debugLog(`[processQueue] - preferredClientIds: ${JSON.stringify(job.options.preferredClientIds)}`);
327
+ this.debugLog(`[processQueue] - excludeClientIds: ${JSON.stringify(job.options.excludeClientIds)}`);
328
+ this.debugLog(`[processQueue] - client.id: ${client.id}`);
329
+ }
330
+ return canRun;
331
+ })
332
+ .map(client => client.id);
333
+ this.debugLog(`[processQueue] Job ${job.jobId.substring(0, 8)}... compatible with: [${compatibleClients.join(", ")}] (selectivity=${compatibleClients.length})`);
334
+ if (compatibleClients.length > 0) {
335
+ jobMatchInfos.push({
336
+ jobPayload,
337
+ job,
338
+ compatibleClients,
339
+ selectivity: compatibleClients.length
340
+ });
341
+ }
209
342
  }
210
- this.runJob({ reservation, job, clientId: lease.clientId, release: lease.release }).catch((error) => {
211
- console.error("[WorkflowPool] Unhandled job error", error);
343
+ this.debugLog(`[processQueue] Found ${jobMatchInfos.length} compatible job matches`);
344
+ if (jobMatchInfos.length === 0) {
345
+ this.debugLog("[processQueue] No compatible jobs for idle clients, breaking");
346
+ break; // No compatible jobs for idle clients
347
+ }
348
+ // Sort jobs by priority first, then selectivity, to maximize throughput
349
+ // 1. Higher priority jobs execute first (explicit user priority)
350
+ // 2. More selective jobs (fewer compatible clients) assigned first within same priority
351
+ // 3. Earlier queue position as final tiebreaker
352
+ jobMatchInfos.sort((a, b) => {
353
+ // Primary: priority (higher priority = higher precedence)
354
+ const aPriority = a.job.options.priority ?? 0;
355
+ const bPriority = b.job.options.priority ?? 0;
356
+ if (aPriority !== bPriority) {
357
+ return bPriority - aPriority; // Higher priority first
358
+ }
359
+ // Secondary: selectivity (fewer compatible clients = higher precedence)
360
+ if (a.selectivity !== b.selectivity) {
361
+ return a.selectivity - b.selectivity;
362
+ }
363
+ // Tertiary: maintain queue order (earlier jobs first)
364
+ const aIndex = waitingJobs.indexOf(a.jobPayload);
365
+ const bIndex = waitingJobs.indexOf(b.jobPayload);
366
+ return aIndex - bIndex;
212
367
  });
368
+ // Assign jobs to clients using the selectivity-based ordering
369
+ let assignedAnyJob = false;
370
+ for (const matchInfo of jobMatchInfos) {
371
+ if (reservedJobIds.has(matchInfo.job.jobId))
372
+ continue;
373
+ // Find first available compatible client
374
+ const availableClient = matchInfo.compatibleClients.find(clientId => !leasedClientIds.has(clientId));
375
+ if (!availableClient) {
376
+ this.debugLog(`[processQueue] No available client for job ${matchInfo.job.jobId.substring(0, 8)}...`);
377
+ continue; // No available clients for this job
378
+ }
379
+ this.debugLog(`[processQueue] Reserving job ${matchInfo.job.jobId.substring(0, 8)}... for client ${availableClient}`);
380
+ const reservation = await this.queue.reserveById(matchInfo.job.jobId);
381
+ if (reservation) {
382
+ // Mark as leased/reserved for this cycle
383
+ leasedClientIds.add(availableClient);
384
+ reservedJobIds.add(matchInfo.job.jobId);
385
+ assignedAnyJob = true;
386
+ // Get the lease (which marks the client as busy)
387
+ const lease = this.clientManager.claim(matchInfo.job, availableClient);
388
+ if (lease) {
389
+ this.debugLog(`[processQueue] Starting job ${matchInfo.job.jobId.substring(0, 8)}... on client ${availableClient}`);
390
+ this.runJob({ reservation, job: matchInfo.job, clientId: lease.clientId, release: lease.release }).catch((error) => {
391
+ console.error("[WorkflowPool] Unhandled job error", error);
392
+ });
393
+ }
394
+ else {
395
+ // This should not happen since we checked canClientRunJob, but handle defensively
396
+ console.error(`[processQueue.processQueue] CRITICAL: Failed to claim client ${availableClient} for job ${matchInfo.job.jobId} after successful check.`);
397
+ await this.queue.retry(reservation.reservationId, { delayMs: matchInfo.job.options.retryDelayMs });
398
+ }
399
+ }
400
+ else {
401
+ this.debugLog(`[processQueue] Failed to reserve job ${matchInfo.job.jobId.substring(0, 8)}...`);
402
+ }
403
+ }
404
+ this.debugLog(`[processQueue] Assigned any job in this iteration: ${assignedAnyJob}`);
405
+ // If we didn't assign any jobs this iteration, no point continuing
406
+ if (!assignedAnyJob) {
407
+ this.debugLog("[processQueue] No jobs assigned, breaking");
408
+ break;
409
+ }
213
410
  }
214
411
  }
215
412
  finally {
413
+ this.debugLog("[processQueue] Exiting, setting processing = false");
216
414
  this.processing = false;
415
+ if (this.processQueued) {
416
+ this.debugLog("[processQueue] Pending rerun detected, draining queue again");
417
+ this.processQueued = false;
418
+ void this.processQueue();
419
+ }
217
420
  }
218
421
  }
219
422
  async runJob(ctx) {
220
423
  const { reservation, job, clientId, release } = ctx;
424
+ let released = false;
425
+ const safeRelease = (opts) => {
426
+ if (released) {
427
+ return;
428
+ }
429
+ released = true;
430
+ release(opts);
431
+ };
221
432
  const managed = this.clientManager.getClient(clientId);
222
433
  const client = managed?.client;
223
434
  if (!client) {
224
435
  await this.queue.retry(reservation.reservationId, { delayMs: job.options.retryDelayMs });
225
- release({ success: false });
436
+ safeRelease({ success: false });
226
437
  return;
227
438
  }
228
439
  job.status = "running";
@@ -257,9 +468,9 @@ export class WorkflowPool extends TypedEventTarget {
257
468
  // Use stored metadata if available (from Workflow instance), otherwise extract from recreated instance
258
469
  const outputNodeIds = reservation.payload.workflowMeta?.outputNodeIds ??
259
470
  wfInstance.outputNodeIds ??
260
- job.options.includeOutputs ?? [];
261
- const outputAliases = reservation.payload.workflowMeta?.outputAliases ??
262
- wfInstance.outputAliases ?? {};
471
+ job.options.includeOutputs ??
472
+ [];
473
+ const outputAliases = reservation.payload.workflowMeta?.outputAliases ?? wfInstance.outputAliases ?? {};
263
474
  let promptBuilder = new PromptBuilder(wfInstance.json, wfInstance.inputPaths ?? [], outputNodeIds);
264
475
  for (const nodeId of outputNodeIds) {
265
476
  const alias = outputAliases[nodeId] ?? nodeId;
@@ -267,9 +478,7 @@ export class WorkflowPool extends TypedEventTarget {
267
478
  }
268
479
  const wrapper = new CallWrapper(client, promptBuilder);
269
480
  // Setup profiling if enabled
270
- const profiler = this.opts.enableProfiling
271
- ? new JobProfiler(job.enqueuedAt, workflowPayload)
272
- : undefined;
481
+ const profiler = this.opts.enableProfiling ? new JobProfiler(job.enqueuedAt, workflowPayload) : undefined;
273
482
  // Setup node execution timeout tracking
274
483
  const nodeExecutionTimeout = this.opts.nodeExecutionTimeoutMs ?? 300000; // 5 minutes default
275
484
  let nodeTimeoutId;
@@ -285,9 +494,10 @@ export class WorkflowPool extends TypedEventTarget {
285
494
  currentExecutingNode = nodeName || null;
286
495
  nodeTimeoutId = setTimeout(() => {
287
496
  const elapsed = Date.now() - (lastNodeStartTime || 0);
288
- const nodeInfo = currentExecutingNode ? ` (node: ${currentExecutingNode})` : '';
289
- rejectCompletion?.(new Error(`Node execution timeout: took longer than ${nodeExecutionTimeout}ms${nodeInfo}. ` +
290
- `Actual time: ${elapsed}ms. Server may be stuck or node is too slow for configured timeout.`));
497
+ const nodeInfo = currentExecutingNode ? ` (node: ${currentExecutingNode})` : "";
498
+ completionError = new Error(`Node execution timeout: took longer than ${nodeExecutionTimeout}ms${nodeInfo}. ` +
499
+ `Actual time: ${elapsed}ms. Server may be stuck or node is too slow for configured timeout.`);
500
+ resolveCompletion?.();
291
501
  }, nodeExecutionTimeout);
292
502
  }
293
503
  };
@@ -326,20 +536,20 @@ export class WorkflowPool extends TypedEventTarget {
326
536
  const onExecutionError = (event) => {
327
537
  const detail = event.detail || {};
328
538
  if (detail.node !== undefined) {
329
- profiler.onNodeError(String(detail.node), detail.exception_message || 'Execution error');
539
+ profiler.onNodeError(String(detail.node), detail.exception_message || "Execution error");
330
540
  }
331
541
  };
332
542
  // Attach listeners to client
333
- client.addEventListener('execution_start', onExecutionStart);
334
- client.addEventListener('execution_cached', onExecutionCached);
335
- client.addEventListener('executing', onExecuting);
336
- client.addEventListener('execution_error', onExecutionError);
543
+ client.addEventListener("execution_start", onExecutionStart);
544
+ client.addEventListener("execution_cached", onExecutionCached);
545
+ client.addEventListener("executing", onExecuting);
546
+ client.addEventListener("execution_error", onExecutionError);
337
547
  // Cleanup function to remove listeners
338
548
  const cleanupProfiler = () => {
339
- client.removeEventListener('execution_start', onExecutionStart);
340
- client.removeEventListener('execution_cached', onExecutionCached);
341
- client.removeEventListener('executing', onExecuting);
342
- client.removeEventListener('execution_error', onExecutionError);
549
+ client.removeEventListener("execution_start", onExecutionStart);
550
+ client.removeEventListener("execution_cached", onExecutionCached);
551
+ client.removeEventListener("executing", onExecuting);
552
+ client.removeEventListener("execution_error", onExecutionError);
343
553
  };
344
554
  // Ensure cleanup happens when job finishes
345
555
  wrapper.onFinished(() => cleanupProfiler());
@@ -365,19 +575,19 @@ export class WorkflowPool extends TypedEventTarget {
365
575
  };
366
576
  const onExecutionStarted = (event) => {
367
577
  // Execution started - reset timeout for first node
368
- resetNodeTimeout('execution_start');
578
+ resetNodeTimeout("execution_start");
369
579
  };
370
580
  if (nodeExecutionTimeout > 0) {
371
- client.addEventListener('execution_start', onExecutionStarted);
372
- client.addEventListener('executing', onNodeExecuting);
373
- client.addEventListener('progress', onNodeProgress);
581
+ client.addEventListener("execution_start", onExecutionStarted);
582
+ client.addEventListener("executing", onNodeExecuting);
583
+ client.addEventListener("progress", onNodeProgress);
374
584
  }
375
585
  const cleanupNodeTimeout = () => {
376
586
  clearNodeTimeout();
377
587
  if (nodeExecutionTimeout > 0) {
378
- client.removeEventListener('execution_start', onExecutionStarted);
379
- client.removeEventListener('executing', onNodeExecuting);
380
- client.removeEventListener('progress', onNodeProgress);
588
+ client.removeEventListener("execution_start", onExecutionStarted);
589
+ client.removeEventListener("executing", onNodeExecuting);
590
+ client.removeEventListener("progress", onNodeProgress);
381
591
  }
382
592
  };
383
593
  let pendingSettled = false;
@@ -398,10 +608,11 @@ export class WorkflowPool extends TypedEventTarget {
398
608
  };
399
609
  });
400
610
  let resolveCompletion;
401
- let rejectCompletion;
402
- const completionPromise = new Promise((resolve, reject) => {
611
+ let completionError;
612
+ // completionPromise is used to track when the wrapper completes (success or failure)
613
+ // It's resolved in onFinished and onFailed handlers
614
+ const completionPromise = new Promise((resolve) => {
403
615
  resolveCompletion = resolve;
404
- rejectCompletion = reject;
405
616
  });
406
617
  let jobStartedDispatched = false;
407
618
  wrapper.onProgress((progress, promptId) => {
@@ -493,16 +704,20 @@ export class WorkflowPool extends TypedEventTarget {
493
704
  }
494
705
  job.result = resultPayload;
495
706
  job.completedAt = Date.now();
707
+ this.clearJobFailures(job.jobId);
496
708
  // Cleanup timeouts
497
709
  cleanupNodeTimeout();
498
710
  // Attach profiling stats if profiling was enabled
499
711
  if (profiler) {
500
712
  job.profileStats = profiler.getStats();
501
713
  }
714
+ completionError = undefined;
502
715
  this.dispatchEvent(new CustomEvent("job:completed", { detail: { job } }));
716
+ safeRelease({ success: true });
503
717
  resolveCompletion?.();
504
718
  });
505
719
  wrapper.onFailed((error, promptId) => {
720
+ this.debugLog("[debug] wrapper.onFailed", job.jobId, error.name);
506
721
  if (!job.promptId && promptId) {
507
722
  job.promptId = promptId;
508
723
  }
@@ -510,9 +725,13 @@ export class WorkflowPool extends TypedEventTarget {
510
725
  // Cleanup timeouts
511
726
  cleanupNodeTimeout();
512
727
  rejectPending?.(error);
513
- rejectCompletion?.(error);
728
+ completionError = error;
729
+ this.debugLog("[debug] resolveCompletion available", Boolean(resolveCompletion));
730
+ safeRelease({ success: false });
731
+ resolveCompletion?.();
514
732
  });
515
733
  try {
734
+ // Start the workflow execution
516
735
  const exec = wrapper.run();
517
736
  // Add timeout for execution start to prevent jobs getting stuck
518
737
  const executionStartTimeout = this.opts.executionStartTimeoutMs ?? 5000;
@@ -528,18 +747,21 @@ export class WorkflowPool extends TypedEventTarget {
528
747
  })
529
748
  ]);
530
749
  await pendingWithTimeout;
531
- clearTimeout(pendingTimeoutId);
532
750
  }
533
751
  else {
534
752
  await pendingPromise;
535
753
  }
754
+ if (executionStartTimeout > 0) {
755
+ clearTimeout(pendingTimeoutId);
756
+ }
536
757
  this.activeJobs.set(job.jobId, {
537
758
  reservation,
538
759
  job,
539
760
  clientId,
540
- release,
761
+ release: (opts) => safeRelease(opts),
541
762
  cancel: async () => {
542
763
  try {
764
+ wrapper.cancel("workflow pool cancel");
543
765
  if (job.promptId) {
544
766
  await client.ext.queue.interrupt(job.promptId);
545
767
  }
@@ -547,36 +769,41 @@ export class WorkflowPool extends TypedEventTarget {
547
769
  finally {
548
770
  this.activeJobs.delete(job.jobId);
549
771
  await this.queue.discard(reservation.reservationId, new Error("cancelled"));
550
- release({ success: false });
772
+ safeRelease({ success: false });
551
773
  }
552
774
  }
553
775
  });
554
776
  const result = await exec;
777
+ // Wait for the wrapper to complete (onFinished or onFailed callback)
778
+ await completionPromise;
555
779
  if (result === false) {
556
- // Execution failed - try to get the error from completionPromise rejection
557
- try {
558
- await completionPromise;
559
- }
560
- catch (err) {
561
- throw err;
562
- }
563
- throw job.lastError ?? new Error("Execution failed");
780
+ const errorToThrow = (completionError instanceof Error ? completionError : undefined) ??
781
+ (job.lastError instanceof Error ? job.lastError : undefined) ??
782
+ new Error("Execution failed");
783
+ throw errorToThrow;
564
784
  }
565
- await completionPromise;
566
785
  await this.queue.commit(reservation.reservationId);
567
- release({ success: true });
786
+ safeRelease({ success: true });
568
787
  }
569
788
  catch (error) {
789
+ // Immediately release the client on any failure
790
+ safeRelease({ success: false });
570
791
  const latestStatus = this.jobStore.get(job.jobId)?.status;
571
792
  if (latestStatus === "cancelled") {
572
- release({ success: false });
573
793
  return;
574
794
  }
575
795
  job.lastError = error;
576
796
  job.status = "failed";
577
- this.clientManager.recordFailure(clientId, job, error);
578
797
  const remainingAttempts = job.options.maxAttempts - job.attempts;
579
- const willRetry = remainingAttempts > 0;
798
+ const failureAnalysis = analyzeWorkflowFailure(error);
799
+ this.rememberJobFailure(job, clientId, failureAnalysis);
800
+ if (failureAnalysis.blockClient === "permanent") {
801
+ this.addPermanentExclusion(job, clientId);
802
+ reservation.payload.options.excludeClientIds = [...(job.options.excludeClientIds ?? [])];
803
+ }
804
+ this.clientManager.recordFailure(clientId, job, error);
805
+ const hasRetryPath = this.hasRetryPath(job);
806
+ const willRetry = failureAnalysis.retryable && remainingAttempts > 0 && hasRetryPath;
580
807
  this.dispatchEvent(new CustomEvent("job:failed", {
581
808
  detail: { job, willRetry }
582
809
  }));
@@ -589,19 +816,24 @@ export class WorkflowPool extends TypedEventTarget {
589
816
  job.startedAt = undefined;
590
817
  job.completedAt = undefined;
591
818
  job.result = undefined;
819
+ reservation.payload.options.excludeClientIds = [...(job.options.excludeClientIds ?? [])];
592
820
  await this.queue.retry(reservation.reservationId, { delayMs: delay });
593
821
  this.dispatchEvent(new CustomEvent("job:queued", { detail: { job } }));
594
822
  this.scheduleProcess(delay);
595
- release({ success: false });
596
823
  }
597
824
  else {
598
825
  job.completedAt = Date.now();
599
- await this.queue.discard(reservation.reservationId, error);
600
- release({ success: false });
826
+ const finalError = !hasRetryPath && failureAnalysis.type === "client_incompatible" && this.jobFailureAnalysis.has(job.jobId)
827
+ ? this.createWorkflowNotSupportedError(job, error)
828
+ : error;
829
+ job.lastError = finalError;
830
+ await this.queue.discard(reservation.reservationId, finalError);
831
+ this.clearJobFailures(job.jobId);
601
832
  }
602
833
  }
603
834
  finally {
604
835
  this.activeJobs.delete(job.jobId);
836
+ this.debugLog(`[runJob.finally] Job ${job.jobId.substring(0, 8)}... completed, calling processQueue()`);
605
837
  void this.processQueue();
606
838
  }
607
839
  }