@onlineapps/conn-orch-orchestrator 1.0.43 → 1.0.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/WorkflowOrchestrator.js +142 -36
package/package.json
CHANGED
|
@@ -37,6 +37,14 @@ class WorkflowOrchestrator {
|
|
|
37
37
|
this.logger = config.logger || console;
|
|
38
38
|
this.defaultTimeout = config.defaultTimeout || 30000;
|
|
39
39
|
|
|
40
|
+
// Retry configuration for DLQ mechanism
|
|
41
|
+
this.retryConfig = {
|
|
42
|
+
maxAttempts: config.maxRetryAttempts || 3,
|
|
43
|
+
baseDelay: config.retryBaseDelay || 1000,
|
|
44
|
+
maxDelay: config.retryMaxDelay || 30000,
|
|
45
|
+
backoffMultiplier: config.retryBackoffMultiplier || 2
|
|
46
|
+
};
|
|
47
|
+
|
|
40
48
|
// Create cookbook router - createRouter is an exported function, not a method
|
|
41
49
|
const { createRouter } = this.cookbook;
|
|
42
50
|
if (typeof createRouter === 'function') {
|
|
@@ -230,61 +238,159 @@ class WorkflowOrchestrator {
|
|
|
230
238
|
};
|
|
231
239
|
|
|
232
240
|
} catch (error) {
|
|
233
|
-
//
|
|
241
|
+
// DLQ/Retry mechanism: retry with exponential backoff before sending to DLQ
|
|
242
|
+
const dlqHistory = context._dlq_history || [];
|
|
243
|
+
const stepAttempts = dlqHistory.filter(h => h.step_id === current_step && h.attempt).length;
|
|
244
|
+
const attemptNumber = stepAttempts + 1;
|
|
245
|
+
|
|
246
|
+
// Calculate step_index for failed step
|
|
247
|
+
let failedStepIndex = 0;
|
|
248
|
+
try {
|
|
249
|
+
const stepsArr = this._getStepsArray(cookbookDef);
|
|
250
|
+
const idx = stepsArr.findIndex(s => this._getStepId(s) === current_step);
|
|
251
|
+
if (idx >= 0) failedStepIndex = idx;
|
|
252
|
+
} catch (e) {
|
|
253
|
+
// Ignore - use default 0
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// IMPORTANT: Publish progress event for FAILED step (so it appears in trace)
|
|
257
|
+
// This ensures every cookbook step is visible in monitoring, even failed ones
|
|
258
|
+
const { publishToMonitoringWorkflow } = require('@onlineapps/mq-client-core').monitoring;
|
|
259
|
+
try {
|
|
260
|
+
await publishToMonitoringWorkflow(this.mqClient, {
|
|
261
|
+
event_type: 'progress',
|
|
262
|
+
workflow_id: workflow_id,
|
|
263
|
+
service_name: serviceName,
|
|
264
|
+
step_index: failedStepIndex,
|
|
265
|
+
step_id: current_step,
|
|
266
|
+
cookbook: cookbookDef,
|
|
267
|
+
context: context,
|
|
268
|
+
output: null,
|
|
269
|
+
error: {
|
|
270
|
+
message: error.message,
|
|
271
|
+
attempt: attemptNumber
|
|
272
|
+
},
|
|
273
|
+
status: 'failed',
|
|
274
|
+
timestamp: new Date().toISOString()
|
|
275
|
+
}, this.logger, { workflow_id, step_id: current_step });
|
|
276
|
+
} catch (monitoringError) {
|
|
277
|
+
// Don't fail workflow if monitoring publish fails
|
|
278
|
+
this.logger.warn('Failed to publish failed step progress to monitoring', {
|
|
279
|
+
workflow_id,
|
|
280
|
+
error: monitoringError.message
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Add attempt to DLQ history
|
|
285
|
+
const attemptRecord = {
|
|
286
|
+
step_id: current_step,
|
|
287
|
+
step_index: failedStepIndex,
|
|
288
|
+
attempt: attemptNumber,
|
|
289
|
+
service: serviceName,
|
|
290
|
+
error: error.message,
|
|
291
|
+
at: new Date().toISOString()
|
|
292
|
+
};
|
|
293
|
+
|
|
294
|
+
const updatedDlqHistory = [...dlqHistory, attemptRecord];
|
|
295
|
+
const updatedContext = {
|
|
296
|
+
...context,
|
|
297
|
+
_dlq_history: updatedDlqHistory
|
|
298
|
+
};
|
|
299
|
+
|
|
300
|
+
this.logger.warn(`[WorkflowOrchestrator] Step failed, attempt ${attemptNumber}/${this.retryConfig.maxAttempts}`, {
|
|
301
|
+
workflow_id,
|
|
302
|
+
current_step,
|
|
303
|
+
service: serviceName,
|
|
304
|
+
error: error.message,
|
|
305
|
+
attempt: attemptNumber
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
// Check if we should retry
|
|
309
|
+
if (attemptNumber < this.retryConfig.maxAttempts) {
|
|
310
|
+
// Calculate delay with exponential backoff
|
|
311
|
+
const delay = Math.min(
|
|
312
|
+
this.retryConfig.baseDelay * Math.pow(this.retryConfig.backoffMultiplier, attemptNumber - 1),
|
|
313
|
+
this.retryConfig.maxDelay
|
|
314
|
+
);
|
|
315
|
+
|
|
316
|
+
this.logger.info(`[WorkflowOrchestrator] Scheduling retry ${attemptNumber + 1} in ${delay}ms`, {
|
|
317
|
+
workflow_id,
|
|
318
|
+
current_step,
|
|
319
|
+
delay
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
// Wait for delay
|
|
323
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
324
|
+
|
|
325
|
+
// Republish to same service queue for retry
|
|
326
|
+
const serviceQueue = `${serviceName}.workflow`;
|
|
327
|
+
await this.mqClient.publish(serviceQueue, {
|
|
328
|
+
workflow_id,
|
|
329
|
+
cookbook: cookbookDef,
|
|
330
|
+
current_step,
|
|
331
|
+
context: updatedContext
|
|
332
|
+
});
|
|
333
|
+
|
|
334
|
+
this.logger.info(`[WorkflowOrchestrator] Message republished for retry`, {
|
|
335
|
+
workflow_id,
|
|
336
|
+
current_step,
|
|
337
|
+
queue: serviceQueue,
|
|
338
|
+
nextAttempt: attemptNumber + 1
|
|
339
|
+
});
|
|
340
|
+
|
|
341
|
+
// Don't throw - message will be retried
|
|
342
|
+
return { retrying: true, attempt: attemptNumber, nextAttempt: attemptNumber + 1 };
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// Max retries exhausted - send to workflow.failed (DLQ entry)
|
|
346
|
+
const dlqEntryRecord = {
|
|
347
|
+
dlq_entered: true,
|
|
348
|
+
step_id: current_step,
|
|
349
|
+
step_index: failedStepIndex,
|
|
350
|
+
service: serviceName,
|
|
351
|
+
reason: 'max_retries_exceeded',
|
|
352
|
+
total_attempts: attemptNumber,
|
|
353
|
+
at: new Date().toISOString()
|
|
354
|
+
};
|
|
355
|
+
|
|
356
|
+
const finalDlqHistory = [...updatedDlqHistory, dlqEntryRecord];
|
|
357
|
+
const finalContext = {
|
|
358
|
+
...context,
|
|
359
|
+
_dlq_history: finalDlqHistory
|
|
360
|
+
};
|
|
361
|
+
|
|
234
362
|
try {
|
|
235
|
-
console.log(`[WorkflowOrchestrator] [
|
|
236
|
-
this.logger.error(`[WorkflowOrchestrator] [
|
|
363
|
+
console.log(`[WorkflowOrchestrator] [DLQ] Max retries exhausted, publishing to workflow.failed for ${workflow_id}`);
|
|
364
|
+
this.logger.error(`[WorkflowOrchestrator] [DLQ] Workflow entering DLQ after ${attemptNumber} attempts: ${error.message}`, {
|
|
237
365
|
workflow_id,
|
|
238
366
|
current_step,
|
|
367
|
+
attempts: attemptNumber,
|
|
239
368
|
error: error.stack
|
|
240
369
|
});
|
|
241
370
|
|
|
242
|
-
// Calculate step_index for failed step
|
|
243
|
-
let failedStepIndex = 0;
|
|
244
|
-
try {
|
|
245
|
-
const stepsArr = this._getStepsArray(cookbookDef);
|
|
246
|
-
const idx = stepsArr.findIndex(s => this._getStepId(s) === current_step);
|
|
247
|
-
if (idx >= 0) failedStepIndex = idx;
|
|
248
|
-
} catch (e) {
|
|
249
|
-
// Ignore - use default 0
|
|
250
|
-
}
|
|
251
|
-
|
|
252
371
|
await this.mqClient.publish('workflow.failed', {
|
|
253
372
|
workflow_id,
|
|
254
373
|
current_step,
|
|
255
|
-
step_id: current_step,
|
|
256
|
-
step_index: failedStepIndex,
|
|
257
|
-
service: serviceName,
|
|
258
|
-
cookbook: cookbookDef,
|
|
259
|
-
context:
|
|
374
|
+
step_id: current_step,
|
|
375
|
+
step_index: failedStepIndex,
|
|
376
|
+
service: serviceName,
|
|
377
|
+
cookbook: cookbookDef,
|
|
378
|
+
context: finalContext,
|
|
260
379
|
error: error.message,
|
|
261
380
|
errorStack: error.stack,
|
|
381
|
+
_dlq_history: finalDlqHistory,
|
|
262
382
|
failed_at: new Date().toISOString()
|
|
263
383
|
});
|
|
264
384
|
|
|
265
|
-
console.log(`[WorkflowOrchestrator] [
|
|
266
|
-
this.logger.info(`[WorkflowOrchestrator] [
|
|
385
|
+
console.log(`[WorkflowOrchestrator] [DLQ] ✓ Successfully published workflow.failed for ${workflow_id}`);
|
|
386
|
+
this.logger.info(`[WorkflowOrchestrator] [DLQ] ✓ Published workflow.failed: ${workflow_id}`);
|
|
267
387
|
} catch (publishError) {
|
|
268
|
-
console.error(`[WorkflowOrchestrator] [
|
|
269
|
-
this.logger.error(`[WorkflowOrchestrator] [
|
|
388
|
+
console.error(`[WorkflowOrchestrator] [DLQ] ✗ Failed to publish workflow.failed for ${workflow_id}:`, publishError.message);
|
|
389
|
+
this.logger.error(`[WorkflowOrchestrator] [DLQ] ✗ Failed to publish workflow.failed`, {
|
|
270
390
|
workflow_id,
|
|
271
391
|
originalError: error.message,
|
|
272
392
|
publishError: publishError.message
|
|
273
393
|
});
|
|
274
|
-
// Don't throw - original error is more important
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
// Handle error
|
|
278
|
-
if (this.errorHandler) {
|
|
279
|
-
await this.errorHandler.handleError(error, {
|
|
280
|
-
workflow_id,
|
|
281
|
-
current_step,
|
|
282
|
-
service: serviceName,
|
|
283
|
-
context
|
|
284
|
-
});
|
|
285
|
-
} else {
|
|
286
|
-
// Fallback to router DLQ
|
|
287
|
-
await this.router.routeToDLQ(cookbookDef, context, error, serviceName);
|
|
288
394
|
}
|
|
289
395
|
|
|
290
396
|
throw error;
|