@unrdf/self-healing-workflows 26.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +284 -0
- package/examples/basic-usage.mjs +99 -0
- package/examples/recovery-strategies.mjs +142 -0
- package/package.json +46 -0
- package/src/circuit-breaker.mjs +262 -0
- package/src/error-classifier.mjs +203 -0
- package/src/health-monitor.mjs +301 -0
- package/src/index.mjs +46 -0
- package/src/recovery-actions.mjs +272 -0
- package/src/retry-strategy.mjs +241 -0
- package/src/schemas.mjs +185 -0
- package/src/self-healing-engine.mjs +354 -0
- package/test/self-healing.test.mjs +772 -0
- package/vitest.config.mjs +20 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file Retry strategy with exponential backoff
|
|
3
|
+
* @module @unrdf/self-healing-workflows/retry
|
|
4
|
+
* @description Implements retry logic with exponential backoff and jitter
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { RetryStrategySchema } from './schemas.mjs';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Retry strategy with exponential backoff
|
|
11
|
+
*/
|
|
12
|
+
export class RetryStrategy {
|
|
13
|
+
/**
|
|
14
|
+
* Creates a new retry strategy
|
|
15
|
+
* @param {Object} [config] - Retry configuration
|
|
16
|
+
* @param {number} [config.maxAttempts=3] - Maximum retry attempts
|
|
17
|
+
* @param {number} [config.initialDelay=1000] - Initial delay in ms
|
|
18
|
+
* @param {number} [config.maxDelay=30000] - Maximum delay in ms
|
|
19
|
+
* @param {number} [config.backoffMultiplier=2] - Backoff multiplier
|
|
20
|
+
* @param {boolean} [config.jitter=true] - Add random jitter
|
|
21
|
+
* @param {Array<string>} [config.retryableErrors] - Error categories to retry
|
|
22
|
+
*/
|
|
23
|
+
constructor(config = {}) {
|
|
24
|
+
this.config = RetryStrategySchema.parse(config);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Executes an operation with retry logic
|
|
29
|
+
* @param {Function} operation - Async operation to execute
|
|
30
|
+
* @param {Object} [options] - Execution options
|
|
31
|
+
* @param {Function} [options.onRetry] - Callback on retry
|
|
32
|
+
* @param {Function} [options.shouldRetry] - Custom retry predicate
|
|
33
|
+
* @returns {Promise<any>} Operation result
|
|
34
|
+
* @throws {Error} If all retries exhausted
|
|
35
|
+
* @example
|
|
36
|
+
* const retry = new RetryStrategy({ maxAttempts: 3 });
|
|
37
|
+
* const result = await retry.execute(async () => {
|
|
38
|
+
* return await fetch('https://api.example.com');
|
|
39
|
+
* });
|
|
40
|
+
*/
|
|
41
|
+
async execute(operation, options = {}) {
|
|
42
|
+
const { onRetry, shouldRetry } = options;
|
|
43
|
+
let lastError;
|
|
44
|
+
let attempt = 0;
|
|
45
|
+
|
|
46
|
+
while (attempt < this.config.maxAttempts) {
|
|
47
|
+
try {
|
|
48
|
+
const result = await operation();
|
|
49
|
+
return result;
|
|
50
|
+
} catch (error) {
|
|
51
|
+
lastError = error;
|
|
52
|
+
attempt++;
|
|
53
|
+
|
|
54
|
+
// Check if we should retry
|
|
55
|
+
const canRetry = shouldRetry
|
|
56
|
+
? shouldRetry(error, attempt)
|
|
57
|
+
: attempt < this.config.maxAttempts;
|
|
58
|
+
|
|
59
|
+
if (!canRetry) {
|
|
60
|
+
break;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Calculate delay
|
|
64
|
+
const delay = this.calculateDelay(attempt);
|
|
65
|
+
|
|
66
|
+
// Call retry callback
|
|
67
|
+
if (onRetry) {
|
|
68
|
+
onRetry(error, attempt, delay);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Wait before retry
|
|
72
|
+
await this.sleep(delay);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// All retries exhausted
|
|
77
|
+
const error = new Error(
|
|
78
|
+
`Operation failed after ${attempt} attempts: ${lastError.message}`
|
|
79
|
+
);
|
|
80
|
+
error.cause = lastError;
|
|
81
|
+
error.attempts = attempt;
|
|
82
|
+
throw error;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Calculates delay for a given attempt with exponential backoff
|
|
87
|
+
* @param {number} attempt - Current attempt number (1-based)
|
|
88
|
+
* @returns {number} Delay in milliseconds
|
|
89
|
+
*/
|
|
90
|
+
calculateDelay(attempt) {
|
|
91
|
+
const { initialDelay, maxDelay, backoffMultiplier, jitter } = this.config;
|
|
92
|
+
|
|
93
|
+
// Exponential backoff: delay = initial * (multiplier ^ (attempt - 1))
|
|
94
|
+
let delay = initialDelay * Math.pow(backoffMultiplier, attempt - 1);
|
|
95
|
+
|
|
96
|
+
// Cap at max delay
|
|
97
|
+
delay = Math.min(delay, maxDelay);
|
|
98
|
+
|
|
99
|
+
// Add jitter to prevent thundering herd
|
|
100
|
+
if (jitter) {
|
|
101
|
+
const jitterAmount = delay * 0.2; // ±20% jitter
|
|
102
|
+
delay = delay + (Math.random() * 2 - 1) * jitterAmount;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return Math.max(0, Math.floor(delay));
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Sleeps for specified duration
|
|
110
|
+
* @param {number} ms - Milliseconds to sleep
|
|
111
|
+
* @returns {Promise<void>}
|
|
112
|
+
*/
|
|
113
|
+
sleep(ms) {
|
|
114
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Executes operation with retry and returns detailed result
|
|
119
|
+
* @param {Function} operation - Async operation to execute
|
|
120
|
+
* @param {Object} [options] - Execution options
|
|
121
|
+
* @returns {Promise<Object>} Result with metadata
|
|
122
|
+
*/
|
|
123
|
+
async executeWithMetadata(operation, options = {}) {
|
|
124
|
+
const startTime = Date.now();
|
|
125
|
+
const attempts = [];
|
|
126
|
+
let result;
|
|
127
|
+
let success = false;
|
|
128
|
+
|
|
129
|
+
try {
|
|
130
|
+
result = await this.execute(operation, {
|
|
131
|
+
...options,
|
|
132
|
+
onRetry: (error, attempt, delay) => {
|
|
133
|
+
attempts.push({
|
|
134
|
+
attempt,
|
|
135
|
+
error: error.message,
|
|
136
|
+
delay,
|
|
137
|
+
timestamp: Date.now()
|
|
138
|
+
});
|
|
139
|
+
if (options.onRetry) {
|
|
140
|
+
options.onRetry(error, attempt, delay);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
});
|
|
144
|
+
success = true;
|
|
145
|
+
} catch (error) {
|
|
146
|
+
result = error;
|
|
147
|
+
success = false;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
success,
|
|
152
|
+
result: success ? result : undefined,
|
|
153
|
+
error: success ? undefined : result,
|
|
154
|
+
attempts: attempts.length + 1,
|
|
155
|
+
duration: Date.now() - startTime,
|
|
156
|
+
retryHistory: attempts
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Creates a retryable version of an async function
|
|
162
|
+
* @param {Function} fn - Function to wrap
|
|
163
|
+
* @param {Object} [options] - Retry options
|
|
164
|
+
* @returns {Function} Wrapped function
|
|
165
|
+
*/
|
|
166
|
+
wrap(fn, options = {}) {
|
|
167
|
+
return async (...args) => {
|
|
168
|
+
return this.execute(() => fn(...args), options);
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Checks if an error is retryable based on configuration
|
|
174
|
+
* @param {Object} classifiedError - Classified error object
|
|
175
|
+
* @returns {boolean} True if error is retryable
|
|
176
|
+
*/
|
|
177
|
+
isRetryable(classifiedError) {
|
|
178
|
+
if (!classifiedError.retryable) {
|
|
179
|
+
return false;
|
|
180
|
+
}
|
|
181
|
+
return this.config.retryableErrors.includes(classifiedError.category);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Updates retry configuration
|
|
186
|
+
* @param {Object} updates - Configuration updates
|
|
187
|
+
* @returns {void}
|
|
188
|
+
*/
|
|
189
|
+
updateConfig(updates) {
|
|
190
|
+
this.config = RetryStrategySchema.parse({
|
|
191
|
+
...this.config,
|
|
192
|
+
...updates
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Gets current configuration
|
|
198
|
+
* @returns {Object} Current configuration
|
|
199
|
+
*/
|
|
200
|
+
getConfig() {
|
|
201
|
+
return { ...this.config };
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Creates a new retry strategy instance
|
|
207
|
+
* @param {Object} [config] - Retry configuration
|
|
208
|
+
* @returns {RetryStrategy} Retry strategy instance
|
|
209
|
+
*/
|
|
210
|
+
export function createRetryStrategy(config) {
|
|
211
|
+
return new RetryStrategy(config);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Immediate retry helper (3 attempts, no backoff)
|
|
216
|
+
* @param {Function} operation - Operation to execute
|
|
217
|
+
* @returns {Promise<any>} Operation result
|
|
218
|
+
*/
|
|
219
|
+
export async function immediateRetry(operation) {
|
|
220
|
+
const strategy = new RetryStrategy({
|
|
221
|
+
maxAttempts: 3,
|
|
222
|
+
initialDelay: 0,
|
|
223
|
+
backoffMultiplier: 1
|
|
224
|
+
});
|
|
225
|
+
return strategy.execute(operation);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Exponential backoff retry helper (default config)
|
|
230
|
+
* @param {Function} operation - Operation to execute
|
|
231
|
+
* @returns {Promise<any>} Operation result
|
|
232
|
+
*/
|
|
233
|
+
export async function exponentialRetry(operation) {
|
|
234
|
+
const strategy = new RetryStrategy({
|
|
235
|
+
maxAttempts: 5,
|
|
236
|
+
initialDelay: 1000,
|
|
237
|
+
maxDelay: 30000,
|
|
238
|
+
backoffMultiplier: 2
|
|
239
|
+
});
|
|
240
|
+
return strategy.execute(operation);
|
|
241
|
+
}
|
package/src/schemas.mjs
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file Zod schemas for self-healing workflows
|
|
3
|
+
* @module @unrdf/self-healing-workflows/schemas
|
|
4
|
+
* @description Validation schemas for error recovery, retry strategies, and health monitoring
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { z } from 'zod';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Error severity levels
|
|
11
|
+
*/
|
|
12
|
+
export const ErrorSeveritySchema = z.enum([
|
|
13
|
+
'critical', // Unrecoverable, requires manual intervention
|
|
14
|
+
'high', // Severe but potentially recoverable
|
|
15
|
+
'medium', // Standard errors, retry likely to succeed
|
|
16
|
+
'low' // Minor errors, safe to retry
|
|
17
|
+
]);
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Error categories for classification
|
|
21
|
+
*/
|
|
22
|
+
export const ErrorCategorySchema = z.enum([
|
|
23
|
+
'network', // Network connectivity issues
|
|
24
|
+
'timeout', // Operation timeout
|
|
25
|
+
'validation', // Data validation failure
|
|
26
|
+
'resource', // Resource unavailable (memory, disk, etc.)
|
|
27
|
+
'dependency', // External dependency failure
|
|
28
|
+
'business-logic', // Business rule violation
|
|
29
|
+
'unknown' // Unclassified error
|
|
30
|
+
]);
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Recovery action types
|
|
34
|
+
*/
|
|
35
|
+
export const RecoveryActionTypeSchema = z.enum([
|
|
36
|
+
'retry', // Retry the operation
|
|
37
|
+
'skip', // Skip and continue
|
|
38
|
+
'compensate', // Execute compensating transaction
|
|
39
|
+
'restart', // Restart the workflow
|
|
40
|
+
'fallback', // Use fallback strategy
|
|
41
|
+
'manual' // Require manual intervention
|
|
42
|
+
]);
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Retry strategy configuration
|
|
46
|
+
*/
|
|
47
|
+
export const RetryStrategySchema = z.object({
|
|
48
|
+
maxAttempts: z.number().int().positive().default(3),
|
|
49
|
+
initialDelay: z.number().nonnegative().default(1000),
|
|
50
|
+
maxDelay: z.number().positive().default(30000),
|
|
51
|
+
backoffMultiplier: z.number().positive().default(2),
|
|
52
|
+
jitter: z.boolean().default(true),
|
|
53
|
+
retryableErrors: z.array(ErrorCategorySchema).default(['network', 'timeout', 'resource'])
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Circuit breaker configuration
|
|
58
|
+
*/
|
|
59
|
+
export const CircuitBreakerConfigSchema = z.object({
|
|
60
|
+
failureThreshold: z.number().int().positive().default(5),
|
|
61
|
+
successThreshold: z.number().int().positive().default(2),
|
|
62
|
+
timeout: z.number().positive().default(60000),
|
|
63
|
+
resetTimeout: z.number().positive().default(30000),
|
|
64
|
+
monitoringPeriod: z.number().positive().default(10000)
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Circuit breaker states
|
|
69
|
+
*/
|
|
70
|
+
export const CircuitBreakerStateSchema = z.enum([
|
|
71
|
+
'closed', // Normal operation, requests allowed
|
|
72
|
+
'open', // Failure threshold exceeded, requests blocked
|
|
73
|
+
'half-open' // Testing if service recovered
|
|
74
|
+
]);
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Error pattern for classification
|
|
78
|
+
*/
|
|
79
|
+
export const ErrorPatternSchema = z.object({
|
|
80
|
+
name: z.string().min(1),
|
|
81
|
+
category: ErrorCategorySchema,
|
|
82
|
+
severity: ErrorSeveritySchema,
|
|
83
|
+
pattern: z.union([
|
|
84
|
+
z.string(),
|
|
85
|
+
z.instanceof(RegExp)
|
|
86
|
+
]),
|
|
87
|
+
metadata: z.record(z.unknown()).optional()
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Classified error
|
|
92
|
+
*/
|
|
93
|
+
export const ClassifiedErrorSchema = z.object({
|
|
94
|
+
originalError: z.instanceof(Error),
|
|
95
|
+
category: ErrorCategorySchema,
|
|
96
|
+
severity: ErrorSeveritySchema,
|
|
97
|
+
matchedPattern: z.string().optional(),
|
|
98
|
+
retryable: z.boolean(),
|
|
99
|
+
timestamp: z.number(),
|
|
100
|
+
metadata: z.record(z.unknown()).optional()
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Recovery action
|
|
105
|
+
*/
|
|
106
|
+
export const RecoveryActionSchema = z.object({
|
|
107
|
+
type: RecoveryActionTypeSchema,
|
|
108
|
+
name: z.string().min(1),
|
|
109
|
+
execute: z.function(),
|
|
110
|
+
condition: z.function().optional(),
|
|
111
|
+
priority: z.number().int().min(0).max(100).default(50),
|
|
112
|
+
metadata: z.record(z.unknown()).optional()
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Recovery result
|
|
117
|
+
*/
|
|
118
|
+
export const RecoveryResultSchema = z.object({
|
|
119
|
+
success: z.boolean(),
|
|
120
|
+
action: RecoveryActionTypeSchema,
|
|
121
|
+
attempts: z.number().int().nonnegative(),
|
|
122
|
+
duration: z.number().nonnegative(),
|
|
123
|
+
error: z.instanceof(Error).optional(),
|
|
124
|
+
metadata: z.record(z.unknown()).optional()
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Health check configuration
|
|
129
|
+
*/
|
|
130
|
+
export const HealthCheckConfigSchema = z.object({
|
|
131
|
+
interval: z.number().positive().default(30000),
|
|
132
|
+
timeout: z.number().positive().default(5000),
|
|
133
|
+
unhealthyThreshold: z.number().int().positive().default(3),
|
|
134
|
+
healthyThreshold: z.number().int().positive().default(2)
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Health status
|
|
139
|
+
*/
|
|
140
|
+
export const HealthStatusSchema = z.enum([
|
|
141
|
+
'healthy',
|
|
142
|
+
'degraded',
|
|
143
|
+
'unhealthy'
|
|
144
|
+
]);
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Health check result
|
|
148
|
+
*/
|
|
149
|
+
export const HealthCheckResultSchema = z.object({
|
|
150
|
+
status: HealthStatusSchema,
|
|
151
|
+
timestamp: z.number(),
|
|
152
|
+
checks: z.array(z.object({
|
|
153
|
+
name: z.string(),
|
|
154
|
+
status: HealthStatusSchema,
|
|
155
|
+
message: z.string().optional(),
|
|
156
|
+
duration: z.number().nonnegative()
|
|
157
|
+
})),
|
|
158
|
+
metadata: z.record(z.unknown()).optional()
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Self-healing engine configuration
|
|
163
|
+
*/
|
|
164
|
+
export const SelfHealingConfigSchema = z.object({
|
|
165
|
+
retry: RetryStrategySchema.optional(),
|
|
166
|
+
circuitBreaker: CircuitBreakerConfigSchema.optional(),
|
|
167
|
+
healthCheck: HealthCheckConfigSchema.optional(),
|
|
168
|
+
errorPatterns: z.array(ErrorPatternSchema).default([]),
|
|
169
|
+
recoveryActions: z.array(RecoveryActionSchema).default([]),
|
|
170
|
+
enableOtel: z.boolean().default(true),
|
|
171
|
+
maxConcurrentRecoveries: z.number().int().positive().default(10)
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Recovery statistics
|
|
176
|
+
*/
|
|
177
|
+
export const RecoveryStatsSchema = z.object({
|
|
178
|
+
totalAttempts: z.number().int().nonnegative(),
|
|
179
|
+
successfulRecoveries: z.number().int().nonnegative(),
|
|
180
|
+
failedRecoveries: z.number().int().nonnegative(),
|
|
181
|
+
averageRecoveryTime: z.number().nonnegative(),
|
|
182
|
+
successRate: z.number().min(0).max(1),
|
|
183
|
+
errorsByCategory: z.record(ErrorCategorySchema, z.number().int().nonnegative()),
|
|
184
|
+
actionsByType: z.record(RecoveryActionTypeSchema, z.number().int().nonnegative())
|
|
185
|
+
});
|