mcp-agent-foundry 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -2
- package/dist/cli/setup-wizard.d.ts.map +1 -1
- package/dist/cli/setup-wizard.js +873 -8
- package/dist/cli/setup-wizard.js.map +1 -1
- package/dist/cli/test-connection.d.ts +28 -0
- package/dist/cli/test-connection.d.ts.map +1 -1
- package/dist/cli/test-connection.js +335 -1
- package/dist/cli/test-connection.js.map +1 -1
- package/dist/cli.d.ts +13 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +169 -1
- package/dist/cli.js.map +1 -1
- package/dist/config/validator.d.ts +113 -0
- package/dist/config/validator.d.ts.map +1 -1
- package/dist/config/validator.js +113 -0
- package/dist/config/validator.js.map +1 -1
- package/dist/failover/health-tracker.d.ts +175 -0
- package/dist/failover/health-tracker.d.ts.map +1 -0
- package/dist/failover/health-tracker.js +350 -0
- package/dist/failover/health-tracker.js.map +1 -0
- package/dist/failover/index.d.ts +9 -0
- package/dist/failover/index.d.ts.map +1 -0
- package/dist/failover/index.js +9 -0
- package/dist/failover/index.js.map +1 -0
- package/dist/failover/orchestrator.d.ts +189 -0
- package/dist/failover/orchestrator.d.ts.map +1 -0
- package/dist/failover/orchestrator.js +488 -0
- package/dist/failover/orchestrator.js.map +1 -0
- package/dist/failover/pricing.d.ts +115 -0
- package/dist/failover/pricing.d.ts.map +1 -0
- package/dist/failover/pricing.js +283 -0
- package/dist/failover/pricing.js.map +1 -0
- package/dist/persistence/state-schema.d.ts +50 -0
- package/dist/persistence/state-schema.d.ts.map +1 -1
- package/dist/persistence/state-schema.js +2 -0
- package/dist/persistence/state-schema.js.map +1 -1
- package/dist/providers/fireworks.d.ts +23 -0
- package/dist/providers/fireworks.d.ts.map +1 -0
- package/dist/providers/fireworks.js +31 -0
- package/dist/providers/fireworks.js.map +1 -0
- package/dist/providers/groq.d.ts +23 -0
- package/dist/providers/groq.d.ts.map +1 -0
- package/dist/providers/groq.js +31 -0
- package/dist/providers/groq.js.map +1 -0
- package/dist/providers/kimi-code.d.ts +32 -0
- package/dist/providers/kimi-code.d.ts.map +1 -0
- package/dist/providers/kimi-code.js +46 -0
- package/dist/providers/kimi-code.js.map +1 -0
- package/dist/providers/kimi.d.ts +1 -1
- package/dist/providers/kimi.js +1 -1
- package/dist/providers/openrouter.d.ts +23 -0
- package/dist/providers/openrouter.d.ts.map +1 -0
- package/dist/providers/openrouter.js +31 -0
- package/dist/providers/openrouter.js.map +1 -0
- package/dist/providers/perplexity.d.ts +29 -0
- package/dist/providers/perplexity.d.ts.map +1 -0
- package/dist/providers/perplexity.js +51 -0
- package/dist/providers/perplexity.js.map +1 -0
- package/dist/providers/together.d.ts +23 -0
- package/dist/providers/together.d.ts.map +1 -0
- package/dist/providers/together.js +31 -0
- package/dist/providers/together.js.map +1 -0
- package/dist/router/engine.d.ts +21 -0
- package/dist/router/engine.d.ts.map +1 -1
- package/dist/router/engine.js +81 -21
- package/dist/router/engine.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +49 -0
- package/dist/server.js.map +1 -1
- package/dist/types.d.ts +52 -1
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +14 -0
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Failover Orchestrator
|
|
3
|
+
*
|
|
4
|
+
* Wraps provider operations with intelligent retry and failover logic.
|
|
5
|
+
* Integrates health tracking, circuit breakers, and cost-aware routing.
|
|
6
|
+
*/
|
|
7
|
+
import type { Logger } from '../observability/logger.js';
|
|
8
|
+
import type { ProviderManager } from '../providers/manager.js';
|
|
9
|
+
import type { Config, CompletionResponse, Message } from '../types.js';
|
|
10
|
+
import { type CircuitBreakerOptions } from '../utils/circuit-breaker.js';
|
|
11
|
+
import { ProviderHealthTracker, type ProviderHealth } from './health-tracker.js';
|
|
12
|
+
import { PricingService } from './pricing.js';
|
|
13
|
+
import type { PersistedFailoverEvent, PersistedProviderHealth } from '../persistence/state-schema.js';
|
|
14
|
+
/**
|
|
15
|
+
* Options for a single completion request with failover.
|
|
16
|
+
*/
|
|
17
|
+
export interface FailoverCompletionOptions {
|
|
18
|
+
/** Request temperature */
|
|
19
|
+
temperature?: number;
|
|
20
|
+
/** Maximum tokens to generate */
|
|
21
|
+
max_tokens?: number;
|
|
22
|
+
/** Request timeout in ms */
|
|
23
|
+
timeout_ms?: number;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Result of a completion with failover metadata.
|
|
27
|
+
*/
|
|
28
|
+
export interface FailoverCompletionResult {
|
|
29
|
+
/** The completion response */
|
|
30
|
+
response: CompletionResponse;
|
|
31
|
+
/** Provider that handled the request */
|
|
32
|
+
provider: string;
|
|
33
|
+
/** Model used */
|
|
34
|
+
model: string;
|
|
35
|
+
/** Number of retry attempts */
|
|
36
|
+
retryCount: number;
|
|
37
|
+
/** Number of providers tried */
|
|
38
|
+
failoverCount: number;
|
|
39
|
+
/** Total latency in ms */
|
|
40
|
+
latencyMs: number;
|
|
41
|
+
/** Whether failover was triggered */
|
|
42
|
+
usedFailover: boolean;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Configuration for the failover orchestrator.
|
|
46
|
+
*/
|
|
47
|
+
export interface FailoverOrchestratorConfig {
|
|
48
|
+
/** Enable failover (default: true) */
|
|
49
|
+
enabled?: boolean;
|
|
50
|
+
/** Maximum total retries across all providers (default: 6) */
|
|
51
|
+
maxTotalRetries?: number;
|
|
52
|
+
/** Health check interval in ms (default: 60000) */
|
|
53
|
+
healthCheckIntervalMs?: number;
|
|
54
|
+
/** Cooldown period in ms (default: 300000) */
|
|
55
|
+
cooldownMs?: number;
|
|
56
|
+
/** Use cheapest healthy provider when no preference (default: false) */
|
|
57
|
+
preferCostEfficient?: boolean;
|
|
58
|
+
/** Circuit breaker options per provider */
|
|
59
|
+
circuitBreaker?: Partial<CircuitBreakerOptions>;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Failover event for logging and persistence.
|
|
63
|
+
*/
|
|
64
|
+
export interface FailoverEvent {
|
|
65
|
+
timestamp: Date;
|
|
66
|
+
role: string;
|
|
67
|
+
fromProvider: string;
|
|
68
|
+
toProvider: string;
|
|
69
|
+
reason: string;
|
|
70
|
+
errorCode?: number;
|
|
71
|
+
errorMessage?: string;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Orchestrates provider requests with intelligent failover.
|
|
75
|
+
*
|
|
76
|
+
* Features:
|
|
77
|
+
* - Automatic retry with exponential backoff
|
|
78
|
+
* - Multi-provider failover chain
|
|
79
|
+
* - Health tracking and cooldown management
|
|
80
|
+
* - Circuit breaker per provider
|
|
81
|
+
* - Cost-aware provider selection
|
|
82
|
+
* - Failover event logging for diagnostics
|
|
83
|
+
*
|
|
84
|
+
* @example
|
|
85
|
+
* ```typescript
|
|
86
|
+
* const orchestrator = new FailoverOrchestrator(
|
|
87
|
+
* providers,
|
|
88
|
+
* config,
|
|
89
|
+
* logger
|
|
90
|
+
* );
|
|
91
|
+
*
|
|
92
|
+
* // Execute with automatic failover
|
|
93
|
+
* const result = await orchestrator.executeWithFailover(
|
|
94
|
+
* 'coder',
|
|
95
|
+
* messages,
|
|
96
|
+
* { temperature: 0.7 }
|
|
97
|
+
* );
|
|
98
|
+
* ```
|
|
99
|
+
*/
|
|
100
|
+
export declare class FailoverOrchestrator {
|
|
101
|
+
private readonly providers;
|
|
102
|
+
private readonly appConfig;
|
|
103
|
+
private readonly logger;
|
|
104
|
+
private readonly config;
|
|
105
|
+
private readonly healthTracker;
|
|
106
|
+
private readonly pricingService;
|
|
107
|
+
private readonly circuitBreakers;
|
|
108
|
+
private readonly failoverEvents;
|
|
109
|
+
private readonly maxEventHistory;
|
|
110
|
+
constructor(providers: ProviderManager, appConfig: Config, logger: Logger, config?: FailoverOrchestratorConfig);
|
|
111
|
+
/**
|
|
112
|
+
* Execute a completion request with automatic failover.
|
|
113
|
+
*/
|
|
114
|
+
executeWithFailover(role: string, messages: Message[], options: FailoverCompletionOptions): Promise<FailoverCompletionResult>;
|
|
115
|
+
/**
|
|
116
|
+
* Get available providers for a role, sorted by health and optionally cost.
|
|
117
|
+
*/
|
|
118
|
+
getAvailableProviders(role: string): Array<{
|
|
119
|
+
provider: string;
|
|
120
|
+
model: string;
|
|
121
|
+
}>;
|
|
122
|
+
/**
|
|
123
|
+
* Get health status for all tracked providers.
|
|
124
|
+
*/
|
|
125
|
+
getProviderHealth(): Map<string, ProviderHealth>;
|
|
126
|
+
/**
|
|
127
|
+
* Get recent failover events.
|
|
128
|
+
*/
|
|
129
|
+
getFailoverEvents(): FailoverEvent[];
|
|
130
|
+
/**
|
|
131
|
+
* Initialize the orchestrator (fetch pricing, start health checks).
|
|
132
|
+
*/
|
|
133
|
+
initialize(): Promise<void>;
|
|
134
|
+
/**
|
|
135
|
+
* Shutdown the orchestrator (stop health checks, persist state).
|
|
136
|
+
*/
|
|
137
|
+
shutdown(): void;
|
|
138
|
+
/**
|
|
139
|
+
* Serialize state for persistence.
|
|
140
|
+
*/
|
|
141
|
+
serializeState(): {
|
|
142
|
+
providerHealth: PersistedProviderHealth[];
|
|
143
|
+
failoverEvents: PersistedFailoverEvent[];
|
|
144
|
+
};
|
|
145
|
+
/**
|
|
146
|
+
* Restore state from persistence.
|
|
147
|
+
*/
|
|
148
|
+
restoreState(state: {
|
|
149
|
+
providerHealth?: PersistedProviderHealth[];
|
|
150
|
+
failoverEvents?: PersistedFailoverEvent[];
|
|
151
|
+
}): void;
|
|
152
|
+
/**
|
|
153
|
+
* Get the pricing service for external use.
|
|
154
|
+
*/
|
|
155
|
+
getPricingService(): PricingService;
|
|
156
|
+
/**
|
|
157
|
+
* Get the health tracker for external use.
|
|
158
|
+
*/
|
|
159
|
+
getHealthTracker(): ProviderHealthTracker;
|
|
160
|
+
/**
|
|
161
|
+
* Build the ordered chain of providers to try for a role.
|
|
162
|
+
*/
|
|
163
|
+
private buildProviderChain;
|
|
164
|
+
/**
|
|
165
|
+
* Execute a single provider request with retry logic.
|
|
166
|
+
*/
|
|
167
|
+
private executeWithRetry;
|
|
168
|
+
/**
|
|
169
|
+
* Execute without failover (single provider only).
|
|
170
|
+
*/
|
|
171
|
+
private executeSingleProvider;
|
|
172
|
+
/**
|
|
173
|
+
* Check if an error should trigger failover.
|
|
174
|
+
*/
|
|
175
|
+
private shouldTriggerFailover;
|
|
176
|
+
/**
|
|
177
|
+
* Get the next provider in the chain that hasn't been tried.
|
|
178
|
+
*/
|
|
179
|
+
private getNextProvider;
|
|
180
|
+
/**
|
|
181
|
+
* Get or create a circuit breaker for a provider.
|
|
182
|
+
*/
|
|
183
|
+
private getOrCreateCircuitBreaker;
|
|
184
|
+
/**
|
|
185
|
+
* Record a failover event for diagnostics.
|
|
186
|
+
*/
|
|
187
|
+
private recordFailoverEvent;
|
|
188
|
+
}
|
|
189
|
+
//# sourceMappingURL=orchestrator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"orchestrator.d.ts","sourceRoot":"","sources":["../../src/failover/orchestrator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,4BAA4B,CAAC;AACzD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC/D,OAAO,KAAK,EAAE,MAAM,EAAqB,kBAAkB,EAAE,OAAO,EAAmC,MAAM,aAAa,CAAC;AAG3H,OAAO,EAAoC,KAAK,qBAAqB,EAAE,MAAM,6BAA6B,CAAC;AAC3G,OAAO,EAAE,qBAAqB,EAAE,KAAK,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACjF,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,KAAK,EAAE,sBAAsB,EAAE,uBAAuB,EAAE,MAAM,gCAAgC,CAAC;AAMtG;;GAEG;AACH,MAAM,WAAW,yBAAyB;IACxC,0BAA0B;IAC1B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,iCAAiC;IACjC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,4BAA4B;IAC5B,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,8BAA8B;IAC9B,QAAQ,EAAE,kBAAkB,CAAC;IAC7B,wCAAwC;IACxC,QAAQ,EAAE,MAAM,CAAC;IACjB,iBAAiB;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAC;IACnB,gCAAgC;IAChC,aAAa,EAAE,MAAM,CAAC;IACtB,0BAA0B;IAC1B,SAAS,EAAE,MAAM,CAAC;IAClB,qCAAqC;IACrC,YAAY,EAAE,OAAO,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,0BAA0B;IACzC,sCAAsC;IACtC,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,8DAA8D;IAC9D,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,mDAAmD;IACnD,qBAAqB,CAAC,EAAE,MAAM,CAAC;IAC/B,8CAA8C;IAC9C,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,wEAAwE;IACxE,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAC9B,2CAA2C;IAC3C,cAAc,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;CACjD;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,SAAS,EAAE,IAAI,CAAC;IAChB,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AA4BD;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAkB;IAC5C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAS;IAChC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAuC;IAC9D,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAwB;IACtD,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAiB;IAChD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAA0C;IAC1E,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAuB;IACtD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAO;gBAGrC,SAAS,EAAE,eAAe,EAC1B,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,MAAM,EACd,MAAM,CAAC,EAAE,0BAA0B;IAiBrC;;OAEG;IACG,mBAAmB,CACvB,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,OAAO,EAAE,EACnB,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,wBAAwB,CAAC;IAgHpC;;OAEG;IACH,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,KAAK,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC;IAuB/E;;OAEG;IACH,iBAAiB,IAAI,GAAG,CAAC,MAAM,EAAE,cAAc,CAAC;IAIhD;;OAEG;IACH,iBAAiB,IAAI,aAAa,EAAE;IAIpC;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAajC;;OAEG;IACH,QAAQ,IAAI,IAAI;IAKhB;;OAEG;IACH,cAAc,IAAI;QAChB,cAAc,EAAE,uBAAuB,EAAE,CAAC;QAC1C,cAAc,EAAE,sBAAsB,EAAE,CAAC;KAC1C;IAeD;;OAEG;IACH,YAAY,CAAC,KAAK,EAAE;QAClB,cAAc,CAAC,EAAE,uBAAuB,EAAE,CAAC;QAC3C,cAAc,CAAC,EAAE,sBAAsB,EAAE,CAAC;KAC3C,GAAG,IAAI;IAuBR;;OAEG;IACH,iBAAiB,IAAI,cAAc;IAInC;;OAEG;IACH,gBAAgB,IAAI,qBAAqB;IAQzC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAuC1B;;OAEG;YACW,gBAAgB;IAiF9B;;OAEG;YACW,qBAAqB;IAoCnC;;OAEG;IACH,OAAO,CAAC,qBAAqB;IAwB7B;;OAEG;IACH,OAAO,CAAC,eAAe;IAYvB;;OAEG;IACH,OAAO,CAAC,yBAAyB;IAoBjC;;OAEG;IACH,OAAO,CAAC,mBAAmB;CA4B5B"}
|
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Failover Orchestrator
|
|
3
|
+
*
|
|
4
|
+
* Wraps provider operations with intelligent retry and failover logic.
|
|
5
|
+
* Integrates health tracking, circuit breakers, and cost-aware routing.
|
|
6
|
+
*/
|
|
7
|
+
import { ProviderError, RateLimitError, TimeoutError, FailoverExhaustedError } from '../types.js';
|
|
8
|
+
import { retry, isRetryableError } from '../utils/retry.js';
|
|
9
|
+
import { CircuitBreaker, CircuitOpenError } from '../utils/circuit-breaker.js';
|
|
10
|
+
import { ProviderHealthTracker } from './health-tracker.js';
|
|
11
|
+
import { PricingService } from './pricing.js';
|
|
12
|
+
// ============================================================================
|
|
13
|
+
// Constants
|
|
14
|
+
// ============================================================================
|
|
15
|
+
const DEFAULT_CONFIG = {
|
|
16
|
+
enabled: true,
|
|
17
|
+
maxTotalRetries: 6,
|
|
18
|
+
healthCheckIntervalMs: 60000,
|
|
19
|
+
cooldownMs: 300000,
|
|
20
|
+
preferCostEfficient: false,
|
|
21
|
+
circuitBreaker: {
|
|
22
|
+
failureThreshold: 5,
|
|
23
|
+
successThreshold: 2,
|
|
24
|
+
timeout: 30000,
|
|
25
|
+
},
|
|
26
|
+
};
|
|
27
|
+
/**
|
|
28
|
+
* Default error codes that trigger failover.
|
|
29
|
+
*/
|
|
30
|
+
const DEFAULT_FAILOVER_ERRORS = [429, 500, 502, 503, 504];
|
|
31
|
+
// ============================================================================
|
|
32
|
+
// FailoverOrchestrator Class
|
|
33
|
+
// ============================================================================
|
|
34
|
+
/**
|
|
35
|
+
* Orchestrates provider requests with intelligent failover.
|
|
36
|
+
*
|
|
37
|
+
* Features:
|
|
38
|
+
* - Automatic retry with exponential backoff
|
|
39
|
+
* - Multi-provider failover chain
|
|
40
|
+
* - Health tracking and cooldown management
|
|
41
|
+
* - Circuit breaker per provider
|
|
42
|
+
* - Cost-aware provider selection
|
|
43
|
+
* - Failover event logging for diagnostics
|
|
44
|
+
*
|
|
45
|
+
* @example
|
|
46
|
+
* ```typescript
|
|
47
|
+
* const orchestrator = new FailoverOrchestrator(
|
|
48
|
+
* providers,
|
|
49
|
+
* config,
|
|
50
|
+
* logger
|
|
51
|
+
* );
|
|
52
|
+
*
|
|
53
|
+
* // Execute with automatic failover
|
|
54
|
+
* const result = await orchestrator.executeWithFailover(
|
|
55
|
+
* 'coder',
|
|
56
|
+
* messages,
|
|
57
|
+
* { temperature: 0.7 }
|
|
58
|
+
* );
|
|
59
|
+
* ```
|
|
60
|
+
*/
|
|
61
|
+
export class FailoverOrchestrator {
|
|
62
|
+
providers;
|
|
63
|
+
appConfig;
|
|
64
|
+
logger;
|
|
65
|
+
config;
|
|
66
|
+
healthTracker;
|
|
67
|
+
pricingService;
|
|
68
|
+
circuitBreakers = new Map();
|
|
69
|
+
failoverEvents = [];
|
|
70
|
+
maxEventHistory = 100;
|
|
71
|
+
constructor(providers, appConfig, logger, config) {
|
|
72
|
+
this.providers = providers;
|
|
73
|
+
this.appConfig = appConfig;
|
|
74
|
+
this.logger = logger;
|
|
75
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
76
|
+
// Initialize health tracker
|
|
77
|
+
this.healthTracker = new ProviderHealthTracker(logger, {
|
|
78
|
+
cooldownMs: this.config.cooldownMs,
|
|
79
|
+
healthCheckIntervalMs: this.config.healthCheckIntervalMs,
|
|
80
|
+
});
|
|
81
|
+
// Initialize pricing service
|
|
82
|
+
this.pricingService = new PricingService(logger);
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Execute a completion request with automatic failover.
|
|
86
|
+
*/
|
|
87
|
+
async executeWithFailover(role, messages, options) {
|
|
88
|
+
if (!this.config.enabled) {
|
|
89
|
+
// Failover disabled, use primary provider only
|
|
90
|
+
return this.executeSingleProvider(role, messages, options);
|
|
91
|
+
}
|
|
92
|
+
const startTime = Date.now();
|
|
93
|
+
const roleConfig = this.appConfig.roles[role];
|
|
94
|
+
if (!roleConfig) {
|
|
95
|
+
throw new Error(`Unknown role: ${role}`);
|
|
96
|
+
}
|
|
97
|
+
// Build the chain of providers to try
|
|
98
|
+
const providerChain = this.buildProviderChain(role, roleConfig);
|
|
99
|
+
const errors = new Map();
|
|
100
|
+
const attemptedProviders = [];
|
|
101
|
+
let totalRetries = 0;
|
|
102
|
+
let failoverCount = 0;
|
|
103
|
+
// Get failover error codes from config or use defaults
|
|
104
|
+
const failoverErrors = roleConfig.fallback_chain?.on_errors ?? DEFAULT_FAILOVER_ERRORS;
|
|
105
|
+
// Get retry config from role or use defaults
|
|
106
|
+
const retryConfig = roleConfig.fallback_chain?.retry ?? {};
|
|
107
|
+
for (const { provider, model } of providerChain) {
|
|
108
|
+
if (totalRetries >= this.config.maxTotalRetries) {
|
|
109
|
+
this.logger.warn('Max total retries reached', {
|
|
110
|
+
role,
|
|
111
|
+
totalRetries,
|
|
112
|
+
attemptedProviders,
|
|
113
|
+
});
|
|
114
|
+
break;
|
|
115
|
+
}
|
|
116
|
+
// Check if provider is available
|
|
117
|
+
if (!this.healthTracker.isAvailable(provider)) {
|
|
118
|
+
const cooldown = this.healthTracker.getCooldownRemaining(provider);
|
|
119
|
+
this.logger.debug('Skipping provider in cooldown', {
|
|
120
|
+
provider,
|
|
121
|
+
cooldownRemainingMs: cooldown,
|
|
122
|
+
});
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
// Check circuit breaker
|
|
126
|
+
const breaker = this.getOrCreateCircuitBreaker(provider);
|
|
127
|
+
if (breaker.getState() === 'OPEN') {
|
|
128
|
+
this.logger.debug('Skipping provider with open circuit', { provider });
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
attemptedProviders.push(provider);
|
|
132
|
+
try {
|
|
133
|
+
const result = await this.executeWithRetry(provider, model, messages, options, retryConfig, failoverErrors);
|
|
134
|
+
const latencyMs = Date.now() - startTime;
|
|
135
|
+
return {
|
|
136
|
+
response: result.response,
|
|
137
|
+
provider,
|
|
138
|
+
model,
|
|
139
|
+
retryCount: result.retryCount,
|
|
140
|
+
failoverCount,
|
|
141
|
+
latencyMs,
|
|
142
|
+
usedFailover: failoverCount > 0,
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
catch (error) {
|
|
146
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
147
|
+
errors.set(provider, err);
|
|
148
|
+
totalRetries++;
|
|
149
|
+
// Check if this error should trigger failover
|
|
150
|
+
const shouldFailover = this.shouldTriggerFailover(err, failoverErrors);
|
|
151
|
+
if (shouldFailover && providerChain.length > attemptedProviders.length) {
|
|
152
|
+
const nextProvider = this.getNextProvider(providerChain, attemptedProviders);
|
|
153
|
+
if (nextProvider) {
|
|
154
|
+
this.recordFailoverEvent(role, provider, nextProvider.provider, err);
|
|
155
|
+
failoverCount++;
|
|
156
|
+
this.logger.info('Failing over to next provider', {
|
|
157
|
+
role,
|
|
158
|
+
fromProvider: provider,
|
|
159
|
+
toProvider: nextProvider.provider,
|
|
160
|
+
errorMessage: err.message,
|
|
161
|
+
});
|
|
162
|
+
continue;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
// No more providers or not a failover-triggering error
|
|
166
|
+
throw err;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
// All providers exhausted
|
|
170
|
+
throw new FailoverExhaustedError(role, attemptedProviders, errors);
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Get available providers for a role, sorted by health and optionally cost.
|
|
174
|
+
*/
|
|
175
|
+
getAvailableProviders(role) {
|
|
176
|
+
const roleConfig = this.appConfig.roles[role];
|
|
177
|
+
if (!roleConfig) {
|
|
178
|
+
return [];
|
|
179
|
+
}
|
|
180
|
+
const chain = this.buildProviderChain(role, roleConfig);
|
|
181
|
+
// Filter to only configured and available providers
|
|
182
|
+
const available = chain.filter(({ provider }) => {
|
|
183
|
+
const isConfigured = this.providers.isConfigured(provider);
|
|
184
|
+
const isAvailable = this.healthTracker.isAvailable(provider);
|
|
185
|
+
return isConfigured && isAvailable;
|
|
186
|
+
});
|
|
187
|
+
// Optionally sort by cost
|
|
188
|
+
if (this.config.preferCostEfficient) {
|
|
189
|
+
return this.pricingService.sortByCost(available);
|
|
190
|
+
}
|
|
191
|
+
return available;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Get health status for all tracked providers.
|
|
195
|
+
*/
|
|
196
|
+
getProviderHealth() {
|
|
197
|
+
return this.healthTracker.getAllHealth();
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Get recent failover events.
|
|
201
|
+
*/
|
|
202
|
+
getFailoverEvents() {
|
|
203
|
+
return [...this.failoverEvents];
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Initialize the orchestrator (fetch pricing, start health checks).
|
|
207
|
+
*/
|
|
208
|
+
async initialize() {
|
|
209
|
+
// Fetch pricing data
|
|
210
|
+
await this.pricingService.refresh();
|
|
211
|
+
// Start health check loop
|
|
212
|
+
this.healthTracker.startHealthCheckLoop(async (provider) => {
|
|
213
|
+
const providerInstance = this.providers.get(provider);
|
|
214
|
+
await providerInstance.healthCheck();
|
|
215
|
+
});
|
|
216
|
+
this.logger.info('Failover orchestrator initialized');
|
|
217
|
+
}
|
|
218
|
+
/**
|
|
219
|
+
* Shutdown the orchestrator (stop health checks, persist state).
|
|
220
|
+
*/
|
|
221
|
+
shutdown() {
|
|
222
|
+
this.healthTracker.stopHealthCheckLoop();
|
|
223
|
+
this.logger.info('Failover orchestrator shut down');
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Serialize state for persistence.
|
|
227
|
+
*/
|
|
228
|
+
serializeState() {
|
|
229
|
+
return {
|
|
230
|
+
providerHealth: this.healthTracker.serialize(),
|
|
231
|
+
failoverEvents: this.failoverEvents.map((e) => ({
|
|
232
|
+
timestamp: e.timestamp.getTime(),
|
|
233
|
+
role: e.role,
|
|
234
|
+
fromProvider: e.fromProvider,
|
|
235
|
+
toProvider: e.toProvider,
|
|
236
|
+
reason: e.reason,
|
|
237
|
+
errorCode: e.errorCode,
|
|
238
|
+
errorMessage: e.errorMessage,
|
|
239
|
+
})),
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Restore state from persistence.
|
|
244
|
+
*/
|
|
245
|
+
restoreState(state) {
|
|
246
|
+
if (state.providerHealth) {
|
|
247
|
+
this.healthTracker.restore(state.providerHealth);
|
|
248
|
+
}
|
|
249
|
+
if (state.failoverEvents) {
|
|
250
|
+
this.failoverEvents.length = 0;
|
|
251
|
+
for (const e of state.failoverEvents) {
|
|
252
|
+
this.failoverEvents.push({
|
|
253
|
+
timestamp: new Date(e.timestamp),
|
|
254
|
+
role: e.role,
|
|
255
|
+
fromProvider: e.fromProvider,
|
|
256
|
+
toProvider: e.toProvider,
|
|
257
|
+
reason: e.reason,
|
|
258
|
+
errorCode: e.errorCode,
|
|
259
|
+
errorMessage: e.errorMessage,
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
this.logger.info('Failover orchestrator state restored');
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Get the pricing service for external use.
|
|
267
|
+
*/
|
|
268
|
+
getPricingService() {
|
|
269
|
+
return this.pricingService;
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Get the health tracker for external use.
|
|
273
|
+
*/
|
|
274
|
+
getHealthTracker() {
|
|
275
|
+
return this.healthTracker;
|
|
276
|
+
}
|
|
277
|
+
// ==========================================================================
|
|
278
|
+
// Private Methods
|
|
279
|
+
// ==========================================================================
|
|
280
|
+
/**
|
|
281
|
+
* Build the ordered chain of providers to try for a role.
|
|
282
|
+
*/
|
|
283
|
+
buildProviderChain(role, roleConfig) {
|
|
284
|
+
const chain = [];
|
|
285
|
+
// Primary provider
|
|
286
|
+
chain.push({
|
|
287
|
+
provider: roleConfig.provider,
|
|
288
|
+
model: roleConfig.model,
|
|
289
|
+
});
|
|
290
|
+
// Fallback chain (extended)
|
|
291
|
+
if (roleConfig.fallback_chain?.providers) {
|
|
292
|
+
for (const fb of roleConfig.fallback_chain.providers) {
|
|
293
|
+
chain.push({
|
|
294
|
+
provider: fb.provider,
|
|
295
|
+
model: fb.model,
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
// Legacy single fallback
|
|
300
|
+
if (roleConfig.fallback) {
|
|
301
|
+
// Only add if not already in chain
|
|
302
|
+
const exists = chain.some((p) => p.provider === roleConfig.fallback.provider && p.model === roleConfig.fallback.model);
|
|
303
|
+
if (!exists) {
|
|
304
|
+
chain.push({
|
|
305
|
+
provider: roleConfig.fallback.provider,
|
|
306
|
+
model: roleConfig.fallback.model,
|
|
307
|
+
});
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
return chain;
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Execute a single provider request with retry logic.
|
|
314
|
+
*/
|
|
315
|
+
async executeWithRetry(provider, model, messages, options, retryConfig, _failoverErrors) {
|
|
316
|
+
const breaker = this.getOrCreateCircuitBreaker(provider);
|
|
317
|
+
let retryCount = 0;
|
|
318
|
+
const retryOptions = {
|
|
319
|
+
maxAttempts: retryConfig.max_attempts ?? 2,
|
|
320
|
+
initialDelayMs: retryConfig.initial_delay_ms ?? 1000,
|
|
321
|
+
maxDelayMs: retryConfig.max_delay_ms ?? 30000,
|
|
322
|
+
shouldRetry: (error) => isRetryableError(error),
|
|
323
|
+
onRetry: (error, attempt, delayMs) => {
|
|
324
|
+
retryCount++;
|
|
325
|
+
this.logger.debug('Retrying provider request', {
|
|
326
|
+
provider,
|
|
327
|
+
attempt,
|
|
328
|
+
delayMs,
|
|
329
|
+
error: error.message,
|
|
330
|
+
});
|
|
331
|
+
},
|
|
332
|
+
};
|
|
333
|
+
try {
|
|
334
|
+
const response = await retry(async () => {
|
|
335
|
+
const startTime = Date.now();
|
|
336
|
+
try {
|
|
337
|
+
// Execute through circuit breaker
|
|
338
|
+
const result = await breaker.execute(async () => {
|
|
339
|
+
const providerInstance = this.providers.get(provider);
|
|
340
|
+
const request = {
|
|
341
|
+
model,
|
|
342
|
+
messages,
|
|
343
|
+
temperature: options.temperature,
|
|
344
|
+
max_tokens: options.max_tokens,
|
|
345
|
+
timeout_ms: options.timeout_ms,
|
|
346
|
+
};
|
|
347
|
+
return providerInstance.complete(request);
|
|
348
|
+
});
|
|
349
|
+
// Record success
|
|
350
|
+
const latencyMs = Date.now() - startTime;
|
|
351
|
+
this.healthTracker.markSuccess(provider, latencyMs);
|
|
352
|
+
return result;
|
|
353
|
+
}
|
|
354
|
+
catch (error) {
|
|
355
|
+
const latencyMs = Date.now() - startTime;
|
|
356
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
357
|
+
// Extract status code if available
|
|
358
|
+
let statusCode;
|
|
359
|
+
if (error instanceof ProviderError) {
|
|
360
|
+
statusCode = error.statusCode;
|
|
361
|
+
}
|
|
362
|
+
else if (error instanceof RateLimitError) {
|
|
363
|
+
statusCode = 429;
|
|
364
|
+
}
|
|
365
|
+
else if (error instanceof CircuitOpenError) {
|
|
366
|
+
// Circuit is open, don't record as failure
|
|
367
|
+
throw error;
|
|
368
|
+
}
|
|
369
|
+
// Record failure
|
|
370
|
+
this.healthTracker.markFailure(provider, err, statusCode);
|
|
371
|
+
throw error;
|
|
372
|
+
}
|
|
373
|
+
}, retryOptions);
|
|
374
|
+
return { response, retryCount };
|
|
375
|
+
}
|
|
376
|
+
catch (error) {
|
|
377
|
+
// Final failure after retries
|
|
378
|
+
throw error;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Execute without failover (single provider only).
|
|
383
|
+
*/
|
|
384
|
+
async executeSingleProvider(role, messages, options) {
|
|
385
|
+
const startTime = Date.now();
|
|
386
|
+
const roleConfig = this.appConfig.roles[role];
|
|
387
|
+
if (!roleConfig) {
|
|
388
|
+
throw new Error(`Unknown role: ${role}`);
|
|
389
|
+
}
|
|
390
|
+
const providerInstance = this.providers.get(roleConfig.provider);
|
|
391
|
+
const request = {
|
|
392
|
+
model: roleConfig.model,
|
|
393
|
+
messages,
|
|
394
|
+
temperature: options.temperature,
|
|
395
|
+
max_tokens: options.max_tokens,
|
|
396
|
+
timeout_ms: options.timeout_ms,
|
|
397
|
+
};
|
|
398
|
+
const response = await providerInstance.complete(request);
|
|
399
|
+
const latencyMs = Date.now() - startTime;
|
|
400
|
+
return {
|
|
401
|
+
response,
|
|
402
|
+
provider: roleConfig.provider,
|
|
403
|
+
model: roleConfig.model,
|
|
404
|
+
retryCount: 0,
|
|
405
|
+
failoverCount: 0,
|
|
406
|
+
latencyMs,
|
|
407
|
+
usedFailover: false,
|
|
408
|
+
};
|
|
409
|
+
}
|
|
410
|
+
/**
|
|
411
|
+
* Check if an error should trigger failover.
|
|
412
|
+
*/
|
|
413
|
+
shouldTriggerFailover(error, failoverErrors) {
|
|
414
|
+
// Rate limit always triggers failover
|
|
415
|
+
if (error instanceof RateLimitError) {
|
|
416
|
+
return true;
|
|
417
|
+
}
|
|
418
|
+
// Timeout triggers failover
|
|
419
|
+
if (error instanceof TimeoutError) {
|
|
420
|
+
return true;
|
|
421
|
+
}
|
|
422
|
+
// Circuit open triggers failover
|
|
423
|
+
if (error instanceof CircuitOpenError) {
|
|
424
|
+
return true;
|
|
425
|
+
}
|
|
426
|
+
// Check status code
|
|
427
|
+
if (error instanceof ProviderError && error.statusCode !== undefined) {
|
|
428
|
+
return failoverErrors.includes(error.statusCode);
|
|
429
|
+
}
|
|
430
|
+
return false;
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Get the next provider in the chain that hasn't been tried.
|
|
434
|
+
*/
|
|
435
|
+
getNextProvider(chain, attempted) {
|
|
436
|
+
for (const p of chain) {
|
|
437
|
+
if (!attempted.includes(p.provider)) {
|
|
438
|
+
return p;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
return undefined;
|
|
442
|
+
}
|
|
443
|
+
/**
|
|
444
|
+
* Get or create a circuit breaker for a provider.
|
|
445
|
+
*/
|
|
446
|
+
getOrCreateCircuitBreaker(provider) {
|
|
447
|
+
let breaker = this.circuitBreakers.get(provider);
|
|
448
|
+
if (!breaker) {
|
|
449
|
+
breaker = new CircuitBreaker(this.config.circuitBreaker);
|
|
450
|
+
// Log state changes
|
|
451
|
+
breaker.onStateChange((prev, next, meta) => {
|
|
452
|
+
this.logger.info('Circuit breaker state change', {
|
|
453
|
+
provider,
|
|
454
|
+
previousState: prev,
|
|
455
|
+
newState: next,
|
|
456
|
+
failureCount: meta.failureCount,
|
|
457
|
+
});
|
|
458
|
+
});
|
|
459
|
+
this.circuitBreakers.set(provider, breaker);
|
|
460
|
+
}
|
|
461
|
+
return breaker;
|
|
462
|
+
}
|
|
463
|
+
/**
|
|
464
|
+
* Record a failover event for diagnostics.
|
|
465
|
+
*/
|
|
466
|
+
recordFailoverEvent(role, fromProvider, toProvider, error) {
|
|
467
|
+
const event = {
|
|
468
|
+
timestamp: new Date(),
|
|
469
|
+
role,
|
|
470
|
+
fromProvider,
|
|
471
|
+
toProvider,
|
|
472
|
+
reason: error.message,
|
|
473
|
+
};
|
|
474
|
+
if (error instanceof ProviderError) {
|
|
475
|
+
event.errorCode = error.statusCode;
|
|
476
|
+
}
|
|
477
|
+
else if (error instanceof RateLimitError) {
|
|
478
|
+
event.errorCode = 429;
|
|
479
|
+
}
|
|
480
|
+
event.errorMessage = error.message;
|
|
481
|
+
this.failoverEvents.push(event);
|
|
482
|
+
// Trim history
|
|
483
|
+
while (this.failoverEvents.length > this.maxEventHistory) {
|
|
484
|
+
this.failoverEvents.shift();
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
//# sourceMappingURL=orchestrator.js.map
|