ai-inference-stepper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +169 -0
- package/.eslintrc.cjs +23 -0
- package/.github/workflows/ci.yml +51 -0
- package/.github/workflows/keep-alive.yml +22 -0
- package/.github/workflows/publish.yml +34 -0
- package/ARCHITECTURE.md +594 -0
- package/Dockerfile +16 -0
- package/LICENSE +28 -0
- package/README.md +261 -0
- package/dist/alerts/discord.d.ts +19 -0
- package/dist/alerts/discord.d.ts.map +1 -0
- package/dist/alerts/discord.js +70 -0
- package/dist/alerts/discord.js.map +1 -0
- package/dist/cache/redisCache.d.ts +45 -0
- package/dist/cache/redisCache.d.ts.map +1 -0
- package/dist/cache/redisCache.js +171 -0
- package/dist/cache/redisCache.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +8 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +6 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +251 -0
- package/dist/config.js.map +1 -0
- package/dist/fallback/templateFallback.d.ts +7 -0
- package/dist/fallback/templateFallback.d.ts.map +1 -0
- package/dist/fallback/templateFallback.js +29 -0
- package/dist/fallback/templateFallback.js.map +1 -0
- package/dist/index.d.ts +121 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +198 -0
- package/dist/index.js.map +1 -0
- package/dist/logging.d.ts +10 -0
- package/dist/logging.d.ts.map +1 -0
- package/dist/logging.js +44 -0
- package/dist/logging.js.map +1 -0
- package/dist/metrics/metrics.d.ts +22 -0
- package/dist/metrics/metrics.d.ts.map +1 -0
- package/dist/metrics/metrics.js +78 -0
- package/dist/metrics/metrics.js.map +1 -0
- package/dist/providers/factory.d.ts +11 -0
- package/dist/providers/factory.d.ts.map +1 -0
- package/dist/providers/factory.js +52 -0
- package/dist/providers/factory.js.map +1 -0
- package/dist/providers/hfSpace.adapter.d.ts +21 -0
- package/dist/providers/hfSpace.adapter.d.ts.map +1 -0
- package/dist/providers/hfSpace.adapter.js +110 -0
- package/dist/providers/hfSpace.adapter.js.map +1 -0
- package/dist/providers/httpTemplate.adapter.d.ts +42 -0
- package/dist/providers/httpTemplate.adapter.d.ts.map +1 -0
- package/dist/providers/httpTemplate.adapter.js +98 -0
- package/dist/providers/httpTemplate.adapter.js.map +1 -0
- package/dist/providers/promptBuilder.d.ts +34 -0
- package/dist/providers/promptBuilder.d.ts.map +1 -0
- package/dist/providers/promptBuilder.js +315 -0
- package/dist/providers/promptBuilder.js.map +1 -0
- package/dist/providers/provider.interface.d.ts +45 -0
- package/dist/providers/provider.interface.d.ts.map +1 -0
- package/dist/providers/provider.interface.js +47 -0
- package/dist/providers/provider.interface.js.map +1 -0
- package/dist/providers/specs.d.ts +18 -0
- package/dist/providers/specs.d.ts.map +1 -0
- package/dist/providers/specs.js +326 -0
- package/dist/providers/specs.js.map +1 -0
- package/dist/providers/unified.adapter.d.ts +37 -0
- package/dist/providers/unified.adapter.d.ts.map +1 -0
- package/dist/providers/unified.adapter.js +141 -0
- package/dist/providers/unified.adapter.js.map +1 -0
- package/dist/queue/producer.d.ts +30 -0
- package/dist/queue/producer.d.ts.map +1 -0
- package/dist/queue/producer.js +87 -0
- package/dist/queue/producer.js.map +1 -0
- package/dist/queue/worker.d.ts +9 -0
- package/dist/queue/worker.d.ts.map +1 -0
- package/dist/queue/worker.js +137 -0
- package/dist/queue/worker.js.map +1 -0
- package/dist/server/app.d.ts +4 -0
- package/dist/server/app.d.ts.map +1 -0
- package/dist/server/app.js +394 -0
- package/dist/server/app.js.map +1 -0
- package/dist/server/start.d.ts +16 -0
- package/dist/server/start.d.ts.map +1 -0
- package/dist/server/start.js +45 -0
- package/dist/server/start.js.map +1 -0
- package/dist/stepper/orchestrator.d.ts +22 -0
- package/dist/stepper/orchestrator.d.ts.map +1 -0
- package/dist/stepper/orchestrator.js +333 -0
- package/dist/stepper/orchestrator.js.map +1 -0
- package/dist/types.d.ts +216 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +14 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/redaction.d.ts +9 -0
- package/dist/utils/redaction.d.ts.map +1 -0
- package/dist/utils/redaction.js +41 -0
- package/dist/utils/redaction.js.map +1 -0
- package/dist/utils/safeRequest.d.ts +38 -0
- package/dist/utils/safeRequest.d.ts.map +1 -0
- package/dist/utils/safeRequest.js +104 -0
- package/dist/utils/safeRequest.js.map +1 -0
- package/dist/validation/report.schema.d.ts +48 -0
- package/dist/validation/report.schema.d.ts.map +1 -0
- package/dist/validation/report.schema.js +72 -0
- package/dist/validation/report.schema.js.map +1 -0
- package/dist/webhooks/delivery.d.ts +31 -0
- package/dist/webhooks/delivery.d.ts.map +1 -0
- package/dist/webhooks/delivery.js +102 -0
- package/dist/webhooks/delivery.js.map +1 -0
- package/docs/assets/architecture.png +0 -0
- package/package.json +75 -0
- package/render.yaml +25 -0
- package/src/alerts/README.md +25 -0
- package/src/alerts/discord.ts +86 -0
- package/src/cache/How redis caching works in package stepper.md +971 -0
- package/src/cache/README.md +51 -0
- package/src/cache/redisCache.ts +194 -0
- package/src/ci/deploy.sh +36 -0
- package/src/cli.ts +9 -0
- package/src/config.ts +265 -0
- package/src/fallback/templateFallback.ts +32 -0
- package/src/index.ts +246 -0
- package/src/logging.ts +46 -0
- package/src/metrics/README.md +24 -0
- package/src/metrics/metrics.ts +84 -0
- package/src/providers/How the providers interact.md +121 -0
- package/src/providers/README.md +121 -0
- package/src/providers/factory.ts +57 -0
- package/src/providers/hfSpace.adapter.ts +119 -0
- package/src/providers/httpTemplate.adapter.ts +138 -0
- package/src/providers/promptBuilder.ts +330 -0
- package/src/providers/provider.interface.ts +73 -0
- package/src/providers/specs.ts +366 -0
- package/src/providers/unified.adapter.ts +172 -0
- package/src/queue/How queue works in package stepper.md +149 -0
- package/src/queue/README.md +41 -0
- package/src/queue/producer.ts +108 -0
- package/src/queue/worker.ts +170 -0
- package/src/server/app.ts +451 -0
- package/src/server/start.ts +68 -0
- package/src/stepper/Dockerfile +48 -0
- package/src/stepper/How orchestrator works in package stepper.md +746 -0
- package/src/stepper/README.md +43 -0
- package/src/stepper/orchestrator.ts +437 -0
- package/src/types.ts +238 -0
- package/src/utils/redaction.ts +50 -0
- package/src/utils/safeRequest.ts +140 -0
- package/src/validation/README.md +25 -0
- package/src/validation/report.schema.ts +96 -0
- package/src/webhooks/delivery.ts +162 -0
- package/tests/integration/full-flow.test.ts +192 -0
- package/tests/unit/alerts/discord.test.ts +119 -0
- package/tests/unit/cache.test.ts +87 -0
- package/tests/unit/orchestrator-fallback.test.ts +92 -0
- package/tests/unit/orchestrator.test.ts +105 -0
- package/tests/unit/providers/factory.test.ts +161 -0
- package/tests/unit/providers/unified.adapter.test.ts +206 -0
- package/tests/unit/utils/redaction.test.ts +140 -0
- package/tests/unit/utils/safeRequest.test.ts +164 -0
- package/tsconfig.json +26 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# 🎼 Inference Orchestrator
|
|
2
|
+
|
|
3
|
+
The Orchestrator is the "Head Chef" of the package. It coordinates the various AI providers, handles retries, and ensures that the user always receives a report, even if multiple providers fail.
|
|
4
|
+
|
|
5
|
+
## 🎯 Purpose
|
|
6
|
+
|
|
7
|
+
- **Reliability**: Implements a rotation of providers. If one fails, it tries the next.
|
|
8
|
+
- **Resilience**: Uses **Circuit Breakers** and **Exponential Backoff**.
|
|
9
|
+
- **Efficiency**: Respects rate limits via **Bottleneck** (e.g., max 5 requests per minute).
|
|
10
|
+
|
|
11
|
+
## 🛡️ Resilience Strategies
|
|
12
|
+
|
|
13
|
+
### 1. Circuit Breaker
|
|
14
|
+
|
|
15
|
+
If a provider fails more than 50% of the time, the "circuit flips open." The Orchestrator will stop sending requests to that provider for 5 minutes to give it time to recover.
|
|
16
|
+
|
|
17
|
+
### 2. Smart Retries
|
|
18
|
+
|
|
19
|
+
When a provider fails with a temporary error (like a network blip), the Orchestrator waits before trying again:
|
|
20
|
+
|
|
21
|
+
- **Base Delay**: 40 seconds.
|
|
22
|
+
- **Exponential Backoff**: Each retry waits longer than the last.
|
|
23
|
+
- **Jitter**: Adds randomness to avoid "thundering herd" problems.
|
|
24
|
+
|
|
25
|
+
### 3. Rate Limiting
|
|
26
|
+
|
|
27
|
+
Controls the flow of requests.
|
|
28
|
+
|
|
29
|
+
- **Requests Per Minute (RPM)**: Default is 5.
|
|
30
|
+
- **Concurrency**: Default is 2 simultaneous requests.
|
|
31
|
+
|
|
32
|
+
### 4. Fail-safe Fallback
|
|
33
|
+
|
|
34
|
+
If _all_ AI providers are down or timeout (after 1 minute), the Orchestrator generates a generic, high-quality template report based on the commit message. This ensures the user is never left with an empty result.
|
|
35
|
+
|
|
36
|
+
## 📋 Core Functions
|
|
37
|
+
|
|
38
|
+
| Function | Description |
|
|
39
|
+
| ----------------------- | ----------------------------------------------------------------------- |
|
|
40
|
+
| `generateReportNow()` | The high-level entry point that manages the entire multi-provider flow. |
|
|
41
|
+
| `initializeProviders()` | Sets up the rate limiters and circuit breakers for each service. |
|
|
42
|
+
| `callWithRetries()` | Handles the low-level retry logic for a single provider. |
|
|
43
|
+
| `getProviderHealth()` | Returns the current status (Healthy/Broken) of all AI services. |
|
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
|
|
2
|
+
//packages/stepper/src/stepper/orchestrator.ts`
|
|
3
|
+
|
|
4
|
+
import Bottleneck from 'bottleneck';
|
|
5
|
+
import CircuitBreaker from 'opossum';
|
|
6
|
+
import { ProviderAdapter, ProviderError, AuthError, RateLimitError } from '../providers/provider.interface.js';
|
|
7
|
+
import { createProviderAdapter } from '../providers/factory.js';
|
|
8
|
+
import { PromptInput, ReportOutput, ProviderResult, ProviderAttemptMeta, StepperCallbacks, ProviderConfig, WebhookCallback } from '../types.js';
|
|
9
|
+
import { config } from '../config.js';
|
|
10
|
+
import { logger, createChildLogger } from '../logging.js';
|
|
11
|
+
import { generateTemplateFallback } from '../fallback/templateFallback.js';
|
|
12
|
+
import { recordProviderAttempt, recordProviderSuccess, recordProviderFailure } from '../metrics/metrics.js';
|
|
13
|
+
import { isRetryableError } from '../utils/safeRequest.js';
|
|
14
|
+
import { alertProviderFailure, alertCircuitOpen } from '../alerts/discord.js';
|
|
15
|
+
|
|
16
|
+
interface ProviderWithLimiter {
|
|
17
|
+
adapter: ProviderAdapter;
|
|
18
|
+
limiter: Bottleneck;
|
|
19
|
+
circuit: CircuitBreaker;
|
|
20
|
+
config: ProviderConfig;
|
|
21
|
+
consecutiveErrors: number;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
let providers: ProviderWithLimiter[] = [];
|
|
25
|
+
let callbacks: StepperCallbacks = {};
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Initialize providers with rate limiters and circuit breakers
|
|
29
|
+
*/
|
|
30
|
+
export function initializeProviders(providerConfigs: ProviderConfig[] = config.providers): void {
|
|
31
|
+
providers = providerConfigs
|
|
32
|
+
.filter((pc) => pc.enabled)
|
|
33
|
+
.map((pc) => {
|
|
34
|
+
// Create adapter using factory
|
|
35
|
+
const adapter = createProviderAdapter(pc);
|
|
36
|
+
if (!adapter) {
|
|
37
|
+
throw new Error(`Failed to create adapter for provider ${pc.name}`);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Create Bottleneck limiter
|
|
41
|
+
// Convert RPM (Requests Per Minute) to ms between requests
|
|
42
|
+
// Example: 5 RPM = 60000ms / 5 = 12000ms (12 seconds) between each request
|
|
43
|
+
// const minTime = 60000 / pc.rateLimitRPM;
|
|
44
|
+
|
|
45
|
+
// Convert RPM (Requests Per Minute) or RPS (Requests Per Second) to ms between requests
|
|
46
|
+
const minTime = pc.rateLimitRPS
|
|
47
|
+
? 1000 / pc.rateLimitRPS
|
|
48
|
+
: 60000 / (pc.rateLimitRPM || 5);
|
|
49
|
+
|
|
50
|
+
const limiter = new Bottleneck({
|
|
51
|
+
maxConcurrent: pc.concurrency,
|
|
52
|
+
minTime: Math.ceil(minTime),
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
// Create circuit breaker
|
|
56
|
+
const circuit = new CircuitBreaker(async (input: PromptInput) => adapter.call(input), {
|
|
57
|
+
timeout: pc.timeout || 15000,
|
|
58
|
+
errorThresholdPercentage: 50,
|
|
59
|
+
resetTimeout: config.circuit.cooldownSeconds * 1000,
|
|
60
|
+
volumeThreshold: config.circuit.failureThreshold,
|
|
61
|
+
rollingCountTimeout: config.circuit.windowSeconds * 1000,
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
circuit.on('open', () => {
|
|
65
|
+
logger.warn({ provider: pc.name }, 'Circuit breaker opened');
|
|
66
|
+
void alertCircuitOpen(pc.name);
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
circuit.on('halfOpen', () => {
|
|
70
|
+
logger.info({ provider: pc.name }, 'Circuit breaker half-open, trying probe');
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
circuit.on('close', () => {
|
|
74
|
+
logger.info({ provider: pc.name }, 'Circuit breaker closed');
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
return { adapter, limiter, circuit, config: pc, consecutiveErrors: 0 };
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
logger.info({ providerCount: providers.length, names: providers.map((p) => p.config.name) }, 'Providers initialized');
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Register lifecycle callbacks
|
|
85
|
+
*/
|
|
86
|
+
export function registerCallbacks(cbs: StepperCallbacks): void {
|
|
87
|
+
callbacks = { ...callbacks, ...cbs };
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Get backoff delay with jitter
|
|
92
|
+
*/
|
|
93
|
+
function getBackoffDelay(attempt: number): number {
|
|
94
|
+
const base = config.retry.baseDelayMs;
|
|
95
|
+
const jitter = Math.floor(Math.random() * config.retry.maxJitterMs);
|
|
96
|
+
return base * Math.pow(2, attempt) + jitter;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Sleep helper
|
|
101
|
+
*/
|
|
102
|
+
function sleep(ms: number): Promise<void> {
|
|
103
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Generic callback result interface
|
|
108
|
+
*/
|
|
109
|
+
interface CallbackResult {
|
|
110
|
+
url: string;
|
|
111
|
+
success: boolean;
|
|
112
|
+
statusCode?: number;
|
|
113
|
+
error?: string;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Send a single callback with retry support
|
|
118
|
+
* Stepper remains agnostic - just sends raw JSON to the URL
|
|
119
|
+
*/
|
|
120
|
+
async function sendCallback(
|
|
121
|
+
callback: WebhookCallback,
|
|
122
|
+
payload: unknown
|
|
123
|
+
): Promise<CallbackResult> {
|
|
124
|
+
const maxAttempts = callback.retry?.maxAttempts ?? 3;
|
|
125
|
+
const backoffMs = callback.retry?.backoffMs ?? 1000;
|
|
126
|
+
|
|
127
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
128
|
+
try {
|
|
129
|
+
const response = await fetch(callback.url, {
|
|
130
|
+
method: 'POST',
|
|
131
|
+
headers: {
|
|
132
|
+
'Content-Type': 'application/json',
|
|
133
|
+
'User-Agent': 'Stepper/1.0',
|
|
134
|
+
'X-Stepper-Timestamp': Date.now().toString(),
|
|
135
|
+
...callback.headers,
|
|
136
|
+
},
|
|
137
|
+
body: JSON.stringify(payload),
|
|
138
|
+
signal: AbortSignal.timeout(10000),
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
if (response.ok) {
|
|
142
|
+
logger.info({ url: callback.url, attempt }, 'Callback succeeded');
|
|
143
|
+
return { url: callback.url, success: true, statusCode: response.status };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Retry on server errors or rate limits
|
|
147
|
+
if ((response.status >= 500 || response.status === 429) && attempt < maxAttempts) {
|
|
148
|
+
const delay = backoffMs * Math.pow(2, attempt - 1);
|
|
149
|
+
logger.warn({ url: callback.url, status: response.status, delay }, 'Retrying callback');
|
|
150
|
+
await sleep(delay);
|
|
151
|
+
continue;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
logger.error({ url: callback.url, status: response.status }, 'Callback failed');
|
|
155
|
+
return { url: callback.url, success: false, statusCode: response.status };
|
|
156
|
+
} catch (error) {
|
|
157
|
+
if (attempt < maxAttempts) {
|
|
158
|
+
const delay = backoffMs * Math.pow(2, attempt - 1);
|
|
159
|
+
logger.warn({ url: callback.url, error: error instanceof Error ? error.message : String(error), delay }, 'Callback error, retrying');
|
|
160
|
+
await sleep(delay);
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
163
|
+
return {
|
|
164
|
+
url: callback.url,
|
|
165
|
+
success: false,
|
|
166
|
+
error: error instanceof Error ? error.message : String(error),
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return { url: callback.url, success: false, error: 'Max attempts exceeded' };
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Execute all configured callbacks with raw result payload
|
|
176
|
+
* Stepper remains agnostic - callers decide what to do with the result
|
|
177
|
+
*/
|
|
178
|
+
async function executeCallbacks(
|
|
179
|
+
callbacks: WebhookCallback[],
|
|
180
|
+
payload: {
|
|
181
|
+
success: boolean;
|
|
182
|
+
result?: ReportOutput;
|
|
183
|
+
error?: string;
|
|
184
|
+
metadata: {
|
|
185
|
+
jobId: string;
|
|
186
|
+
userId: string;
|
|
187
|
+
commitSha: string;
|
|
188
|
+
repo: string;
|
|
189
|
+
provider?: string;
|
|
190
|
+
generationTimeMs?: number;
|
|
191
|
+
timestamp: string;
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
): Promise<CallbackResult[]> {
|
|
195
|
+
const results: CallbackResult[] = [];
|
|
196
|
+
|
|
197
|
+
for (const callback of callbacks) {
|
|
198
|
+
const result = await sendCallback(callback, payload);
|
|
199
|
+
results.push(result);
|
|
200
|
+
|
|
201
|
+
if (!result.success && !callback.continueOnFailure) {
|
|
202
|
+
logger.warn({ url: callback.url }, 'Callback failed, stopping chain');
|
|
203
|
+
break;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return results;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Call provider with retries
|
|
212
|
+
*/
|
|
213
|
+
async function callWithRetries(
|
|
214
|
+
provider: ProviderWithLimiter,
|
|
215
|
+
input: PromptInput,
|
|
216
|
+
jobId: string
|
|
217
|
+
): Promise<{ result: ReportOutput; durationMs: number }> {
|
|
218
|
+
const maxAttempts = config.retry.maxAttemptsPerProvider;
|
|
219
|
+
const log = createChildLogger({ provider: provider.config.name, jobId });
|
|
220
|
+
|
|
221
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
222
|
+
const startTime = Date.now();
|
|
223
|
+
|
|
224
|
+
try {
|
|
225
|
+
// Use circuit breaker - the result type from Opossum is unknown, but we know
|
|
226
|
+
// it returns ReportOutput since the circuit wraps adapter.call(input)
|
|
227
|
+
const result = await provider.circuit.fire(input) as ReportOutput;
|
|
228
|
+
const durationMs = Date.now() - startTime;
|
|
229
|
+
|
|
230
|
+
log.debug({ attempt, durationMs }, 'Provider call succeeded');
|
|
231
|
+
return { result, durationMs };
|
|
232
|
+
} catch (error) {
|
|
233
|
+
const durationMs = Date.now() - startTime;
|
|
234
|
+
log.warn({ attempt, error: error instanceof Error ? error.message : String(error), durationMs }, 'Provider call failed');
|
|
235
|
+
|
|
236
|
+
// Don't retry auth errors
|
|
237
|
+
if (error instanceof AuthError) {
|
|
238
|
+
log.error({ error: error.message }, 'Auth error - stopping retries');
|
|
239
|
+
throw error;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Handle rate limits with Retry-After
|
|
243
|
+
if (error instanceof RateLimitError) {
|
|
244
|
+
// Use the AI service's requested wait time, or fallback to config (90 minutes default)
|
|
245
|
+
const retryAfter = error.retryAfter || config.retry.rateLimitFallbackSeconds;
|
|
246
|
+
log.info({ retryAfterSeconds: retryAfter, attempt }, 'Rate limited, backing off');
|
|
247
|
+
|
|
248
|
+
if (attempt < maxAttempts - 1) {
|
|
249
|
+
await sleep(retryAfter * 1000);
|
|
250
|
+
continue;
|
|
251
|
+
}
|
|
252
|
+
throw error;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Retry on retryable errors
|
|
256
|
+
if (isRetryableError(error) && attempt < maxAttempts - 1) {
|
|
257
|
+
const delay = getBackoffDelay(attempt);
|
|
258
|
+
log.debug({ delay, attempt }, 'Retrying after backoff');
|
|
259
|
+
await sleep(delay);
|
|
260
|
+
continue;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
throw error;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
throw new Error('Max retries exceeded');
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Safe callback invocation
|
|
272
|
+
*/
|
|
273
|
+
async function invokeCallback<T extends keyof StepperCallbacks>(
|
|
274
|
+
name: T,
|
|
275
|
+
...args: Parameters<NonNullable<StepperCallbacks[T]>>
|
|
276
|
+
): Promise<void> {
|
|
277
|
+
const callback = callbacks[name];
|
|
278
|
+
if (!callback) return;
|
|
279
|
+
|
|
280
|
+
try {
|
|
281
|
+
await (callback as (...args: unknown[]) => void | Promise<void>)(...args);
|
|
282
|
+
} catch (error) {
|
|
283
|
+
logger.error({ callback: name, error }, 'Callback threw error');
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/**
|
|
288
|
+
* Generate report using provider orchestration
|
|
289
|
+
*/
|
|
290
|
+
export async function generateReportNow(input: PromptInput, jobId: string = 'immediate'): Promise<ProviderResult> {
|
|
291
|
+
const log = createChildLogger({ jobId, userId: input.userId, commitSha: input.commitSha });
|
|
292
|
+
const startTime = Date.now();
|
|
293
|
+
const providersAttempted: ProviderAttemptMeta[] = [];
|
|
294
|
+
|
|
295
|
+
await invokeCallback('onStart', jobId, input);
|
|
296
|
+
|
|
297
|
+
// Ensure providers are initialized
|
|
298
|
+
if (providers.length === 0) {
|
|
299
|
+
initializeProviders();
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// Try each provider in order
|
|
303
|
+
for (const provider of providers) {
|
|
304
|
+
const providerName = provider.config.name;
|
|
305
|
+
|
|
306
|
+
// Check circuit breaker state
|
|
307
|
+
if (provider.circuit.opened) {
|
|
308
|
+
log.info({ provider: providerName }, 'Skipping provider - circuit open');
|
|
309
|
+
providersAttempted.push({
|
|
310
|
+
provider: providerName,
|
|
311
|
+
attemptNumber: 0,
|
|
312
|
+
skipped: 'circuit_open',
|
|
313
|
+
});
|
|
314
|
+
continue;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
let attemptNumber = 0;
|
|
318
|
+
|
|
319
|
+
try {
|
|
320
|
+
attemptNumber++;
|
|
321
|
+
log.info({ provider: providerName, attempt: attemptNumber }, 'Attempting provider');
|
|
322
|
+
|
|
323
|
+
await invokeCallback('onProviderAttempt', jobId, providerName, attemptNumber, {
|
|
324
|
+
provider: providerName,
|
|
325
|
+
attemptNumber,
|
|
326
|
+
});
|
|
327
|
+
|
|
328
|
+
recordProviderAttempt(providerName);
|
|
329
|
+
|
|
330
|
+
// Schedule with rate limiter and call with retries
|
|
331
|
+
const { result, durationMs } = await provider.limiter.schedule(() =>
|
|
332
|
+
callWithRetries(provider, input, jobId)
|
|
333
|
+
);
|
|
334
|
+
|
|
335
|
+
// Success!
|
|
336
|
+
provider.consecutiveErrors = 0;
|
|
337
|
+
const totalMs = Date.now() - startTime;
|
|
338
|
+
log.info({ provider: providerName, totalMs, providerMs: durationMs }, 'Report generated successfully');
|
|
339
|
+
|
|
340
|
+
recordProviderSuccess(providerName, durationMs);
|
|
341
|
+
providersAttempted.push({
|
|
342
|
+
provider: providerName,
|
|
343
|
+
attemptNumber,
|
|
344
|
+
durationMs,
|
|
345
|
+
});
|
|
346
|
+
|
|
347
|
+
await invokeCallback('onSuccess', jobId, providerName, result, {
|
|
348
|
+
timings: { totalMs, providerMs: durationMs },
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
// Execute configured callbacks immediately after success
|
|
352
|
+
// This ensures delivery even if subsequent DB operations fail
|
|
353
|
+
if (input.callbacks && input.callbacks.length > 0) {
|
|
354
|
+
const callbackPayload = {
|
|
355
|
+
success: true,
|
|
356
|
+
result,
|
|
357
|
+
metadata: {
|
|
358
|
+
jobId,
|
|
359
|
+
userId: input.userId,
|
|
360
|
+
commitSha: input.commitSha,
|
|
361
|
+
repo: input.repo,
|
|
362
|
+
provider: providerName,
|
|
363
|
+
generationTimeMs: totalMs,
|
|
364
|
+
timestamp: new Date().toISOString(),
|
|
365
|
+
},
|
|
366
|
+
};
|
|
367
|
+
|
|
368
|
+
executeCallbacks(input.callbacks, callbackPayload)
|
|
369
|
+
.then((callbackResults: CallbackResult[]) => {
|
|
370
|
+
log.info({ callbackResults: callbackResults.map(r => ({ url: r.url, success: r.success })) }, 'Callbacks executed');
|
|
371
|
+
})
|
|
372
|
+
.catch((err: unknown) => {
|
|
373
|
+
log.error({ error: err instanceof Error ? err.message : String(err) }, 'Callbacks execution error');
|
|
374
|
+
});
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
return {
|
|
378
|
+
result,
|
|
379
|
+
usedProvider: providerName,
|
|
380
|
+
providersAttempted,
|
|
381
|
+
fallback: false,
|
|
382
|
+
timings: { totalMs, providerMs: durationMs },
|
|
383
|
+
};
|
|
384
|
+
} catch (error) {
|
|
385
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
386
|
+
const errorCode = error instanceof ProviderError ? error.type : 'UNKNOWN';
|
|
387
|
+
|
|
388
|
+
log.warn({ provider: providerName, error: errorMessage, errorCode }, 'Provider failed');
|
|
389
|
+
|
|
390
|
+
// Update consecutive errors
|
|
391
|
+
provider.consecutiveErrors = (provider.consecutiveErrors || 0) + 1;
|
|
392
|
+
|
|
393
|
+
recordProviderFailure(providerName, errorCode);
|
|
394
|
+
providersAttempted.push({
|
|
395
|
+
provider: providerName,
|
|
396
|
+
attemptNumber,
|
|
397
|
+
error: errorMessage,
|
|
398
|
+
errorCode,
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
// Alert on failure
|
|
402
|
+
void alertProviderFailure(providerName, provider.consecutiveErrors, error);
|
|
403
|
+
|
|
404
|
+
// Continue to next provider
|
|
405
|
+
continue;
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// All providers failed
|
|
410
|
+
const totalMs = Date.now() - startTime;
|
|
411
|
+
log.error({ totalMs, providersAttempted: providersAttempted.length }, 'All providers failed, job will be retried');
|
|
412
|
+
|
|
413
|
+
if (!config.fallback.enabled) {
|
|
414
|
+
throw new Error(`All ${providersAttempted.length} provider(s) failed. Job will retry.`);
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
const fallbackResult = generateTemplateFallback(input);
|
|
418
|
+
await invokeCallback('onFallback', jobId, fallbackResult, { providersAttempted });
|
|
419
|
+
return {
|
|
420
|
+
result: fallbackResult,
|
|
421
|
+
usedProvider: 'fallback',
|
|
422
|
+
providersAttempted,
|
|
423
|
+
fallback: true,
|
|
424
|
+
timings: { totalMs },
|
|
425
|
+
};
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/**
|
|
429
|
+
* Get provider health status
|
|
430
|
+
*/
|
|
431
|
+
export function getProviderHealth(): Array<{ name: string; circuitOpen: boolean; healthy: boolean }> {
|
|
432
|
+
return providers.map((p) => ({
|
|
433
|
+
name: p.config.name,
|
|
434
|
+
circuitOpen: p.circuit.opened,
|
|
435
|
+
healthy: !p.circuit.opened,
|
|
436
|
+
}));
|
|
437
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
// types.ts - Stepper Type Definitions
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Generic webhook callback configuration
|
|
5
|
+
* Stepper sends raw results to these URLs - callers handle transformation
|
|
6
|
+
*/
|
|
7
|
+
export interface WebhookCallback {
|
|
8
|
+
/** Callback URL to send results */
|
|
9
|
+
url: string;
|
|
10
|
+
/** Custom headers (auth tokens, content-type, etc.) */
|
|
11
|
+
headers?: Record<string, string>;
|
|
12
|
+
/** Continue to next callback even if this one fails */
|
|
13
|
+
continueOnFailure?: boolean;
|
|
14
|
+
/** Retry configuration */
|
|
15
|
+
retry?: {
|
|
16
|
+
maxAttempts: number;
|
|
17
|
+
backoffMs: number;
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Input to generate a commit report
|
|
23
|
+
*/
|
|
24
|
+
export interface PromptInput {
|
|
25
|
+
userId: string;
|
|
26
|
+
commitSha: string;
|
|
27
|
+
repo: string;
|
|
28
|
+
message: string;
|
|
29
|
+
files: string[];
|
|
30
|
+
components: string[];
|
|
31
|
+
diffSummary: string;
|
|
32
|
+
template?: string;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Multiple webhook callbacks for resilience
|
|
36
|
+
* Stepper will call each in order, sending the raw result
|
|
37
|
+
* Use continueOnFailure: true to ensure all callbacks are attempted
|
|
38
|
+
*/
|
|
39
|
+
callbacks?: WebhookCallback[];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Structured report output from AI providers
|
|
44
|
+
*/
|
|
45
|
+
export interface ReportOutput {
|
|
46
|
+
title: string;
|
|
47
|
+
summary: string;
|
|
48
|
+
changes: string[];
|
|
49
|
+
rationale: string;
|
|
50
|
+
impact_and_tests: string;
|
|
51
|
+
next_steps: string[];
|
|
52
|
+
tags: string;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Provider attempt result
|
|
57
|
+
*/
|
|
58
|
+
export interface ProviderResult {
|
|
59
|
+
result: ReportOutput;
|
|
60
|
+
usedProvider: string;
|
|
61
|
+
providersAttempted: ProviderAttemptMeta[];
|
|
62
|
+
fallback: boolean;
|
|
63
|
+
timings: {
|
|
64
|
+
totalMs: number;
|
|
65
|
+
providerMs?: number;
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Metadata for each provider attempt
|
|
71
|
+
*/
|
|
72
|
+
export interface ProviderAttemptMeta {
|
|
73
|
+
provider: string;
|
|
74
|
+
attemptNumber: number;
|
|
75
|
+
error?: string;
|
|
76
|
+
errorCode?: string;
|
|
77
|
+
durationMs?: number;
|
|
78
|
+
skipped?: string;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Cache entry structure
|
|
83
|
+
*/
|
|
84
|
+
export interface CacheEntry {
|
|
85
|
+
status: 'hydrated' | 'dehydrated' | 'failed';
|
|
86
|
+
result?: ReportOutput;
|
|
87
|
+
jobId?: string;
|
|
88
|
+
providersAttempted?: ProviderAttemptMeta[];
|
|
89
|
+
timestamps: {
|
|
90
|
+
created: string;
|
|
91
|
+
updated: string;
|
|
92
|
+
};
|
|
93
|
+
ttl?: number;
|
|
94
|
+
etag?: string;
|
|
95
|
+
fallback?: boolean;
|
|
96
|
+
error?: string;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Job data for BullMQ
|
|
101
|
+
*/
|
|
102
|
+
export interface ReportJobData {
|
|
103
|
+
jobId: string;
|
|
104
|
+
input: PromptInput;
|
|
105
|
+
cacheKey: string;
|
|
106
|
+
priority?: number;
|
|
107
|
+
callbackUrl?: string;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Provider error types
|
|
112
|
+
*/
|
|
113
|
+
export enum ProviderErrorType {
|
|
114
|
+
RateLimit = 'RATE_LIMIT',
|
|
115
|
+
Auth = 'AUTH_ERROR',
|
|
116
|
+
Timeout = 'TIMEOUT',
|
|
117
|
+
Unavailable = 'UNAVAILABLE',
|
|
118
|
+
InvalidResponse = 'INVALID_RESPONSE',
|
|
119
|
+
Unknown = 'UNKNOWN',
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Lifecycle callbacks for stepper events
|
|
124
|
+
*/
|
|
125
|
+
export interface StepperCallbacks {
|
|
126
|
+
onEnqueue?: (jobId: string, meta: { input: PromptInput; cacheKey: string }) => void | Promise<void>;
|
|
127
|
+
onStart?: (jobId: string, input: PromptInput) => void | Promise<void>;
|
|
128
|
+
onProviderAttempt?: (
|
|
129
|
+
jobId: string,
|
|
130
|
+
providerName: string,
|
|
131
|
+
attemptNumber: number,
|
|
132
|
+
meta: ProviderAttemptMeta
|
|
133
|
+
) => void | Promise<void>;
|
|
134
|
+
onSuccess?: (
|
|
135
|
+
jobId: string,
|
|
136
|
+
providerName: string,
|
|
137
|
+
result: ReportOutput,
|
|
138
|
+
meta: { timings: { totalMs: number; providerMs?: number } }
|
|
139
|
+
) => void | Promise<void>;
|
|
140
|
+
onFallback?: (
|
|
141
|
+
jobId: string,
|
|
142
|
+
result: ReportOutput,
|
|
143
|
+
meta: { providersAttempted: ProviderAttemptMeta[] }
|
|
144
|
+
) => void | Promise<void>;
|
|
145
|
+
onFailure?: (
|
|
146
|
+
jobId: string,
|
|
147
|
+
errors: ProviderAttemptMeta[],
|
|
148
|
+
meta: { lastError?: string }
|
|
149
|
+
) => void | Promise<void>;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Provider configuration
|
|
154
|
+
*/
|
|
155
|
+
export interface ProviderConfig {
|
|
156
|
+
name: string;
|
|
157
|
+
enabled: boolean;
|
|
158
|
+
baseUrl?: string;
|
|
159
|
+
modelName?: string;
|
|
160
|
+
apiKey?: string;
|
|
161
|
+
apiKeyEnvVar?: string;
|
|
162
|
+
rateLimitRPM?: number; // Requests Per Minute
|
|
163
|
+
rateLimitRPS?: number; // Requests Per Second
|
|
164
|
+
concurrency: number;
|
|
165
|
+
timeout?: number;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Stepper configuration
|
|
170
|
+
*/
|
|
171
|
+
export interface StepperConfig {
|
|
172
|
+
providers: ProviderConfig[];
|
|
173
|
+
providerConfigs?: ProviderConfig[];
|
|
174
|
+
fallback: {
|
|
175
|
+
enabled: boolean;
|
|
176
|
+
};
|
|
177
|
+
redis: {
|
|
178
|
+
url: string;
|
|
179
|
+
keyPrefix: string;
|
|
180
|
+
};
|
|
181
|
+
cache: {
|
|
182
|
+
ttlSeconds: number;
|
|
183
|
+
staleThresholdSeconds: number;
|
|
184
|
+
enableStaleWhileRevalidate: boolean;
|
|
185
|
+
};
|
|
186
|
+
queue: {
|
|
187
|
+
name: string;
|
|
188
|
+
concurrency: number;
|
|
189
|
+
};
|
|
190
|
+
webhook: {
|
|
191
|
+
enabled: boolean;
|
|
192
|
+
secret: string;
|
|
193
|
+
maxRetries: number;
|
|
194
|
+
retryDelayMs: number;
|
|
195
|
+
};
|
|
196
|
+
retry: {
|
|
197
|
+
maxAttemptsPerProvider: number;
|
|
198
|
+
baseDelayMs: number;
|
|
199
|
+
maxJitterMs: number;
|
|
200
|
+
rateLimitFallbackSeconds: number;
|
|
201
|
+
};
|
|
202
|
+
circuit: {
|
|
203
|
+
failureThreshold: number;
|
|
204
|
+
windowSeconds: number;
|
|
205
|
+
cooldownSeconds: number;
|
|
206
|
+
};
|
|
207
|
+
security: {
|
|
208
|
+
redactBeforeSend: boolean;
|
|
209
|
+
// CORS configuration
|
|
210
|
+
cors: {
|
|
211
|
+
enabled: boolean;
|
|
212
|
+
allowedOrigins: string[];
|
|
213
|
+
allowCredentials: boolean;
|
|
214
|
+
};
|
|
215
|
+
// Rate limiting configuration
|
|
216
|
+
rateLimit: {
|
|
217
|
+
enabled: boolean;
|
|
218
|
+
windowMs: number; // Time window in milliseconds
|
|
219
|
+
maxRequests: number; // Max requests per window per IP
|
|
220
|
+
maxRequestsPerUser: number; // Max requests per window per userId
|
|
221
|
+
skipHealthEndpoints: boolean; // Skip rate limiting for /health and /metrics
|
|
222
|
+
};
|
|
223
|
+
// Helmet security headers
|
|
224
|
+
helmet: {
|
|
225
|
+
enabled: boolean;
|
|
226
|
+
};
|
|
227
|
+
// API Key authentication
|
|
228
|
+
apiKey: {
|
|
229
|
+
enabled: boolean;
|
|
230
|
+
headerName: string; // e.g., 'x-api-key'
|
|
231
|
+
skipHealthEndpoints: boolean; // Skip auth for /health and /metrics
|
|
232
|
+
};
|
|
233
|
+
};
|
|
234
|
+
server: {
|
|
235
|
+
port: number;
|
|
236
|
+
metricsPort?: number;
|
|
237
|
+
};
|
|
238
|
+
}
|