claude-sdk-proxy 2.3.2 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/trace.ts ADDED
@@ -0,0 +1,633 @@
1
+ import { logInfo, logWarn, logError, logDebug, dumpError } from "./logger"
2
+
3
+ // ── Request Trace ────────────────────────────────────────────────────────────
4
+ // Captures the full lifecycle of a single API request with timing milestones,
5
+ // so you can see exactly WHERE time was spent and WHERE failures occurred.
6
+
7
+ export type TracePhase =
8
+ | "received" // HTTP request received, body parsed
9
+ | "validated" // Request validated, model resolved
10
+ | "queued" // Waiting for concurrency slot
11
+ | "acquired" // Concurrency slot acquired
12
+ | "sdk_starting" // About to call SDK query()
13
+ | "sdk_streaming" // Receiving events from SDK
14
+ | "sdk_done" // SDK query() iterator finished
15
+ | "responding" // Building/sending HTTP response
16
+ | "completed" // Successfully sent response
17
+ | "error" // Failed at some point
18
+
19
+ export type TraceStatus = "active" | "completed" | "error"
20
+
21
+ export interface TraceError {
22
+ type: string // "AbortError", "sdk_error", "queue_timeout", "parse_error", etc.
23
+ message: string
24
+ stack?: string
25
+ phase: TracePhase // Which phase the error occurred in
26
+ }
27
+
28
+ export interface RequestTrace {
29
+ reqId: string
30
+ startedAt: number // Date.now() when request received
31
+
32
+ // Request metadata
33
+ model: string // "haiku" | "sonnet" | "opus"
34
+ requestedModel: string // Original model string from caller (e.g. "claude-haiku-4-5")
35
+ stream: boolean
36
+ hasTools: boolean
37
+ thinking?: string // "enabled" | "disabled" | "adaptive"
38
+ promptLen: number // Character length of serialized prompt
39
+ systemLen: number // Character length of system prompt
40
+ msgCount: number // Number of messages in request
41
+ bodyBytes: number // Raw HTTP body size in bytes
42
+
43
+ // Client info
44
+ clientIp?: string
45
+ userAgent?: string
46
+
47
+ // Timing milestones (all Date.now() values)
48
+ queuedAt?: number // When we started waiting for a slot
49
+ acquiredAt?: number // When we got a concurrency slot
50
+ sdkStartedAt?: number // When query() was called
51
+ firstTokenAt?: number // When first content event arrived
52
+ sdkEndedAt?: number // When query() iterator finished
53
+ completedAt?: number // When HTTP response was fully sent
54
+
55
+ // Phase tracking
56
+ phase: TracePhase
57
+ status: TraceStatus
58
+
59
+ // Output metrics
60
+ sdkEventCount: number
61
+ outputLen: number // Character length of generated text
62
+ toolCallCount: number
63
+ stallCount: number // How many 15s stall intervals where stallMs > 30s occurred
64
+
65
+ // SDK event type distribution
66
+ eventTypes: Record<string, number> // e.g. { "content_block_delta": 500, "content_block_start": 1, ... }
67
+
68
+ // Last event tracking (for stall diagnostics)
69
+ lastEventAt: number // Date.now() of last SDK event
70
+ lastEventType?: string // Type of last SDK event received
71
+
72
+ // Termination reason (more specific than just error/completed)
73
+ endReason?: "completed" | "client_disconnect" | "stall_timeout" | "queue_timeout" | "sdk_error" | "abort" | "unknown"
74
+
75
+ // Error info
76
+ error?: TraceError
77
+
78
+ // SDK debug log path (from DEBUG_CLAUDE_AGENT_SDK)
79
+ sdkDebugLogPath?: string
80
+ }
81
+
82
+ // ── Per-model stats ──────────────────────────────────────────────────────────
83
+
84
+ interface ModelStats {
85
+ total: number
86
+ errors: number
87
+ totalDurationMs: number
88
+ totalTimeToFirstTokenMs: number
89
+ firstTokenCount: number // requests where we got a first token (for avg calc)
90
+ maxDurationMs: number
91
+ lastErrorAt?: number
92
+ lastErrorReqId?: string
93
+ }
94
+
95
+ function newModelStats(): ModelStats {
96
+ return { total: 0, errors: 0, totalDurationMs: 0, totalTimeToFirstTokenMs: 0, firstTokenCount: 0, maxDurationMs: 0 }
97
+ }
98
+
99
+ // ── Trace Store ──────────────────────────────────────────────────────────────
100
+ // In-memory ring buffer of recent traces + aggregate stats.
101
+
102
+ const BUFFER_SIZE = 200
103
+ const ERROR_BUFFER_SIZE = 50
104
+
105
+ class TraceStore {
106
+ private traces: RequestTrace[] = []
107
+ private errorTraces: RequestTrace[] = []
108
+ private activeTraces = new Map<string, RequestTrace>()
109
+ private stats = {
110
+ totalRequests: 0,
111
+ totalErrors: 0,
112
+ totalDurationMs: 0,
113
+ startedAt: Date.now(),
114
+ }
115
+ private modelStats = new Map<string, ModelStats>()
116
+
117
+ /** Create a new trace for a request. */
118
+ create(init: {
119
+ reqId: string
120
+ model: string
121
+ requestedModel: string
122
+ stream: boolean
123
+ hasTools: boolean
124
+ thinking?: string
125
+ promptLen: number
126
+ systemLen: number
127
+ msgCount: number
128
+ bodyBytes: number
129
+ clientIp?: string
130
+ userAgent?: string
131
+ }): RequestTrace {
132
+ const now = Date.now()
133
+ const trace: RequestTrace = {
134
+ ...init,
135
+ startedAt: now,
136
+ phase: "received",
137
+ status: "active",
138
+ sdkEventCount: 0,
139
+ outputLen: 0,
140
+ toolCallCount: 0,
141
+ stallCount: 0,
142
+ eventTypes: {},
143
+ lastEventAt: now,
144
+ }
145
+ this.activeTraces.set(init.reqId, trace)
146
+ this.stats.totalRequests++
147
+
148
+ logInfo("trace.created", {
149
+ reqId: init.reqId,
150
+ model: init.model,
151
+ requestedModel: init.requestedModel,
152
+ stream: init.stream,
153
+ hasTools: init.hasTools,
154
+ thinking: init.thinking,
155
+ promptLen: init.promptLen,
156
+ systemLen: init.systemLen,
157
+ msgCount: init.msgCount,
158
+ bodyBytes: init.bodyBytes,
159
+ clientIp: init.clientIp,
160
+ userAgent: init.userAgent,
161
+ })
162
+
163
+ return trace
164
+ }
165
+
166
+ /** Update the phase of an active trace. Logs the transition. */
167
+ phase(reqId: string, phase: TracePhase, extra?: Record<string, unknown>) {
168
+ const trace = this.activeTraces.get(reqId)
169
+ if (!trace) return
170
+
171
+ const now = Date.now()
172
+ trace.phase = phase
173
+
174
+ switch (phase) {
175
+ case "queued":
176
+ trace.queuedAt = now
177
+ break
178
+ case "acquired":
179
+ trace.acquiredAt = now
180
+ break
181
+ case "sdk_starting":
182
+ trace.sdkStartedAt = now
183
+ break
184
+ case "sdk_streaming":
185
+ if (!trace.firstTokenAt) trace.firstTokenAt = now
186
+ break
187
+ case "sdk_done":
188
+ trace.sdkEndedAt = now
189
+ break
190
+ case "completed":
191
+ trace.completedAt = now
192
+ break
193
+ }
194
+
195
+ const elapsed = now - trace.startedAt
196
+ logDebug("trace.phase", { reqId, phase, elapsedMs: elapsed, ...extra })
197
+ }
198
+
199
+ /** Record an SDK event. Tracks timing, event type distribution, and first-token detection. */
200
+ sdkEvent(reqId: string, eventNum: number, eventType: string, subtype?: string) {
201
+ const trace = this.activeTraces.get(reqId)
202
+ if (!trace) return
203
+
204
+ const now = Date.now()
205
+ trace.sdkEventCount = eventNum
206
+ trace.lastEventAt = now
207
+
208
+ // Track event type distribution
209
+ const key = subtype ?? eventType
210
+ trace.lastEventType = key
211
+ trace.eventTypes[key] = (trace.eventTypes[key] ?? 0) + 1
212
+
213
+ // Mark first content event
214
+ if (!trace.firstTokenAt && (subtype === "content_block_delta" || subtype === "content_block_start")) {
215
+ trace.firstTokenAt = now
216
+ const ttft = now - trace.startedAt
217
+ const ttftFromSdk = trace.sdkStartedAt ? now - trace.sdkStartedAt : undefined
218
+
219
+ logInfo("trace.first_token", {
220
+ reqId,
221
+ ttftMs: ttft,
222
+ ttftFromSdkMs: ttftFromSdk,
223
+ eventNum,
224
+ model: trace.model,
225
+ })
226
+ }
227
+
228
+ // Log first 5 events, then every 200th, plus every thinking event
229
+ if (eventNum <= 5 || eventNum % 200 === 0 || subtype === "thinking") {
230
+ logDebug("trace.sdk_event", {
231
+ reqId,
232
+ n: eventNum,
233
+ type: eventType,
234
+ subtype,
235
+ elapsedMs: now - trace.startedAt,
236
+ outputLen: trace.outputLen,
237
+ })
238
+ }
239
+ }
240
+
241
+ /** Record a stall check (called every 15s). Only warns if idle > 30s. */
242
+ stall(reqId: string, stallMs: number) {
243
+ const trace = this.activeTraces.get(reqId)
244
+ if (!trace) return
245
+
246
+ // Only count meaningful stalls (>30s idle)
247
+ if (stallMs < 30_000) {
248
+ // Short gap — debug log only, not a real stall
249
+ logDebug("trace.stall_check", {
250
+ reqId,
251
+ stallMs,
252
+ sdkEventCount: trace.sdkEventCount,
253
+ phase: trace.phase,
254
+ })
255
+ return
256
+ }
257
+
258
+ trace.stallCount++
259
+ const level = stallMs > 60_000 ? "error" : "warn"
260
+ const log = level === "error" ? logError : logWarn
261
+
262
+ log("trace.stall", {
263
+ reqId,
264
+ stallMs,
265
+ stallCount: trace.stallCount,
266
+ sdkEventCount: trace.sdkEventCount,
267
+ outputLen: trace.outputLen,
268
+ elapsedMs: Date.now() - trace.startedAt,
269
+ phase: trace.phase,
270
+ model: trace.model,
271
+ lastEventType: trace.lastEventType,
272
+ eventTypes: trace.eventTypes,
273
+ })
274
+ }
275
+
276
+ /** Mark a trace as successfully completed. */
277
+ complete(reqId: string, extra?: { outputLen?: number; toolCallCount?: number }) {
278
+ const trace = this.activeTraces.get(reqId)
279
+ if (!trace) return
280
+
281
+ const now = Date.now()
282
+ trace.completedAt = now
283
+ trace.phase = "completed"
284
+ trace.status = "completed"
285
+ trace.endReason = "completed"
286
+ if (extra?.outputLen !== undefined) trace.outputLen = extra.outputLen
287
+ if (extra?.toolCallCount !== undefined) trace.toolCallCount = extra.toolCallCount
288
+
289
+ const duration = now - trace.startedAt
290
+ const timings = this.computeTimings(trace)
291
+
292
+ // Compute throughput (chars/sec) over the streaming period
293
+ const streamDuration = trace.sdkStartedAt ? now - trace.sdkStartedAt : duration
294
+ const charsPerSec = streamDuration > 0 ? Math.round((trace.outputLen / streamDuration) * 1000) : 0
295
+ const eventsPerSec = streamDuration > 0 ? Math.round((trace.sdkEventCount / streamDuration) * 1000) : 0
296
+
297
+ logInfo("trace.completed", {
298
+ reqId,
299
+ model: trace.model,
300
+ requestedModel: trace.requestedModel,
301
+ durationMs: duration,
302
+ ...timings,
303
+ sdkEventCount: trace.sdkEventCount,
304
+ outputLen: trace.outputLen,
305
+ toolCallCount: trace.toolCallCount,
306
+ stallCount: trace.stallCount,
307
+ charsPerSec,
308
+ eventsPerSec,
309
+ eventTypes: trace.eventTypes,
310
+ })
311
+
312
+ // Update stats
313
+ this.stats.totalDurationMs += duration
314
+ const ms = this.getModelStats(trace.model)
315
+ ms.total++
316
+ ms.totalDurationMs += duration
317
+ if (ms.maxDurationMs < duration) ms.maxDurationMs = duration
318
+ if (timings.ttftMs !== undefined) {
319
+ ms.totalTimeToFirstTokenMs += timings.ttftMs
320
+ ms.firstTokenCount++
321
+ }
322
+
323
+ this.archive(trace)
324
+ }
325
+
326
+ /** Mark a trace as failed. Dumps error context to file. */
327
+ fail(reqId: string, error: Error | string, phase?: TracePhase, extra?: Record<string, unknown>) {
328
+ const trace = this.activeTraces.get(reqId)
329
+ if (!trace) {
330
+ // No trace found — log the error anyway
331
+ logError("trace.fail.no_trace", { reqId, error: String(error), phase })
332
+ return
333
+ }
334
+
335
+ const now = Date.now()
336
+ trace.completedAt = now
337
+ trace.phase = phase ?? trace.phase
338
+ trace.status = "error"
339
+
340
+ const err = error instanceof Error ? error : new Error(String(error))
341
+ const errorType = classifyError(err)
342
+
343
+ // Determine specific end reason
344
+ trace.endReason = extra?.clientDisconnect ? "client_disconnect"
345
+ : errorType === "stall_timeout" ? "stall_timeout"
346
+ : errorType === "queue_timeout" ? "queue_timeout"
347
+ : errorType === "timeout" ? "stall_timeout"
348
+ : err.name === "AbortError" ? "abort"
349
+ : "sdk_error"
350
+
351
+ trace.error = {
352
+ type: errorType,
353
+ message: err.message,
354
+ stack: err.stack,
355
+ phase: trace.phase,
356
+ }
357
+
358
+ const duration = now - trace.startedAt
359
+ const timings = this.computeTimings(trace)
360
+ const timeSinceLastEvent = now - trace.lastEventAt
361
+
362
+ // Compute throughput (chars/sec) over the streaming period
363
+ const streamDuration = trace.sdkStartedAt ? now - trace.sdkStartedAt : duration
364
+ const charsPerSec = streamDuration > 0 ? Math.round((trace.outputLen / streamDuration) * 1000) : 0
365
+
366
+ logError("trace.failed", {
367
+ reqId,
368
+ model: trace.model,
369
+ requestedModel: trace.requestedModel,
370
+ endReason: trace.endReason,
371
+ errorType,
372
+ error: err.message,
373
+ phase: trace.phase,
374
+ durationMs: duration,
375
+ ...timings,
376
+ sdkEventCount: trace.sdkEventCount,
377
+ outputLen: trace.outputLen,
378
+ stallCount: trace.stallCount,
379
+ charsPerSec,
380
+ timeSinceLastEventMs: timeSinceLastEvent,
381
+ lastEventType: trace.lastEventType,
382
+ eventTypes: trace.eventTypes,
383
+ ...extra,
384
+ })
385
+
386
+ // Update stats
387
+ this.stats.totalErrors++
388
+ this.stats.totalDurationMs += duration
389
+ const ms = this.getModelStats(trace.model)
390
+ ms.total++
391
+ ms.errors++
392
+ ms.totalDurationMs += duration
393
+ if (ms.maxDurationMs < duration) ms.maxDurationMs = duration
394
+ ms.lastErrorAt = now
395
+ ms.lastErrorReqId = reqId
396
+ if (timings.ttftMs !== undefined) {
397
+ ms.totalTimeToFirstTokenMs += timings.ttftMs
398
+ ms.firstTokenCount++
399
+ }
400
+
401
+ // Dump full error context to file
402
+ const dumpPath = dumpError(reqId, {
403
+ trace: this.serializeTrace(trace),
404
+ error: { type: errorType, message: err.message, stack: err.stack, phase: trace.phase },
405
+ endReason: trace.endReason,
406
+ timeSinceLastEventMs: timeSinceLastEvent,
407
+ lastEventType: trace.lastEventType,
408
+ eventTypes: trace.eventTypes,
409
+ charsPerSec,
410
+ ...extra,
411
+ })
412
+ logInfo("trace.error_dumped", { reqId, path: dumpPath })
413
+
414
+ // Store in error buffer
415
+ this.errorTraces.push({ ...trace })
416
+ if (this.errorTraces.length > ERROR_BUFFER_SIZE) {
417
+ this.errorTraces.shift()
418
+ }
419
+
420
+ this.archive(trace)
421
+ }
422
+
423
+ /** Update output length on a live trace (during streaming). */
424
+ updateOutput(reqId: string, outputLen: number) {
425
+ const trace = this.activeTraces.get(reqId)
426
+ if (trace) trace.outputLen = outputLen
427
+ }
428
+
429
+ /** Set the SDK debug log path for a trace. */
430
+ setSdkDebugLog(reqId: string, path: string) {
431
+ const trace = this.activeTraces.get(reqId)
432
+ if (trace) trace.sdkDebugLogPath = path
433
+ }
434
+
435
+ // ── Query methods (for debug endpoints) ──────────────────────────────────
436
+
437
+ /** Get aggregate stats. */
438
+ getStats() {
439
+ const now = Date.now()
440
+ const uptimeMs = now - this.stats.startedAt
441
+ const avgDurationMs = this.stats.totalRequests > 0
442
+ ? Math.round(this.stats.totalDurationMs / this.stats.totalRequests)
443
+ : 0
444
+
445
+ const byModel: Record<string, {
446
+ total: number
447
+ errors: number
448
+ avgDurationMs: number
449
+ avgTtftMs: number
450
+ maxDurationMs: number
451
+ lastErrorAt?: string
452
+ lastErrorReqId?: string
453
+ }> = {}
454
+ for (const [model, ms] of this.modelStats) {
455
+ byModel[model] = {
456
+ total: ms.total,
457
+ errors: ms.errors,
458
+ avgDurationMs: ms.total > 0 ? Math.round(ms.totalDurationMs / ms.total) : 0,
459
+ avgTtftMs: ms.firstTokenCount > 0 ? Math.round(ms.totalTimeToFirstTokenMs / ms.firstTokenCount) : 0,
460
+ maxDurationMs: ms.maxDurationMs,
461
+ ...(ms.lastErrorAt ? { lastErrorAt: new Date(ms.lastErrorAt).toISOString() } : {}),
462
+ ...(ms.lastErrorReqId ? { lastErrorReqId: ms.lastErrorReqId } : {}),
463
+ }
464
+ }
465
+
466
+ return {
467
+ uptimeMs,
468
+ uptimeHuman: humanDuration(uptimeMs),
469
+ requests: {
470
+ total: this.stats.totalRequests,
471
+ errors: this.stats.totalErrors,
472
+ active: this.activeTraces.size,
473
+ avgDurationMs,
474
+ errorRate: this.stats.totalRequests > 0
475
+ ? `${((this.stats.totalErrors / this.stats.totalRequests) * 100).toFixed(1)}%`
476
+ : "0%",
477
+ },
478
+ byModel,
479
+ activeRequests: Array.from(this.activeTraces.values()).map(t => ({
480
+ reqId: t.reqId,
481
+ model: t.model,
482
+ requestedModel: t.requestedModel,
483
+ phase: t.phase,
484
+ stream: t.stream,
485
+ hasTools: t.hasTools,
486
+ thinking: t.thinking,
487
+ elapsedMs: now - t.startedAt,
488
+ timeSinceLastEventMs: now - t.lastEventAt,
489
+ lastEventType: t.lastEventType,
490
+ sdkEventCount: t.sdkEventCount,
491
+ outputLen: t.outputLen,
492
+ stallCount: t.stallCount,
493
+ promptLen: t.promptLen,
494
+ systemLen: t.systemLen,
495
+ bodyBytes: t.bodyBytes,
496
+ clientIp: t.clientIp,
497
+ })),
498
+ }
499
+ }
500
+
501
+ /** Get recent traces (most recent first). */
502
+ getRecentTraces(limit = 20): ReturnType<typeof this.serializeTrace>[] {
503
+ return this.traces.slice(-limit).reverse().map(t => this.serializeTrace(t))
504
+ }
505
+
506
+ /** Get a specific trace by reqId. */
507
+ getTrace(reqId: string): ReturnType<typeof this.serializeTrace> | null {
508
+ // Check active first
509
+ const active = this.activeTraces.get(reqId)
510
+ if (active) return this.serializeTrace(active)
511
+ // Check buffer
512
+ const archived = this.traces.find(t => t.reqId === reqId)
513
+ if (archived) return this.serializeTrace(archived)
514
+ // Check error buffer
515
+ const err = this.errorTraces.find(t => t.reqId === reqId)
516
+ if (err) return this.serializeTrace(err)
517
+ return null
518
+ }
519
+
520
+ /** Get recent error traces. */
521
+ getRecentErrors(limit = 10): ReturnType<typeof this.serializeTrace>[] {
522
+ return this.errorTraces.slice(-limit).reverse().map(t => this.serializeTrace(t))
523
+ }
524
+
525
+ // ── Internal ─────────────────────────────────────────────────────────────
526
+
527
+ private archive(trace: RequestTrace) {
528
+ this.activeTraces.delete(trace.reqId)
529
+ this.traces.push(trace)
530
+ if (this.traces.length > BUFFER_SIZE) {
531
+ this.traces.shift()
532
+ }
533
+ }
534
+
535
+ private getModelStats(model: string): ModelStats {
536
+ let ms = this.modelStats.get(model)
537
+ if (!ms) {
538
+ ms = newModelStats()
539
+ this.modelStats.set(model, ms)
540
+ }
541
+ return ms
542
+ }
543
+
544
+ private computeTimings(trace: RequestTrace) {
545
+ const result: Record<string, number | undefined> = {}
546
+ if (trace.queuedAt && trace.acquiredAt) {
547
+ result.queueWaitMs = trace.acquiredAt - trace.queuedAt
548
+ }
549
+ if (trace.firstTokenAt) {
550
+ result.ttftMs = trace.firstTokenAt - trace.startedAt
551
+ if (trace.sdkStartedAt) {
552
+ result.ttftFromSdkMs = trace.firstTokenAt - trace.sdkStartedAt
553
+ }
554
+ }
555
+ if (trace.sdkStartedAt && trace.sdkEndedAt) {
556
+ result.sdkDurationMs = trace.sdkEndedAt - trace.sdkStartedAt
557
+ }
558
+ if (trace.completedAt) {
559
+ result.totalDurationMs = trace.completedAt - trace.startedAt
560
+ }
561
+ return result
562
+ }
563
+
564
+ private serializeTrace(trace: RequestTrace) {
565
+ const now = Date.now()
566
+ const timings = this.computeTimings(trace)
567
+ const duration = (trace.completedAt ?? now) - trace.startedAt
568
+ const streamDuration = trace.sdkStartedAt ? (trace.completedAt ?? now) - trace.sdkStartedAt : duration
569
+ const charsPerSec = streamDuration > 0 ? Math.round((trace.outputLen / streamDuration) * 1000) : 0
570
+ const timeSinceLastEvent = now - trace.lastEventAt
571
+
572
+ return {
573
+ reqId: trace.reqId,
574
+ model: trace.model,
575
+ requestedModel: trace.requestedModel,
576
+ stream: trace.stream,
577
+ hasTools: trace.hasTools,
578
+ thinking: trace.thinking,
579
+ promptLen: trace.promptLen,
580
+ systemLen: trace.systemLen,
581
+ msgCount: trace.msgCount,
582
+ bodyBytes: trace.bodyBytes,
583
+ clientIp: trace.clientIp,
584
+ phase: trace.phase,
585
+ status: trace.status,
586
+ endReason: trace.endReason,
587
+ sdkEventCount: trace.sdkEventCount,
588
+ outputLen: trace.outputLen,
589
+ toolCallCount: trace.toolCallCount,
590
+ stallCount: trace.stallCount,
591
+ charsPerSec,
592
+ eventTypes: trace.eventTypes,
593
+ lastEventType: trace.lastEventType,
594
+ startedAt: new Date(trace.startedAt).toISOString(),
595
+ ...(trace.completedAt
596
+ ? { completedAt: new Date(trace.completedAt).toISOString() }
597
+ : { elapsedMs: now - trace.startedAt, timeSinceLastEventMs: timeSinceLastEvent }),
598
+ ...timings,
599
+ ...(trace.error ? { error: trace.error } : {}),
600
+ ...(trace.sdkDebugLogPath ? { sdkDebugLogPath: trace.sdkDebugLogPath } : {}),
601
+ }
602
+ }
603
+ }
604
+
605
+ // ── Error classification ─────────────────────────────────────────────────────
606
+
607
+ export function classifyError(err: Error): string {
608
+ if (err.name === "AbortError" || err.message?.includes("aborted")) return "stall_timeout"
609
+ if (err.message.includes("Queue timeout")) return "queue_timeout"
610
+ if (err.message.includes("client disconnect") || err.message.includes("cancel")) return "client_disconnect"
611
+ if (err.message.includes("process aborted")) return "sdk_aborted"
612
+ if (err.message.includes("SIGTERM") || err.message.includes("SIGKILL")) return "sdk_killed"
613
+ if (err.message.includes("spawn") || err.message.includes("ENOENT")) return "sdk_spawn_error"
614
+ if (err.message.includes("JSON")) return "parse_error"
615
+ if (err.message.includes("ECONNREFUSED") || err.message.includes("ECONNRESET")) return "connection_error"
616
+ if (err.message.includes("EPIPE") || err.message.includes("broken pipe")) return "broken_pipe"
617
+ if (err.message.includes("memory") || err.message.includes("OOM")) return "oom_error"
618
+ if (err.message.includes("rate limit") || err.message.includes("429")) return "rate_limit"
619
+ return "unknown_error"
620
+ }
621
+
622
+ function humanDuration(ms: number): string {
623
+ if (ms < 1000) return `${ms}ms`
624
+ if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`
625
+ if (ms < 3600000) return `${Math.floor(ms / 60000)}m ${Math.floor((ms % 60000) / 1000)}s`
626
+ const h = Math.floor(ms / 3600000)
627
+ const m = Math.floor((ms % 3600000) / 60000)
628
+ return `${h}h ${m}m`
629
+ }
630
+
631
+ // ── Singleton ────────────────────────────────────────────────────────────────
632
+
633
+ export const traceStore = new TraceStore()