@monotykamary/pi-tps 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,701 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
+ import type { AssistantMessage } from '@earendil-works/pi-ai';
3
+ import { createTestFixture, activateExtension, tick } from './helpers';
4
+
5
+ describe('pi-tps extension — precision timing (performance.now())', () => {
6
+ let fixture: ReturnType<typeof createTestFixture>;
7
+
8
+ beforeEach(async () => {
9
+ fixture = createTestFixture();
10
+ await activateExtension(fixture);
11
+ });
12
+
13
+ afterEach(() => {
14
+ vi.restoreAllMocks();
15
+ });
16
+
17
+ /**
18
+ * Drive a full turn with mocked performance.now() timestamps.
19
+ * This avoids real-timer flakiness and tests sub-millisecond precision
20
+ * that Date.now() (1ms floor) would lose.
21
+ *
22
+ * `streamUpdates` provides timestamps for non-TTFT message_update events.
23
+ * At least MIN_STREAM_UPDATES (5) entries with a non-zero span are now
24
+ * required for inter-update TPS. Fewer updates falls back to generationMs
25
+ * (if generationMs > 2× streamMs) or null.
26
+ */
27
+ function driveTurn(clocks: {
28
+ turnStart: number;
29
+ messageStart: number;
30
+ firstUpdate: number;
31
+ streamUpdates: number[];
32
+ messageEnd: number;
33
+ turnEnd?: number;
34
+ }) {
35
+ const { handlers, notifySpy, appendEntrySpy } = fixture;
36
+
37
+ // Explicit sequence of performance.now() return values in call order:
38
+ // turnStartMs, lastUpdateMs (both at turn start), message_start,
39
+ // first message_update (TTFT), each streaming message_update,
40
+ // message_end, turnEndMs
41
+ const timestamps = [
42
+ clocks.turnStart, // turnStartMs
43
+ clocks.turnStart, // lastUpdateMs (same moment as turn start)
44
+ clocks.messageStart, // message_start: currentMessageStartMs + lastUpdateMs reset
45
+ clocks.firstUpdate, // message_update (TTFT): firstTokenMs
46
+ ...clocks.streamUpdates, // streaming message_update events
47
+ clocks.messageEnd, // message_end: generation time end
48
+ clocks.turnEnd ?? clocks.messageEnd, // turnEndMs
49
+ ];
50
+
51
+ let callIdx = 0;
52
+ const spy = vi.spyOn(performance, 'now').mockImplementation(() => {
53
+ return timestamps[Math.min(callIdx++, timestamps.length - 1)];
54
+ });
55
+
56
+ const assistantMessage: AssistantMessage = {
57
+ role: 'assistant',
58
+ content: [{ type: 'text', text: 'Short reply' }],
59
+ api: 'openai-completions',
60
+ provider: 'openai',
61
+ model: 'gpt-4',
62
+ usage: {
63
+ input: 50,
64
+ output: 20,
65
+ cacheRead: 0,
66
+ cacheWrite: 0,
67
+ totalTokens: 70,
68
+ cost: { input: 0.001, output: 0.002, cacheRead: 0, cacheWrite: 0, total: 0.003 },
69
+ },
70
+ stopReason: 'stop',
71
+ timestamp: Date.now(),
72
+ };
73
+
74
+ handlers['turn_start']?.({ type: 'turn_start', turnIndex: 0, timestamp: Date.now() });
75
+ handlers['message_start']?.({ type: 'message_start', message: assistantMessage });
76
+ // TTFT update
77
+ handlers['message_update']?.({
78
+ type: 'message_update',
79
+ message: assistantMessage,
80
+ assistantMessageEvent: { type: 'text_delta', delta: 't' },
81
+ });
82
+ // Streaming updates (each is a non-TTFT message_update)
83
+ for (const _ts of clocks.streamUpdates) {
84
+ handlers['message_update']?.({
85
+ type: 'message_update',
86
+ message: assistantMessage,
87
+ assistantMessageEvent: { type: 'text_delta', delta: 't' },
88
+ });
89
+ }
90
+ handlers['message_end']?.({ type: 'message_end', message: assistantMessage });
91
+ handlers['turn_end']?.(
92
+ { type: 'turn_end', turnIndex: 0, message: assistantMessage, toolResults: [] },
93
+ fixture.mockCtx
94
+ );
95
+
96
+ spy.mockRestore();
97
+ return { notifySpy, appendEntrySpy };
98
+ }
99
+
100
+ it('should produce realistic TPS with sufficient streaming updates (≥5)', () => {
101
+ const { notifySpy, appendEntrySpy } = driveTurn({
102
+ turnStart: 0,
103
+ messageStart: 200,
104
+ firstUpdate: 200.123,
105
+ streamUpdates: [400, 500, 600, 700, 800],
106
+ messageEnd: 900,
107
+ });
108
+
109
+ expect(notifySpy).toHaveBeenCalledOnce();
110
+ const notification = notifySpy.mock.calls[0][0] as string;
111
+ const tpsMatch = notification.match(/TPS (\d+(?:\.\d+)?) tok\/s/);
112
+ expect(tpsMatch).toBeTruthy();
113
+ const tps = parseFloat(tpsMatch![1]);
114
+ // 20 tokens / 0.4s (streamMs: 800 - 400) = 50.0 TPS
115
+ expect(tps).toBeGreaterThanOrEqual(40);
116
+ expect(tps).toBeLessThanOrEqual(60);
117
+
118
+ const [, data] = appendEntrySpy.mock.calls[0];
119
+ expect(data.timing.generationMs).toBeGreaterThanOrEqual(690);
120
+ expect(data.timing.ttftMs).toBeGreaterThanOrEqual(190);
121
+ expect(data.timing.streamMs).toBe(400); // 800 - 400
122
+ });
123
+
124
+ it('should capture sub-millisecond TTFT precision', () => {
125
+ const { appendEntrySpy } = driveTurn({
126
+ turnStart: 0,
127
+ messageStart: 23.456,
128
+ firstUpdate: 23.579,
129
+ streamUpdates: [100, 200, 300, 400, 523],
130
+ messageEnd: 523.456,
131
+ });
132
+
133
+ const [, data] = appendEntrySpy.mock.calls[0];
134
+ expect(data.timing.ttftMs).toBeGreaterThanOrEqual(23);
135
+ expect(data.timing.ttftMs).toBeLessThanOrEqual(24);
136
+ });
137
+
138
+ it('should produce null TPS when all streaming updates arrive in a burst (≤4 updates)', () => {
139
+ // Simulates the read-command case: updates fire in quick burst with few chunks.
140
+ // With only 1 post-TTFT update, updateCount=1 < MIN_STREAM_UPDATES=5.
141
+ const { notifySpy, appendEntrySpy } = driveTurn({
142
+ turnStart: 0,
143
+ messageStart: 100,
144
+ firstUpdate: 100.05,
145
+ streamUpdates: [100.05], // 1 post-TTFT update
146
+ messageEnd: 100.5,
147
+ });
148
+
149
+ expect(notifySpy).toHaveBeenCalledOnce();
150
+ const notification = notifySpy.mock.calls[0][0] as string;
151
+ // TPS shown as dash — not enough chunks for meaningful rate
152
+ expect(notification).toContain('TPS —');
153
+
154
+ const [, data] = appendEntrySpy.mock.calls[0];
155
+ expect(data.timing.generationMs).toBeGreaterThan(0);
156
+ expect(data.timing.streamMs).toBe(0);
157
+ expect(data.tps).toBeNull();
158
+ });
159
+
160
+ it('should use performance.now() consistently across all timing events', () => {
161
+ const { handlers, appendEntrySpy } = fixture;
162
+ const spy = vi.spyOn(performance, 'now');
163
+ // turn_start(2), message_start(1), message_update-TTFT(1),
164
+ // 5 streaming updates(5), message_end(1), turn_end(1) = 11 calls
165
+ // indices: 0-1=turn_start, 2=message_start, 3=TTFT,
166
+ // 4-8=streaming, 9=message_end, 10=turn_end
167
+ const timestamps = [0, 0, 100, 100.001, 100.5, 101, 101.2, 101.4, 101.8, 102, 102];
168
+ let callIdx = 0;
169
+ spy.mockImplementation(() => timestamps[Math.min(callIdx++, timestamps.length - 1)]);
170
+
171
+ const assistantMessage: AssistantMessage = {
172
+ role: 'assistant',
173
+ content: [{ type: 'text', text: 'Hi world test example' }],
174
+ api: 'openai-completions',
175
+ provider: 'openai',
176
+ model: 'gpt-4',
177
+ usage: { input: 10, output: 5, cacheRead: 0, cacheWrite: 0, totalTokens: 15 },
178
+ stopReason: 'stop',
179
+ timestamp: Date.now(),
180
+ };
181
+
182
+ handlers['turn_start']?.({ type: 'turn_start', turnIndex: 0, timestamp: Date.now() });
183
+ handlers['message_start']?.({ type: 'message_start', message: assistantMessage });
184
+ // TTFT update
185
+ handlers['message_update']?.({
186
+ type: 'message_update',
187
+ message: assistantMessage,
188
+ assistantMessageEvent: { type: 'text_delta', delta: 'H' },
189
+ });
190
+ // Streaming updates (5 = MIN_STREAM_UPDATES)
191
+ for (let i = 0; i < 5; i++) {
192
+ handlers['message_update']?.({
193
+ type: 'message_update',
194
+ message: assistantMessage,
195
+ assistantMessageEvent: { type: 'text_delta', delta: 'i' },
196
+ });
197
+ }
198
+ handlers['message_end']?.({ type: 'message_end', message: assistantMessage });
199
+ handlers['turn_end']?.(
200
+ { type: 'turn_end', turnIndex: 0, message: assistantMessage, toolResults: [] },
201
+ fixture.mockCtx
202
+ );
203
+
204
+ const callCount = spy.mock.calls.length;
205
+ spy.mockRestore();
206
+
207
+ expect(callCount).toBeGreaterThanOrEqual(4);
208
+
209
+ const [, data] = appendEntrySpy.mock.calls[0];
210
+ expect(data.timing.generationMs).toBeGreaterThan(1);
211
+ // Inter-update span: 101.8 - 100.5 = 1.3ms
212
+ expect(data.timing.streamMs).toBeCloseTo(1.3, 1); // last stream(101.8) - first stream(100.5)
213
+ });
214
+
215
+ // ─── Compound gate tests (MIN_STREAM_UPDATES + generationMs fallback) ───
216
+
217
+ it('should fallback to generationMs TPS when few chunks but generation time >> burst span', () => {
218
+ // 2 post-TTFT updates (updateCount=2), generationMs (200ms) is >= 50ms floor
219
+ const { notifySpy, appendEntrySpy } = driveTurn({
220
+ turnStart: 0,
221
+ messageStart: 50,
222
+ firstUpdate: 50.1,
223
+ streamUpdates: [50.15, 50.3],
224
+ messageEnd: 250,
225
+ });
226
+
227
+ expect(notifySpy).toHaveBeenCalledOnce();
228
+ const [, data] = appendEntrySpy.mock.calls[0];
229
+ // Falls back to generationMs: 20 tokens / 0.2s = 100 TPS
230
+ const notification = notifySpy.mock.calls[0][0] as string;
231
+ expect(notification).not.toContain('TPS —');
232
+
233
+ const tpsMatch = notification.match(/TPS (\d+(?:\.\d+)?) tok\/s/);
234
+ expect(tpsMatch).toBeTruthy();
235
+ const tps = parseFloat(tpsMatch![1]);
236
+ expect(tps).toBeGreaterThanOrEqual(70);
237
+ expect(tps).toBeLessThanOrEqual(130);
238
+
239
+ expect(data.tps).not.toBeNull();
240
+ });
241
+
242
+ it('should produce null TPS for fast burst where generationMs ≈ streamMs', () => {
243
+ // 2 post-TTFT updates, generationMs (0.3ms) is NOT > 2× streamMs (0.2ms)
244
+ const { notifySpy, appendEntrySpy } = driveTurn({
245
+ turnStart: 0,
246
+ messageStart: 100,
247
+ firstUpdate: 100.1,
248
+ streamUpdates: [100.15, 100.3],
249
+ messageEnd: 100.4,
250
+ });
251
+
252
+ expect(notifySpy).toHaveBeenCalledOnce();
253
+ const notification = notifySpy.mock.calls[0][0] as string;
254
+ expect(notification).toContain('TPS —');
255
+
256
+ const [, data] = appendEntrySpy.mock.calls[0];
257
+ expect(data.tps).toBeNull();
258
+ // Structurally unidentifiable: too few chunks, no reliable timebase
259
+ expect(data.timing.streamMs).toBeGreaterThan(0);
260
+ expect(data.timing.generationMs).toBeLessThan(5);
261
+ });
262
+
263
+ it('should return null TPS for exactly 4 post-TTFT updates (just below gate)', () => {
264
+ const { notifySpy, appendEntrySpy } = driveTurn({
265
+ turnStart: 0,
266
+ messageStart: 100,
267
+ firstUpdate: 100.1,
268
+ streamUpdates: [101, 102, 103, 104],
269
+ messageEnd: 105,
270
+ });
271
+
272
+ expect(notifySpy).toHaveBeenCalledOnce();
273
+ const notification = notifySpy.mock.calls[0][0] as string;
274
+ expect(notification).toContain('TPS —');
275
+
276
+ const [, data] = appendEntrySpy.mock.calls[0];
277
+ expect(data.tps).toBeNull();
278
+ expect(data.timing.streamMs).toBe(3); // 104 - 101
279
+ });
280
+
281
+ it('should return realistic TPS for exactly 5 post-TTFT updates (at gate)', () => {
282
+ const { notifySpy, appendEntrySpy } = driveTurn({
283
+ turnStart: 0,
284
+ messageStart: 100,
285
+ firstUpdate: 100.1,
286
+ streamUpdates: [150, 200, 250, 300, 350],
287
+ messageEnd: 400,
288
+ });
289
+
290
+ expect(notifySpy).toHaveBeenCalledOnce();
291
+ const notification = notifySpy.mock.calls[0][0] as string;
292
+ expect(notification).not.toContain('TPS —');
293
+
294
+ // 20 tokens / 0.2s (streamMs: 350 - 150) = 100 TPS
295
+ const tpsMatch = notification.match(/TPS (\d+(?:\.\d+)?) tok\/s/);
296
+ expect(tpsMatch).toBeTruthy();
297
+ const tps = parseFloat(tpsMatch![1]);
298
+ expect(tps).toBeGreaterThanOrEqual(90);
299
+ expect(tps).toBeLessThanOrEqual(110);
300
+
301
+ const [, data] = appendEntrySpy.mock.calls[0];
302
+ expect(data.tps).not.toBeNull();
303
+ expect(data.timing.streamMs).toBe(200); // 350 - 150
304
+ });
305
+
306
+ it('should return null TPS when streaming span is <50ms even with genuine high-speed generation', () => {
307
+ // 5 updates over 4ms (1ms avg gap) with 20 tokens → 5000 TPS if measured.
308
+ // But effectiveStreamMs=4ms is below the 50ms reliability floor: we can't
309
+ // distinguish genuine 5000 tok/s generation from a buffer-flush dispatch
310
+ // of pre-generated tokens in under 50ms. So we return null rather than
311
+ // risk overshooting.
312
+ //
313
+ // This also fails the fallback: generationMs (5.5ms) < 50ms → null.
314
+ const { notifySpy, appendEntrySpy } = driveTurn({
315
+ turnStart: 0,
316
+ messageStart: 100,
317
+ firstUpdate: 100.1,
318
+ streamUpdates: [101.1, 102.1, 103.1, 104.1, 105.1],
319
+ messageEnd: 105.5,
320
+ });
321
+
322
+ expect(notifySpy).toHaveBeenCalledOnce();
323
+ const notification = notifySpy.mock.calls[0][0] as string;
324
+ // Span too short for reliable generation speed — null is correct
325
+ expect(notification).toContain('TPS —');
326
+
327
+ const [, data] = appendEntrySpy.mock.calls[0];
328
+ expect(data.tps).toBeNull();
329
+ });
330
+
331
+ it('should fallback to effective-genMs TPS when stall dominates stream window (stall-before-stream)', () => {
332
+ // Real-world bug: a stall between TTFT and the first stream update
333
+ // causes firstStreamUpdateMs to be set AFTER the stall, making
334
+ // streamMs only cover the post-stall burst. Without the stall guard,
335
+ // TPS = output / streamMs gives wildly inflated values (e.g. 1934 tok/s
336
+ // from a 121ms burst within a 5843ms generation window).
337
+ //
338
+ // Timeline: TTFT at 2600ms, stall of ~4200ms, then 10 updates in 90ms.
339
+ // The stall is detected on the second message_update (first stream update),
340
+ // so firstStreamUpdateMs = 6800ms (post-stall) and streamMs = 90ms.
341
+ // stallMs (4200) > streamMs (90), so primary branch is skipped.
342
+ // Fallback: effectiveGenMs = generationMs - stallMs = 6900 - 4200 = 2700ms.
343
+ // TPS = 20 / 2.7 ≈ 7.4 tok/s (sane, not 222).
344
+ const { notifySpy, appendEntrySpy } = driveTurn({
345
+ turnStart: 0,
346
+ messageStart: 100,
347
+ firstUpdate: 2600, // TTFT
348
+ streamUpdates: [
349
+ // Second update (first stream update) arrives after ~4200ms stall
350
+ 6800, 6810, 6820, 6830, 6840, 6850, 6860, 6870, 6880, 6890,
351
+ ],
352
+ messageEnd: 7000,
353
+ turnEnd: 7000,
354
+ });
355
+
356
+ expect(notifySpy).toHaveBeenCalledOnce();
357
+ const notification = notifySpy.mock.calls[0][0] as string;
358
+ expect(notification).not.toContain('TPS —');
359
+
360
+ const tpsMatch = notification.match(/TPS (\d+(?:\.\d+)?) tok\/s/);
361
+ expect(tpsMatch).toBeTruthy();
362
+ const tps = parseFloat(tpsMatch![1]);
363
+
364
+ // Must NOT be in the thousands — fallback gives effective-genMs TPS
365
+ // effectiveGenMs = 6900 - 4200 = 2700ms → 20 / 2.7 ≈ 7.4 tok/s
366
+ expect(tps).toBeLessThan(30);
367
+ expect(tps).toBeGreaterThan(3);
368
+
369
+ const [, data] = appendEntrySpy.mock.calls[0];
370
+ expect(data.timing.stallMs).toBeGreaterThanOrEqual(4000);
371
+ expect(data.timing.streamMs).toBeLessThanOrEqual(100);
372
+ // stallMs > streamMs → primary branch skipped, fallback used
373
+ expect(data.tps).toBeLessThan(30);
374
+ });
375
+
376
+ it('should compute generation TPS with stall subtraction via fallback when stalls dominate active time', () => {
377
+ // A stall occurs WITHIN the streaming window (between two updates).
378
+ // stallMs (2000) > effectiveStreamMs (800) → the stall dominates the
379
+ // streaming window, so PRIMARY is skipped. FALLBACK gives effective-genMs
380
+ // rate: includes TTFT, so it underestimates, but never overshoots.
381
+ const { notifySpy, appendEntrySpy } = driveTurn({
382
+ turnStart: 0,
383
+ messageStart: 100,
384
+ firstUpdate: 100.1,
385
+ // 10 updates: first 5 in a burst, then a 2s stall, then 5 more
386
+ streamUpdates: [200, 300, 400, 500, 600, 2600, 2700, 2800, 2900, 3000],
387
+ messageEnd: 3100,
388
+ });
389
+
390
+ expect(notifySpy).toHaveBeenCalledOnce();
391
+ const [, data] = appendEntrySpy.mock.calls[0];
392
+ // streamMs = 3000 - 200 = 2800ms (includes the 2s stall)
393
+ expect(data.timing.streamMs).toBe(2800);
394
+ // stallMs should include the ~2000ms gap
395
+ expect(data.timing.stallMs).toBeGreaterThanOrEqual(1900);
396
+ // stallMs (2000) > effectiveStreamMs (800) → PRIMARY skipped
397
+ // FALLBACK: effectiveGenMs = max(3000 - 2000, 50) = 1000ms
398
+ // TPS = 20 / 1.0 = 20.0 tok/s (includes TTFT, underestimates gen speed)
399
+ const tpsMatch = (notifySpy.mock.calls[0][0] as string).match(/TPS (\d+(?:\.\d+)?) tok\/s/);
400
+ expect(tpsMatch).toBeTruthy();
401
+ const tps = parseFloat(tpsMatch![1]);
402
+ expect(tps).toBeGreaterThanOrEqual(15);
403
+ expect(tps).toBeLessThanOrEqual(30);
404
+ });
405
+
406
+ it('should return null TPS when avg inter-chunk gap < 1ms (buffer-flush signature)', () => {
407
+ // 5 updates over 1ms (0.25ms avg gap) — looks like a buffer flush,
408
+ // even with enough update count.
409
+ const { notifySpy, appendEntrySpy } = driveTurn({
410
+ turnStart: 0,
411
+ messageStart: 100,
412
+ firstUpdate: 100.1,
413
+ streamUpdates: [100.2, 100.4, 100.6, 100.8, 101.0],
414
+ messageEnd: 101.5,
415
+ });
416
+
417
+ expect(notifySpy).toHaveBeenCalledOnce();
418
+ const notification = notifySpy.mock.calls[0][0] as string;
419
+ // Dispatch overhead dominates: can't distinguish from generation timing
420
+ expect(notification).toContain('TPS —');
421
+
422
+ const [, data] = appendEntrySpy.mock.calls[0];
423
+ expect(data.tps).toBeNull();
424
+ });
425
+
426
+ // ─── Stall guard edge cases ────────────────────────────────────────────────
427
+
428
+ it('should use wall-clock streamMs (no stall subtraction) when stallMs is zero', () => {
429
+ // Baseline: no stalls, primary branch uses raw streamMs.
430
+ const { appendEntrySpy } = driveTurn({
431
+ turnStart: 0,
432
+ messageStart: 100,
433
+ firstUpdate: 100.1,
434
+ streamUpdates: [200, 300, 400, 500, 600],
435
+ messageEnd: 700,
436
+ });
437
+
438
+ const [, data] = appendEntrySpy.mock.calls[0];
439
+ // streamMs = 600 - 200 = 400ms
440
+ expect(data.timing.streamMs).toBe(400);
441
+ expect(data.timing.stallMs).toBe(0);
442
+ // 20 tokens / 0.4s = 50 tok/s
443
+ expect(data.tps).toBe(50);
444
+ });
445
+
446
+ it('should fallback when effectiveStreamMs < 50ms even though stallMs < streamMs', () => {
447
+ // Critical edge case: streamMs=1051, stallMs=998 → effectiveStreamMs=53ms.
448
+ // The 53ms remainder could be a buffer-flush dispatch of pre-generated
449
+ // tokens after a 998ms stall, not sustained inference. The 50ms floor
450
+ // catches this: effectiveStreamMs < 50ms → fall to fallback (genMs).
451
+ const { notifySpy, appendEntrySpy } = driveTurn({
452
+ turnStart: 0,
453
+ messageStart: 100,
454
+ firstUpdate: 100.1,
455
+ // 6 updates with gaps: 10ms, 10ms, 10ms, 10ms, 998ms (stall), 10ms
456
+ // streamMs = 1148.1 - 110.1 = 1038ms
457
+ // stallMs ≈ 998ms
458
+ // effectiveStreamMs = 1038 - 998 = 40ms < 50ms → FALLBACK
459
+ streamUpdates: [110.1, 120.1, 130.1, 140.1, 1138.1, 1148.1],
460
+ messageEnd: 1200,
461
+ });
462
+
463
+ const [, data] = appendEntrySpy.mock.calls[0];
464
+ expect(data.timing.streamMs).toBeGreaterThan(1000);
465
+ expect(data.timing.stallMs).toBeGreaterThanOrEqual(900);
466
+ // Fallback: effectiveGenMs = max(1100 - 998, 50) = 102ms
467
+ // 20 / 0.102 = ~196 tok/s (includes TTFT, so underestimates gen speed)
468
+ const tpsMatch = (notifySpy.mock.calls[0][0] as string).match(/TPS (\d+(?:\.\d+)?) tok\/s/);
469
+ expect(tpsMatch).toBeTruthy();
470
+ const tps = parseFloat(tpsMatch![1]);
471
+ expect(tps).toBeLessThan(300); // not inflated (not 500+)
472
+ expect(tps).toBeGreaterThan(0);
473
+ });
474
+
475
+ it('should fallback when effectiveStreamMs < 50ms at stallMs ≈ streamMs boundary', () => {
476
+ // streamMs ≈ 530ms, stallMs ≈ 500ms → effectiveStreamMs ≈ 30ms < 50ms
477
+ // Falls to fallback: effectiveGenMs (includes TTFT, underestimates)
478
+ const { notifySpy, appendEntrySpy } = driveTurn({
479
+ turnStart: 0,
480
+ messageStart: 100,
481
+ firstUpdate: 100.1,
482
+ streamUpdates: [600.1, 1100.1, 1110.1, 1120.1, 1130.1],
483
+ messageEnd: 1200,
484
+ });
485
+
486
+ const [, data] = appendEntrySpy.mock.calls[0];
487
+ expect(data.timing.stallMs).toBeGreaterThanOrEqual(400);
488
+ // Should NOT be inflated (not 600+)
489
+ expect(data.tps).toBeLessThan(200);
490
+ expect(data.tps).not.toBeNull();
491
+ });
492
+
493
+ it('should produce null TPS when both primary and fallback conditions fail', () => {
494
+ // Few updates AND short generation time
495
+ const { notifySpy, appendEntrySpy } = driveTurn({
496
+ turnStart: 0,
497
+ messageStart: 10,
498
+ firstUpdate: 10.1,
499
+ streamUpdates: [10.15, 10.2], // only 2 updates, genMs < 50ms
500
+ messageEnd: 10.5,
501
+ });
502
+
503
+ expect(notifySpy).toHaveBeenCalledOnce();
504
+ const notification = notifySpy.mock.calls[0][0] as string;
505
+ expect(notification).toContain('TPS —');
506
+
507
+ const [, data] = appendEntrySpy.mock.calls[0];
508
+ expect(data.tps).toBeNull();
509
+ });
510
+
511
+ it('should compute generation TPS via PRIMARY branch when stallMs < effectiveStreamMs', () => {
512
+ // A moderate stall occurs within the streaming window but doesn't
513
+ // dominate it: 500ms stall in a 2000ms window → effectiveStreamMs = 1500ms.
514
+ // stallMs (500) < effectiveStreamMs (1500) → PRIMARY branch fires.
515
+ // Generation TPS = 20 / 1.5 = 13.3 tok/s (raw inference speed)
516
+ const { notifySpy, appendEntrySpy } = driveTurn({
517
+ turnStart: 0,
518
+ messageStart: 100,
519
+ firstUpdate: 100.1,
520
+ // Updates: 5 at 200ms gaps, 500ms stall, then 5 at 200ms gaps
521
+ // streamMs = 4900 - 200 = 4700ms
522
+ // stallMs = 500ms
523
+ // effectiveStreamMs = 4700 - 500 = 4200ms
524
+ // stallMs (500) < effectiveStreamMs (4200) → PRIMARY
525
+ streamUpdates: [200, 400, 600, 800, 1000, 1500, 1700, 1900, 2100, 2300],
526
+ messageEnd: 2500,
527
+ });
528
+
529
+ expect(notifySpy).toHaveBeenCalledOnce();
530
+ const [, data] = appendEntrySpy.mock.calls[0];
531
+ expect(data.timing.streamMs).toBe(2100); // 2300 - 200
532
+ expect(data.timing.stallMs).toBeGreaterThanOrEqual(400);
533
+ // PRIMARY: effectiveStreamMs = 2100 - 500 = 1600ms
534
+ // 20 / 1.6 = 12.5 tok/s
535
+ const tpsMatch = (notifySpy.mock.calls[0][0] as string).match(/TPS (\d+(?:\.\d+)?) tok\/s/);
536
+ expect(tpsMatch).toBeTruthy();
537
+ const tps = parseFloat(tpsMatch![1]);
538
+ expect(tps).toBeGreaterThanOrEqual(10);
539
+ expect(tps).toBeLessThanOrEqual(20);
540
+ // Verify: this should match output / (effectiveStreamMs / 1000)
541
+ const effectiveStreamMs = data.timing.streamMs - data.timing.stallMs;
542
+ expect(data.tps).toBeCloseTo(20 / (effectiveStreamMs / 1000), 0);
543
+ });
544
+
545
+ it('should fallback when stallMs exactly equals effectiveStreamMs (50/50 boundary)', () => {
546
+ // streamMs = 2000, stallMs = 1000, effectiveStreamMs = 1000
547
+ // stallMs(1000) < effectiveStreamMs(1000)? NO (equal) → FALLBACK
548
+ // This prevents counting buffer-flush dispatches as generation.
549
+ const { notifySpy, appendEntrySpy } = driveTurn({
550
+ turnStart: 0,
551
+ messageStart: 100,
552
+ firstUpdate: 100.1,
553
+ // streamMs = 2100 - 100 = 2000ms
554
+ // stall from 600→1600 = 1000ms
555
+ streamUpdates: [100, 200, 300, 400, 500, 1600, 1700, 1800, 1900, 2100],
556
+ messageEnd: 2200,
557
+ });
558
+
559
+ const [, data] = appendEntrySpy.mock.calls[0];
560
+ expect(data.timing.streamMs).toBe(2000);
561
+ expect(data.timing.stallMs).toBeGreaterThanOrEqual(900);
562
+ // stallMs ≈ effectiveStreamMs → FALLBACK (includes TTFT)
563
+ expect(data.tps).not.toBeNull();
564
+ expect(data.tps!).toBeLessThan(50); // no inflation
565
+ });
566
+
567
+ it('should handle stall-before-stream with zero streamMs (all updates in one tick after stall)', () => {
568
+ // TTFT, then a long stall, then ALL stream updates arrive in the same tick
569
+ // → streamMs = 0 (or near-zero) → primary fails, fallback kicks in
570
+ const { notifySpy, appendEntrySpy } = driveTurn({
571
+ turnStart: 0,
572
+ messageStart: 100,
573
+ firstUpdate: 5000, // TTFT at 5s
574
+ // All stream updates arrive simultaneously (buffered after stall)
575
+ streamUpdates: [6000, 6000, 6000, 6000, 6000],
576
+ messageEnd: 6100,
577
+ });
578
+
579
+ const [, data] = appendEntrySpy.mock.calls[0];
580
+ // streamMs ≈ 0 → primary fails (streamMs < MIN_STREAM_MS)
581
+ // Fallback: generationMs >= 50ms → effectiveGenMs
582
+ expect(data.timing.streamMs).toBeLessThan(5);
583
+ expect(data.tps).not.toBeNull();
584
+ expect(data.tps).toBeLessThan(100); // no inflation
585
+ });
586
+
587
+ it('should produce consistent TPS for multi-message turn with stalls', () => {
588
+ // Two messages per turn, each with a stall.
589
+ // The stall detector resets on message_start, so stalls should be
590
+ // tracked per-message but accumulated across the turn.
591
+ const { handlers, notifySpy, appendEntrySpy } = fixture;
592
+
593
+ const msg1: AssistantMessage = {
594
+ role: 'assistant',
595
+ content: [{ type: 'text', text: 'First' }],
596
+ api: 'openai-completions',
597
+ provider: 'openai',
598
+ model: 'gpt-4',
599
+ usage: {
600
+ input: 50,
601
+ output: 100,
602
+ cacheRead: 0,
603
+ cacheWrite: 0,
604
+ totalTokens: 150,
605
+ cost: { input: 0.001, output: 0.002, cacheRead: 0, cacheWrite: 0, total: 0.003 },
606
+ },
607
+ stopReason: 'toolUse',
608
+ timestamp: Date.now(),
609
+ };
610
+ const msg2: AssistantMessage = {
611
+ role: 'assistant',
612
+ content: [{ type: 'text', text: 'Second' }],
613
+ api: 'openai-completions',
614
+ provider: 'openai',
615
+ model: 'gpt-4',
616
+ usage: {
617
+ input: 50,
618
+ output: 100,
619
+ cacheRead: 0,
620
+ cacheWrite: 0,
621
+ totalTokens: 150,
622
+ cost: { input: 0.001, output: 0.002, cacheRead: 0, cacheWrite: 0, total: 0.003 },
623
+ },
624
+ stopReason: 'stop',
625
+ timestamp: Date.now(),
626
+ };
627
+
628
+ const timestamps = [
629
+ 0, // turn_start (turnStartMs)
630
+ 0, // turn_start (lastUpdateMs)
631
+ 100, // message_start 1 (currentMessageStartMs)
632
+ 200, // message_update TTFT 1
633
+ 300, // message_update stream 1
634
+ 400, // message_update stream 2
635
+ 500, // message_update stream 3
636
+ 600, // message_update stream 4
637
+ 700, // message_update stream 5
638
+ 1200, // message_update stream 6 (500ms stall gap)
639
+ 1300, // message_end 1 (generationMs end)
640
+ 1400, // message_start 2 (resets stall tracking)
641
+ 1500, // message_update TTFT 2
642
+ 1600, // message_update stream 1
643
+ 1700, // message_update stream 2
644
+ 1800, // message_update stream 3
645
+ 1900, // message_update stream 4
646
+ 2000, // message_update stream 5
647
+ 2500, // message_update stream 6 (500ms stall gap)
648
+ 2600, // message_end 2
649
+ 2600, // turn_end
650
+ ];
651
+ let callIdx = 0;
652
+ const spy = vi
653
+ .spyOn(performance, 'now')
654
+ .mockImplementation(() => timestamps[Math.min(callIdx++, timestamps.length - 1)]);
655
+
656
+ handlers['turn_start']?.({ type: 'turn_start', turnIndex: 0, timestamp: Date.now() });
657
+ handlers['message_start']?.({ type: 'message_start', message: msg1 });
658
+ handlers['message_update']?.({
659
+ type: 'message_update',
660
+ message: msg1,
661
+ assistantMessageEvent: { type: 'text_delta', delta: 't' },
662
+ }); // TTFT
663
+ for (let i = 0; i < 6; i++) {
664
+ handlers['message_update']?.({
665
+ type: 'message_update',
666
+ message: msg1,
667
+ assistantMessageEvent: { type: 'text_delta', delta: 't' },
668
+ }); // stream
669
+ }
670
+ handlers['message_end']?.({ type: 'message_end', message: msg1 });
671
+ handlers['message_start']?.({ type: 'message_start', message: msg2 }); // resets stall
672
+ handlers['message_update']?.({
673
+ type: 'message_update',
674
+ message: msg2,
675
+ assistantMessageEvent: { type: 'text_delta', delta: 't' },
676
+ }); // TTFT
677
+ for (let i = 0; i < 6; i++) {
678
+ handlers['message_update']?.({
679
+ type: 'message_update',
680
+ message: msg2,
681
+ assistantMessageEvent: { type: 'text_delta', delta: 't' },
682
+ }); // stream
683
+ }
684
+ handlers['message_end']?.({ type: 'message_end', message: msg2 });
685
+ handlers['turn_end']?.(
686
+ { type: 'turn_end', turnIndex: 0, message: msg2, toolResults: [] },
687
+ fixture.mockCtx
688
+ );
689
+ spy.mockRestore();
690
+
691
+ expect(notifySpy).toHaveBeenCalledOnce();
692
+ const [, data] = appendEntrySpy.mock.calls[0];
693
+ // Two messages, each with 100 output, total output = 200
694
+ expect(data.tokens.output).toBe(200);
695
+ expect(data.timing.messageCount).toBe(2);
696
+ // Should have stalls from both messages
697
+ expect(data.timing.stallCount).toBeGreaterThanOrEqual(2);
698
+ // TPS should be sane (not in the thousands)
699
+ expect(data.tps).toBeLessThan(200);
700
+ });
701
+ });