@monotykamary/pi-tps 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/FUNDING.yml +4 -0
- package/.github/workflows/test.yml +55 -0
- package/.pi/autoresearch/session-id +1 -0
- package/.prettierrc +7 -0
- package/LICENSE +21 -0
- package/README.md +237 -0
- package/commitlint.config.cjs +1 -0
- package/extensions/pi-tps/__tests__/export-command.test.ts +307 -0
- package/extensions/pi-tps/__tests__/extension-setup.test.ts +41 -0
- package/extensions/pi-tps/__tests__/format-duration.test.ts +83 -0
- package/extensions/pi-tps/__tests__/helpers.ts +154 -0
- package/extensions/pi-tps/__tests__/precision-timing.test.ts +701 -0
- package/extensions/pi-tps/__tests__/rehydration.test.ts +266 -0
- package/extensions/pi-tps/__tests__/session-export.test.ts +204 -0
- package/extensions/pi-tps/__tests__/stall-detection.test.ts +209 -0
- package/extensions/pi-tps/__tests__/stall-reduction.test.ts +139 -0
- package/extensions/pi-tps/__tests__/telemetry-flow.test.ts +654 -0
- package/extensions/pi-tps/index.ts +734 -0
- package/knip.json +10 -0
- package/npm-shrinkwrap.json +6923 -0
- package/package.json +54 -0
- package/tsconfig.json +24 -0
- package/vitest.config.ts +15 -0
|
@@ -0,0 +1,701 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import type { AssistantMessage } from '@earendil-works/pi-ai';
|
|
3
|
+
import { createTestFixture, activateExtension, tick } from './helpers';
|
|
4
|
+
|
|
5
|
+
describe('pi-tps extension — precision timing (performance.now())', () => {
|
|
6
|
+
let fixture: ReturnType<typeof createTestFixture>;
|
|
7
|
+
|
|
8
|
+
beforeEach(async () => {
|
|
9
|
+
fixture = createTestFixture();
|
|
10
|
+
await activateExtension(fixture);
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
afterEach(() => {
|
|
14
|
+
vi.restoreAllMocks();
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Drive a full turn with mocked performance.now() timestamps.
|
|
19
|
+
* This avoids real-timer flakiness and tests sub-millisecond precision
|
|
20
|
+
* that Date.now() (1ms floor) would lose.
|
|
21
|
+
*
|
|
22
|
+
* `streamUpdates` provides timestamps for non-TTFT message_update events.
|
|
23
|
+
* At least MIN_STREAM_UPDATES (5) entries with a non-zero span are now
|
|
24
|
+
* required for inter-update TPS. Fewer updates falls back to generationMs
|
|
25
|
+
* (if generationMs > 2× streamMs) or null.
|
|
26
|
+
*/
|
|
27
|
+
function driveTurn(clocks: {
|
|
28
|
+
turnStart: number;
|
|
29
|
+
messageStart: number;
|
|
30
|
+
firstUpdate: number;
|
|
31
|
+
streamUpdates: number[];
|
|
32
|
+
messageEnd: number;
|
|
33
|
+
turnEnd?: number;
|
|
34
|
+
}) {
|
|
35
|
+
const { handlers, notifySpy, appendEntrySpy } = fixture;
|
|
36
|
+
|
|
37
|
+
// Explicit sequence of performance.now() return values in call order:
|
|
38
|
+
// turnStartMs, lastUpdateMs (both at turn start), message_start,
|
|
39
|
+
// first message_update (TTFT), each streaming message_update,
|
|
40
|
+
// message_end, turnEndMs
|
|
41
|
+
const timestamps = [
|
|
42
|
+
clocks.turnStart, // turnStartMs
|
|
43
|
+
clocks.turnStart, // lastUpdateMs (same moment as turn start)
|
|
44
|
+
clocks.messageStart, // message_start: currentMessageStartMs + lastUpdateMs reset
|
|
45
|
+
clocks.firstUpdate, // message_update (TTFT): firstTokenMs
|
|
46
|
+
...clocks.streamUpdates, // streaming message_update events
|
|
47
|
+
clocks.messageEnd, // message_end: generation time end
|
|
48
|
+
clocks.turnEnd ?? clocks.messageEnd, // turnEndMs
|
|
49
|
+
];
|
|
50
|
+
|
|
51
|
+
let callIdx = 0;
|
|
52
|
+
const spy = vi.spyOn(performance, 'now').mockImplementation(() => {
|
|
53
|
+
return timestamps[Math.min(callIdx++, timestamps.length - 1)];
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
const assistantMessage: AssistantMessage = {
|
|
57
|
+
role: 'assistant',
|
|
58
|
+
content: [{ type: 'text', text: 'Short reply' }],
|
|
59
|
+
api: 'openai-completions',
|
|
60
|
+
provider: 'openai',
|
|
61
|
+
model: 'gpt-4',
|
|
62
|
+
usage: {
|
|
63
|
+
input: 50,
|
|
64
|
+
output: 20,
|
|
65
|
+
cacheRead: 0,
|
|
66
|
+
cacheWrite: 0,
|
|
67
|
+
totalTokens: 70,
|
|
68
|
+
cost: { input: 0.001, output: 0.002, cacheRead: 0, cacheWrite: 0, total: 0.003 },
|
|
69
|
+
},
|
|
70
|
+
stopReason: 'stop',
|
|
71
|
+
timestamp: Date.now(),
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
handlers['turn_start']?.({ type: 'turn_start', turnIndex: 0, timestamp: Date.now() });
|
|
75
|
+
handlers['message_start']?.({ type: 'message_start', message: assistantMessage });
|
|
76
|
+
// TTFT update
|
|
77
|
+
handlers['message_update']?.({
|
|
78
|
+
type: 'message_update',
|
|
79
|
+
message: assistantMessage,
|
|
80
|
+
assistantMessageEvent: { type: 'text_delta', delta: 't' },
|
|
81
|
+
});
|
|
82
|
+
// Streaming updates (each is a non-TTFT message_update)
|
|
83
|
+
for (const _ts of clocks.streamUpdates) {
|
|
84
|
+
handlers['message_update']?.({
|
|
85
|
+
type: 'message_update',
|
|
86
|
+
message: assistantMessage,
|
|
87
|
+
assistantMessageEvent: { type: 'text_delta', delta: 't' },
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
handlers['message_end']?.({ type: 'message_end', message: assistantMessage });
|
|
91
|
+
handlers['turn_end']?.(
|
|
92
|
+
{ type: 'turn_end', turnIndex: 0, message: assistantMessage, toolResults: [] },
|
|
93
|
+
fixture.mockCtx
|
|
94
|
+
);
|
|
95
|
+
|
|
96
|
+
spy.mockRestore();
|
|
97
|
+
return { notifySpy, appendEntrySpy };
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
it('should produce realistic TPS with sufficient streaming updates (≥5)', () => {
|
|
101
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
102
|
+
turnStart: 0,
|
|
103
|
+
messageStart: 200,
|
|
104
|
+
firstUpdate: 200.123,
|
|
105
|
+
streamUpdates: [400, 500, 600, 700, 800],
|
|
106
|
+
messageEnd: 900,
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
110
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
111
|
+
const tpsMatch = notification.match(/TPS (\d+(?:\.\d+)?) tok\/s/);
|
|
112
|
+
expect(tpsMatch).toBeTruthy();
|
|
113
|
+
const tps = parseFloat(tpsMatch![1]);
|
|
114
|
+
// 20 tokens / 0.4s (streamMs: 800 - 400) = 50.0 TPS
|
|
115
|
+
expect(tps).toBeGreaterThanOrEqual(40);
|
|
116
|
+
expect(tps).toBeLessThanOrEqual(60);
|
|
117
|
+
|
|
118
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
119
|
+
expect(data.timing.generationMs).toBeGreaterThanOrEqual(690);
|
|
120
|
+
expect(data.timing.ttftMs).toBeGreaterThanOrEqual(190);
|
|
121
|
+
expect(data.timing.streamMs).toBe(400); // 800 - 400
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
it('should capture sub-millisecond TTFT precision', () => {
|
|
125
|
+
const { appendEntrySpy } = driveTurn({
|
|
126
|
+
turnStart: 0,
|
|
127
|
+
messageStart: 23.456,
|
|
128
|
+
firstUpdate: 23.579,
|
|
129
|
+
streamUpdates: [100, 200, 300, 400, 523],
|
|
130
|
+
messageEnd: 523.456,
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
134
|
+
expect(data.timing.ttftMs).toBeGreaterThanOrEqual(23);
|
|
135
|
+
expect(data.timing.ttftMs).toBeLessThanOrEqual(24);
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
it('should produce null TPS when all streaming updates arrive in a burst (≤4 updates)', () => {
|
|
139
|
+
// Simulates the read-command case: updates fire in quick burst with few chunks.
|
|
140
|
+
// With only 1 post-TTFT update, updateCount=1 < MIN_STREAM_UPDATES=5.
|
|
141
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
142
|
+
turnStart: 0,
|
|
143
|
+
messageStart: 100,
|
|
144
|
+
firstUpdate: 100.05,
|
|
145
|
+
streamUpdates: [100.05], // 1 post-TTFT update
|
|
146
|
+
messageEnd: 100.5,
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
150
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
151
|
+
// TPS shown as dash — not enough chunks for meaningful rate
|
|
152
|
+
expect(notification).toContain('TPS —');
|
|
153
|
+
|
|
154
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
155
|
+
expect(data.timing.generationMs).toBeGreaterThan(0);
|
|
156
|
+
expect(data.timing.streamMs).toBe(0);
|
|
157
|
+
expect(data.tps).toBeNull();
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
it('should use performance.now() consistently across all timing events', () => {
|
|
161
|
+
const { handlers, appendEntrySpy } = fixture;
|
|
162
|
+
const spy = vi.spyOn(performance, 'now');
|
|
163
|
+
// turn_start(2), message_start(1), message_update-TTFT(1),
|
|
164
|
+
// 5 streaming updates(5), message_end(1), turn_end(1) = 11 calls
|
|
165
|
+
// indices: 0-1=turn_start, 2=message_start, 3=TTFT,
|
|
166
|
+
// 4-8=streaming, 9=message_end, 10=turn_end
|
|
167
|
+
const timestamps = [0, 0, 100, 100.001, 100.5, 101, 101.2, 101.4, 101.8, 102, 102];
|
|
168
|
+
let callIdx = 0;
|
|
169
|
+
spy.mockImplementation(() => timestamps[Math.min(callIdx++, timestamps.length - 1)]);
|
|
170
|
+
|
|
171
|
+
const assistantMessage: AssistantMessage = {
|
|
172
|
+
role: 'assistant',
|
|
173
|
+
content: [{ type: 'text', text: 'Hi world test example' }],
|
|
174
|
+
api: 'openai-completions',
|
|
175
|
+
provider: 'openai',
|
|
176
|
+
model: 'gpt-4',
|
|
177
|
+
usage: { input: 10, output: 5, cacheRead: 0, cacheWrite: 0, totalTokens: 15 },
|
|
178
|
+
stopReason: 'stop',
|
|
179
|
+
timestamp: Date.now(),
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
handlers['turn_start']?.({ type: 'turn_start', turnIndex: 0, timestamp: Date.now() });
|
|
183
|
+
handlers['message_start']?.({ type: 'message_start', message: assistantMessage });
|
|
184
|
+
// TTFT update
|
|
185
|
+
handlers['message_update']?.({
|
|
186
|
+
type: 'message_update',
|
|
187
|
+
message: assistantMessage,
|
|
188
|
+
assistantMessageEvent: { type: 'text_delta', delta: 'H' },
|
|
189
|
+
});
|
|
190
|
+
// Streaming updates (5 = MIN_STREAM_UPDATES)
|
|
191
|
+
for (let i = 0; i < 5; i++) {
|
|
192
|
+
handlers['message_update']?.({
|
|
193
|
+
type: 'message_update',
|
|
194
|
+
message: assistantMessage,
|
|
195
|
+
assistantMessageEvent: { type: 'text_delta', delta: 'i' },
|
|
196
|
+
});
|
|
197
|
+
}
|
|
198
|
+
handlers['message_end']?.({ type: 'message_end', message: assistantMessage });
|
|
199
|
+
handlers['turn_end']?.(
|
|
200
|
+
{ type: 'turn_end', turnIndex: 0, message: assistantMessage, toolResults: [] },
|
|
201
|
+
fixture.mockCtx
|
|
202
|
+
);
|
|
203
|
+
|
|
204
|
+
const callCount = spy.mock.calls.length;
|
|
205
|
+
spy.mockRestore();
|
|
206
|
+
|
|
207
|
+
expect(callCount).toBeGreaterThanOrEqual(4);
|
|
208
|
+
|
|
209
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
210
|
+
expect(data.timing.generationMs).toBeGreaterThan(1);
|
|
211
|
+
// Inter-update span: 101.8 - 100.5 = 1.3ms
|
|
212
|
+
expect(data.timing.streamMs).toBeCloseTo(1.3, 1); // last stream(101.8) - first stream(100.5)
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
// ─── Compound gate tests (MIN_STREAM_UPDATES + generationMs fallback) ───
|
|
216
|
+
|
|
217
|
+
it('should fallback to generationMs TPS when few chunks but generation time >> burst span', () => {
|
|
218
|
+
// 2 post-TTFT updates (updateCount=2), generationMs (200ms) is >= 50ms floor
|
|
219
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
220
|
+
turnStart: 0,
|
|
221
|
+
messageStart: 50,
|
|
222
|
+
firstUpdate: 50.1,
|
|
223
|
+
streamUpdates: [50.15, 50.3],
|
|
224
|
+
messageEnd: 250,
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
228
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
229
|
+
// Falls back to generationMs: 20 tokens / 0.2s = 100 TPS
|
|
230
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
231
|
+
expect(notification).not.toContain('TPS —');
|
|
232
|
+
|
|
233
|
+
const tpsMatch = notification.match(/TPS (\d+(?:\.\d+)?) tok\/s/);
|
|
234
|
+
expect(tpsMatch).toBeTruthy();
|
|
235
|
+
const tps = parseFloat(tpsMatch![1]);
|
|
236
|
+
expect(tps).toBeGreaterThanOrEqual(70);
|
|
237
|
+
expect(tps).toBeLessThanOrEqual(130);
|
|
238
|
+
|
|
239
|
+
expect(data.tps).not.toBeNull();
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
it('should produce null TPS for fast burst where generationMs ≈ streamMs', () => {
|
|
243
|
+
// 2 post-TTFT updates, generationMs (0.3ms) is NOT > 2× streamMs (0.2ms)
|
|
244
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
245
|
+
turnStart: 0,
|
|
246
|
+
messageStart: 100,
|
|
247
|
+
firstUpdate: 100.1,
|
|
248
|
+
streamUpdates: [100.15, 100.3],
|
|
249
|
+
messageEnd: 100.4,
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
253
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
254
|
+
expect(notification).toContain('TPS —');
|
|
255
|
+
|
|
256
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
257
|
+
expect(data.tps).toBeNull();
|
|
258
|
+
// Structurally unidentifiable: too few chunks, no reliable timebase
|
|
259
|
+
expect(data.timing.streamMs).toBeGreaterThan(0);
|
|
260
|
+
expect(data.timing.generationMs).toBeLessThan(5);
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
it('should return null TPS for exactly 4 post-TTFT updates (just below gate)', () => {
|
|
264
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
265
|
+
turnStart: 0,
|
|
266
|
+
messageStart: 100,
|
|
267
|
+
firstUpdate: 100.1,
|
|
268
|
+
streamUpdates: [101, 102, 103, 104],
|
|
269
|
+
messageEnd: 105,
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
273
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
274
|
+
expect(notification).toContain('TPS —');
|
|
275
|
+
|
|
276
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
277
|
+
expect(data.tps).toBeNull();
|
|
278
|
+
expect(data.timing.streamMs).toBe(3); // 104 - 101
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
it('should return realistic TPS for exactly 5 post-TTFT updates (at gate)', () => {
|
|
282
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
283
|
+
turnStart: 0,
|
|
284
|
+
messageStart: 100,
|
|
285
|
+
firstUpdate: 100.1,
|
|
286
|
+
streamUpdates: [150, 200, 250, 300, 350],
|
|
287
|
+
messageEnd: 400,
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
291
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
292
|
+
expect(notification).not.toContain('TPS —');
|
|
293
|
+
|
|
294
|
+
// 20 tokens / 0.2s (streamMs: 350 - 150) = 100 TPS
|
|
295
|
+
const tpsMatch = notification.match(/TPS (\d+(?:\.\d+)?) tok\/s/);
|
|
296
|
+
expect(tpsMatch).toBeTruthy();
|
|
297
|
+
const tps = parseFloat(tpsMatch![1]);
|
|
298
|
+
expect(tps).toBeGreaterThanOrEqual(90);
|
|
299
|
+
expect(tps).toBeLessThanOrEqual(110);
|
|
300
|
+
|
|
301
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
302
|
+
expect(data.tps).not.toBeNull();
|
|
303
|
+
expect(data.timing.streamMs).toBe(200); // 350 - 150
|
|
304
|
+
});
|
|
305
|
+
|
|
306
|
+
it('should return null TPS when streaming span is <50ms even with genuine high-speed generation', () => {
|
|
307
|
+
// 5 updates over 4ms (1ms avg gap) with 20 tokens → 5000 TPS if measured.
|
|
308
|
+
// But effectiveStreamMs=4ms is below the 50ms reliability floor: we can't
|
|
309
|
+
// distinguish genuine 5000 tok/s generation from a buffer-flush dispatch
|
|
310
|
+
// of pre-generated tokens in under 50ms. So we return null rather than
|
|
311
|
+
// risk overshooting.
|
|
312
|
+
//
|
|
313
|
+
// This also fails the fallback: generationMs (5.5ms) < 50ms → null.
|
|
314
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
315
|
+
turnStart: 0,
|
|
316
|
+
messageStart: 100,
|
|
317
|
+
firstUpdate: 100.1,
|
|
318
|
+
streamUpdates: [101.1, 102.1, 103.1, 104.1, 105.1],
|
|
319
|
+
messageEnd: 105.5,
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
323
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
324
|
+
// Span too short for reliable generation speed — null is correct
|
|
325
|
+
expect(notification).toContain('TPS —');
|
|
326
|
+
|
|
327
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
328
|
+
expect(data.tps).toBeNull();
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
it('should fallback to effective-genMs TPS when stall dominates stream window (stall-before-stream)', () => {
|
|
332
|
+
// Real-world bug: a stall between TTFT and the first stream update
|
|
333
|
+
// causes firstStreamUpdateMs to be set AFTER the stall, making
|
|
334
|
+
// streamMs only cover the post-stall burst. Without the stall guard,
|
|
335
|
+
// TPS = output / streamMs gives wildly inflated values (e.g. 1934 tok/s
|
|
336
|
+
// from a 121ms burst within a 5843ms generation window).
|
|
337
|
+
//
|
|
338
|
+
// Timeline: TTFT at 2600ms, stall of ~4200ms, then 10 updates in 90ms.
|
|
339
|
+
// The stall is detected on the second message_update (first stream update),
|
|
340
|
+
// so firstStreamUpdateMs = 6800ms (post-stall) and streamMs = 90ms.
|
|
341
|
+
// stallMs (4200) > streamMs (90), so primary branch is skipped.
|
|
342
|
+
// Fallback: effectiveGenMs = generationMs - stallMs = 6900 - 4200 = 2700ms.
|
|
343
|
+
// TPS = 20 / 2.7 ≈ 7.4 tok/s (sane, not 222).
|
|
344
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
345
|
+
turnStart: 0,
|
|
346
|
+
messageStart: 100,
|
|
347
|
+
firstUpdate: 2600, // TTFT
|
|
348
|
+
streamUpdates: [
|
|
349
|
+
// Second update (first stream update) arrives after ~4200ms stall
|
|
350
|
+
6800, 6810, 6820, 6830, 6840, 6850, 6860, 6870, 6880, 6890,
|
|
351
|
+
],
|
|
352
|
+
messageEnd: 7000,
|
|
353
|
+
turnEnd: 7000,
|
|
354
|
+
});
|
|
355
|
+
|
|
356
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
357
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
358
|
+
expect(notification).not.toContain('TPS —');
|
|
359
|
+
|
|
360
|
+
const tpsMatch = notification.match(/TPS (\d+(?:\.\d+)?) tok\/s/);
|
|
361
|
+
expect(tpsMatch).toBeTruthy();
|
|
362
|
+
const tps = parseFloat(tpsMatch![1]);
|
|
363
|
+
|
|
364
|
+
// Must NOT be in the thousands — fallback gives effective-genMs TPS
|
|
365
|
+
// effectiveGenMs = 6900 - 4200 = 2700ms → 20 / 2.7 ≈ 7.4 tok/s
|
|
366
|
+
expect(tps).toBeLessThan(30);
|
|
367
|
+
expect(tps).toBeGreaterThan(3);
|
|
368
|
+
|
|
369
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
370
|
+
expect(data.timing.stallMs).toBeGreaterThanOrEqual(4000);
|
|
371
|
+
expect(data.timing.streamMs).toBeLessThanOrEqual(100);
|
|
372
|
+
// stallMs > streamMs → primary branch skipped, fallback used
|
|
373
|
+
expect(data.tps).toBeLessThan(30);
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
it('should compute generation TPS with stall subtraction via fallback when stalls dominate active time', () => {
|
|
377
|
+
// A stall occurs WITHIN the streaming window (between two updates).
|
|
378
|
+
// stallMs (2000) > effectiveStreamMs (800) → the stall dominates the
|
|
379
|
+
// streaming window, so PRIMARY is skipped. FALLBACK gives effective-genMs
|
|
380
|
+
// rate: includes TTFT, so it underestimates, but never overshoots.
|
|
381
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
382
|
+
turnStart: 0,
|
|
383
|
+
messageStart: 100,
|
|
384
|
+
firstUpdate: 100.1,
|
|
385
|
+
// 10 updates: first 5 in a burst, then a 2s stall, then 5 more
|
|
386
|
+
streamUpdates: [200, 300, 400, 500, 600, 2600, 2700, 2800, 2900, 3000],
|
|
387
|
+
messageEnd: 3100,
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
391
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
392
|
+
// streamMs = 3000 - 200 = 2800ms (includes the 2s stall)
|
|
393
|
+
expect(data.timing.streamMs).toBe(2800);
|
|
394
|
+
// stallMs should include the ~2000ms gap
|
|
395
|
+
expect(data.timing.stallMs).toBeGreaterThanOrEqual(1900);
|
|
396
|
+
// stallMs (2000) > effectiveStreamMs (800) → PRIMARY skipped
|
|
397
|
+
// FALLBACK: effectiveGenMs = max(3000 - 2000, 50) = 1000ms
|
|
398
|
+
// TPS = 20 / 1.0 = 20.0 tok/s (includes TTFT, underestimates gen speed)
|
|
399
|
+
const tpsMatch = (notifySpy.mock.calls[0][0] as string).match(/TPS (\d+(?:\.\d+)?) tok\/s/);
|
|
400
|
+
expect(tpsMatch).toBeTruthy();
|
|
401
|
+
const tps = parseFloat(tpsMatch![1]);
|
|
402
|
+
expect(tps).toBeGreaterThanOrEqual(15);
|
|
403
|
+
expect(tps).toBeLessThanOrEqual(30);
|
|
404
|
+
});
|
|
405
|
+
|
|
406
|
+
it('should return null TPS when avg inter-chunk gap < 1ms (buffer-flush signature)', () => {
|
|
407
|
+
// 5 updates over 1ms (0.25ms avg gap) — looks like a buffer flush,
|
|
408
|
+
// even with enough update count.
|
|
409
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
410
|
+
turnStart: 0,
|
|
411
|
+
messageStart: 100,
|
|
412
|
+
firstUpdate: 100.1,
|
|
413
|
+
streamUpdates: [100.2, 100.4, 100.6, 100.8, 101.0],
|
|
414
|
+
messageEnd: 101.5,
|
|
415
|
+
});
|
|
416
|
+
|
|
417
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
418
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
419
|
+
// Dispatch overhead dominates: can't distinguish from generation timing
|
|
420
|
+
expect(notification).toContain('TPS —');
|
|
421
|
+
|
|
422
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
423
|
+
expect(data.tps).toBeNull();
|
|
424
|
+
});
|
|
425
|
+
|
|
426
|
+
// ─── Stall guard edge cases ────────────────────────────────────────────────
|
|
427
|
+
|
|
428
|
+
it('should use wall-clock streamMs (no stall subtraction) when stallMs is zero', () => {
|
|
429
|
+
// Baseline: no stalls, primary branch uses raw streamMs.
|
|
430
|
+
const { appendEntrySpy } = driveTurn({
|
|
431
|
+
turnStart: 0,
|
|
432
|
+
messageStart: 100,
|
|
433
|
+
firstUpdate: 100.1,
|
|
434
|
+
streamUpdates: [200, 300, 400, 500, 600],
|
|
435
|
+
messageEnd: 700,
|
|
436
|
+
});
|
|
437
|
+
|
|
438
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
439
|
+
// streamMs = 600 - 200 = 400ms
|
|
440
|
+
expect(data.timing.streamMs).toBe(400);
|
|
441
|
+
expect(data.timing.stallMs).toBe(0);
|
|
442
|
+
// 20 tokens / 0.4s = 50 tok/s
|
|
443
|
+
expect(data.tps).toBe(50);
|
|
444
|
+
});
|
|
445
|
+
|
|
446
|
+
it('should fallback when effectiveStreamMs < 50ms even though stallMs < streamMs', () => {
|
|
447
|
+
// Critical edge case: streamMs=1051, stallMs=998 → effectiveStreamMs=53ms.
|
|
448
|
+
// The 53ms remainder could be a buffer-flush dispatch of pre-generated
|
|
449
|
+
// tokens after a 998ms stall, not sustained inference. The 50ms floor
|
|
450
|
+
// catches this: effectiveStreamMs < 50ms → fall to fallback (genMs).
|
|
451
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
452
|
+
turnStart: 0,
|
|
453
|
+
messageStart: 100,
|
|
454
|
+
firstUpdate: 100.1,
|
|
455
|
+
// 6 updates with gaps: 10ms, 10ms, 10ms, 10ms, 998ms (stall), 10ms
|
|
456
|
+
// streamMs = 1148.1 - 110.1 = 1038ms
|
|
457
|
+
// stallMs ≈ 998ms
|
|
458
|
+
// effectiveStreamMs = 1038 - 998 = 40ms < 50ms → FALLBACK
|
|
459
|
+
streamUpdates: [110.1, 120.1, 130.1, 140.1, 1138.1, 1148.1],
|
|
460
|
+
messageEnd: 1200,
|
|
461
|
+
});
|
|
462
|
+
|
|
463
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
464
|
+
expect(data.timing.streamMs).toBeGreaterThan(1000);
|
|
465
|
+
expect(data.timing.stallMs).toBeGreaterThanOrEqual(900);
|
|
466
|
+
// Fallback: effectiveGenMs = max(1100 - 998, 50) = 102ms
|
|
467
|
+
// 20 / 0.102 = ~196 tok/s (includes TTFT, so underestimates gen speed)
|
|
468
|
+
const tpsMatch = (notifySpy.mock.calls[0][0] as string).match(/TPS (\d+(?:\.\d+)?) tok\/s/);
|
|
469
|
+
expect(tpsMatch).toBeTruthy();
|
|
470
|
+
const tps = parseFloat(tpsMatch![1]);
|
|
471
|
+
expect(tps).toBeLessThan(300); // not inflated (not 500+)
|
|
472
|
+
expect(tps).toBeGreaterThan(0);
|
|
473
|
+
});
|
|
474
|
+
|
|
475
|
+
it('should fallback when effectiveStreamMs < 50ms at stallMs ≈ streamMs boundary', () => {
|
|
476
|
+
// streamMs ≈ 530ms, stallMs ≈ 500ms → effectiveStreamMs ≈ 30ms < 50ms
|
|
477
|
+
// Falls to fallback: effectiveGenMs (includes TTFT, underestimates)
|
|
478
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
479
|
+
turnStart: 0,
|
|
480
|
+
messageStart: 100,
|
|
481
|
+
firstUpdate: 100.1,
|
|
482
|
+
streamUpdates: [600.1, 1100.1, 1110.1, 1120.1, 1130.1],
|
|
483
|
+
messageEnd: 1200,
|
|
484
|
+
});
|
|
485
|
+
|
|
486
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
487
|
+
expect(data.timing.stallMs).toBeGreaterThanOrEqual(400);
|
|
488
|
+
// Should NOT be inflated (not 600+)
|
|
489
|
+
expect(data.tps).toBeLessThan(200);
|
|
490
|
+
expect(data.tps).not.toBeNull();
|
|
491
|
+
});
|
|
492
|
+
|
|
493
|
+
it('should produce null TPS when both primary and fallback conditions fail', () => {
|
|
494
|
+
// Few updates AND short generation time
|
|
495
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
496
|
+
turnStart: 0,
|
|
497
|
+
messageStart: 10,
|
|
498
|
+
firstUpdate: 10.1,
|
|
499
|
+
streamUpdates: [10.15, 10.2], // only 2 updates, genMs < 50ms
|
|
500
|
+
messageEnd: 10.5,
|
|
501
|
+
});
|
|
502
|
+
|
|
503
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
504
|
+
const notification = notifySpy.mock.calls[0][0] as string;
|
|
505
|
+
expect(notification).toContain('TPS —');
|
|
506
|
+
|
|
507
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
508
|
+
expect(data.tps).toBeNull();
|
|
509
|
+
});
|
|
510
|
+
|
|
511
|
+
it('should compute generation TPS via PRIMARY branch when stallMs < effectiveStreamMs', () => {
|
|
512
|
+
// A moderate stall occurs within the streaming window but doesn't
|
|
513
|
+
// dominate it: 500ms stall in a 2000ms window → effectiveStreamMs = 1500ms.
|
|
514
|
+
// stallMs (500) < effectiveStreamMs (1500) → PRIMARY branch fires.
|
|
515
|
+
// Generation TPS = 20 / 1.5 = 13.3 tok/s (raw inference speed)
|
|
516
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
517
|
+
turnStart: 0,
|
|
518
|
+
messageStart: 100,
|
|
519
|
+
firstUpdate: 100.1,
|
|
520
|
+
// Updates: 5 at 200ms gaps, 500ms stall, then 5 at 200ms gaps
|
|
521
|
+
// streamMs = 4900 - 200 = 4700ms
|
|
522
|
+
// stallMs = 500ms
|
|
523
|
+
// effectiveStreamMs = 4700 - 500 = 4200ms
|
|
524
|
+
// stallMs (500) < effectiveStreamMs (4200) → PRIMARY
|
|
525
|
+
streamUpdates: [200, 400, 600, 800, 1000, 1500, 1700, 1900, 2100, 2300],
|
|
526
|
+
messageEnd: 2500,
|
|
527
|
+
});
|
|
528
|
+
|
|
529
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
530
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
531
|
+
expect(data.timing.streamMs).toBe(2100); // 2300 - 200
|
|
532
|
+
expect(data.timing.stallMs).toBeGreaterThanOrEqual(400);
|
|
533
|
+
// PRIMARY: effectiveStreamMs = 2100 - 500 = 1600ms
|
|
534
|
+
// 20 / 1.6 = 12.5 tok/s
|
|
535
|
+
const tpsMatch = (notifySpy.mock.calls[0][0] as string).match(/TPS (\d+(?:\.\d+)?) tok\/s/);
|
|
536
|
+
expect(tpsMatch).toBeTruthy();
|
|
537
|
+
const tps = parseFloat(tpsMatch![1]);
|
|
538
|
+
expect(tps).toBeGreaterThanOrEqual(10);
|
|
539
|
+
expect(tps).toBeLessThanOrEqual(20);
|
|
540
|
+
// Verify: this should match output / (effectiveStreamMs / 1000)
|
|
541
|
+
const effectiveStreamMs = data.timing.streamMs - data.timing.stallMs;
|
|
542
|
+
expect(data.tps).toBeCloseTo(20 / (effectiveStreamMs / 1000), 0);
|
|
543
|
+
});
|
|
544
|
+
|
|
545
|
+
it('should fallback when stallMs exactly equals effectiveStreamMs (50/50 boundary)', () => {
|
|
546
|
+
// streamMs = 2000, stallMs = 1000, effectiveStreamMs = 1000
|
|
547
|
+
// stallMs(1000) < effectiveStreamMs(1000)? NO (equal) → FALLBACK
|
|
548
|
+
// This prevents counting buffer-flush dispatches as generation.
|
|
549
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
550
|
+
turnStart: 0,
|
|
551
|
+
messageStart: 100,
|
|
552
|
+
firstUpdate: 100.1,
|
|
553
|
+
// streamMs = 2100 - 100 = 2000ms
|
|
554
|
+
// stall from 600→1600 = 1000ms
|
|
555
|
+
streamUpdates: [100, 200, 300, 400, 500, 1600, 1700, 1800, 1900, 2100],
|
|
556
|
+
messageEnd: 2200,
|
|
557
|
+
});
|
|
558
|
+
|
|
559
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
560
|
+
expect(data.timing.streamMs).toBe(2000);
|
|
561
|
+
expect(data.timing.stallMs).toBeGreaterThanOrEqual(900);
|
|
562
|
+
// stallMs ≈ effectiveStreamMs → FALLBACK (includes TTFT)
|
|
563
|
+
expect(data.tps).not.toBeNull();
|
|
564
|
+
expect(data.tps!).toBeLessThan(50); // no inflation
|
|
565
|
+
});
|
|
566
|
+
|
|
567
|
+
it('should handle stall-before-stream with zero streamMs (all updates in one tick after stall)', () => {
|
|
568
|
+
// TTFT, then a long stall, then ALL stream updates arrive in the same tick
|
|
569
|
+
// → streamMs = 0 (or near-zero) → primary fails, fallback kicks in
|
|
570
|
+
const { notifySpy, appendEntrySpy } = driveTurn({
|
|
571
|
+
turnStart: 0,
|
|
572
|
+
messageStart: 100,
|
|
573
|
+
firstUpdate: 5000, // TTFT at 5s
|
|
574
|
+
// All stream updates arrive simultaneously (buffered after stall)
|
|
575
|
+
streamUpdates: [6000, 6000, 6000, 6000, 6000],
|
|
576
|
+
messageEnd: 6100,
|
|
577
|
+
});
|
|
578
|
+
|
|
579
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
580
|
+
// streamMs ≈ 0 → primary fails (streamMs < MIN_STREAM_MS)
|
|
581
|
+
// Fallback: generationMs >= 50ms → effectiveGenMs
|
|
582
|
+
expect(data.timing.streamMs).toBeLessThan(5);
|
|
583
|
+
expect(data.tps).not.toBeNull();
|
|
584
|
+
expect(data.tps).toBeLessThan(100); // no inflation
|
|
585
|
+
});
|
|
586
|
+
|
|
587
|
+
it('should produce consistent TPS for multi-message turn with stalls', () => {
|
|
588
|
+
// Two messages per turn, each with a stall.
|
|
589
|
+
// The stall detector resets on message_start, so stalls should be
|
|
590
|
+
// tracked per-message but accumulated across the turn.
|
|
591
|
+
const { handlers, notifySpy, appendEntrySpy } = fixture;
|
|
592
|
+
|
|
593
|
+
const msg1: AssistantMessage = {
|
|
594
|
+
role: 'assistant',
|
|
595
|
+
content: [{ type: 'text', text: 'First' }],
|
|
596
|
+
api: 'openai-completions',
|
|
597
|
+
provider: 'openai',
|
|
598
|
+
model: 'gpt-4',
|
|
599
|
+
usage: {
|
|
600
|
+
input: 50,
|
|
601
|
+
output: 100,
|
|
602
|
+
cacheRead: 0,
|
|
603
|
+
cacheWrite: 0,
|
|
604
|
+
totalTokens: 150,
|
|
605
|
+
cost: { input: 0.001, output: 0.002, cacheRead: 0, cacheWrite: 0, total: 0.003 },
|
|
606
|
+
},
|
|
607
|
+
stopReason: 'toolUse',
|
|
608
|
+
timestamp: Date.now(),
|
|
609
|
+
};
|
|
610
|
+
const msg2: AssistantMessage = {
|
|
611
|
+
role: 'assistant',
|
|
612
|
+
content: [{ type: 'text', text: 'Second' }],
|
|
613
|
+
api: 'openai-completions',
|
|
614
|
+
provider: 'openai',
|
|
615
|
+
model: 'gpt-4',
|
|
616
|
+
usage: {
|
|
617
|
+
input: 50,
|
|
618
|
+
output: 100,
|
|
619
|
+
cacheRead: 0,
|
|
620
|
+
cacheWrite: 0,
|
|
621
|
+
totalTokens: 150,
|
|
622
|
+
cost: { input: 0.001, output: 0.002, cacheRead: 0, cacheWrite: 0, total: 0.003 },
|
|
623
|
+
},
|
|
624
|
+
stopReason: 'stop',
|
|
625
|
+
timestamp: Date.now(),
|
|
626
|
+
};
|
|
627
|
+
|
|
628
|
+
const timestamps = [
|
|
629
|
+
0, // turn_start (turnStartMs)
|
|
630
|
+
0, // turn_start (lastUpdateMs)
|
|
631
|
+
100, // message_start 1 (currentMessageStartMs)
|
|
632
|
+
200, // message_update TTFT 1
|
|
633
|
+
300, // message_update stream 1
|
|
634
|
+
400, // message_update stream 2
|
|
635
|
+
500, // message_update stream 3
|
|
636
|
+
600, // message_update stream 4
|
|
637
|
+
700, // message_update stream 5
|
|
638
|
+
1200, // message_update stream 6 (500ms stall gap)
|
|
639
|
+
1300, // message_end 1 (generationMs end)
|
|
640
|
+
1400, // message_start 2 (resets stall tracking)
|
|
641
|
+
1500, // message_update TTFT 2
|
|
642
|
+
1600, // message_update stream 1
|
|
643
|
+
1700, // message_update stream 2
|
|
644
|
+
1800, // message_update stream 3
|
|
645
|
+
1900, // message_update stream 4
|
|
646
|
+
2000, // message_update stream 5
|
|
647
|
+
2500, // message_update stream 6 (500ms stall gap)
|
|
648
|
+
2600, // message_end 2
|
|
649
|
+
2600, // turn_end
|
|
650
|
+
];
|
|
651
|
+
let callIdx = 0;
|
|
652
|
+
const spy = vi
|
|
653
|
+
.spyOn(performance, 'now')
|
|
654
|
+
.mockImplementation(() => timestamps[Math.min(callIdx++, timestamps.length - 1)]);
|
|
655
|
+
|
|
656
|
+
handlers['turn_start']?.({ type: 'turn_start', turnIndex: 0, timestamp: Date.now() });
|
|
657
|
+
handlers['message_start']?.({ type: 'message_start', message: msg1 });
|
|
658
|
+
handlers['message_update']?.({
|
|
659
|
+
type: 'message_update',
|
|
660
|
+
message: msg1,
|
|
661
|
+
assistantMessageEvent: { type: 'text_delta', delta: 't' },
|
|
662
|
+
}); // TTFT
|
|
663
|
+
for (let i = 0; i < 6; i++) {
|
|
664
|
+
handlers['message_update']?.({
|
|
665
|
+
type: 'message_update',
|
|
666
|
+
message: msg1,
|
|
667
|
+
assistantMessageEvent: { type: 'text_delta', delta: 't' },
|
|
668
|
+
}); // stream
|
|
669
|
+
}
|
|
670
|
+
handlers['message_end']?.({ type: 'message_end', message: msg1 });
|
|
671
|
+
handlers['message_start']?.({ type: 'message_start', message: msg2 }); // resets stall
|
|
672
|
+
handlers['message_update']?.({
|
|
673
|
+
type: 'message_update',
|
|
674
|
+
message: msg2,
|
|
675
|
+
assistantMessageEvent: { type: 'text_delta', delta: 't' },
|
|
676
|
+
}); // TTFT
|
|
677
|
+
for (let i = 0; i < 6; i++) {
|
|
678
|
+
handlers['message_update']?.({
|
|
679
|
+
type: 'message_update',
|
|
680
|
+
message: msg2,
|
|
681
|
+
assistantMessageEvent: { type: 'text_delta', delta: 't' },
|
|
682
|
+
}); // stream
|
|
683
|
+
}
|
|
684
|
+
handlers['message_end']?.({ type: 'message_end', message: msg2 });
|
|
685
|
+
handlers['turn_end']?.(
|
|
686
|
+
{ type: 'turn_end', turnIndex: 0, message: msg2, toolResults: [] },
|
|
687
|
+
fixture.mockCtx
|
|
688
|
+
);
|
|
689
|
+
spy.mockRestore();
|
|
690
|
+
|
|
691
|
+
expect(notifySpy).toHaveBeenCalledOnce();
|
|
692
|
+
const [, data] = appendEntrySpy.mock.calls[0];
|
|
693
|
+
// Two messages, each with 100 output, total output = 200
|
|
694
|
+
expect(data.tokens.output).toBe(200);
|
|
695
|
+
expect(data.timing.messageCount).toBe(2);
|
|
696
|
+
// Should have stalls from both messages
|
|
697
|
+
expect(data.timing.stallCount).toBeGreaterThanOrEqual(2);
|
|
698
|
+
// TPS should be sane (not in the thousands)
|
|
699
|
+
expect(data.tps).toBeLessThan(200);
|
|
700
|
+
});
|
|
701
|
+
});
|