@monotykamary/pi-tps 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,734 @@
1
+ /**
2
+ * pi-tps — Tokens-per-second tracker for pi
3
+ *
4
+ * Captures structured telemetry at every LLM turn (per-API-call).
5
+ * Tracks real-time TPS via token-by-token updates, detects inference
6
+ * stalls (GPU queuing / request queuing pauses), and persists telemetry
7
+ * as custom entries in the session JSONL for provider debugging.
8
+ *
9
+ * Originally from: https://github.com/badlogic/pi-mono/blob/main/.pi/extensions/tps.ts
10
+ */
11
+
12
+ import { execSync } from 'child_process';
13
+ import { writeFileSync, mkdirSync } from 'fs';
14
+ import { homedir } from 'os';
15
+ import { join } from 'path';
16
+
17
+ import type { AssistantMessage } from '@earendil-works/pi-ai';
18
+ import type {
19
+ ExtensionAPI,
20
+ ExtensionCommandContext,
21
+ ExtensionContext,
22
+ } from '@earendil-works/pi-coding-agent';
23
+
24
+ // ─── Event types (not exported from pi's public API) ────────────────────────
25
+ // These mirror the internal types in @earendil-works/pi-coding-agent's
26
+ // dist/core/extensions/types.d.ts. When pi starts exporting them, replace
27
+ // these local definitions with imports.
28
+
29
+ interface TurnStartEvent {
30
+ type: 'turn_start';
31
+ turnIndex: number;
32
+ timestamp: number;
33
+ }
34
+
35
+ interface TurnEndEvent {
36
+ type: 'turn_end';
37
+ turnIndex: number;
38
+ message: unknown;
39
+ toolResults: unknown[];
40
+ }
41
+
42
+ interface MessageStartEvent {
43
+ type: 'message_start';
44
+ message: unknown;
45
+ }
46
+
47
+ interface MessageUpdateEvent {
48
+ type: 'message_update';
49
+ message: unknown;
50
+ }
51
+
52
+ interface MessageEndEvent {
53
+ type: 'message_end';
54
+ message: unknown;
55
+ }
56
+
57
+ interface SessionTreeEvent {
58
+ type: 'session_tree';
59
+ newLeafId: string | null;
60
+ oldLeafId: string | null;
61
+ }
62
+
63
+ // ─── Constants ──────────────────────────────────────────────────────────────
64
+
65
+ /** Minimum gap between token updates to count as a stall (ms) */
66
+ const STALL_THRESHOLD_MS = 500;
67
+
68
+ // ─── Data types ─────────────────────────────────────────────────────────────
69
+
70
+ /** Structured telemetry persisted per turn in the session JSONL */
71
+ interface TurnTelemetry {
72
+ model: { provider: string; modelId: string };
73
+ tokens: { input: number; output: number; cacheRead: number; cacheWrite: number; total: number };
74
+ timing: {
75
+ ttftMs: number | null; // time to first token
76
+ totalMs: number; // wall clock: turn_start → turn_end
77
+ generationMs: number; // wall clock streaming time (message_start → message_end per message)
78
+ streamMs: number | null; // inter-update span: first streaming update → last streaming update
79
+ stallMs: number; // accumulated gaps > STALL_THRESHOLD_MS
80
+ stallCount: number; // how many discrete stall events
81
+ messageCount: number; // assistant messages in this turn
82
+ };
83
+ tps: number | null; // output / (streamMs / 1000), null when burst/degenerate
84
+ cost: {
85
+ input: number;
86
+ output: number;
87
+ cacheRead: number;
88
+ cacheWrite: number;
89
+ total: number;
90
+ } | null;
91
+ timestamp: number;
92
+ }
93
+
94
+ /** In-memory state accumulated during one LLM turn */
95
+ interface TurnTiming {
96
+ turnStartMs: number;
97
+ lastUpdateMs: number;
98
+ firstTokenMs: number | null;
99
+ currentMessageStartMs: number | null;
100
+ assistantMessages: AssistantMessage[];
101
+ totalGenerationMs: number;
102
+ // Inter-update TPS tracking: measures the streaming span between
103
+ // the first and last non-TTFT message_update events.
104
+ updateCount: number; // number of message_update events after the first (TTFT) one
105
+ firstStreamUpdateMs: number | null; // timestamp of first non-TTFT update
106
+ lastStreamUpdateMs: number; // timestamp of most recent non-TTFT update
107
+ stallMs: number;
108
+ stallCount: number;
109
+ inStall: boolean;
110
+ messageCount: number;
111
+ }
112
+
113
+ // ─── Helpers ────────────────────────────────────────────────────────────────
114
+
115
+ function isAssistantMessage(message: unknown): message is AssistantMessage {
116
+ if (!message || typeof message !== 'object') return false;
117
+ const msg = message as Record<string, unknown>;
118
+ if (msg.role !== 'assistant') return false;
119
+ // Guard: ensure usage exists with required numeric fields before downstream access.
120
+ if (typeof msg.usage !== 'object' || msg.usage === null) return false;
121
+ const usage = msg.usage as Record<string, unknown>;
122
+ if (typeof usage.input !== 'number' || typeof usage.output !== 'number') return false;
123
+ return true;
124
+ }
125
+
126
+ /**
127
+ * Format a number with human-readable scaling (K, M, B).
128
+ * - Values < 1000: raw integer (e.g. 567 → "567")
129
+ * - Values ≥ 1000: scaled with one decimal, dropping ".0" (e.g. 1234 → "1.2K", 2000 → "2K")
130
+ * - Handles up to billions (e.g. 1_234_567_890 → "1.2B")
131
+ */
132
+ function formatNumber(num: number): string {
133
+ if (num < 1_000) return String(num);
134
+
135
+ let value: number;
136
+ let suffix: string;
137
+
138
+ if (num >= 1_000_000_000) {
139
+ value = num / 1_000_000_000;
140
+ suffix = 'B';
141
+ } else if (num >= 1_000_000) {
142
+ value = num / 1_000_000;
143
+ suffix = 'M';
144
+ } else {
145
+ value = num / 1_000;
146
+ suffix = 'K';
147
+ }
148
+
149
+ // Drop trailing ".0" for clean display
150
+ const formatted = value.toFixed(1);
151
+ return formatted.endsWith('.0') ? `${value.toFixed(0)}${suffix}` : `${formatted}${suffix}`;
152
+ }
153
+
154
+ /**
155
+ * Format duration in seconds to human-readable string.
156
+ * Sub-minute values show 1 decimal (e.g. "2.3s" for TTFT precision).
157
+ * Rules: no decimals, up to 2 units, includes years.
158
+ * @internal Exported for testing only.
159
+ */
160
+ export function formatDuration(totalSeconds: number): string {
161
+ if (totalSeconds < 60) {
162
+ return `${totalSeconds.toFixed(1)}s`;
163
+ }
164
+
165
+ const seconds = Math.round(totalSeconds);
166
+ const units = [
167
+ { label: 'y', seconds: 365 * 24 * 60 * 60 }, // 365 days
168
+ { label: 'mo', seconds: 30 * 24 * 60 * 60 }, // 30 days
169
+ { label: 'w', seconds: 7 * 24 * 60 * 60 },
170
+ { label: 'd', seconds: 24 * 60 * 60 },
171
+ { label: 'h', seconds: 60 * 60 },
172
+ { label: 'm', seconds: 60 },
173
+ { label: 's', seconds: 1 },
174
+ ];
175
+
176
+ const parts: { value: number; label: string }[] = [];
177
+ let remaining = seconds;
178
+
179
+ // First pass: extract all units with non-zero values
180
+ for (let i = 0; i < units.length; i++) {
181
+ const unit = units[i];
182
+ if (remaining >= unit.seconds) {
183
+ const value = Math.floor(remaining / unit.seconds);
184
+ parts.push({ value, label: unit.label });
185
+ remaining %= unit.seconds;
186
+ }
187
+ }
188
+
189
+ // If we only found one unit, add the next smaller unit as zero
190
+ // Skip 'w' (weeks) when the primary unit is 'mo' (months) for better readability
191
+ if (parts.length === 1) {
192
+ const firstUnitIndex = units.findIndex((u) => u.label === parts[0].label);
193
+ if (firstUnitIndex < units.length - 1) {
194
+ let nextIndex = firstUnitIndex + 1;
195
+ // Skip weeks when showing months, skip months+weeks when showing years —
196
+ // go directly to days for a cleaner display
197
+ if (parts[0].label === 'mo' && units[nextIndex].label === 'w') {
198
+ nextIndex++;
199
+ } else if (parts[0].label === 'y' && units[nextIndex].label === 'mo') {
200
+ nextIndex += 2; // skip mo and w, land on d
201
+ }
202
+ if (nextIndex < units.length) {
203
+ parts.push({ value: 0, label: units[nextIndex].label });
204
+ }
205
+ }
206
+ }
207
+
208
+ // Return up to 2 most significant units
209
+ const top2 = parts.slice(0, 2);
210
+ return top2.map((p) => `${p.value}${p.label}`).join(' ');
211
+ }
212
+
213
+ /**
214
+ * Compose the human-readable display string from structured telemetry.
215
+ */
216
+ function composeDisplayString(t: TurnTelemetry): string {
217
+ const parts: string[] = [];
218
+ parts.push(t.tps !== null ? `TPS ${t.tps.toFixed(1)} tok/s` : 'TPS —');
219
+ if (t.timing.ttftMs !== null) {
220
+ parts.push(`TTFT ${formatDuration(t.timing.ttftMs / 1000)}`);
221
+ }
222
+ parts.push(formatDuration(t.timing.totalMs / 1000));
223
+ parts.push(`in ${formatNumber(t.tokens.input)}`);
224
+ parts.push(`out ${formatNumber(t.tokens.output)}`);
225
+ if (t.timing.stallMs > 0) {
226
+ const stallStr = formatDuration(t.timing.stallMs / 1000);
227
+ parts.push(`stall ${stallStr}×${t.timing.stallCount}`);
228
+ }
229
+ return parts.join(' · ');
230
+ }
231
+
232
+ /**
233
+ * Build structured TurnTelemetry from accumulated turn timing.
234
+ * Returns null if the turn had no meaningful LLM output.
235
+ */
236
+ function buildTelemetry(timing: TurnTiming, turnEndMs: number): TurnTelemetry | null {
237
+ let input = 0;
238
+ let output = 0;
239
+ let cacheRead = 0;
240
+ let cacheWrite = 0;
241
+ let totalTokens = 0;
242
+ let costInput = 0;
243
+ let costOutput = 0;
244
+ let costCacheRead = 0;
245
+ let costCacheWrite = 0;
246
+ let costTotal = 0;
247
+ let hasCost = false;
248
+ let model: { provider: string; modelId: string } | null = null;
249
+
250
+ for (const message of timing.assistantMessages) {
251
+ input += message.usage.input || 0;
252
+ output += message.usage.output || 0;
253
+ cacheRead += message.usage.cacheRead || 0;
254
+ cacheWrite += message.usage.cacheWrite || 0;
255
+ totalTokens += message.usage.totalTokens || 0;
256
+ if (message.usage.cost) {
257
+ costInput += message.usage.cost.input || 0;
258
+ costOutput += message.usage.cost.output || 0;
259
+ costCacheRead += message.usage.cost.cacheRead || 0;
260
+ costCacheWrite += message.usage.cost.cacheWrite || 0;
261
+ costTotal += message.usage.cost.total || 0;
262
+ hasCost = true;
263
+ }
264
+ if (!model && message.provider && message.model) {
265
+ model = { provider: message.provider, modelId: message.model };
266
+ }
267
+ }
268
+
269
+ if (output <= 0) return null;
270
+ if (!timing.firstTokenMs) return null;
271
+ if (!model) return null;
272
+
273
+ const totalMs = turnEndMs - timing.turnStartMs;
274
+
275
+ // Inter-update TPS: measures the streaming span between the first and
276
+ // last non-TTFT message_update events.
277
+ //
278
+ // Three-branch gate:
279
+ // Primary: ≥5 updates, avg inter-chunk gap ≥1ms → streamMs is genuine
280
+ // generation timing, not buffer-flush dispatch overhead.
281
+ // Fallback: ≥2 updates, totalGenerationMs ≥50ms → conservative rate
282
+ // using the full generation window (includes TTFT, so it
283
+ // underestimates — by design, to avoid overshooting).
284
+ // Else: null — structurally unidentifiable.
285
+ //
286
+ // The avg inter-chunk gap is the key signal: buffer flushes dispatch
287
+ // chunks with ~0.3–0.5ms gaps (network overhead only), while genuine
288
+ // streaming at 3000 TPS with 5-token chunks has ~1.7ms gaps. A 1ms
289
+ // threshold cleanly separates the two regimes without capping TPS by
290
+ // magnitude — a legitimate 5000 TPS provider with 10-token chunks
291
+ // still passes (2ms gaps), and future faster hardware with larger
292
+ // batches passes too.
293
+ const MIN_STREAM_MS = 1;
294
+ const MIN_STREAM_UPDATES = 5;
295
+ const MIN_INTER_CHUNK_MS = 1;
296
+ const MIN_GENERATION_MS = 50;
297
+ const ACTIVE_TIME_THRESHOLD_MS = 200;
298
+ const STALL_REDUCTION_DENOM = 2;
299
+ const STALL_DOMINANCE_RATIO = 0.85;
300
+
301
+ const streamMs =
302
+ timing.updateCount > 0 && timing.firstStreamUpdateMs !== null
303
+ ? timing.lastStreamUpdateMs - timing.firstStreamUpdateMs
304
+ : null;
305
+
306
+ const avgInterChunkGap =
307
+ streamMs !== null && timing.updateCount > 1 ? streamMs / (timing.updateCount - 1) : 0;
308
+
309
+ // ── Generation TPS ────────────────────────────────────────────────────
310
+ // Raw inference speed: output / (active streaming time / 1000).
311
+ // Excludes BOTH TTFT and known stalls — this is the speed at which the
312
+ // model was actually producing tokens during active generation.
313
+ //
314
+ // The stall-before-stream bug: when a stall occurs between TTFT and the
315
+ // first stream update, firstStreamUpdateMs is set AFTER the stall, making
316
+ // streamMs only cover the post-stall burst. Subtracting stallMs from
317
+ // streamMs gives the "active generation" span, but when stallMs ≥ streamMs
318
+ // the result is unreliable — the post-stall cluster could be a buffer-flush
319
+ // dispatch of pre-generated tokens, not sustained inference.
320
+ //
321
+ // Three guard conditions prevent inflation:
322
+ // 1. stallMs < streamMs: prevents stall-before-stream where
323
+ // firstStreamUpdateMs lands AFTER the stall.
324
+ // 2. effectiveStreamMs >= 50ms: the active span must be long enough to
325
+ // distinguish genuine generation from a dispatch artifact.
326
+ // 3. stallMs < effectiveStreamMs: when stalls exceed active generation
327
+ // time (e.g. 998ms stall, 53ms active), the "active" span is likely
328
+ // a buffer-flush burst of pre-generated tokens, not sustained
329
+ // inference. Requiring stall time < active time ensures the
330
+ // streaming window is dominated by generation, not stalls.
331
+ //
332
+ // Three-branch gate:
333
+ // Primary: all 3 guards pass → output / (effectiveStreamMs / 1000)
334
+ // Fallback: ≥2 updates, generationMs ≥50ms
335
+ // → output / (effectiveGenMs / 1000)
336
+ // Includes TTFT, underestimates, but never overshoots.
337
+ // Else: null — structurally unidentifiable.
338
+ let tps: number | null = null;
339
+ if (
340
+ streamMs !== null &&
341
+ streamMs >= MIN_STREAM_MS &&
342
+ timing.updateCount >= MIN_STREAM_UPDATES &&
343
+ avgInterChunkGap >= MIN_INTER_CHUNK_MS &&
344
+ timing.stallMs < streamMs && // stalls must not dominate streaming span
345
+ streamMs - timing.stallMs >= MIN_GENERATION_MS && // effective span must be measurable
346
+ timing.stallMs < streamMs - timing.stallMs // stall time < active time
347
+ ) {
348
+ // Active generation time: streaming window minus known stalls.
349
+ // streamMs already excludes TTFT; subtracting stallMs gives the
350
+ // time the model was actually generating tokens.
351
+ const effectiveStreamMs = streamMs - timing.stallMs;
352
+ const raw = output / (effectiveStreamMs / 1000);
353
+ tps = Math.round(raw * 10) / 10;
354
+ } else if (timing.updateCount >= 2 && timing.totalGenerationMs >= MIN_GENERATION_MS) {
355
+ // Fallback: use generationMs (message_start → message_end) minus
356
+ // stalls. This includes TTFT, so it underestimates generation speed,
357
+ // but it's safe — no inflation possible.
358
+ let effectiveGenMs = timing.totalGenerationMs - timing.stallMs;
359
+ const stallsDominate =
360
+ effectiveGenMs < ACTIVE_TIME_THRESHOLD_MS ||
361
+ timing.stallMs > timing.totalGenerationMs * STALL_DOMINANCE_RATIO;
362
+ if (stallsDominate) {
363
+ effectiveGenMs = Math.max(
364
+ timing.totalGenerationMs - timing.stallMs / STALL_REDUCTION_DENOM,
365
+ MIN_GENERATION_MS
366
+ );
367
+ } else {
368
+ effectiveGenMs = Math.max(effectiveGenMs, MIN_GENERATION_MS);
369
+ }
370
+ const raw = output / (effectiveGenMs / 1000);
371
+ tps = Math.round(raw * 10) / 10;
372
+ } else {
373
+ tps = null;
374
+ }
375
+
376
+ return {
377
+ model,
378
+ tokens: { input, output, cacheRead, cacheWrite, total: totalTokens },
379
+ timing: {
380
+ ttftMs: timing.firstTokenMs - timing.turnStartMs,
381
+ totalMs,
382
+ generationMs: timing.totalGenerationMs,
383
+ streamMs,
384
+ stallMs: timing.stallMs,
385
+ stallCount: timing.stallCount,
386
+ messageCount: timing.messageCount,
387
+ },
388
+ tps,
389
+ cost: hasCost
390
+ ? {
391
+ input: costInput,
392
+ output: costOutput,
393
+ cacheRead: costCacheRead,
394
+ cacheWrite: costCacheWrite,
395
+ total: costTotal,
396
+ }
397
+ : null,
398
+ timestamp: Date.now(),
399
+ };
400
+ }
401
+
402
+ // ─── Extension ──────────────────────────────────────────────────────────────
403
+
404
+ export default function tpsExtension(pi: ExtensionAPI) {
405
+ // Current turn timing state
406
+ let currentTiming: TurnTiming | null = null;
407
+
408
+ // Cached session entries for argument completion (captured on session_start / session_tree)
409
+ let cachedEntries: Array<{ type?: string; customType?: string; data?: unknown }> = [];
410
+
411
+ // ── Rehydration ─────────────────────────────────────────────────────────
412
+
413
+ /**
414
+ * Restore the most recent TPS notification on resume.
415
+ * Supports both structured (TurnTelemetry) and legacy ({ message, timestamp })
416
+ * entries for backwards compatibility with older session files.
417
+ * Deferred via setTimeout so it survives TUI clear+rebuild.
418
+ */
419
+ function restoreTPSNotification(ctx: ExtensionContext) {
420
+ if (!ctx.hasUI) return;
421
+ const entries = cachedEntries.length > 0 ? cachedEntries : ctx.sessionManager.getEntries();
422
+ for (let i = entries.length - 1; i >= 0; i--) {
423
+ const entry = entries[i];
424
+ if (entry.type === 'custom' && entry.customType === 'tps') {
425
+ const data = entry.data as Record<string, unknown> | null;
426
+ if (!data) continue;
427
+ // Structured format (current): has model field
428
+ if (data.model) {
429
+ const message = composeDisplayString(data as unknown as TurnTelemetry);
430
+ setTimeout(() => {
431
+ ctx.ui.notify(message, 'info');
432
+ }, 0);
433
+ break;
434
+ }
435
+ // Legacy format: { message: string, timestamp: number }
436
+ if (typeof data.message === 'string') {
437
+ setTimeout(() => {
438
+ ctx.ui.notify(data.message as string, 'info');
439
+ }, 0);
440
+ break;
441
+ }
442
+ }
443
+ }
444
+ }
445
+
446
+ // Restore notification on session start/resume — skip only brand-new sessions
447
+ pi.on('session_start', (_event, ctx) => {
448
+ cachedEntries = ctx.sessionManager.getEntries();
449
+ restoreTPSNotification(ctx);
450
+ });
451
+
452
+ // Restore notification after /tree navigation (same session, different branch)
453
+ pi.on('session_tree', (_event: SessionTreeEvent, ctx: ExtensionContext) => {
454
+ cachedEntries = ctx.sessionManager.getEntries();
455
+ restoreTPSNotification(ctx);
456
+ });
457
+
458
+ // ── Turn timing ─────────────────────────────────────────────────────────
459
+
460
+ // Track when a turn starts (request sent to LLM)
461
+ pi.on('turn_start', (_event: TurnStartEvent) => {
462
+ currentTiming = {
463
+ turnStartMs: performance.now(),
464
+ lastUpdateMs: performance.now(),
465
+ firstTokenMs: null,
466
+ currentMessageStartMs: null,
467
+ assistantMessages: [],
468
+ totalGenerationMs: 0,
469
+ updateCount: 0,
470
+ firstStreamUpdateMs: null,
471
+ lastStreamUpdateMs: 0,
472
+ stallMs: 0,
473
+ stallCount: 0,
474
+ inStall: false,
475
+ messageCount: 0,
476
+ };
477
+ });
478
+
479
+ // Track when a message starts. In pi, message_start fires at stream
480
+ // creation (before any tokens), so we defer TTFT to the first
481
+ // message_update which carries the first real token.
482
+ pi.on('message_start', (event: MessageStartEvent) => {
483
+ if (!currentTiming) return;
484
+ if (!isAssistantMessage(event.message)) return;
485
+
486
+ const now = performance.now();
487
+
488
+ // Track when THIS message started streaming (for generation TPS)
489
+ currentTiming.currentMessageStartMs = now;
490
+ currentTiming.messageCount++;
491
+
492
+ // Reset stall-tracking clock so tool-execution gaps between
493
+ // messages don't get counted as inference stalls.
494
+ currentTiming.lastUpdateMs = now;
495
+ currentTiming.inStall = false;
496
+ });
497
+
498
+ // Track token-by-token updates during streaming (real-time TPS & stall detection).
499
+ // The first message_update is the effective first token (message_start fires
500
+ // at stream creation, before any content arrives).
501
+ pi.on('message_update', (event: MessageUpdateEvent) => {
502
+ if (!currentTiming) return;
503
+ if (!isAssistantMessage(event.message)) return;
504
+
505
+ const now = performance.now();
506
+
507
+ // First token: capture TTFT and seed stall timing, then bail.
508
+ // No stall detection on this event — the gap from message_start to
509
+ // first message_update is provider parsing overhead, not a stall.
510
+ if (currentTiming.firstTokenMs === null) {
511
+ currentTiming.firstTokenMs = now;
512
+ currentTiming.lastUpdateMs = now;
513
+ return;
514
+ }
515
+
516
+ // Track inter-update streaming span for TPS calculation
517
+ currentTiming.updateCount++;
518
+ if (currentTiming.firstStreamUpdateMs === null) {
519
+ currentTiming.firstStreamUpdateMs = now;
520
+ }
521
+ currentTiming.lastStreamUpdateMs = now;
522
+
523
+ const gap = now - currentTiming.lastUpdateMs;
524
+
525
+ // Detect stall: gap exceeds threshold. The full gap counts as stall
526
+ // time — the threshold is a detection gate, not a duration discount.
527
+ if (gap >= STALL_THRESHOLD_MS) {
528
+ if (!currentTiming.inStall) {
529
+ currentTiming.stallCount++;
530
+ }
531
+ currentTiming.inStall = true;
532
+ currentTiming.stallMs += gap;
533
+ } else {
534
+ currentTiming.inStall = false;
535
+ }
536
+
537
+ currentTiming.lastUpdateMs = now;
538
+ });
539
+
540
+ // Track when a message ends
541
+ pi.on('message_end', (event: MessageEndEvent) => {
542
+ if (!currentTiming) return;
543
+ if (!isAssistantMessage(event.message)) return;
544
+
545
+ const now = performance.now();
546
+
547
+ // Accumulate ACTUAL streaming time for this message (true generation time)
548
+ if (currentTiming.currentMessageStartMs) {
549
+ const messageGenerationMs = now - currentTiming.currentMessageStartMs;
550
+ currentTiming.totalGenerationMs += messageGenerationMs;
551
+ currentTiming.currentMessageStartMs = null;
552
+ }
553
+
554
+ // Store this message to count its tokens later (only current turn's messages)
555
+ currentTiming.assistantMessages.push(event.message);
556
+ currentTiming.lastUpdateMs = now;
557
+ });
558
+
559
+ // ── Persist telemetry ───────────────────────────────────────────────────
560
+
561
+ // Calculate, display, and persist telemetry at the end of each LLM turn
562
+ pi.on('turn_end', (_event: TurnEndEvent, ctx: ExtensionContext) => {
563
+ if (!currentTiming) return;
564
+
565
+ const timing = currentTiming;
566
+ currentTiming = null;
567
+
568
+ const turnEndMs = performance.now();
569
+ const telemetry = buildTelemetry(timing, turnEndMs);
570
+ if (!telemetry) return;
571
+
572
+ // Persist structured telemetry to session for export and rehydration
573
+ pi.appendEntry('tps', telemetry);
574
+
575
+ // Emit event so other extensions can react to new telemetry
576
+ pi.events.emit('tps:telemetry', telemetry);
577
+
578
+ // Show notification only when UI is available
579
+ if (ctx.hasUI) {
580
+ const message = composeDisplayString(telemetry);
581
+ ctx.ui.notify(message, 'info');
582
+ }
583
+
584
+ // Keep argument completion cache in sync with new entries
585
+ cachedEntries.push({ type: 'custom', customType: 'tps' });
586
+ });
587
+
588
+ // ── Export commands ─────────────────────────────────────────────────────
589
+
590
+ pi.registerCommand('tps-export', {
591
+ description:
592
+ 'Export telemetry + session structure (model changes, branch points) as JSONL (--full for all branches, filter by customType)',
593
+ getArgumentCompletions: (argumentPrefix: string) => {
594
+ // Offer --full flag
595
+ if ('--full'.startsWith(argumentPrefix)) {
596
+ return [{ value: '--full', label: '--full (all branches, not just current)' }];
597
+ }
598
+ // Collect all unique customType values from cached session entries
599
+ const customTypes = new Set<string>();
600
+ for (const entry of cachedEntries) {
601
+ if (entry.type === 'custom' && entry.customType) {
602
+ customTypes.add(entry.customType);
603
+ }
604
+ }
605
+ return Array.from(customTypes)
606
+ .filter((ct) => ct.startsWith(argumentPrefix))
607
+ .map((ct) => ({ value: ct, label: ct }));
608
+ },
609
+ handler: async (args: string, ctx: ExtensionCommandContext) => {
610
+ // Default: current branch. --full: entire session.
611
+ const tokens = args.trim().split(/\s+/).filter(Boolean);
612
+ const full = tokens.includes('--full');
613
+ const filterType = tokens.filter((t) => t !== '--full').join(' ') || null;
614
+
615
+ // Collect entries: custom entries (+ optional customType filter) + structural entries
616
+ // Structural entries (model_change, branch_summary) are always included so the
617
+ // exported parentId tree is fully resolvable and the web inspector can show
618
+ // model switches and branch points.
619
+ const entries = full ? ctx.sessionManager.getEntries() : ctx.sessionManager.getBranch();
620
+ const isStructural = (e: { type: string }) =>
621
+ e.type === 'model_change' || e.type === 'branch_summary';
622
+
623
+ const exportedEntries = entries.filter(
624
+ (e) =>
625
+ isStructural(e) || (e.type === 'custom' && (!filterType || e.customType === filterType))
626
+ );
627
+
628
+ if (exportedEntries.length === 0) {
629
+ const scope = full ? 'all-entries' : 'current-branch';
630
+ ctx.ui.notify(`No matching entries found in ${scope}`, 'warning');
631
+ return;
632
+ }
633
+
634
+ // Re-chain parentIds so the exported entries form a valid tree.
635
+ // Original parentIds often point to message entries (not in the export).
636
+ // We walk up the full session tree until we find the nearest ancestor
637
+ // that IS in the export, giving us a self-contained tree structure.
638
+ const byId = new Map<string, (typeof entries)[number]>(entries.map((e) => [e.id, e]));
639
+ const exportedIds = new Set(exportedEntries.map((e) => e.id));
640
+
641
+ const rechainParentId = (entry: (typeof exportedEntries)[number]): string | null => {
642
+ let current: string | null = entry.parentId;
643
+ while (current) {
644
+ if (exportedIds.has(current)) return current;
645
+ const parent = byId.get(current);
646
+ current = parent?.parentId ?? null;
647
+ }
648
+ return null;
649
+ };
650
+
651
+ const rechained = exportedEntries.map((e) => ({
652
+ ...e,
653
+ parentId: rechainParentId(e),
654
+ }));
655
+
656
+ // Write to tmp directory
657
+ const cacheBase = process.env.XDG_CACHE_HOME || join(homedir(), '.cache');
658
+ const dir = join(cacheBase, 'pi-telemetry');
659
+ mkdirSync(dir, { recursive: true });
660
+
661
+ const sessionId = ctx.sessionManager.getSessionId?.() ?? 'unknown';
662
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
663
+ const scopeParts = [full ? 'full' : 'branch', filterType].filter(Boolean);
664
+ const scope = scopeParts.join('-');
665
+ const filename = `pi-telemetry-${scope}-${sessionId.slice(0, 8)}-${timestamp}.jsonl`;
666
+ const filepath = join(dir, filename);
667
+
668
+ const content = rechained.map((e) => JSON.stringify(e)).join('\n') + '\n';
669
+ writeFileSync(filepath, content);
670
+
671
+ const structuralCount = exportedEntries.filter((e) => isStructural(e)).length;
672
+ const customCount = exportedEntries.length - structuralCount;
673
+ const parts: string[] = [];
674
+ if (customCount > 0) parts.push(`${customCount} telemetry`);
675
+ if (structuralCount > 0) parts.push(`${structuralCount} structural`);
676
+ const summary = parts.length > 0 ? parts.join(' + ') : `${exportedEntries.length} entries`;
677
+ try {
678
+ const opener = process.platform === 'darwin' ? 'open' : 'xdg-open';
679
+ execSync(`${opener} ${JSON.stringify(dir)}`, { stdio: 'ignore' });
680
+ } catch {
681
+ // opener not available — ignore silently
682
+ }
683
+
684
+ ctx.ui.notify(`Exported ${summary} → ${filepath}`, 'info');
685
+ },
686
+ });
687
+
688
+ // ── Session export command ─────────────────────────────────────────────────
689
+
690
+ pi.registerCommand('session-export', {
691
+ description:
692
+ 'Export full session JSONL (all entry types, current branch only; --full for all branches)',
693
+ getArgumentCompletions: (argumentPrefix: string) => {
694
+ if ('--full'.startsWith(argumentPrefix)) {
695
+ return [{ value: '--full', label: '--full (all branches, not just current)' }];
696
+ }
697
+ return [];
698
+ },
699
+ handler: async (args: string, ctx: ExtensionCommandContext) => {
700
+ const full = args.trim().split(/\s+/).includes('--full');
701
+
702
+ const entries = full ? ctx.sessionManager.getEntries() : ctx.sessionManager.getBranch();
703
+
704
+ if (entries.length === 0) {
705
+ const scope = full ? 'all-entries' : 'current-branch';
706
+ ctx.ui.notify(`No entries found in ${scope}`, 'warning');
707
+ return;
708
+ }
709
+
710
+ // Write to tmp directory
711
+ const cacheBase = process.env.XDG_CACHE_HOME || join(homedir(), '.cache');
712
+ const dir = join(cacheBase, 'pi-sessions');
713
+ mkdirSync(dir, { recursive: true });
714
+
715
+ const sessionId = ctx.sessionManager.getSessionId?.() ?? 'unknown';
716
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
717
+ const scope = full ? 'full' : 'branch';
718
+ const filename = `pi-session-${scope}-${sessionId.slice(0, 8)}-${timestamp}.jsonl`;
719
+ const filepath = join(dir, filename);
720
+
721
+ const content = entries.map((e) => JSON.stringify(e)).join('\n') + '\n';
722
+ writeFileSync(filepath, content);
723
+
724
+ try {
725
+ const opener = process.platform === 'darwin' ? 'open' : 'xdg-open';
726
+ execSync(`${opener} ${JSON.stringify(dir)}`, { stdio: 'ignore' });
727
+ } catch {
728
+ // opener not available — ignore silently
729
+ }
730
+
731
+ ctx.ui.notify(`Exported ${entries.length} entries → ${filepath}`, 'info');
732
+ },
733
+ });
734
+ }