pi-lilac-provider 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.ts ADDED
@@ -0,0 +1,672 @@
1
+ /**
2
+ * Lilac Provider Extension
3
+ *
4
+ * Registers Lilac (getlilac.com) as a custom provider using the openai-completions API.
5
+ * Base URL: https://api.getlilac.com/v1
6
+ *
7
+ * Lilac serves models via a customized fork of vLLM tuned for idle-GPU scheduling
8
+ * and shared warm endpoints. All models use chat_template_kwargs to toggle reasoning:
9
+ *
10
+ * - Kimi K2.6: reasoning ON by default, honors `thinking` key
11
+ * - GLM 5.1: reasoning ON by default, honors `enable_thinking` key
12
+ * - Gemma 4: reasoning OFF by default, honors `enable_thinking` key
13
+ *
14
+ * The forward-compatible approach is to send both `thinking` and `enable_thinking`
15
+ * in chat_template_kwargs — pi's `qwen-chat-template` thinkingFormat does this.
16
+ *
17
+ * Key API notes:
18
+ * - Uses `max_completion_tokens` (preferred for reasoning models)
19
+ * - All reasoning models return chain-of-thought in `reasoning` field
20
+ * - Developer role is NOT supported by GLM, Kimi, or MiniMax chat templates;
21
+ * prompts with role: "developer" are silently dropped. Only Gemma 4 handles it.
22
+ * supportsDeveloperRole is set to false for affected models via patch.json.
23
+ * - Context caching supported on Kimi K2.6 and GLM 5.1 (cacheRead pricing)
24
+ * - Gemma 4 does NOT support cache read pricing
25
+ * - `store` parameter is NOT supported
26
+ *
27
+ * GLM 5.1 caveats:
28
+ * - vLLM's streaming parser intermittently omits `delta.tool_calls` when the
29
+ * model decides to call tools, finishing with `finish_reason: "tool_calls"` but
30
+ * an empty delta. Even with `tool_stream: true` set via `zaiToolStream`, this
31
+ * can still occur intermittently. The `message_end` handler converts the
32
+ * resulting `stopReason: "toolUse"` with zero toolCall blocks into a retryable
33
+ * error (matching pi's auto-retry pattern) so the agent re-prompts automatically.
34
+ * - GLM's chat template does not handle the `developer` role — prompts sent
35
+ * with `role: "developer"` are silently dropped. `supportsDeveloperRole: false`
36
+ * in models.json forces pi to use `role: "system"` instead.
37
+ * - On current vLLM builds, disabling reasoning may still leak chain-of-thought
38
+ * into `content` terminated by a ``` marker. Clients that require
39
+ * hard-suppressed output should post-process accordingly.
40
+ * See: https://github.com/vllm-project/vllm/issues/31319
41
+ *
42
+ * Kimi K2.6 / MiniMax M2.7 caveat: Their chat templates also do not handle the
43
+ * `developer` role — prompts are silently dropped. `supportsDeveloperRole: false`
44
+ * is set for these models as well.
45
+ *
46
+ * Gemma 4 caveat: vLLM's reasoning parser can fail to populate the `reasoning`
47
+ * field when special tokens are stripped. Combining `enable_thinking: false`
48
+ * with `response_format: json_schema` can silently disable structured output.
49
+ * See: https://github.com/vllm-project/vllm/issues/38855
50
+ * See: https://github.com/vllm-project/vllm/issues/39130
51
+ *
52
+ * Model resolution strategy: Stale-While-Revalidate
53
+ * 1. Serve stale immediately: disk cache → embedded models.json (zero-latency)
54
+ * 2. Revalidate in background: live API /models → merge with embedded → cache → hot-swap
55
+ * 3. patch.json + custom-models.json applied on top of whichever source won
56
+ *
57
+ * Merge order: [live|cache|embedded] → apply patch.json → merge custom-models.json
58
+ *
59
+ * Usage:
60
+ * # Option 1: Store in auth.json (recommended)
61
+ * # Add to ~/.pi/agent/auth.json:
62
+ * # "lilac": { "type": "api_key", "key": "your-api-key" }
63
+ *
64
+ * # Option 2: Set as environment variable
65
+ * export LILAC_API_KEY=your-api-key
66
+ *
67
+ * # Run pi with the extension
68
+ * pi -e /path/to/pi-lilac-provider
69
+ *
70
+ * Then use /model to select from available models
71
+ */
72
+
73
+ import type { ExtensionAPI, ModelRegistry } from "@earendil-works/pi-coding-agent";
74
+ import modelsData from "./models.json" with { type: "json" };
75
+ import customModelsData from "./custom-models.json" with { type: "json" };
76
+ import patchData from "./patch.json" with { type: "json" };
77
+ import fs from "fs";
78
+ import os from "os";
79
+ import path from "path";
80
+
81
+ // ─── Types ────────────────────────────────────────────────────────────────────
82
+
83
+ interface JsonDiscount {
84
+ supplyState: string;
85
+ discountPercent: number;
86
+ creditMultiplier: number;
87
+ }
88
+
89
+ // Maps pi's thinking levels (off, minimal, low, medium, high, xhigh) to the
90
+ // provider-specific effort string sent on the wire. A `null` value marks a
91
+ // level as unsupported — clampThinkingLevel skips it when resolving the
92
+ // user's selection. Mirrors pi-ai's ThinkingLevelMap shape.
93
+ type ThinkingLevelMap = {
94
+ off?: string | null;
95
+ minimal?: string | null;
96
+ low?: string | null;
97
+ medium?: string | null;
98
+ high?: string | null;
99
+ xhigh?: string | null;
100
+ };
101
+
102
+ interface JsonModel {
103
+ id: string;
104
+ name: string;
105
+ reasoning: boolean;
106
+ input: string[];
107
+ cost: {
108
+ input: number;
109
+ output: number;
110
+ cacheRead: number;
111
+ cacheWrite: number;
112
+ };
113
+ contextWindow: number;
114
+ maxTokens: number;
115
+ thinkingLevelMap?: ThinkingLevelMap;
116
+ compat?: {
117
+ supportsDeveloperRole?: boolean;
118
+ supportsStore?: boolean;
119
+ maxTokensField?: "max_completion_tokens" | "max_tokens";
120
+ thinkingFormat?: "openai" | "zai" | "qwen" | "qwen-chat-template";
121
+ supportsReasoningEffort?: boolean;
122
+ };
123
+ discount?: JsonDiscount;
124
+ }
125
+
126
+ interface PatchEntry {
127
+ name?: string;
128
+ reasoning?: boolean;
129
+ input?: string[];
130
+ cost?: {
131
+ input?: number;
132
+ output?: number;
133
+ cacheRead?: number;
134
+ cacheWrite?: number;
135
+ };
136
+ contextWindow?: number;
137
+ maxTokens?: number;
138
+ thinkingLevelMap?: ThinkingLevelMap;
139
+ compat?: Record<string, unknown>;
140
+ }
141
+
142
+ type PatchData = Record<string, PatchEntry>;
143
+
144
+ // ─── Patch Application ────────────────────────────────────────────────────────
145
+
146
+ function applyPatch(model: JsonModel, patch: PatchEntry): JsonModel {
147
+ const result = { ...model };
148
+
149
+ if (patch.name !== undefined) result.name = patch.name;
150
+ if (patch.reasoning !== undefined) result.reasoning = patch.reasoning;
151
+ if (patch.input !== undefined) result.input = patch.input;
152
+ if (patch.contextWindow !== undefined) result.contextWindow = patch.contextWindow;
153
+ if (patch.maxTokens !== undefined) result.maxTokens = patch.maxTokens;
154
+
155
+ if (patch.cost) {
156
+ result.cost = {
157
+ input: patch.cost.input ?? result.cost.input,
158
+ output: patch.cost.output ?? result.cost.output,
159
+ cacheRead: patch.cost.cacheRead ?? result.cost.cacheRead,
160
+ cacheWrite: patch.cost.cacheWrite ?? result.cost.cacheWrite,
161
+ };
162
+ }
163
+ if (patch.compat) {
164
+ result.compat = { ...(result.compat || {}), ...patch.compat };
165
+ }
166
+ if (patch.thinkingLevelMap !== undefined) {
167
+ result.thinkingLevelMap = patch.thinkingLevelMap;
168
+ }
169
+
170
+ if (!result.reasoning && result.compat?.thinkingFormat) {
171
+ delete result.compat.thinkingFormat;
172
+ }
173
+ if (!result.reasoning && result.thinkingLevelMap) {
174
+ delete result.thinkingLevelMap;
175
+ }
176
+ if (result.compat && Object.keys(result.compat).length === 0) {
177
+ delete result.compat;
178
+ }
179
+
180
+ return result;
181
+ }
182
+
183
+ /** Full pipeline: base models → patch → custom → result */
184
+ function buildModels(base: JsonModel[], custom: JsonModel[], patch: PatchData): JsonModel[] {
185
+ const modelMap = new Map<string, JsonModel>();
186
+
187
+ for (const model of base) {
188
+ modelMap.set(model.id, model);
189
+ }
190
+
191
+ for (const [id, patchEntry] of Object.entries(patch)) {
192
+ const existing = modelMap.get(id);
193
+ if (existing) {
194
+ modelMap.set(id, applyPatch(existing, patchEntry));
195
+ }
196
+ }
197
+
198
+ for (const model of custom) {
199
+ const existing = modelMap.get(model.id);
200
+ const patchEntry = patch[model.id];
201
+ if (existing && patchEntry) {
202
+ modelMap.set(model.id, applyPatch(model, patchEntry));
203
+ } else if (existing) {
204
+ modelMap.set(model.id, model);
205
+ } else if (patchEntry) {
206
+ modelMap.set(model.id, applyPatch(model, patchEntry));
207
+ } else {
208
+ modelMap.set(model.id, model);
209
+ }
210
+ }
211
+
212
+ return Array.from(modelMap.values());
213
+ }
214
+
215
+ // ─── Stale-While-Revalidate Model Sync ────────────────────────────────────────
216
+
217
+ const PROVIDER_ID = "lilac";
218
+ const BASE_URL = "https://api.getlilac.com/v1";
219
+ const STATUS_URL = "https://api.getlilac.com/status";
220
+ const MODELS_URL = `${BASE_URL}/models`;
221
+ const CACHE_DIR = path.join(os.homedir(), ".pi", "agent", "cache");
222
+ const CACHE_PATH = path.join(CACHE_DIR, `${PROVIDER_ID}-models.json`);
223
+ const DISCOUNT_CACHE_PATH = path.join(CACHE_DIR, `${PROVIDER_ID}-discounts.json`);
224
+ const LIVE_FETCH_TIMEOUT_MS = 8000;
225
+
226
+ /** Transform a model from the Lilac /v1/models API. Lilac returns rich metadata. */
227
+ function transformApiModel(apiModel: any): JsonModel | null {
228
+ const features: string[] = apiModel.supported_features || [];
229
+ const modalities = apiModel.architecture?.input_modalities || [];
230
+ const hasImage = modalities.includes("image");
231
+ const pricing = apiModel.pricing || {};
232
+
233
+ // Lilac API returns per-token pricing (e.g. "0.0000007" = $0.70/M tokens)
234
+ const toPerM = (v: any) => Math.round((typeof v === "string" ? parseFloat(v) : (v || 0)) * 1_000_000 * 100) / 100;
235
+
236
+ const inputTypes: string[] = ["text"];
237
+ if (hasImage) inputTypes.push("image");
238
+ // Video is sent as image frames, so we don't add a separate "video" input type
239
+
240
+ const model: JsonModel = {
241
+ id: apiModel.id,
242
+ name: apiModel.name || apiModel.id,
243
+ reasoning: features.includes("reasoning"),
244
+ input: inputTypes,
245
+ cost: {
246
+ input: toPerM(pricing.prompt),
247
+ output: toPerM(pricing.completion),
248
+ cacheRead: toPerM(pricing.input_cache_read),
249
+ cacheWrite: 0,
250
+ },
251
+ contextWindow: apiModel.context_length || 131072,
252
+ maxTokens: apiModel.top_provider?.max_completion_tokens || apiModel.context_length || 131072,
253
+ };
254
+
255
+ // All Lilac models use chat_template_kwargs for reasoning toggle
256
+ if (features.includes("reasoning")) {
257
+ model.compat = {
258
+ supportsDeveloperRole: true,
259
+ supportsStore: false,
260
+ maxTokensField: "max_completion_tokens",
261
+ thinkingFormat: "qwen-chat-template",
262
+ supportsReasoningEffort: true,
263
+ };
264
+ } else {
265
+ model.compat = {
266
+ supportsDeveloperRole: true,
267
+ supportsStore: false,
268
+ maxTokensField: "max_completion_tokens",
269
+ };
270
+ }
271
+
272
+ return model;
273
+ }
274
+
275
+ async function fetchLiveModels(apiKey: string, signal?: AbortSignal): Promise<JsonModel[] | null> {
276
+ try {
277
+ const response = await fetch(MODELS_URL, {
278
+ headers: { Authorization: `Bearer ${apiKey}` },
279
+ signal: signal ? AbortSignal.any([AbortSignal.timeout(LIVE_FETCH_TIMEOUT_MS), signal]) : AbortSignal.timeout(LIVE_FETCH_TIMEOUT_MS),
280
+ });
281
+ if (!response.ok) return null;
282
+ const data = await response.json();
283
+ const apiModels = Array.isArray(data) ? data : (data.data || []);
284
+ if (!Array.isArray(apiModels) || apiModels.length === 0) return null;
285
+ return apiModels.map(transformApiModel).filter((m): m is JsonModel => m !== null);
286
+ } catch {
287
+ return null;
288
+ }
289
+ }
290
+
291
+ function loadCachedModels(): JsonModel[] | null {
292
+ try {
293
+ const data = JSON.parse(fs.readFileSync(CACHE_PATH, "utf8"));
294
+ return Array.isArray(data) ? data : null;
295
+ } catch {
296
+ return null;
297
+ }
298
+ }
299
+
300
+ function cacheModels(models: JsonModel[]): void {
301
+ try {
302
+ fs.mkdirSync(CACHE_DIR, { recursive: true });
303
+ fs.writeFileSync(CACHE_PATH, JSON.stringify(models, null, 2) + "\n");
304
+ } catch {
305
+ // Cache write failure is non-fatal
306
+ }
307
+ }
308
+
309
+ function mergeWithEmbedded(liveModels: JsonModel[], embeddedModels: JsonModel[]): JsonModel[] {
310
+ const embeddedMap = new Map(embeddedModels.map(m => [m.id, m]));
311
+ const seen = new Set<string>();
312
+ const result: JsonModel[] = [];
313
+ for (const liveModel of liveModels) {
314
+ const embedded = embeddedMap.get(liveModel.id);
315
+ seen.add(liveModel.id);
316
+ if (embedded) {
317
+ result.push({
318
+ ...liveModel,
319
+ ...embedded,
320
+ contextWindow: liveModel.contextWindow || embedded.contextWindow,
321
+ });
322
+ } else {
323
+ result.push(liveModel);
324
+ }
325
+ }
326
+ // Append any embedded models that the live API didn't return
327
+ for (const em of embeddedModels) {
328
+ if (!seen.has(em.id)) {
329
+ result.push(em);
330
+ }
331
+ }
332
+ return result;
333
+ }
334
+
335
+ function loadStaleModels(embeddedModels: JsonModel[]): JsonModel[] {
336
+ const cached = loadCachedModels();
337
+ if (!cached || cached.length === 0) return embeddedModels;
338
+
339
+ // Merge embedded models that are missing from cache (newly added models)
340
+ const cachedMap = new Map(cached.map(m => [m.id, m]));
341
+ for (const em of embeddedModels) {
342
+ if (!cachedMap.has(em.id)) {
343
+ cached.push(em);
344
+ }
345
+ }
346
+ return cached;
347
+ }
348
+
349
+ async function fetchStatusDiscounts(apiKey: string, signal?: AbortSignal): Promise<Map<string, JsonDiscount> | null> {
350
+ try {
351
+ const response = await fetch(STATUS_URL, {
352
+ headers: { Authorization: `Bearer ${apiKey}` },
353
+ signal: signal ? AbortSignal.any([AbortSignal.timeout(LIVE_FETCH_TIMEOUT_MS), signal]) : AbortSignal.timeout(LIVE_FETCH_TIMEOUT_MS),
354
+ });
355
+ if (!response.ok) return null;
356
+ const data = await response.json() as Record<string, unknown>;
357
+ const discounts = new Map<string, JsonDiscount>();
358
+ // The /status endpoint returns per-model discount data in a "models" array.
359
+ // Each model object has: id, current_subscription_supply_state,
360
+ // current_subscription_discount_percent, current_subscription_credit_multiplier.
361
+ const models = data.models;
362
+ if (Array.isArray(models)) {
363
+ for (const m of models) {
364
+ if (!m || typeof m !== "object" || !m.id) continue;
365
+ discounts.set(m.id, {
366
+ supplyState: String(m.current_subscription_supply_state || "unknown"),
367
+ discountPercent: Number(m.current_subscription_discount_percent ?? 0),
368
+ creditMultiplier: parseFloat(String(m.current_subscription_credit_multiplier ?? "1")),
369
+ });
370
+ }
371
+ }
372
+ return discounts;
373
+ } catch {
374
+ return null;
375
+ }
376
+ }
377
+
378
+ function applyDiscounts(models: JsonModel[], discounts: Map<string, JsonDiscount> | null): JsonModel[] {
379
+ if (!discounts || discounts.size === 0) return models;
380
+ return models.map(model => {
381
+ const discount = discounts.get(model.id);
382
+ if (!discount) return model;
383
+ // credit_multiplier from /status is the effective price factor.
384
+ // E.g. "0.75" means pay 75% of list price. For MiniMax with "1.00" there's no discount.
385
+ // discountPercent is informational (it equals (1 - creditMultiplier) * 100).
386
+ const factor = discount.creditMultiplier;
387
+ const applyFactor = (n: number) => n > 0 ? Math.round(n * factor * 10000) / 10000 : n;
388
+ return {
389
+ ...model,
390
+ cost: {
391
+ input: applyFactor(model.cost.input),
392
+ output: applyFactor(model.cost.output),
393
+ cacheRead: applyFactor(model.cost.cacheRead),
394
+ cacheWrite: model.cost.cacheWrite,
395
+ },
396
+ discount,
397
+ };
398
+ });
399
+ }
400
+
401
+ function cacheDiscounts(discounts: Map<string, JsonDiscount>): void {
402
+ try {
403
+ fs.mkdirSync(CACHE_DIR, { recursive: true });
404
+ fs.writeFileSync(DISCOUNT_CACHE_PATH, JSON.stringify(Object.fromEntries(discounts), null, 2) + "\n");
405
+ } catch {
406
+ // non-fatal
407
+ }
408
+ }
409
+
410
+ function loadCachedDiscounts(): Map<string, JsonDiscount> | null {
411
+ try {
412
+ const data = JSON.parse(fs.readFileSync(DISCOUNT_CACHE_PATH, "utf8")) as Record<string, JsonDiscount>;
413
+ const map = new Map<string, JsonDiscount>();
414
+ for (const [key, value] of Object.entries(data)) {
415
+ if (value && typeof value === "object") {
416
+ map.set(key, {
417
+ supplyState: String(value.supplyState || "unknown"),
418
+ discountPercent: Number(value.discountPercent ?? 0),
419
+ creditMultiplier: Number(value.creditMultiplier ?? 1),
420
+ });
421
+ }
422
+ }
423
+ return map;
424
+ } catch {
425
+ return null;
426
+ }
427
+ }
428
+
429
+ function formatDiscountStatus(modelId?: string): string {
430
+ if (!modelId) return "supply: —";
431
+ if (!latestDiscounts) return "supply: checking…";
432
+ const discount = latestDiscounts.get(modelId);
433
+ if (!discount) return "supply: —";
434
+ return `supply: ${discount.supplyState} · sub-discount: ${discount.discountPercent}%`;
435
+ }
436
+
437
+ function dimStatus(ctx: any, text: string): string {
438
+ try {
439
+ return ctx.ui.theme.fg("dim", text);
440
+ } catch {
441
+ return text;
442
+ }
443
+ }
444
+
445
+ function discountsChanged(
446
+ a: Map<string, JsonDiscount> | null,
447
+ b: Map<string, JsonDiscount> | null,
448
+ ): boolean {
449
+ if (!a || !b) return true;
450
+ if (a.size !== b.size) return true;
451
+ for (const [key, valA] of a) {
452
+ const valB = b.get(key);
453
+ if (!valB) return true;
454
+ if (valA.supplyState !== valB.supplyState) return true;
455
+ if (valA.discountPercent !== valB.discountPercent) return true;
456
+ if (valA.creditMultiplier !== valB.creditMultiplier) return true;
457
+ }
458
+ return false;
459
+ }
460
+
461
+
462
+
463
+ // ─── API Key Resolution (via ModelRegistry) ────────────────────────────────────
464
+
465
+ let cachedApiKey: string | undefined;
466
+ let revalidateAbort: AbortController | null = null;
467
+ let latestDiscounts: Map<string, JsonDiscount> | null = null;
468
+ let lastDiscountFetchTime = 0;
469
+ const STATUS_CACHE_TTL_MS = 30000;
470
+
471
+ async function resolveApiKey(modelRegistry: ModelRegistry): Promise<void> {
472
+ cachedApiKey = await modelRegistry.getApiKeyForProvider("lilac") ?? undefined;
473
+ }
474
+
475
+ // ─── Extension Entry Point ────────────────────────────────────────────────────
476
+
477
+ export default function (pi: ExtensionAPI) {
478
+ const embeddedModels = modelsData as JsonModel[];
479
+ const customModels = customModelsData as JsonModel[];
480
+ const patches = patchData as PatchData;
481
+
482
+ const staleBase = loadStaleModels(embeddedModels);
483
+ latestDiscounts = loadCachedDiscounts();
484
+ const staleModels = applyDiscounts(buildModels(staleBase, customModels, patches), latestDiscounts);
485
+
486
+ pi.registerProvider("lilac", {
487
+ baseUrl: BASE_URL,
488
+ apiKey: "$LILAC_API_KEY",
489
+ api: "openai-completions",
490
+ models: staleModels,
491
+ });
492
+
493
+ const DISCOUNT_ENTRY_TYPE = "lilac-discount";
494
+
495
+ interface DiscountEntry {
496
+ modelId: string;
497
+ supplyState: string;
498
+ discountPercent: number;
499
+ creditMultiplier: number;
500
+ }
501
+
502
+ function replayDiscountEvents(ctx: any): void {
503
+ latestDiscounts = loadCachedDiscounts() ?? new Map();
504
+ for (const entry of ctx.sessionManager.getBranch()) {
505
+ if (entry.type === "custom" && entry.customType === DISCOUNT_ENTRY_TYPE && entry.data) {
506
+ const d = entry.data as DiscountEntry;
507
+ latestDiscounts.set(d.modelId, {
508
+ supplyState: d.supplyState,
509
+ discountPercent: d.discountPercent,
510
+ creditMultiplier: d.creditMultiplier,
511
+ });
512
+ }
513
+ }
514
+ }
515
+
516
+ pi.on("session_start", async (_event, ctx) => {
517
+ revalidateAbort?.abort();
518
+ revalidateAbort = new AbortController();
519
+ const signal = revalidateAbort.signal;
520
+
521
+ // Replay persisted discount state from session JSONL (synchronous, zero-latency)
522
+ replayDiscountEvents(ctx);
523
+
524
+ // Show status immediately with replayed/cached data — don't block pi startup
525
+ const model = ctx.model;
526
+ if (model?.provider === "lilac") {
527
+ ctx.ui.setStatus("lilac", dimStatus(ctx, formatDiscountStatus(model.id)));
528
+ }
529
+
530
+ // Fire-and-forget: resolve API key, then fetch live data in background.
531
+ // Provider and status are hot-swapped when results arrive.
532
+ resolveApiKey(ctx.modelRegistry).then(() => {
533
+ if (!cachedApiKey || signal.aborted) return;
534
+
535
+ Promise.all([
536
+ fetchLiveModels(cachedApiKey, signal),
537
+ fetchStatusDiscounts(cachedApiKey, signal),
538
+ ]).then(([liveModels, discounts]) => {
539
+ if (signal.aborted) return;
540
+
541
+ if (discounts) {
542
+ lastDiscountFetchTime = Date.now();
543
+ cacheDiscounts(discounts);
544
+ latestDiscounts = discounts;
545
+ }
546
+
547
+ if (liveModels && liveModels.length > 0) {
548
+ const merged = mergeWithEmbedded(liveModels, embeddedModels);
549
+ cacheModels(merged);
550
+ pi.registerProvider("lilac", {
551
+ baseUrl: BASE_URL,
552
+ apiKey: "$LILAC_API_KEY",
553
+ api: "openai-completions",
554
+ models: applyDiscounts(buildModels(merged, customModels, patches), latestDiscounts),
555
+ });
556
+ } else if (discounts) {
557
+ pi.registerProvider("lilac", {
558
+ baseUrl: BASE_URL,
559
+ apiKey: "$LILAC_API_KEY",
560
+ api: "openai-completions",
561
+ models: applyDiscounts(buildModels(staleBase, customModels, patches), latestDiscounts),
562
+ });
563
+ }
564
+
565
+ if (model?.provider === "lilac") {
566
+ ctx.ui.setStatus("lilac", dimStatus(ctx, formatDiscountStatus(model.id)));
567
+ }
568
+ }).catch(() => { /* network errors are non-fatal */ });
569
+ });
570
+ });
571
+
572
+ pi.on("turn_end", async (_event, ctx) => {
573
+ if (!ctx.model || ctx.model.provider !== "lilac" || !latestDiscounts) return;
574
+ const discount = latestDiscounts.get(ctx.model.id);
575
+ if (!discount) return;
576
+ pi.appendEntry(DISCOUNT_ENTRY_TYPE, {
577
+ modelId: ctx.model.id,
578
+ supplyState: discount.supplyState,
579
+ discountPercent: discount.discountPercent,
580
+ creditMultiplier: discount.creditMultiplier,
581
+ } as DiscountEntry);
582
+ });
583
+
584
+ pi.on("before_provider_request", async (_event, ctx) => {
585
+ if (ctx.model?.provider !== "lilac") return;
586
+
587
+ // Always show status for active lilac model
588
+ ctx.ui.setStatus("lilac", dimStatus(ctx, formatDiscountStatus(ctx.model.id)));
589
+
590
+ if (!cachedApiKey) return;
591
+
592
+ const now = Date.now();
593
+ if (latestDiscounts && now - lastDiscountFetchTime < STATUS_CACHE_TTL_MS) {
594
+ return;
595
+ }
596
+
597
+ const discounts = await fetchStatusDiscounts(cachedApiKey);
598
+ if (!discounts) return;
599
+ if (!discountsChanged(latestDiscounts, discounts)) {
600
+ lastDiscountFetchTime = now;
601
+ ctx.ui.setStatus("lilac", dimStatus(ctx, formatDiscountStatus(ctx.model.id)));
602
+ return;
603
+ }
604
+
605
+ lastDiscountFetchTime = now;
606
+ cacheDiscounts(discounts);
607
+ latestDiscounts = discounts;
608
+
609
+ const base = loadStaleModels(embeddedModels);
610
+ pi.registerProvider("lilac", {
611
+ baseUrl: BASE_URL,
612
+ apiKey: "$LILAC_API_KEY",
613
+ api: "openai-completions",
614
+ models: applyDiscounts(buildModels(base, customModels, patches), discounts),
615
+ });
616
+ ctx.ui.setStatus("lilac", dimStatus(ctx, formatDiscountStatus(ctx.model.id)));
617
+ });
618
+
619
+ pi.on("model_select", async (event, ctx) => {
620
+ if (event.model.provider === "lilac") {
621
+ ctx.ui.setStatus("lilac", dimStatus(ctx, formatDiscountStatus(event.model.id)));
622
+ } else {
623
+ ctx.ui.setStatus("lilac", undefined);
624
+ }
625
+ });
626
+
627
+ pi.on("session_tree", async (_event, ctx) => {
628
+ replayDiscountEvents(ctx);
629
+ const model = ctx.model;
630
+ if (model?.provider === "lilac") {
631
+ ctx.ui.setStatus("lilac", dimStatus(ctx, formatDiscountStatus(model.id)));
632
+ }
633
+ });
634
+
635
+ // vLLM's streaming parser intermittently emits finish_reason: "tool_calls" without
636
+ // any delta.tool_calls chunks — even with tool_stream: true (set via zaiToolStream
637
+ // in compat). Pi maps that to stopReason: "toolUse" but there are zero toolCall
638
+ // blocks to execute, so the agent loop ends with nothing to do ("abrupt stop").
639
+ // The message_end handler converts this to a retryable error so pi's auto-retry
640
+ // mechanism re-prompts the agent.
641
+ pi.on("message_end", async (event, mctx) => {
642
+ const message = event.message;
643
+ if (message.role !== "assistant") return;
644
+ if (message.provider !== "lilac" && mctx.model?.provider !== "lilac") return;
645
+ if (message.stopReason !== "toolUse") return;
646
+
647
+ const content = message.content;
648
+ const hasToolCalls = Array.isArray(content) &&
649
+ content.some((block: any) => block.type === "toolCall");
650
+
651
+ if (hasToolCalls) return;
652
+
653
+ // vLLM emitted finish_reason: "tool_calls" without any delta.tool_calls chunks.
654
+ // Convert to a retryable error so pi's auto-retry mechanism re-prompts the
655
+ // agent. The error message matches the "stream ended before" pattern in
656
+ // _isRetryableError, which triggers automatic backoff-and-retry.
657
+ return {
658
+ message: {
659
+ ...message,
660
+ stopReason: "error",
661
+ errorMessage: "stream ended before tool_calls were received (vLLM phantom tool_use)",
662
+ },
663
+ };
664
+ });
665
+
666
+ pi.on("session_shutdown", () => {
667
+ revalidateAbort?.abort();
668
+ });
669
+ }
670
+
671
+ export { fetchStatusDiscounts, applyDiscounts, loadCachedDiscounts, cacheDiscounts };
672
+ export type { JsonDiscount, JsonModel, PatchEntry, PatchData };