@tokagent/tokagentos 2.0.21 → 2.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
package/templates/fullstack-app/plugins/plugin-tokagent-billing/src/routes/messages-proxy-routes.ts
CHANGED
|
@@ -112,19 +112,12 @@ async function proxyToLiteLLM(
|
|
|
112
112
|
return;
|
|
113
113
|
}
|
|
114
114
|
|
|
115
|
-
//
|
|
116
|
-
//
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
message:
|
|
122
|
-
"Streaming responses are not yet supported by this billing proxy. " +
|
|
123
|
-
"Set `stream: false` and retry.",
|
|
124
|
-
},
|
|
125
|
-
});
|
|
126
|
-
return;
|
|
127
|
-
}
|
|
115
|
+
// Detect streaming. plugin-openai (Vercel AI SDK) defaults to
|
|
116
|
+
// stream:true and there's no way to disable from the agent's chat flow,
|
|
117
|
+
// so we MUST support it. For non-stream we buffer the JSON response;
|
|
118
|
+
// for stream we pipe SSE bytes through and parse usage from the final
|
|
119
|
+
// chunk before committing billing.
|
|
120
|
+
const wantsStream = (body as Record<string, unknown>).stream === true;
|
|
128
121
|
|
|
129
122
|
// ---- Auth + reserve ----
|
|
130
123
|
const incoming = toIncomingMessage(req);
|
|
@@ -152,12 +145,27 @@ async function proxyToLiteLLM(
|
|
|
152
145
|
const upstreamUrl = `${litellmBaseUrl.replace(/\/$/, "")}${upstreamPath}`;
|
|
153
146
|
const upstreamHeaders = pickUpstreamHeaders(req, litellmApiKey);
|
|
154
147
|
|
|
148
|
+
// For streaming, request usage in the final SSE chunk (OpenAI's
|
|
149
|
+
// stream_options.include_usage convention — LiteLLM honors it). Without
|
|
150
|
+
// this we'd have no token counts and would commit zero, leaking PTON.
|
|
151
|
+
const upstreamBodyObj =
|
|
152
|
+
wantsStream
|
|
153
|
+
? {
|
|
154
|
+
...body,
|
|
155
|
+
stream_options: {
|
|
156
|
+
...((body as { stream_options?: Record<string, unknown> })
|
|
157
|
+
.stream_options ?? {}),
|
|
158
|
+
include_usage: true,
|
|
159
|
+
},
|
|
160
|
+
}
|
|
161
|
+
: body;
|
|
162
|
+
|
|
155
163
|
let upstreamRes: Response;
|
|
156
164
|
try {
|
|
157
165
|
upstreamRes = await fetch(upstreamUrl, {
|
|
158
166
|
method: "POST",
|
|
159
167
|
headers: upstreamHeaders,
|
|
160
|
-
body: JSON.stringify(
|
|
168
|
+
body: JSON.stringify(upstreamBodyObj),
|
|
161
169
|
});
|
|
162
170
|
} catch (err) {
|
|
163
171
|
await gate.release?.("released_error");
|
|
@@ -171,6 +179,136 @@ async function proxyToLiteLLM(
|
|
|
171
179
|
return;
|
|
172
180
|
}
|
|
173
181
|
|
|
182
|
+
// ---- STREAMING PATH ----
|
|
183
|
+
// For SSE we need raw write() access to the underlying ServerResponse.
|
|
184
|
+
// RouteResponse's .json()/.send() helpers buffer + close; we instead
|
|
185
|
+
// forward bytes as they arrive, parse data: lines to extract usage from
|
|
186
|
+
// the final chunk, then end the response and commit billing.
|
|
187
|
+
if (wantsStream) {
|
|
188
|
+
if (!upstreamRes.ok || !upstreamRes.body) {
|
|
189
|
+
await gate.release?.("released_error");
|
|
190
|
+
const errText = await upstreamRes.text().catch(() => "");
|
|
191
|
+
let errBody: unknown;
|
|
192
|
+
try {
|
|
193
|
+
errBody = errText ? JSON.parse(errText) : { error: "upstream_error" };
|
|
194
|
+
} catch {
|
|
195
|
+
errBody = { error: { type: "upstream_error", message: errText.slice(0, 500) } };
|
|
196
|
+
}
|
|
197
|
+
res.status(upstreamRes.status).json(errBody as object);
|
|
198
|
+
return;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Bypass the .json()/.send() helpers — write SSE bytes directly to the
|
|
202
|
+
// underlying http.ServerResponse. The shim attaches helpers ON res so
|
|
203
|
+
// the native write/end/setHeader are still available beneath them.
|
|
204
|
+
const rawRes = res as unknown as {
|
|
205
|
+
statusCode?: number;
|
|
206
|
+
setHeader?: (n: string, v: string) => void;
|
|
207
|
+
write?: (chunk: string | Uint8Array) => boolean;
|
|
208
|
+
end?: () => void;
|
|
209
|
+
};
|
|
210
|
+
rawRes.statusCode = 200;
|
|
211
|
+
rawRes.setHeader?.("Content-Type", "text/event-stream; charset=utf-8");
|
|
212
|
+
rawRes.setHeader?.("Cache-Control", "no-cache, no-transform");
|
|
213
|
+
rawRes.setHeader?.("Connection", "keep-alive");
|
|
214
|
+
rawRes.setHeader?.("X-Accel-Buffering", "no");
|
|
215
|
+
|
|
216
|
+
const model =
|
|
217
|
+
typeof (body as Record<string, unknown>)["model"] === "string"
|
|
218
|
+
? ((body as Record<string, unknown>)["model"] as string)
|
|
219
|
+
: "unknown";
|
|
220
|
+
let lastUsage: Record<string, number> | null = null;
|
|
221
|
+
let buffer = "";
|
|
222
|
+
const decoder = new TextDecoder();
|
|
223
|
+
const reader = upstreamRes.body.getReader();
|
|
224
|
+
|
|
225
|
+
try {
|
|
226
|
+
while (true) {
|
|
227
|
+
const { value, done } = await reader.read();
|
|
228
|
+
if (done) break;
|
|
229
|
+
const chunkText = decoder.decode(value, { stream: true });
|
|
230
|
+
// Forward to client verbatim. plugin-openai's SDK parses the SSE
|
|
231
|
+
// event stream — we don't transform.
|
|
232
|
+
rawRes.write?.(chunkText);
|
|
233
|
+
// Parse for usage extraction. SSE events are separated by blank
|
|
234
|
+
// lines; within an event, `data: <json>` carries the payload.
|
|
235
|
+
// The final usage chunk (when include_usage=true) is the LAST
|
|
236
|
+
// data line before [DONE], with content.choices empty + usage set.
|
|
237
|
+
buffer += chunkText;
|
|
238
|
+
const events = buffer.split("\n\n");
|
|
239
|
+
buffer = events.pop() ?? ""; // keep last (possibly partial) event
|
|
240
|
+
for (const evt of events) {
|
|
241
|
+
for (const line of evt.split("\n")) {
|
|
242
|
+
if (!line.startsWith("data:")) continue;
|
|
243
|
+
const data = line.slice(5).trim();
|
|
244
|
+
if (!data || data === "[DONE]") continue;
|
|
245
|
+
try {
|
|
246
|
+
const parsed = JSON.parse(data) as { usage?: Record<string, number> };
|
|
247
|
+
if (parsed.usage && typeof parsed.usage === "object") {
|
|
248
|
+
lastUsage = parsed.usage;
|
|
249
|
+
}
|
|
250
|
+
} catch {
|
|
251
|
+
// Ignore malformed chunks — keep streaming.
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
} catch (err) {
|
|
257
|
+
// Stream interrupted — best-effort release and end the response.
|
|
258
|
+
await gate.release?.("released_error");
|
|
259
|
+
try {
|
|
260
|
+
rawRes.end?.();
|
|
261
|
+
} catch {
|
|
262
|
+
/* response already ended */
|
|
263
|
+
}
|
|
264
|
+
return;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Flush any final buffered bytes (rare — usually [DONE] ends the
|
|
268
|
+
// stream cleanly with a trailing blank line).
|
|
269
|
+
if (buffer.length > 0) rawRes.write?.(buffer);
|
|
270
|
+
rawRes.end?.();
|
|
271
|
+
|
|
272
|
+
// ---- Commit billing from extracted usage ----
|
|
273
|
+
if (lastUsage) {
|
|
274
|
+
const inputTokens = Number(
|
|
275
|
+
lastUsage["prompt_tokens"] ?? lastUsage["input_tokens"] ?? 0,
|
|
276
|
+
);
|
|
277
|
+
const outputTokens = Number(
|
|
278
|
+
lastUsage["completion_tokens"] ?? lastUsage["output_tokens"] ?? 0,
|
|
279
|
+
);
|
|
280
|
+
let actualUsd = 0;
|
|
281
|
+
try {
|
|
282
|
+
actualUsd = computeActualCostUsd({
|
|
283
|
+
model,
|
|
284
|
+
usage: lastUsage as Record<string, number>,
|
|
285
|
+
});
|
|
286
|
+
} catch {
|
|
287
|
+
actualUsd = 0;
|
|
288
|
+
}
|
|
289
|
+
try {
|
|
290
|
+
await gate.commit?.(actualUsd, {
|
|
291
|
+
model,
|
|
292
|
+
inputTokens,
|
|
293
|
+
outputTokens,
|
|
294
|
+
status: "ok",
|
|
295
|
+
});
|
|
296
|
+
} catch {
|
|
297
|
+
/* commit failure is non-fatal — user already got their response */
|
|
298
|
+
}
|
|
299
|
+
} else {
|
|
300
|
+
// No usage chunk arrived — upstream didn't honor include_usage, or
|
|
301
|
+
// the stream ended abnormally. Commit zero so we don't double-charge
|
|
302
|
+
// a reservation that may have been zero-sized anyway.
|
|
303
|
+
try {
|
|
304
|
+
await gate.commit?.(0, { model, status: "ok" });
|
|
305
|
+
} catch {
|
|
306
|
+
/* swallow */
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
return;
|
|
310
|
+
}
|
|
311
|
+
|
|
174
312
|
// Parse the JSON body once — we both relay it to the client AND extract
|
|
175
313
|
// usage for billing commit.
|
|
176
314
|
const upstreamText = await upstreamRes.text();
|
|
@@ -261,6 +399,63 @@ async function handleChatCompletions(
|
|
|
261
399
|
return proxyToLiteLLM(req, res, "/v1/chat/completions");
|
|
262
400
|
}
|
|
263
401
|
|
|
402
|
+
/**
|
|
403
|
+
* OpenAI-compatible model catalog. plugin-openai (and many OpenAI SDKs)
|
|
404
|
+
* call GET /v1/models on startup to validate the API key — if this returns
|
|
405
|
+
* 401/404, the plugin marks the provider unhealthy and the agent's chat
|
|
406
|
+
* composer never gets an active backend.
|
|
407
|
+
*
|
|
408
|
+
* We return a static list of the models the gateway actually supports
|
|
409
|
+
* (currently glm-4.7 on Tokamak's LiteLLM). Two reasons static beats
|
|
410
|
+
* proxying upstream:
|
|
411
|
+
* 1. Tokamak's LiteLLM /v1/models requires the operator's key, not the
|
|
412
|
+
* user's sk-ai-* — proxying would either expose the operator key or
|
|
413
|
+
* require a separate auth path. Static avoids the leak.
|
|
414
|
+
* 2. The billing layer's allowlist is the source of truth for "what
|
|
415
|
+
* models a billing client can use"; the upstream catalog is the
|
|
416
|
+
* operator's concern. Decoupling them lets us add/remove allowlisted
|
|
417
|
+
* models without redeploying the upstream.
|
|
418
|
+
*
|
|
419
|
+
* Auth: still gated by applyBillingGate so only authenticated clients see
|
|
420
|
+
* the list. Returns the same 401 envelope as the chat routes on bad auth.
|
|
421
|
+
*/
|
|
422
|
+
async function handleModels(
|
|
423
|
+
req: RouteRequest,
|
|
424
|
+
res: RouteResponse,
|
|
425
|
+
_runtime: IAgentRuntime,
|
|
426
|
+
): Promise<void> {
|
|
427
|
+
if (!isBillingStateInitialized()) return billingUnavailable(res);
|
|
428
|
+
const state = getBillingState();
|
|
429
|
+
if (!state.config.enabled) return billingUnavailable(res);
|
|
430
|
+
|
|
431
|
+
// Auth check — applyBillingGate is overkill here (no model/body to gate
|
|
432
|
+
// on) but using it keeps the auth-error envelope consistent across routes.
|
|
433
|
+
const incoming = toIncomingMessage(req);
|
|
434
|
+
const { resolveBillingIdentity } = await import(
|
|
435
|
+
"../middleware/api-key-resolve.js"
|
|
436
|
+
);
|
|
437
|
+
const identity = await resolveBillingIdentity(incoming);
|
|
438
|
+
if (!identity) {
|
|
439
|
+
res.status(401).json({
|
|
440
|
+
error: { type: "invalid_auth", message: "Authentication required." },
|
|
441
|
+
});
|
|
442
|
+
return;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
const now = Math.floor(Date.now() / 1000);
|
|
446
|
+
res.status(200).json({
|
|
447
|
+
object: "list",
|
|
448
|
+
data: [
|
|
449
|
+
{
|
|
450
|
+
id: "glm-4.7",
|
|
451
|
+
object: "model",
|
|
452
|
+
created: now,
|
|
453
|
+
owned_by: "tokamak",
|
|
454
|
+
},
|
|
455
|
+
],
|
|
456
|
+
});
|
|
457
|
+
}
|
|
458
|
+
|
|
264
459
|
export const messagesProxyRoutes: Route[] = [
|
|
265
460
|
{
|
|
266
461
|
type: "POST",
|
|
@@ -278,6 +473,14 @@ export const messagesProxyRoutes: Route[] = [
|
|
|
278
473
|
name: "billing-chat-completions-proxy",
|
|
279
474
|
handler: handleChatCompletions,
|
|
280
475
|
},
|
|
476
|
+
{
|
|
477
|
+
type: "GET",
|
|
478
|
+
path: "/v1/models",
|
|
479
|
+
rawPath: true,
|
|
480
|
+
public: true,
|
|
481
|
+
name: "billing-models-catalog",
|
|
482
|
+
handler: handleModels,
|
|
483
|
+
},
|
|
281
484
|
];
|
|
282
485
|
|
|
283
486
|
export function getMessagesProxyRoutes(mode: "server" | "client"): Route[] {
|