@tokagent/tokagentos 2.0.21 → 2.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tokagent/tokagentos",
3
- "version": "2.0.21",
3
+ "version": "2.0.23",
4
4
  "description": "tokagentOS CLI - Create and upgrade tokagentOS project templates",
5
5
  "type": "module",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tokagent/plugin-tokagent-billing",
3
- "version": "2.0.12",
3
+ "version": "2.0.14",
4
4
  "description": "elizaOS plugin: Web3 credit-billing routes and middleware for the tokagentos LLM gateway.",
5
5
  "type": "module",
6
6
  "publishConfig": { "access": "public" },
@@ -112,19 +112,12 @@ async function proxyToLiteLLM(
112
112
  return;
113
113
  }
114
114
 
115
- // Streaming requires duplex passthrough out of scope for this proxy
116
- // until we wire up SSE forwarding. Reject loudly so clients don't hang.
117
- if ((body as Record<string, unknown>).stream === true) {
118
- res.status(501).json({
119
- error: {
120
- type: "not_implemented",
121
- message:
122
- "Streaming responses are not yet supported by this billing proxy. " +
123
- "Set `stream: false` and retry.",
124
- },
125
- });
126
- return;
127
- }
115
+ // Detect streaming. plugin-openai (Vercel AI SDK) defaults to
116
+ // stream:true and there's no way to disable from the agent's chat flow,
117
+ // so we MUST support it. For non-stream we buffer the JSON response;
118
+ // for stream we pipe SSE bytes through and parse usage from the final
119
+ // chunk before committing billing.
120
+ const wantsStream = (body as Record<string, unknown>).stream === true;
128
121
 
129
122
  // ---- Auth + reserve ----
130
123
  const incoming = toIncomingMessage(req);
@@ -152,12 +145,27 @@ async function proxyToLiteLLM(
152
145
  const upstreamUrl = `${litellmBaseUrl.replace(/\/$/, "")}${upstreamPath}`;
153
146
  const upstreamHeaders = pickUpstreamHeaders(req, litellmApiKey);
154
147
 
148
+ // For streaming, request usage in the final SSE chunk (OpenAI's
149
+ // stream_options.include_usage convention — LiteLLM honors it). Without
150
+ // this we'd have no token counts and would commit zero, leaking PTON.
151
+ const upstreamBodyObj =
152
+ wantsStream
153
+ ? {
154
+ ...body,
155
+ stream_options: {
156
+ ...((body as { stream_options?: Record<string, unknown> })
157
+ .stream_options ?? {}),
158
+ include_usage: true,
159
+ },
160
+ }
161
+ : body;
162
+
155
163
  let upstreamRes: Response;
156
164
  try {
157
165
  upstreamRes = await fetch(upstreamUrl, {
158
166
  method: "POST",
159
167
  headers: upstreamHeaders,
160
- body: JSON.stringify(body),
168
+ body: JSON.stringify(upstreamBodyObj),
161
169
  });
162
170
  } catch (err) {
163
171
  await gate.release?.("released_error");
@@ -171,6 +179,136 @@ async function proxyToLiteLLM(
171
179
  return;
172
180
  }
173
181
 
182
+ // ---- STREAMING PATH ----
183
+ // For SSE we need raw write() access to the underlying ServerResponse.
184
+ // RouteResponse's .json()/.send() helpers buffer + close; we instead
185
+ // forward bytes as they arrive, parse data: lines to extract usage from
186
+ // the final chunk, then end the response and commit billing.
187
+ if (wantsStream) {
188
+ if (!upstreamRes.ok || !upstreamRes.body) {
189
+ await gate.release?.("released_error");
190
+ const errText = await upstreamRes.text().catch(() => "");
191
+ let errBody: unknown;
192
+ try {
193
+ errBody = errText ? JSON.parse(errText) : { error: "upstream_error" };
194
+ } catch {
195
+ errBody = { error: { type: "upstream_error", message: errText.slice(0, 500) } };
196
+ }
197
+ res.status(upstreamRes.status).json(errBody as object);
198
+ return;
199
+ }
200
+
201
+ // Bypass the .json()/.send() helpers — write SSE bytes directly to the
202
+ // underlying http.ServerResponse. The shim attaches helpers ON res so
203
+ // the native write/end/setHeader are still available beneath them.
204
+ const rawRes = res as unknown as {
205
+ statusCode?: number;
206
+ setHeader?: (n: string, v: string) => void;
207
+ write?: (chunk: string | Uint8Array) => boolean;
208
+ end?: () => void;
209
+ };
210
+ rawRes.statusCode = 200;
211
+ rawRes.setHeader?.("Content-Type", "text/event-stream; charset=utf-8");
212
+ rawRes.setHeader?.("Cache-Control", "no-cache, no-transform");
213
+ rawRes.setHeader?.("Connection", "keep-alive");
214
+ rawRes.setHeader?.("X-Accel-Buffering", "no");
215
+
216
+ const model =
217
+ typeof (body as Record<string, unknown>)["model"] === "string"
218
+ ? ((body as Record<string, unknown>)["model"] as string)
219
+ : "unknown";
220
+ let lastUsage: Record<string, number> | null = null;
221
+ let buffer = "";
222
+ const decoder = new TextDecoder();
223
+ const reader = upstreamRes.body.getReader();
224
+
225
+ try {
226
+ while (true) {
227
+ const { value, done } = await reader.read();
228
+ if (done) break;
229
+ const chunkText = decoder.decode(value, { stream: true });
230
+ // Forward to client verbatim. plugin-openai's SDK parses the SSE
231
+ // event stream — we don't transform.
232
+ rawRes.write?.(chunkText);
233
+ // Parse for usage extraction. SSE events are separated by blank
234
+ // lines; within an event, `data: <json>` carries the payload.
235
+ // The final usage chunk (when include_usage=true) is the LAST
236
+ // data line before [DONE], with content.choices empty + usage set.
237
+ buffer += chunkText;
238
+ const events = buffer.split("\n\n");
239
+ buffer = events.pop() ?? ""; // keep last (possibly partial) event
240
+ for (const evt of events) {
241
+ for (const line of evt.split("\n")) {
242
+ if (!line.startsWith("data:")) continue;
243
+ const data = line.slice(5).trim();
244
+ if (!data || data === "[DONE]") continue;
245
+ try {
246
+ const parsed = JSON.parse(data) as { usage?: Record<string, number> };
247
+ if (parsed.usage && typeof parsed.usage === "object") {
248
+ lastUsage = parsed.usage;
249
+ }
250
+ } catch {
251
+ // Ignore malformed chunks — keep streaming.
252
+ }
253
+ }
254
+ }
255
+ }
256
+ } catch (err) {
257
+ // Stream interrupted — best-effort release and end the response.
258
+ await gate.release?.("released_error");
259
+ try {
260
+ rawRes.end?.();
261
+ } catch {
262
+ /* response already ended */
263
+ }
264
+ return;
265
+ }
266
+
267
+ // Flush any final buffered bytes (rare — usually [DONE] ends the
268
+ // stream cleanly with a trailing blank line).
269
+ if (buffer.length > 0) rawRes.write?.(buffer);
270
+ rawRes.end?.();
271
+
272
+ // ---- Commit billing from extracted usage ----
273
+ if (lastUsage) {
274
+ const inputTokens = Number(
275
+ lastUsage["prompt_tokens"] ?? lastUsage["input_tokens"] ?? 0,
276
+ );
277
+ const outputTokens = Number(
278
+ lastUsage["completion_tokens"] ?? lastUsage["output_tokens"] ?? 0,
279
+ );
280
+ let actualUsd = 0;
281
+ try {
282
+ actualUsd = computeActualCostUsd({
283
+ model,
284
+ usage: lastUsage as Record<string, number>,
285
+ });
286
+ } catch {
287
+ actualUsd = 0;
288
+ }
289
+ try {
290
+ await gate.commit?.(actualUsd, {
291
+ model,
292
+ inputTokens,
293
+ outputTokens,
294
+ status: "ok",
295
+ });
296
+ } catch {
297
+ /* commit failure is non-fatal — user already got their response */
298
+ }
299
+ } else {
300
+ // No usage chunk arrived — upstream didn't honor include_usage, or
301
+ // the stream ended abnormally. Commit zero so we don't double-charge
302
+ // a reservation that may have been zero-sized anyway.
303
+ try {
304
+ await gate.commit?.(0, { model, status: "ok" });
305
+ } catch {
306
+ /* swallow */
307
+ }
308
+ }
309
+ return;
310
+ }
311
+
174
312
  // Parse the JSON body once — we both relay it to the client AND extract
175
313
  // usage for billing commit.
176
314
  const upstreamText = await upstreamRes.text();
@@ -261,6 +399,63 @@ async function handleChatCompletions(
261
399
  return proxyToLiteLLM(req, res, "/v1/chat/completions");
262
400
  }
263
401
 
402
+ /**
403
+ * OpenAI-compatible model catalog. plugin-openai (and many OpenAI SDKs)
404
+ * call GET /v1/models on startup to validate the API key — if this returns
405
+ * 401/404, the plugin marks the provider unhealthy and the agent's chat
406
+ * composer never gets an active backend.
407
+ *
408
+ * We return a static list of the models the gateway actually supports
409
+ * (currently glm-4.7 on Tokamak's LiteLLM). Two reasons static beats
410
+ * proxying upstream:
411
+ * 1. Tokamak's LiteLLM /v1/models requires the operator's key, not the
412
+ * user's sk-ai-* — proxying would either expose the operator key or
413
+ * require a separate auth path. Static avoids the leak.
414
+ * 2. The billing layer's allowlist is the source of truth for "what
415
+ * models a billing client can use"; the upstream catalog is the
416
+ * operator's concern. Decoupling them lets us add/remove allowlisted
417
+ * models without redeploying the upstream.
418
+ *
419
+ * Auth: still gated by applyBillingGate so only authenticated clients see
420
+ * the list. Returns the same 401 envelope as the chat routes on bad auth.
421
+ */
422
+ async function handleModels(
423
+ req: RouteRequest,
424
+ res: RouteResponse,
425
+ _runtime: IAgentRuntime,
426
+ ): Promise<void> {
427
+ if (!isBillingStateInitialized()) return billingUnavailable(res);
428
+ const state = getBillingState();
429
+ if (!state.config.enabled) return billingUnavailable(res);
430
+
431
+ // Auth check — applyBillingGate is overkill here (no model/body to gate
432
+ // on) but using it keeps the auth-error envelope consistent across routes.
433
+ const incoming = toIncomingMessage(req);
434
+ const { resolveBillingIdentity } = await import(
435
+ "../middleware/api-key-resolve.js"
436
+ );
437
+ const identity = await resolveBillingIdentity(incoming);
438
+ if (!identity) {
439
+ res.status(401).json({
440
+ error: { type: "invalid_auth", message: "Authentication required." },
441
+ });
442
+ return;
443
+ }
444
+
445
+ const now = Math.floor(Date.now() / 1000);
446
+ res.status(200).json({
447
+ object: "list",
448
+ data: [
449
+ {
450
+ id: "glm-4.7",
451
+ object: "model",
452
+ created: now,
453
+ owned_by: "tokamak",
454
+ },
455
+ ],
456
+ });
457
+ }
458
+
264
459
  export const messagesProxyRoutes: Route[] = [
265
460
  {
266
461
  type: "POST",
@@ -278,6 +473,14 @@ export const messagesProxyRoutes: Route[] = [
278
473
  name: "billing-chat-completions-proxy",
279
474
  handler: handleChatCompletions,
280
475
  },
476
+ {
477
+ type: "GET",
478
+ path: "/v1/models",
479
+ rawPath: true,
480
+ public: true,
481
+ name: "billing-models-catalog",
482
+ handler: handleModels,
483
+ },
281
484
  ];
282
485
 
283
486
  export function getMessagesProxyRoutes(mode: "server" | "client"): Route[] {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "version": "1.0.0",
3
- "generatedAt": "2026-05-19T20:35:31.260Z",
3
+ "generatedAt": "2026-05-19T21:13:11.158Z",
4
4
  "repoUrl": "https://github.com/elizaos/eliza",
5
5
  "templates": [
6
6
  {