npm - @tokagent/tokagentos - Versions diffs - 2.0.21 → 2.0.23 - Mend

@tokagent/tokagentos 2.0.21 → 2.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tokagent/tokagentos",
-  "version": "2.0.21",
+  "version": "2.0.23",
   "description": "tokagentOS CLI - Create and upgrade tokagentOS project templates",
   "type": "module",
   "bin": {

package/templates/fullstack-app/plugins/plugin-tokagent-billing/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tokagent/plugin-tokagent-billing",
-  "version": "2.0.12",
+  "version": "2.0.14",
   "description": "elizaOS plugin: Web3 credit-billing routes and middleware for the tokagentos LLM gateway.",
   "type": "module",
   "publishConfig": { "access": "public" },

package/templates/fullstack-app/plugins/plugin-tokagent-billing/src/routes/messages-proxy-routes.ts CHANGED Viewed

@@ -112,19 +112,12 @@ async function proxyToLiteLLM(
     return;
   }
-  // Streaming requires duplex passthrough — out of scope for this proxy
-  // until we wire up SSE forwarding. Reject loudly so clients don't hang.
-  if ((body as Record<string, unknown>).stream === true) {
-    res.status(501).json({
-      error: {
-        type: "not_implemented",
-        message:
-          "Streaming responses are not yet supported by this billing proxy. " +
-          "Set `stream: false` and retry.",
-      },
-    });
-    return;
-  }
+  // Detect streaming. plugin-openai (Vercel AI SDK) defaults to
+  // stream:true and there's no way to disable from the agent's chat flow,
+  // so we MUST support it. For non-stream we buffer the JSON response;
+  // for stream we pipe SSE bytes through and parse usage from the final
+  // chunk before committing billing.
+  const wantsStream = (body as Record<string, unknown>).stream === true;
   // ---- Auth + reserve ----
   const incoming = toIncomingMessage(req);
@@ -152,12 +145,27 @@ async function proxyToLiteLLM(
   const upstreamUrl = `${litellmBaseUrl.replace(/\/$/, "")}${upstreamPath}`;
   const upstreamHeaders = pickUpstreamHeaders(req, litellmApiKey);
+  // For streaming, request usage in the final SSE chunk (OpenAI's
+  // stream_options.include_usage convention — LiteLLM honors it). Without
+  // this we'd have no token counts and would commit zero, leaking PTON.
+  const upstreamBodyObj =
+    wantsStream
+      ? {
+          ...body,
+          stream_options: {
+            ...((body as { stream_options?: Record<string, unknown> })
+              .stream_options ?? {}),
+            include_usage: true,
+          },
+        }
+      : body;
   let upstreamRes: Response;
   try {
     upstreamRes = await fetch(upstreamUrl, {
       method: "POST",
       headers: upstreamHeaders,
-      body: JSON.stringify(body),
+      body: JSON.stringify(upstreamBodyObj),
     });
   } catch (err) {
     await gate.release?.("released_error");
@@ -171,6 +179,136 @@ async function proxyToLiteLLM(
     return;
   }
+  // ---- STREAMING PATH ----
+  // For SSE we need raw write() access to the underlying ServerResponse.
+  // RouteResponse's .json()/.send() helpers buffer + close; we instead
+  // forward bytes as they arrive, parse data: lines to extract usage from
+  // the final chunk, then end the response and commit billing.
+  if (wantsStream) {
+    if (!upstreamRes.ok || !upstreamRes.body) {
+      await gate.release?.("released_error");
+      const errText = await upstreamRes.text().catch(() => "");
+      let errBody: unknown;
+      try {
+        errBody = errText ? JSON.parse(errText) : { error: "upstream_error" };
+      } catch {
+        errBody = { error: { type: "upstream_error", message: errText.slice(0, 500) } };
+      }
+      res.status(upstreamRes.status).json(errBody as object);
+      return;
+    }
+    // Bypass the .json()/.send() helpers — write SSE bytes directly to the
+    // underlying http.ServerResponse. The shim attaches helpers ON res so
+    // the native write/end/setHeader are still available beneath them.
+    const rawRes = res as unknown as {
+      statusCode?: number;
+      setHeader?: (n: string, v: string) => void;
+      write?: (chunk: string | Uint8Array) => boolean;
+      end?: () => void;
+    };
+    rawRes.statusCode = 200;
+    rawRes.setHeader?.("Content-Type", "text/event-stream; charset=utf-8");
+    rawRes.setHeader?.("Cache-Control", "no-cache, no-transform");
+    rawRes.setHeader?.("Connection", "keep-alive");
+    rawRes.setHeader?.("X-Accel-Buffering", "no");
+    const model =
+      typeof (body as Record<string, unknown>)["model"] === "string"
+        ? ((body as Record<string, unknown>)["model"] as string)
+        : "unknown";
+    let lastUsage: Record<string, number> | null = null;
+    let buffer = "";
+    const decoder = new TextDecoder();
+    const reader = upstreamRes.body.getReader();
+    try {
+      while (true) {
+        const { value, done } = await reader.read();
+        if (done) break;
+        const chunkText = decoder.decode(value, { stream: true });
+        // Forward to client verbatim. plugin-openai's SDK parses the SSE
+        // event stream — we don't transform.
+        rawRes.write?.(chunkText);
+        // Parse for usage extraction. SSE events are separated by blank
+        // lines; within an event, `data: <json>` carries the payload.
+        // The final usage chunk (when include_usage=true) is the LAST
+        // data line before [DONE], with content.choices empty + usage set.
+        buffer += chunkText;
+        const events = buffer.split("\n\n");
+        buffer = events.pop() ?? ""; // keep last (possibly partial) event
+        for (const evt of events) {
+          for (const line of evt.split("\n")) {
+            if (!line.startsWith("data:")) continue;
+            const data = line.slice(5).trim();
+            if (!data || data === "[DONE]") continue;
+            try {
+              const parsed = JSON.parse(data) as { usage?: Record<string, number> };
+              if (parsed.usage && typeof parsed.usage === "object") {
+                lastUsage = parsed.usage;
+              }
+            } catch {
+              // Ignore malformed chunks — keep streaming.
+            }
+          }
+        }
+      }
+    } catch (err) {
+      // Stream interrupted — best-effort release and end the response.
+      await gate.release?.("released_error");
+      try {
+        rawRes.end?.();
+      } catch {
+        /* response already ended */
+      }
+      return;
+    }
+    // Flush any final buffered bytes (rare — usually [DONE] ends the
+    // stream cleanly with a trailing blank line).
+    if (buffer.length > 0) rawRes.write?.(buffer);
+    rawRes.end?.();
+    // ---- Commit billing from extracted usage ----
+    if (lastUsage) {
+      const inputTokens = Number(
+        lastUsage["prompt_tokens"] ?? lastUsage["input_tokens"] ?? 0,
+      );
+      const outputTokens = Number(
+        lastUsage["completion_tokens"] ?? lastUsage["output_tokens"] ?? 0,
+      );
+      let actualUsd = 0;
+      try {
+        actualUsd = computeActualCostUsd({
+          model,
+          usage: lastUsage as Record<string, number>,
+        });
+      } catch {
+        actualUsd = 0;
+      }
+      try {
+        await gate.commit?.(actualUsd, {
+          model,
+          inputTokens,
+          outputTokens,
+          status: "ok",
+        });
+      } catch {
+        /* commit failure is non-fatal — user already got their response */
+      }
+    } else {
+      // No usage chunk arrived — upstream didn't honor include_usage, or
+      // the stream ended abnormally. Commit zero so we don't double-charge
+      // a reservation that may have been zero-sized anyway.
+      try {
+        await gate.commit?.(0, { model, status: "ok" });
+      } catch {
+        /* swallow */
+      }
+    }
+    return;
+  }
   // Parse the JSON body once — we both relay it to the client AND extract
   // usage for billing commit.
   const upstreamText = await upstreamRes.text();
@@ -261,6 +399,63 @@ async function handleChatCompletions(
   return proxyToLiteLLM(req, res, "/v1/chat/completions");
 }
+/**
+ * OpenAI-compatible model catalog. plugin-openai (and many OpenAI SDKs)
+ * call GET /v1/models on startup to validate the API key — if this returns
+ * 401/404, the plugin marks the provider unhealthy and the agent's chat
+ * composer never gets an active backend.
+ *
+ * We return a static list of the models the gateway actually supports
+ * (currently glm-4.7 on Tokamak's LiteLLM). Two reasons static beats
+ * proxying upstream:
+ *   1. Tokamak's LiteLLM /v1/models requires the operator's key, not the
+ *      user's sk-ai-* — proxying would either expose the operator key or
+ *      require a separate auth path. Static avoids the leak.
+ *   2. The billing layer's allowlist is the source of truth for "what
+ *      models a billing client can use"; the upstream catalog is the
+ *      operator's concern. Decoupling them lets us add/remove allowlisted
+ *      models without redeploying the upstream.
+ *
+ * Auth: still gated by applyBillingGate so only authenticated clients see
+ * the list. Returns the same 401 envelope as the chat routes on bad auth.
+ */
+async function handleModels(
+  req: RouteRequest,
+  res: RouteResponse,
+  _runtime: IAgentRuntime,
+): Promise<void> {
+  if (!isBillingStateInitialized()) return billingUnavailable(res);
+  const state = getBillingState();
+  if (!state.config.enabled) return billingUnavailable(res);
+  // Auth check — applyBillingGate is overkill here (no model/body to gate
+  // on) but using it keeps the auth-error envelope consistent across routes.
+  const incoming = toIncomingMessage(req);
+  const { resolveBillingIdentity } = await import(
+    "../middleware/api-key-resolve.js"
+  );
+  const identity = await resolveBillingIdentity(incoming);
+  if (!identity) {
+    res.status(401).json({
+      error: { type: "invalid_auth", message: "Authentication required." },
+    });
+    return;
+  }
+  const now = Math.floor(Date.now() / 1000);
+  res.status(200).json({
+    object: "list",
+    data: [
+      {
+        id: "glm-4.7",
+        object: "model",
+        created: now,
+        owned_by: "tokamak",
+      },
+    ],
+  });
+}
 export const messagesProxyRoutes: Route[] = [
   {
     type: "POST",
@@ -278,6 +473,14 @@ export const messagesProxyRoutes: Route[] = [
     name: "billing-chat-completions-proxy",
     handler: handleChatCompletions,
   },
+  {
+    type: "GET",
+    path: "/v1/models",
+    rawPath: true,
+    public: true,
+    name: "billing-models-catalog",
+    handler: handleModels,
+  },
 ];
 export function getMessagesProxyRoutes(mode: "server" | "client"): Route[] {

package/templates-manifest.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "version": "1.0.0",
-  "generatedAt": "2026-05-19T20:35:31.260Z",
+  "generatedAt": "2026-05-19T21:13:11.158Z",
   "repoUrl": "https://github.com/elizaos/eliza",
   "templates": [
     {