@tokagent/tokagentos 2.0.22 → 2.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tokagent/tokagentos",
3
- "version": "2.0.22",
3
+ "version": "2.0.24",
4
4
  "description": "tokagentOS CLI - Create and upgrade tokagentOS project templates",
5
5
  "type": "module",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tokagent/plugin-tokagent-billing",
3
- "version": "2.0.13",
3
+ "version": "2.0.14",
4
4
  "description": "elizaOS plugin: Web3 credit-billing routes and middleware for the tokagentos LLM gateway.",
5
5
  "type": "module",
6
6
  "publishConfig": { "access": "public" },
@@ -112,19 +112,12 @@ async function proxyToLiteLLM(
112
112
  return;
113
113
  }
114
114
 
115
- // Streaming requires duplex passthrough out of scope for this proxy
116
- // until we wire up SSE forwarding. Reject loudly so clients don't hang.
117
- if ((body as Record<string, unknown>).stream === true) {
118
- res.status(501).json({
119
- error: {
120
- type: "not_implemented",
121
- message:
122
- "Streaming responses are not yet supported by this billing proxy. " +
123
- "Set `stream: false` and retry.",
124
- },
125
- });
126
- return;
127
- }
115
+ // Detect streaming. plugin-openai (Vercel AI SDK) defaults to
116
+ // stream:true and there's no way to disable from the agent's chat flow,
117
+ // so we MUST support it. For non-stream we buffer the JSON response;
118
+ // for stream we pipe SSE bytes through and parse usage from the final
119
+ // chunk before committing billing.
120
+ const wantsStream = (body as Record<string, unknown>).stream === true;
128
121
 
129
122
  // ---- Auth + reserve ----
130
123
  const incoming = toIncomingMessage(req);
@@ -152,12 +145,27 @@ async function proxyToLiteLLM(
152
145
  const upstreamUrl = `${litellmBaseUrl.replace(/\/$/, "")}${upstreamPath}`;
153
146
  const upstreamHeaders = pickUpstreamHeaders(req, litellmApiKey);
154
147
 
148
+ // For streaming, request usage in the final SSE chunk (OpenAI's
149
+ // stream_options.include_usage convention — LiteLLM honors it). Without
150
+ // this we'd have no token counts and would commit zero, leaking PTON.
151
+ const upstreamBodyObj =
152
+ wantsStream
153
+ ? {
154
+ ...body,
155
+ stream_options: {
156
+ ...((body as { stream_options?: Record<string, unknown> })
157
+ .stream_options ?? {}),
158
+ include_usage: true,
159
+ },
160
+ }
161
+ : body;
162
+
155
163
  let upstreamRes: Response;
156
164
  try {
157
165
  upstreamRes = await fetch(upstreamUrl, {
158
166
  method: "POST",
159
167
  headers: upstreamHeaders,
160
- body: JSON.stringify(body),
168
+ body: JSON.stringify(upstreamBodyObj),
161
169
  });
162
170
  } catch (err) {
163
171
  await gate.release?.("released_error");
@@ -171,6 +179,136 @@ async function proxyToLiteLLM(
171
179
  return;
172
180
  }
173
181
 
182
+ // ---- STREAMING PATH ----
183
+ // For SSE we need raw write() access to the underlying ServerResponse.
184
+ // RouteResponse's .json()/.send() helpers buffer + close; we instead
185
+ // forward bytes as they arrive, parse data: lines to extract usage from
186
+ // the final chunk, then end the response and commit billing.
187
+ if (wantsStream) {
188
+ if (!upstreamRes.ok || !upstreamRes.body) {
189
+ await gate.release?.("released_error");
190
+ const errText = await upstreamRes.text().catch(() => "");
191
+ let errBody: unknown;
192
+ try {
193
+ errBody = errText ? JSON.parse(errText) : { error: "upstream_error" };
194
+ } catch {
195
+ errBody = { error: { type: "upstream_error", message: errText.slice(0, 500) } };
196
+ }
197
+ res.status(upstreamRes.status).json(errBody as object);
198
+ return;
199
+ }
200
+
201
+ // Bypass the .json()/.send() helpers — write SSE bytes directly to the
202
+ // underlying http.ServerResponse. The shim attaches helpers ON res so
203
+ // the native write/end/setHeader are still available beneath them.
204
+ const rawRes = res as unknown as {
205
+ statusCode?: number;
206
+ setHeader?: (n: string, v: string) => void;
207
+ write?: (chunk: string | Uint8Array) => boolean;
208
+ end?: () => void;
209
+ };
210
+ rawRes.statusCode = 200;
211
+ rawRes.setHeader?.("Content-Type", "text/event-stream; charset=utf-8");
212
+ rawRes.setHeader?.("Cache-Control", "no-cache, no-transform");
213
+ rawRes.setHeader?.("Connection", "keep-alive");
214
+ rawRes.setHeader?.("X-Accel-Buffering", "no");
215
+
216
+ const model =
217
+ typeof (body as Record<string, unknown>)["model"] === "string"
218
+ ? ((body as Record<string, unknown>)["model"] as string)
219
+ : "unknown";
220
+ let lastUsage: Record<string, number> | null = null;
221
+ let buffer = "";
222
+ const decoder = new TextDecoder();
223
+ const reader = upstreamRes.body.getReader();
224
+
225
+ try {
226
+ while (true) {
227
+ const { value, done } = await reader.read();
228
+ if (done) break;
229
+ const chunkText = decoder.decode(value, { stream: true });
230
+ // Forward to client verbatim. plugin-openai's SDK parses the SSE
231
+ // event stream — we don't transform.
232
+ rawRes.write?.(chunkText);
233
+ // Parse for usage extraction. SSE events are separated by blank
234
+ // lines; within an event, `data: <json>` carries the payload.
235
+ // The final usage chunk (when include_usage=true) is the LAST
236
+ // data line before [DONE], with content.choices empty + usage set.
237
+ buffer += chunkText;
238
+ const events = buffer.split("\n\n");
239
+ buffer = events.pop() ?? ""; // keep last (possibly partial) event
240
+ for (const evt of events) {
241
+ for (const line of evt.split("\n")) {
242
+ if (!line.startsWith("data:")) continue;
243
+ const data = line.slice(5).trim();
244
+ if (!data || data === "[DONE]") continue;
245
+ try {
246
+ const parsed = JSON.parse(data) as { usage?: Record<string, number> };
247
+ if (parsed.usage && typeof parsed.usage === "object") {
248
+ lastUsage = parsed.usage;
249
+ }
250
+ } catch {
251
+ // Ignore malformed chunks — keep streaming.
252
+ }
253
+ }
254
+ }
255
+ }
256
+ } catch (err) {
257
+ // Stream interrupted — best-effort release and end the response.
258
+ await gate.release?.("released_error");
259
+ try {
260
+ rawRes.end?.();
261
+ } catch {
262
+ /* response already ended */
263
+ }
264
+ return;
265
+ }
266
+
267
+ // Flush any final buffered bytes (rare — usually [DONE] ends the
268
+ // stream cleanly with a trailing blank line).
269
+ if (buffer.length > 0) rawRes.write?.(buffer);
270
+ rawRes.end?.();
271
+
272
+ // ---- Commit billing from extracted usage ----
273
+ if (lastUsage) {
274
+ const inputTokens = Number(
275
+ lastUsage["prompt_tokens"] ?? lastUsage["input_tokens"] ?? 0,
276
+ );
277
+ const outputTokens = Number(
278
+ lastUsage["completion_tokens"] ?? lastUsage["output_tokens"] ?? 0,
279
+ );
280
+ let actualUsd = 0;
281
+ try {
282
+ actualUsd = computeActualCostUsd({
283
+ model,
284
+ usage: lastUsage as Record<string, number>,
285
+ });
286
+ } catch {
287
+ actualUsd = 0;
288
+ }
289
+ try {
290
+ await gate.commit?.(actualUsd, {
291
+ model,
292
+ inputTokens,
293
+ outputTokens,
294
+ status: "ok",
295
+ });
296
+ } catch {
297
+ /* commit failure is non-fatal — user already got their response */
298
+ }
299
+ } else {
300
+ // No usage chunk arrived — upstream didn't honor include_usage, or
301
+ // the stream ended abnormally. Commit zero so we don't double-charge
302
+ // a reservation that may have been zero-sized anyway.
303
+ try {
304
+ await gate.commit?.(0, { model, status: "ok" });
305
+ } catch {
306
+ /* swallow */
307
+ }
308
+ }
309
+ return;
310
+ }
311
+
174
312
  // Parse the JSON body once — we both relay it to the client AND extract
175
313
  // usage for billing commit.
176
314
  const upstreamText = await upstreamRes.text();
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "version": "1.0.0",
3
- "generatedAt": "2026-05-19T20:51:16.832Z",
3
+ "generatedAt": "2026-05-19T21:32:47.891Z",
4
4
  "repoUrl": "https://github.com/elizaos/eliza",
5
5
  "templates": [
6
6
  {