@tollgateai/sdk 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tollgate
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -2,13 +2,13 @@
2
2
 
3
3
  > Real-time gross-margin observability for AI agents. Track every LLM call's cost, attribute it to a customer, and see whether you're making money — before the invoice goes out.
4
4
 
5
- **v0.4.0** · [npm](https://www.npmjs.com/package/@tollgateai/sdk) · [Dashboard](https://tollgateai.vercel.app)
5
+ **v0.6.0** · [npm](https://www.npmjs.com/package/@tollgateai/sdk) · [Dashboard](https://tollgateai.vercel.app)
6
6
 
7
7
  ---
8
8
 
9
9
  ## Why Tollgate
10
10
 
11
- You sell an AI-powered product. Each customer interaction triggers LLM calls that cost you real money — input tokens, output tokens, reasoning tokens, cached tokens, tool calls. Tollgate captures that cost automatically from provider responses, joins it with the revenue your pricing model defines, and shows you per-customer, per-agent, per-run gross margin in real time.
11
+ You sell an AI-powered product. Each customer interaction triggers LLM calls that cost you real money — input tokens, output tokens, reasoning tokens, audio tokens, cached tokens, web searches, tool calls. Tollgate captures that cost automatically from provider responses, joins it with the revenue your pricing model defines, and shows you per-customer, per-agent, per-run gross margin in real time.
12
12
 
13
13
  ## Installation
14
14
 
@@ -34,11 +34,11 @@ const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
34
34
  runId: 'ticket_8842',
35
35
  });
36
36
 
37
- // Every call is tracked automatically — tokens, cost, tool calls.
37
+ // Every call is tracked automatically — tokens, cost, latency, tool calls.
38
38
  const msg = await anthropic.messages.create({
39
39
  model: 'claude-sonnet-4-6',
40
40
  max_tokens: 1024,
41
- messages: [{ role: 'user', content: 'Resolve this billing dispute' }],
41
+ messages: [{ role: 'user', content: 'Resolve this billing dispute...' }],
42
42
  });
43
43
 
44
44
  // Close the run and book revenue.
@@ -52,12 +52,13 @@ await tollgate.resolve({
52
52
 
53
53
  ## Provider Support
54
54
 
55
- | Provider | Wrapper | Streaming | Tool-Call Tracking |
55
+ | Provider | Wrapper | Streaming | What Gets Extracted |
56
56
  |---|---|---|---|
57
- | Anthropic | `wrapAnthropic` | Automatic | Counts `tool_use` content blocks |
58
- | OpenAI | `wrapOpenAI` | Needs `stream_options: { include_usage: true }` | Counts `tool_calls` on choices |
59
- | OpenAI-compatible (Groq, OpenRouter, Together, Nebius, vLLM, …) | `wrapOpenAI` with `provider: 'openai_compatible'` | Same as OpenAI | Same as OpenAI |
60
- | AWS Bedrock | `wrapBedrock` | Automatic | Counts `toolUse` content blocks |
57
+ | **Anthropic** | `wrapAnthropic` | Automatic | Tokens, thinking/reasoning, cache (read + write by TTL), web search requests, tool calls, latency |
58
+ | **OpenAI** | `wrapOpenAI` | `stream_options: { include_usage: true }` | Tokens, reasoning, cached, audio in/out, text in/out, prediction tokens, service tier, tool calls, latency |
59
+ | **Google Gemini** | `wrapGemini` | Automatic | Tokens, thinking, cached, audio/image/video per-modality, web search (grounding), tool calls, latency |
60
+ | **OpenAI-compatible** | `wrapOpenAI` + `provider: 'openai_compatible'` | Same as OpenAI | Same as OpenAI |
61
+ | **AWS Bedrock** | `wrapBedrock` | Automatic | Tokens, cache (read + write), tool calls, latency |
61
62
 
62
63
  ## Configuration
63
64
 
@@ -81,7 +82,7 @@ const tollgate = createTollgateClient({
81
82
 
82
83
  ## Auto-Instrumentation
83
84
 
84
- Wrap your provider client once. Every `create` call reports usage in the background — non-blocking, fire-and-forget. Failures go to `onError` (default: `console.warn`) and never break your LLM call.
85
+ Wrap your provider client once. Every `create` / `generateContent` call reports usage in the background — non-blocking, fire-and-forget. Failures go to `onError` (default: `console.warn`) and never break your LLM call.
85
86
 
86
87
  ### Anthropic
87
88
 
@@ -98,7 +99,7 @@ const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
98
99
  await anthropic.messages.create({
99
100
  model: 'claude-sonnet-4-6',
100
101
  max_tokens: 512,
101
- messages: [{ role: 'user', content: 'Summarize this ticket' }],
102
+ messages: [{ role: 'user', content: 'Summarize this ticket...' }],
102
103
  });
103
104
  ```
104
105
 
@@ -117,6 +118,23 @@ await openai.chat.completions.create({
117
118
  });
118
119
  ```
119
120
 
121
+ ### Google Gemini
122
+
123
+ ```ts
124
+ import { GoogleGenerativeAI } from '@google/generative-ai';
125
+ import { createTollgateClient, wrapGemini } from '@tollgateai/sdk';
126
+
127
+ const tollgate = createTollgateClient();
128
+ const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
129
+ const model = wrapGemini(
130
+ genai.getGenerativeModel({ model: 'gemini-2.0-flash' }),
131
+ tollgate,
132
+ { customerId: 'cust_acme' },
133
+ );
134
+
135
+ const result = await model.generateContent('Explain quantum computing');
136
+ ```
137
+
120
138
  ### OpenAI-Compatible Gateways
121
139
 
122
140
  Point the OpenAI SDK at any compatible endpoint and set `provider: 'openai_compatible'`:
@@ -159,9 +177,9 @@ await bedrock.send(new ConverseCommand({
159
177
 
160
178
  ### Streaming
161
179
 
162
- Streaming is captured automatically iterate the stream as usual and usage is reported when the stream ends.
180
+ Streaming is captured automatically. Iterate the stream as usual usage and latency are reported when the stream ends.
163
181
 
164
- **OpenAI / compatible** requires `stream_options: { include_usage: true }` for the final usage chunk. **Anthropic** and **Bedrock** need no extra flags.
182
+ **OpenAI / compatible** requires `stream_options: { include_usage: true }`. **Anthropic**, **Gemini**, and **Bedrock** need no extra flags.
165
183
 
166
184
  ```ts
167
185
  const stream = await openai.chat.completions.create({
@@ -171,38 +189,45 @@ const stream = await openai.chat.completions.create({
171
189
  messages: [{ role: 'user', content: 'Hello' }],
172
190
  });
173
191
  for await (const chunk of stream) { /* render to UI */ }
174
- // Usage reported automatically when stream ends.
192
+ // Usage + latency reported automatically when stream ends.
175
193
  ```
176
194
 
177
195
  ---
178
196
 
179
197
  ## What Gets Tracked
180
198
 
181
- Every auto-instrumented call captures the following from the provider response:
199
+ Every auto-instrumented call captures these fields from the provider response:
182
200
 
183
- | Field | Source | Description |
201
+ | Field | Providers | Description |
184
202
  |---|---|---|
185
- | `tokensIn` | `usage.input_tokens` / `prompt_tokens` | Input tokens consumed |
186
- | `tokensOut` | `usage.output_tokens` / `completion_tokens` | Output tokens generated |
187
- | `reasoningTokens` | `completion_tokens_details.reasoning_tokens` | Reasoning/chain-of-thought tokens (OpenAI) |
188
- | `cachedTokens` | `cache_read_input_tokens` / `cached_tokens` | Prompt cache read tokens |
189
- | `cacheWrite5mTokens` | `cache_creation_input_tokens` | 5-min TTL cache write tokens |
190
- | `cacheWrite1hTokens` | `cache_creation.ephemeral_1h_input_tokens` | 1-hour TTL cache write tokens |
191
- | `toolCalls` | Content block / choice inspection | Number of tool calls in the response |
192
- | `externalCostCents` | User-provided | Cost of external tools/services (image gen, sandbox, search) |
193
- | `provider` | Wrapper default or override | `anthropic`, `openai`, `openai_compatible`, `bedrock` |
194
- | `model` | Response object | Model identifier as reported by the provider |
195
-
196
- Cost is computed **server-side** from token counts and a rate card that auto-syncs daily from the public LiteLLM registry. Unknown models are priced at $0 and flagged in logs.
203
+ | `tokensIn` | All | Input tokens consumed |
204
+ | `tokensOut` | All | Output tokens generated |
205
+ | `reasoningTokens` | OpenAI, Anthropic, Gemini | Reasoning/thinking tokens (billed at reasoning rate) |
206
+ | `cachedTokens` | All | Prompt cache read tokens (reduced rate) |
207
+ | `cacheWrite5mTokens` | Anthropic, Bedrock | 5-min TTL cache creation tokens |
208
+ | `cacheWrite1hTokens` | Anthropic | 1-hour TTL cache creation tokens |
209
+ | `audioTokensIn` | OpenAI | Audio input tokens (GPT-4o audio / Realtime) |
210
+ | `audioTokensOut` | OpenAI, Gemini | Audio output tokens |
211
+ | `imageTokensIn` | Gemini | Image/vision input tokens |
212
+ | `imageTokensOut` | Gemini | Image generation output tokens |
213
+ | `videoTokensIn` | Gemini | Video input tokens |
214
+ | `textTokensIn` | OpenAI, Gemini | Text-only input tokens (modality split) |
215
+ | `textTokensOut` | OpenAI, Gemini | Text-only output tokens |
216
+ | `webSearchRequests` | Anthropic, Gemini | Web search requests (server tools / grounding) |
217
+ | `acceptedPredictionTokens` | OpenAI | Predicted Outputs: accepted tokens |
218
+ | `rejectedPredictionTokens` | OpenAI | Predicted Outputs: rejected tokens (waste) |
219
+ | `serviceTier` | OpenAI | Service tier used (`default`, `flex`, `priority`) |
220
+ | `latencyMs` | All | SDK-measured request duration in milliseconds |
221
+ | `toolCalls` | All | Number of tool calls in the response |
222
+ | `model` | All | Model identifier as reported by the provider |
223
+
224
+ Cost is computed **server-side** from token counts and a rate card that auto-syncs daily from the LiteLLM registry (1,500+ models). Rate cards include per-token pricing for text, audio, image, video, cache, reasoning, and web search. Unknown models are priced at $0 and flagged in logs.
197
225
 
198
226
  ---
199
227
 
200
228
  ## Outcome-Based Pricing
201
229
 
202
- Under per-resolution pricing, only a **resolved** run earns revenue. An escalated or failed run earns $0 but its provider cost still counts. The pattern:
203
-
204
- 1. **Wrap** to meter cost on every LLM call (automatic).
205
- 2. **Resolve** once at the end to book the outcome.
230
+ Under per-resolution pricing, only a **resolved** run earns revenue. An escalated or failed run earns $0 but its provider cost still counts.
206
231
 
207
232
  ```ts
208
233
  const runId = 'ticket_8842';
@@ -211,7 +236,7 @@ const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
211
236
  runId,
212
237
  });
213
238
 
214
- // multiple LLM calls within this run
239
+ // ... multiple LLM calls within this run ...
215
240
 
216
241
  await tollgate.resolve({
217
242
  runId,
@@ -227,12 +252,9 @@ For simple per-call billing, pass `revenueUnitCents` in the wrap options and ski
227
252
 
228
253
  ## External Tool Costs
229
254
 
230
- AI agents often call external services image generation, code sandboxes, search APIs, vector databases — that cost real money outside of LLM token pricing. Report these costs alongside the LLM call so Tollgate includes them in the margin calculation.
255
+ Report costs from external services (image generation, code sandboxes, search APIs) alongside LLM calls:
231
256
 
232
257
  ```ts
233
- // Agent generates an image during the run
234
- const image = await dalle.generate({ prompt: '...' }); // costs $0.04
235
-
236
258
  await tollgate.track({
237
259
  customerId: 'cust_acme',
238
260
  runId: 'ticket_8842',
@@ -240,35 +262,21 @@ await tollgate.track({
240
262
  model: 'gpt-4o',
241
263
  tokensIn: 500,
242
264
  tokensOut: 200,
243
- toolCalls: 1, // LLM tool_use count (auto-extracted by wrappers)
244
265
  externalCostCents: 4.0, // $0.04 for the DALL-E call
245
266
  idempotencyKey: 'ticket_8842#step_2',
246
267
  });
247
268
  ```
248
269
 
249
- Common examples:
250
-
251
- | Service | Typical Cost | How to report |
252
- |---|---|---|
253
- | DALL-E image generation | ~$0.04/image | `externalCostCents: 4` |
254
- | E2B code sandbox | ~$0.01/run | `externalCostCents: 1` |
255
- | Tavily search API | ~$0.01/search | `externalCostCents: 1` |
256
- | Pinecone vector query | ~$0.001/query | `externalCostCents: 0.1` |
257
-
258
- External costs flow into the **Tools** bucket in the Customer Drawer's cost-split chart — so you can see exactly what share of each customer's cost comes from external services vs. LLM tokens.
259
-
260
270
  ---
261
271
 
262
272
  ## Customer & Plan Setup
263
273
 
264
- Create customers and assign plans **before** sending usage so plan-priced revenue is recognized from the first event. Idempotent — safe to run on every boot.
274
+ Create customers and assign plans before sending usage so plan-priced revenue is recognized from the first event. Idempotent.
265
275
 
266
276
  ```ts
267
277
  await tollgate.upsertCustomer({
268
278
  customerId: 'cust_acme',
269
279
  name: 'Acme Corp',
270
- company: 'Acme Corp',
271
- seats: 5,
272
280
  plan: {
273
281
  name: 'Pro Plan',
274
282
  pricingModel: 'usage_based', // per_unit | per_resolution | usage_based | per_seat | flat | hybrid
@@ -279,57 +287,26 @@ await tollgate.upsertCustomer({
279
287
 
280
288
  ---
281
289
 
282
- ## Manual Tracking
283
-
284
- For full control, unusual providers, or non-LLM cost events:
285
-
286
- ```ts
287
- await tollgate.track({
288
- customerId: 'cust_acme',
289
- runId: 'run_12345',
290
- provider: 'anthropic',
291
- model: 'claude-sonnet-4-6',
292
- tokensIn: 1200,
293
- tokensOut: 450,
294
- reasoningTokens: 0,
295
- cachedTokens: 0,
296
- toolCalls: 2,
297
- revenueUnitCents: 50,
298
- idempotencyKey: 'run_12345#step_1',
299
- });
300
- ```
301
-
302
- ### Already have an exact cost?
303
-
304
- Pass `providerCostCents` (a number or a function of the response) and the server uses it verbatim, skipping the rate card entirely:
305
-
306
- ```ts
307
- const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
308
- customerId: 'cust_acme',
309
- providerCostCents: 3.5, // or: (response) => computeMyOwnCost(response)
310
- });
311
- ```
312
-
313
- ---
314
-
315
290
  ## API Reference
316
291
 
317
292
  ### Exports
318
293
 
319
294
  ```ts
320
295
  // Client
321
- createTollgateClient(options?) // TollgateClient
296
+ createTollgateClient(options?) // -> TollgateClient
322
297
  TollgateError // Custom error with status & body
323
298
 
324
299
  // Auto-instrumentation wrappers
325
- wrapAnthropic(client, tollgate, options) // instrumented Anthropic client
326
- wrapOpenAI(client, tollgate, options) // instrumented OpenAI / compatible client
327
- wrapBedrock(client, tollgate, options) // instrumented Bedrock Runtime client
300
+ wrapAnthropic(client, tollgate, options) // -> instrumented Anthropic client
301
+ wrapOpenAI(client, tollgate, options) // -> instrumented OpenAI / compatible client
302
+ wrapBedrock(client, tollgate, options) // -> instrumented Bedrock Runtime client
303
+ wrapGemini(model, tollgate, options) // -> instrumented Gemini model
328
304
 
329
305
  // Low-level event builders (for manual track payloads)
330
- anthropicEventFrom(msg, options) // TrackEventInput | null
331
- openAIEventFrom(completion, options) // TrackEventInput | null
332
- bedrockEventFrom(usage, model, options) // TrackEventInput | null
306
+ anthropicEventFrom(msg, options) // -> TrackEventInput | null
307
+ openAIEventFrom(completion, options) // -> TrackEventInput | null
308
+ bedrockEventFrom(usage, model, options) // -> TrackEventInput | null
309
+ geminiEventFrom(response, options) // -> TrackEventInput | null
333
310
  ```
334
311
 
335
312
  ### TollgateClient
@@ -344,32 +321,54 @@ bedrockEventFrom(usage, model, options) // → TrackEventInput | null
344
321
 
345
322
  | Field | Type | Required | Description |
346
323
  |---|---|---|---|
347
- | `customerId` | `string` | Yes | Your end customer's stable identifier. |
348
- | `agentId` | `string` | No | Agent or workflow identifier. |
349
- | `runId` | `string \| () => string` | No | Logical run ID. Defaults to the provider response ID. |
350
- | `provider` | `Provider` | No | Override the reported provider (e.g. `'openai_compatible'`). |
351
- | `revenueUnitCents` | `number \| (response) => number` | No | Revenue per call in cents. |
352
- | `providerCostCents` | `number \| (response) => number` | No | Exact cost override skips rate card. |
353
- | `onError` | `(err) => void` | No | Error handler for background tracking (default: `console.warn`). |
324
+ | `customerId` | `string` | Yes | Your end customer's stable identifier |
325
+ | `agentId` | `string` | No | Agent or workflow identifier |
326
+ | `runId` | `string \| () => string` | No | Logical run ID (defaults to provider response ID) |
327
+ | `provider` | `Provider` | No | Override the reported provider |
328
+ | `revenueUnitCents` | `number \| (response) => number` | No | Revenue per call in cents |
329
+ | `providerCostCents` | `number \| (response) => number` | No | Exact cost override (skips rate card) |
330
+ | `onError` | `(err) => void` | No | Error handler for background tracking |
354
331
 
355
332
  ---
356
333
 
357
334
  ## How It Works
358
335
 
359
- 1. **Proxy wrappers** intercept `messages.create` / `chat.completions.create` / `send` without modifying the request or response.
360
- 2. After the provider responds, the wrapper extracts token counts, tool call counts, and metadata from the response's `usage` object and content blocks.
361
- 3. A `POST /api/track` is fired **in the background** — non-blocking, with automatic retries on transient failures.
362
- 4. The server computes cost from tokens via rate cards, joins it with your plan-configured revenue, and updates real-time margin rollups.
363
- 5. Events are **idempotent** on `idempotencyKey` (auto-set to the provider response ID), so retries and stream replays never double-count.
336
+ 1. **Proxy wrappers** intercept provider calls without modifying the request or response.
337
+ 2. After the provider responds, the wrapper extracts token counts (by modality), tool calls, service tier, and latency from the response.
338
+ 3. A `POST /api/track` fires **in the background** with automatic retries on transient failures.
339
+ 4. The server computes cost from tokens via rate cards (text, audio, image, video, cache, reasoning, web search), joins it with plan-configured revenue, and updates real-time margin rollups.
340
+ 5. Events are **idempotent** on `idempotencyKey` (auto-set to the provider response ID).
364
341
 
365
342
  ## Privacy & Security
366
343
 
367
344
  - **No prompt content is ever sent.** Only token counts, model name, and metadata.
368
- - Events are deduplicated server-side — safe to retry.
345
+ - Events are deduplicated server-side.
369
346
  - Background tracking never throws into your application code.
370
347
 
371
348
  ---
372
349
 
350
+ ## What's New in v0.6.0
351
+
352
+ - **Fix: Anthropic thinking token extraction** — `output_tokens_details.thinking_tokens` is now extracted and costed at the reasoning rate instead of the output rate. Previously, thinking tokens from extended thinking (Sonnet 4.x, Opus 4.x) were invisible to cost computation.
353
+ - **Fix: OpenAI double-counting** — `completion_tokens` includes reasoning and audio sub-totals; these are now subtracted from `tokensOut` so each token is costed at exactly one rate. Previously, reasoning tokens were billed at both the output rate and the reasoning rate.
354
+ - **Fix: OpenAI input double-counting** — `prompt_tokens` includes cached and audio sub-totals; these are now subtracted from `tokensIn`. Previously, cached tokens were billed at both the full input rate and the cached rate.
355
+ - **Fix: Multimodal-only events** — audio, image, video, and web search events now trigger rate-card lookup even when text token counts are zero.
356
+ - `reasoningTokens` is now extracted from **all three** providers: OpenAI, Anthropic, and Gemini.
357
+
358
+ ### v0.5.0
359
+
360
+ - Google Gemini / Vertex AI support (`wrapGemini`) with full multimodal extraction
361
+ - Audio token tracking (OpenAI GPT-4o audio / Realtime API)
362
+ - Image & video token tracking (Gemini per-modality breakdowns)
363
+ - Web search request tracking (Anthropic `server_tool_use`, Gemini grounding)
364
+ - Latency measurement on all wrappers (SDK-measured `latencyMs`)
365
+ - OpenAI Predicted Outputs (`acceptedPredictionTokens` / `rejectedPredictionTokens`)
366
+ - Service tier tracking (OpenAI `flex` / `priority`, Anthropic `priority`)
367
+ - Text modality split for accurate cost attribution in mixed-modal requests
368
+ - Expanded rate card sync: audio, image, video, and web search rates from LiteLLM
369
+
370
+ ---
371
+
373
372
  ## License
374
373
 
375
374
  Licensed for use with Tollgate.
package/dist/index.cjs CHANGED
@@ -162,6 +162,7 @@ function anthropicEventFrom(msg, opts) {
162
162
  const oneh = usage.cache_creation?.ephemeral_1h_input_tokens;
163
163
  const hasSplit = fivem !== void 0 || oneh !== void 0;
164
164
  const toolCalls = Array.isArray(msg.content) ? msg.content.filter((b) => b.type === "tool_use").length : 0;
165
+ const thinkingTokens = usage.output_tokens_details?.thinking_tokens ?? 0;
165
166
  const event = {
166
167
  customerId: opts.customerId,
167
168
  agentId: opts.agentId,
@@ -169,10 +170,12 @@ function anthropicEventFrom(msg, opts) {
169
170
  provider: opts.provider ?? "anthropic",
170
171
  model: msg.model ?? "unknown",
171
172
  tokensIn: usage.input_tokens ?? 0,
172
- tokensOut: usage.output_tokens ?? 0,
173
+ tokensOut: (usage.output_tokens ?? 0) - thinkingTokens,
174
+ reasoningTokens: thinkingTokens,
173
175
  cachedTokens: usage.cache_read_input_tokens ?? 0,
174
176
  cacheWrite5mTokens: hasSplit ? fivem ?? 0 : usage.cache_creation_input_tokens ?? 0,
175
177
  cacheWrite1hTokens: hasSplit ? oneh ?? 0 : 0,
178
+ webSearchRequests: usage.server_tool_use?.web_search_requests ?? 0,
176
179
  toolCalls,
177
180
  revenueUnitCents: resolveRevenue(opts, msg),
178
181
  idempotencyKey: msg.id ?? `${runId}#${randomId()}`
@@ -183,6 +186,7 @@ function wrapAnthropic(client, tollgate, opts) {
183
186
  const messages = client.messages;
184
187
  const original = messages.create.bind(messages);
185
188
  const create = async (...args) => {
189
+ const t0 = Date.now();
186
190
  const result = await original(...args);
187
191
  if (isAsyncIterable(result)) {
188
192
  const msg = {};
@@ -195,7 +199,11 @@ function wrapAnthropic(client, tollgate, opts) {
195
199
  msg.model = ev.message.model;
196
200
  msg.usage = { ...ev.message.usage };
197
201
  } else if (ev.type === "message_delta" && ev.usage) {
198
- msg.usage = { ...msg.usage ?? {}, output_tokens: ev.usage.output_tokens };
202
+ msg.usage = {
203
+ ...msg.usage ?? {},
204
+ output_tokens: ev.usage.output_tokens,
205
+ output_tokens_details: ev.usage.output_tokens_details
206
+ };
199
207
  } else if (ev.type === "content_block_start" && ev.content_block?.type === "tool_use") {
200
208
  toolUseBlocks.push(ev.content_block);
201
209
  }
@@ -203,12 +211,18 @@ function wrapAnthropic(client, tollgate, opts) {
203
211
  () => {
204
212
  msg.content = toolUseBlocks;
205
213
  const event2 = anthropicEventFrom(msg, opts);
206
- if (event2) fireAndForget(tollgate.track(event2), opts.onError);
214
+ if (event2) {
215
+ event2.latencyMs = Date.now() - t0;
216
+ fireAndForget(tollgate.track(event2), opts.onError);
217
+ }
207
218
  }
208
219
  );
209
220
  }
210
221
  const event = anthropicEventFrom(result, opts);
211
- if (event) fireAndForget(tollgate.track(event), opts.onError);
222
+ if (event) {
223
+ event.latencyMs = Date.now() - t0;
224
+ fireAndForget(tollgate.track(event), opts.onError);
225
+ }
212
226
  return result;
213
227
  };
214
228
  return new Proxy(client, {
@@ -227,16 +241,29 @@ function openAIEventFrom(completion, opts) {
227
241
  if (!usage) return null;
228
242
  const runId = resolveRunId(opts, completion.id);
229
243
  const toolCalls = completion.choices?.[0]?.message?.tool_calls?.length ?? 0;
244
+ const ptd = usage.prompt_tokens_details;
245
+ const ctd = usage.completion_tokens_details;
246
+ const cachedIn = ptd?.cached_tokens ?? 0;
247
+ const audioIn = ptd?.audio_tokens ?? 0;
248
+ const reasoningOut = ctd?.reasoning_tokens ?? 0;
249
+ const audioOut = ctd?.audio_tokens ?? 0;
230
250
  const event = {
231
251
  customerId: opts.customerId,
232
252
  agentId: opts.agentId,
233
253
  runId,
234
254
  provider: opts.provider ?? "openai",
235
255
  model: completion.model ?? "unknown",
236
- tokensIn: usage.prompt_tokens ?? 0,
237
- tokensOut: usage.completion_tokens ?? 0,
238
- reasoningTokens: usage.completion_tokens_details?.reasoning_tokens ?? 0,
239
- cachedTokens: usage.prompt_tokens_details?.cached_tokens ?? 0,
256
+ tokensIn: (usage.prompt_tokens ?? 0) - cachedIn - audioIn,
257
+ tokensOut: (usage.completion_tokens ?? 0) - reasoningOut - audioOut,
258
+ reasoningTokens: reasoningOut,
259
+ cachedTokens: cachedIn,
260
+ audioTokensIn: audioIn,
261
+ audioTokensOut: audioOut,
262
+ textTokensIn: ptd?.text_tokens ?? 0,
263
+ textTokensOut: ctd?.text_tokens ?? 0,
264
+ acceptedPredictionTokens: ctd?.accepted_prediction_tokens ?? 0,
265
+ rejectedPredictionTokens: ctd?.rejected_prediction_tokens ?? 0,
266
+ serviceTier: completion.service_tier,
240
267
  toolCalls,
241
268
  revenueUnitCents: resolveRevenue(opts, completion),
242
269
  idempotencyKey: completion.id ?? `${runId}#${randomId()}`
@@ -247,11 +274,13 @@ function wrapOpenAI(client, tollgate, opts) {
247
274
  const completions = client.chat.completions;
248
275
  const original = completions.create.bind(completions);
249
276
  const create = async (...args) => {
277
+ const t0 = Date.now();
250
278
  const result = await original(...args);
251
279
  if (isAsyncIterable(result)) {
252
280
  let id;
253
281
  let model;
254
282
  let usage;
283
+ let serviceTier;
255
284
  const toolCallIndices = /* @__PURE__ */ new Set();
256
285
  return instrumentStream(
257
286
  result,
@@ -259,6 +288,7 @@ function wrapOpenAI(client, tollgate, opts) {
259
288
  if (chunk.id) id = chunk.id;
260
289
  if (chunk.model) model = chunk.model;
261
290
  if (chunk.usage) usage = chunk.usage;
291
+ if (chunk.service_tier) serviceTier = chunk.service_tier;
262
292
  for (const c of chunk.choices ?? []) {
263
293
  for (const tc of c.delta?.tool_calls ?? []) {
264
294
  if (tc.index !== void 0) toolCallIndices.add(tc.index);
@@ -267,17 +297,23 @@ function wrapOpenAI(client, tollgate, opts) {
267
297
  },
268
298
  () => {
269
299
  if (!usage) return;
270
- const synth = { id, model, usage };
300
+ const synth = { id, model, usage, service_tier: serviceTier };
271
301
  if (toolCallIndices.size > 0) {
272
302
  synth.choices = [{ message: { tool_calls: new Array(toolCallIndices.size) } }];
273
303
  }
274
304
  const event2 = openAIEventFrom(synth, opts);
275
- if (event2) fireAndForget(tollgate.track(event2), opts.onError);
305
+ if (event2) {
306
+ event2.latencyMs = Date.now() - t0;
307
+ fireAndForget(tollgate.track(event2), opts.onError);
308
+ }
276
309
  }
277
310
  );
278
311
  }
279
312
  const event = openAIEventFrom(result, opts);
280
- if (event) fireAndForget(tollgate.track(event), opts.onError);
313
+ if (event) {
314
+ event.latencyMs = Date.now() - t0;
315
+ fireAndForget(tollgate.track(event), opts.onError);
316
+ }
281
317
  return result;
282
318
  };
283
319
  return new Proxy(client, {
@@ -316,6 +352,7 @@ function bedrockEventFrom(usage, model, opts, response = void 0, toolCalls = 0)
316
352
  function wrapBedrock(client, tollgate, opts) {
317
353
  const originalSend = client.send.bind(client);
318
354
  const send = async (command, ...rest) => {
355
+ const t0 = Date.now();
319
356
  const result = await originalSend(command, ...rest);
320
357
  const model = command?.input?.modelId ?? "unknown";
321
358
  if (result?.stream && isAsyncIterable(result.stream)) {
@@ -329,7 +366,10 @@ function wrapBedrock(client, tollgate, opts) {
329
366
  },
330
367
  () => {
331
368
  const event = bedrockEventFrom(usage, model, opts, result, streamToolCalls);
332
- if (event) fireAndForget(tollgate.track(event), opts.onError);
369
+ if (event) {
370
+ event.latencyMs = Date.now() - t0;
371
+ fireAndForget(tollgate.track(event), opts.onError);
372
+ }
333
373
  }
334
374
  );
335
375
  return result;
@@ -337,7 +377,10 @@ function wrapBedrock(client, tollgate, opts) {
337
377
  if (result?.usage) {
338
378
  const tc = result.output?.message?.content?.filter((b) => b.toolUse != null).length ?? 0;
339
379
  const event = bedrockEventFrom(result.usage, model, opts, result, tc);
340
- if (event) fireAndForget(tollgate.track(event), opts.onError);
380
+ if (event) {
381
+ event.latencyMs = Date.now() - t0;
382
+ fireAndForget(tollgate.track(event), opts.onError);
383
+ }
341
384
  }
342
385
  return result;
343
386
  };
@@ -348,14 +391,110 @@ function wrapBedrock(client, tollgate, opts) {
348
391
  }
349
392
  });
350
393
  }
394
+ function modalityTokens(details, modality) {
395
+ if (!details) return 0;
396
+ return details.filter((d) => d.modality === modality).reduce((sum, d) => sum + (d.tokenCount ?? 0), 0);
397
+ }
398
+ function geminiEventFrom(response, opts) {
399
+ const usage = response?.usageMetadata;
400
+ if (!usage) return null;
401
+ const runId = resolveRunId(opts, void 0);
402
+ const candidates = response.candidates ?? [];
403
+ const toolCalls = candidates.reduce((sum, c) => {
404
+ const parts = c.content?.parts ?? [];
405
+ return sum + parts.filter((p) => p.functionCall != null).length;
406
+ }, 0);
407
+ const webSearchRequests = candidates.reduce((sum, c) => {
408
+ return sum + (c.groundingMetadata?.webSearchQueries?.length ?? 0);
409
+ }, 0);
410
+ const promptDetails = usage.promptTokensDetails;
411
+ const candidateDetails = usage.candidatesTokensDetails;
412
+ const event = {
413
+ customerId: opts.customerId,
414
+ agentId: opts.agentId,
415
+ runId,
416
+ provider: opts.provider ?? "google",
417
+ model: "unknown",
418
+ tokensIn: usage.promptTokenCount ?? 0,
419
+ tokensOut: usage.candidatesTokenCount ?? 0,
420
+ reasoningTokens: usage.thoughtsTokenCount ?? 0,
421
+ cachedTokens: usage.cachedContentTokenCount ?? 0,
422
+ audioTokensIn: modalityTokens(promptDetails, "AUDIO"),
423
+ audioTokensOut: modalityTokens(candidateDetails, "AUDIO"),
424
+ imageTokensIn: modalityTokens(promptDetails, "IMAGE"),
425
+ imageTokensOut: modalityTokens(candidateDetails, "IMAGE"),
426
+ videoTokensIn: modalityTokens(promptDetails, "VIDEO"),
427
+ textTokensIn: modalityTokens(promptDetails, "TEXT"),
428
+ textTokensOut: modalityTokens(candidateDetails, "TEXT"),
429
+ webSearchRequests,
430
+ toolCalls,
431
+ revenueUnitCents: resolveRevenue(opts, response),
432
+ idempotencyKey: `${runId}#${randomId()}`
433
+ };
434
+ return withCost(event, opts, response);
435
+ }
436
+ function wrapGemini(model, tollgate, opts) {
437
+ const original = model.generateContent.bind(model);
438
+ const modelName = model.model ?? "unknown";
439
+ const generateContent = async (...args) => {
440
+ const t0 = Date.now();
441
+ const result = await original(...args);
442
+ if (isAsyncIterable(result)) {
443
+ const accumulated = {};
444
+ let toolCallCount = 0;
445
+ let searchCount = 0;
446
+ return instrumentStream(
447
+ result,
448
+ (chunk) => {
449
+ if (chunk.usageMetadata) {
450
+ Object.assign(accumulated, chunk.usageMetadata);
451
+ }
452
+ for (const c of chunk.candidates ?? []) {
453
+ for (const p of c.content?.parts ?? []) {
454
+ if (p.functionCall != null) toolCallCount++;
455
+ }
456
+ searchCount += c.groundingMetadata?.webSearchQueries?.length ?? 0;
457
+ }
458
+ },
459
+ () => {
460
+ const synth = {
461
+ usageMetadata: accumulated,
462
+ candidates: searchCount > 0 || toolCallCount > 0 ? [{ content: { parts: new Array(toolCallCount).fill({ functionCall: {} }) }, groundingMetadata: { webSearchQueries: new Array(searchCount) } }] : []
463
+ };
464
+ const event2 = geminiEventFrom(synth, opts);
465
+ if (event2) {
466
+ event2.model = modelName;
467
+ event2.latencyMs = Date.now() - t0;
468
+ fireAndForget(tollgate.track(event2), opts.onError);
469
+ }
470
+ }
471
+ );
472
+ }
473
+ const event = geminiEventFrom(result, opts);
474
+ if (event) {
475
+ event.model = modelName;
476
+ event.latencyMs = Date.now() - t0;
477
+ fireAndForget(tollgate.track(event), opts.onError);
478
+ }
479
+ return result;
480
+ };
481
+ return new Proxy(model, {
482
+ get(target, prop, recv) {
483
+ if (prop === "generateContent") return generateContent;
484
+ return Reflect.get(target, prop, recv);
485
+ }
486
+ });
487
+ }
351
488
 
352
489
  exports.TollgateError = TollgateError;
353
490
  exports.anthropicEventFrom = anthropicEventFrom;
354
491
  exports.bedrockEventFrom = bedrockEventFrom;
355
492
  exports.createTollgateClient = createTollgateClient;
493
+ exports.geminiEventFrom = geminiEventFrom;
356
494
  exports.openAIEventFrom = openAIEventFrom;
357
495
  exports.wrapAnthropic = wrapAnthropic;
358
496
  exports.wrapBedrock = wrapBedrock;
497
+ exports.wrapGemini = wrapGemini;
359
498
  exports.wrapOpenAI = wrapOpenAI;
360
499
  //# sourceMappingURL=index.cjs.map
361
500
  //# sourceMappingURL=index.cjs.map