@tollgateai/sdk 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +104 -105
- package/dist/index.cjs +152 -13
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +74 -2
- package/dist/index.d.ts +74 -2
- package/dist/index.js +151 -14
- package/dist/index.js.map +1 -1
- package/package.json +5 -3
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tollgate
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -2,13 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
> Real-time gross-margin observability for AI agents. Track every LLM call's cost, attribute it to a customer, and see whether you're making money — before the invoice goes out.
|
|
4
4
|
|
|
5
|
-
**v0.
|
|
5
|
+
**v0.6.0** · [npm](https://www.npmjs.com/package/@tollgateai/sdk) · [Dashboard](https://tollgateai.vercel.app)
|
|
6
6
|
|
|
7
7
|
---
|
|
8
8
|
|
|
9
9
|
## Why Tollgate
|
|
10
10
|
|
|
11
|
-
You sell an AI-powered product. Each customer interaction triggers LLM calls that cost you real money — input tokens, output tokens, reasoning tokens, cached tokens, tool calls. Tollgate captures that cost automatically from provider responses, joins it with the revenue your pricing model defines, and shows you per-customer, per-agent, per-run gross margin in real time.
|
|
11
|
+
You sell an AI-powered product. Each customer interaction triggers LLM calls that cost you real money — input tokens, output tokens, reasoning tokens, audio tokens, cached tokens, web searches, tool calls. Tollgate captures that cost automatically from provider responses, joins it with the revenue your pricing model defines, and shows you per-customer, per-agent, per-run gross margin in real time.
|
|
12
12
|
|
|
13
13
|
## Installation
|
|
14
14
|
|
|
@@ -34,11 +34,11 @@ const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
|
|
|
34
34
|
runId: 'ticket_8842',
|
|
35
35
|
});
|
|
36
36
|
|
|
37
|
-
// Every call is tracked automatically — tokens, cost, tool calls.
|
|
37
|
+
// Every call is tracked automatically — tokens, cost, latency, tool calls.
|
|
38
38
|
const msg = await anthropic.messages.create({
|
|
39
39
|
model: 'claude-sonnet-4-6',
|
|
40
40
|
max_tokens: 1024,
|
|
41
|
-
messages: [{ role: 'user', content: 'Resolve this billing dispute
|
|
41
|
+
messages: [{ role: 'user', content: 'Resolve this billing dispute...' }],
|
|
42
42
|
});
|
|
43
43
|
|
|
44
44
|
// Close the run and book revenue.
|
|
@@ -52,12 +52,13 @@ await tollgate.resolve({
|
|
|
52
52
|
|
|
53
53
|
## Provider Support
|
|
54
54
|
|
|
55
|
-
| Provider | Wrapper | Streaming |
|
|
55
|
+
| Provider | Wrapper | Streaming | What Gets Extracted |
|
|
56
56
|
|---|---|---|---|
|
|
57
|
-
| Anthropic | `wrapAnthropic` | Automatic |
|
|
58
|
-
| OpenAI | `wrapOpenAI` |
|
|
59
|
-
|
|
|
60
|
-
|
|
|
57
|
+
| **Anthropic** | `wrapAnthropic` | Automatic | Tokens, thinking/reasoning, cache (read + write by TTL), web search requests, tool calls, latency |
|
|
58
|
+
| **OpenAI** | `wrapOpenAI` | `stream_options: { include_usage: true }` | Tokens, reasoning, cached, audio in/out, text in/out, prediction tokens, service tier, tool calls, latency |
|
|
59
|
+
| **Google Gemini** | `wrapGemini` | Automatic | Tokens, thinking, cached, audio/image/video per-modality, web search (grounding), tool calls, latency |
|
|
60
|
+
| **OpenAI-compatible** | `wrapOpenAI` + `provider: 'openai_compatible'` | Same as OpenAI | Same as OpenAI |
|
|
61
|
+
| **AWS Bedrock** | `wrapBedrock` | Automatic | Tokens, cache (read + write), tool calls, latency |
|
|
61
62
|
|
|
62
63
|
## Configuration
|
|
63
64
|
|
|
@@ -81,7 +82,7 @@ const tollgate = createTollgateClient({
|
|
|
81
82
|
|
|
82
83
|
## Auto-Instrumentation
|
|
83
84
|
|
|
84
|
-
Wrap your provider client once. Every `create` call reports usage in the background — non-blocking, fire-and-forget. Failures go to `onError` (default: `console.warn`) and never break your LLM call.
|
|
85
|
+
Wrap your provider client once. Every `create` / `generateContent` call reports usage in the background — non-blocking, fire-and-forget. Failures go to `onError` (default: `console.warn`) and never break your LLM call.
|
|
85
86
|
|
|
86
87
|
### Anthropic
|
|
87
88
|
|
|
@@ -98,7 +99,7 @@ const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
|
|
|
98
99
|
await anthropic.messages.create({
|
|
99
100
|
model: 'claude-sonnet-4-6',
|
|
100
101
|
max_tokens: 512,
|
|
101
|
-
messages: [{ role: 'user', content: 'Summarize this ticket
|
|
102
|
+
messages: [{ role: 'user', content: 'Summarize this ticket...' }],
|
|
102
103
|
});
|
|
103
104
|
```
|
|
104
105
|
|
|
@@ -117,6 +118,23 @@ await openai.chat.completions.create({
|
|
|
117
118
|
});
|
|
118
119
|
```
|
|
119
120
|
|
|
121
|
+
### Google Gemini
|
|
122
|
+
|
|
123
|
+
```ts
|
|
124
|
+
import { GoogleGenerativeAI } from '@google/generative-ai';
|
|
125
|
+
import { createTollgateClient, wrapGemini } from '@tollgateai/sdk';
|
|
126
|
+
|
|
127
|
+
const tollgate = createTollgateClient();
|
|
128
|
+
const genai = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
|
|
129
|
+
const model = wrapGemini(
|
|
130
|
+
genai.getGenerativeModel({ model: 'gemini-2.0-flash' }),
|
|
131
|
+
tollgate,
|
|
132
|
+
{ customerId: 'cust_acme' },
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
const result = await model.generateContent('Explain quantum computing');
|
|
136
|
+
```
|
|
137
|
+
|
|
120
138
|
### OpenAI-Compatible Gateways
|
|
121
139
|
|
|
122
140
|
Point the OpenAI SDK at any compatible endpoint and set `provider: 'openai_compatible'`:
|
|
@@ -159,9 +177,9 @@ await bedrock.send(new ConverseCommand({
|
|
|
159
177
|
|
|
160
178
|
### Streaming
|
|
161
179
|
|
|
162
|
-
Streaming is captured automatically
|
|
180
|
+
Streaming is captured automatically. Iterate the stream as usual — usage and latency are reported when the stream ends.
|
|
163
181
|
|
|
164
|
-
**OpenAI / compatible** requires `stream_options: { include_usage: true }
|
|
182
|
+
**OpenAI / compatible** requires `stream_options: { include_usage: true }`. **Anthropic**, **Gemini**, and **Bedrock** need no extra flags.
|
|
165
183
|
|
|
166
184
|
```ts
|
|
167
185
|
const stream = await openai.chat.completions.create({
|
|
@@ -171,38 +189,45 @@ const stream = await openai.chat.completions.create({
|
|
|
171
189
|
messages: [{ role: 'user', content: 'Hello' }],
|
|
172
190
|
});
|
|
173
191
|
for await (const chunk of stream) { /* render to UI */ }
|
|
174
|
-
// Usage reported automatically when stream ends.
|
|
192
|
+
// Usage + latency reported automatically when stream ends.
|
|
175
193
|
```
|
|
176
194
|
|
|
177
195
|
---
|
|
178
196
|
|
|
179
197
|
## What Gets Tracked
|
|
180
198
|
|
|
181
|
-
Every auto-instrumented call captures
|
|
199
|
+
Every auto-instrumented call captures these fields from the provider response:
|
|
182
200
|
|
|
183
|
-
| Field |
|
|
201
|
+
| Field | Providers | Description |
|
|
184
202
|
|---|---|---|
|
|
185
|
-
| `tokensIn` |
|
|
186
|
-
| `tokensOut` |
|
|
187
|
-
| `reasoningTokens` |
|
|
188
|
-
| `cachedTokens` |
|
|
189
|
-
| `cacheWrite5mTokens` |
|
|
190
|
-
| `cacheWrite1hTokens` |
|
|
191
|
-
| `
|
|
192
|
-
| `
|
|
193
|
-
| `
|
|
194
|
-
| `
|
|
195
|
-
|
|
196
|
-
|
|
203
|
+
| `tokensIn` | All | Input tokens consumed |
|
|
204
|
+
| `tokensOut` | All | Output tokens generated |
|
|
205
|
+
| `reasoningTokens` | OpenAI, Anthropic, Gemini | Reasoning/thinking tokens (billed at reasoning rate) |
|
|
206
|
+
| `cachedTokens` | All | Prompt cache read tokens (reduced rate) |
|
|
207
|
+
| `cacheWrite5mTokens` | Anthropic, Bedrock | 5-min TTL cache creation tokens |
|
|
208
|
+
| `cacheWrite1hTokens` | Anthropic | 1-hour TTL cache creation tokens |
|
|
209
|
+
| `audioTokensIn` | OpenAI | Audio input tokens (GPT-4o audio / Realtime) |
|
|
210
|
+
| `audioTokensOut` | OpenAI, Gemini | Audio output tokens |
|
|
211
|
+
| `imageTokensIn` | Gemini | Image/vision input tokens |
|
|
212
|
+
| `imageTokensOut` | Gemini | Image generation output tokens |
|
|
213
|
+
| `videoTokensIn` | Gemini | Video input tokens |
|
|
214
|
+
| `textTokensIn` | OpenAI, Gemini | Text-only input tokens (modality split) |
|
|
215
|
+
| `textTokensOut` | OpenAI, Gemini | Text-only output tokens |
|
|
216
|
+
| `webSearchRequests` | Anthropic, Gemini | Web search requests (server tools / grounding) |
|
|
217
|
+
| `acceptedPredictionTokens` | OpenAI | Predicted Outputs: accepted tokens |
|
|
218
|
+
| `rejectedPredictionTokens` | OpenAI | Predicted Outputs: rejected tokens (waste) |
|
|
219
|
+
| `serviceTier` | OpenAI | Service tier used (`default`, `flex`, `priority`) |
|
|
220
|
+
| `latencyMs` | All | SDK-measured request duration in milliseconds |
|
|
221
|
+
| `toolCalls` | All | Number of tool calls in the response |
|
|
222
|
+
| `model` | All | Model identifier as reported by the provider |
|
|
223
|
+
|
|
224
|
+
Cost is computed **server-side** from token counts and a rate card that auto-syncs daily from the LiteLLM registry (1,500+ models). Rate cards include per-token pricing for text, audio, image, video, cache, reasoning, and web search. Unknown models are priced at $0 and flagged in logs.
|
|
197
225
|
|
|
198
226
|
---
|
|
199
227
|
|
|
200
228
|
## Outcome-Based Pricing
|
|
201
229
|
|
|
202
|
-
Under per-resolution pricing, only a **resolved** run earns revenue. An escalated or failed run earns $0 but its provider cost still counts.
|
|
203
|
-
|
|
204
|
-
1. **Wrap** to meter cost on every LLM call (automatic).
|
|
205
|
-
2. **Resolve** once at the end to book the outcome.
|
|
230
|
+
Under per-resolution pricing, only a **resolved** run earns revenue. An escalated or failed run earns $0 but its provider cost still counts.
|
|
206
231
|
|
|
207
232
|
```ts
|
|
208
233
|
const runId = 'ticket_8842';
|
|
@@ -211,7 +236,7 @@ const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
|
|
|
211
236
|
runId,
|
|
212
237
|
});
|
|
213
238
|
|
|
214
|
-
//
|
|
239
|
+
// ... multiple LLM calls within this run ...
|
|
215
240
|
|
|
216
241
|
await tollgate.resolve({
|
|
217
242
|
runId,
|
|
@@ -227,12 +252,9 @@ For simple per-call billing, pass `revenueUnitCents` in the wrap options and ski
|
|
|
227
252
|
|
|
228
253
|
## External Tool Costs
|
|
229
254
|
|
|
230
|
-
|
|
255
|
+
Report costs from external services (image generation, code sandboxes, search APIs) alongside LLM calls:
|
|
231
256
|
|
|
232
257
|
```ts
|
|
233
|
-
// Agent generates an image during the run
|
|
234
|
-
const image = await dalle.generate({ prompt: '...' }); // costs $0.04
|
|
235
|
-
|
|
236
258
|
await tollgate.track({
|
|
237
259
|
customerId: 'cust_acme',
|
|
238
260
|
runId: 'ticket_8842',
|
|
@@ -240,35 +262,21 @@ await tollgate.track({
|
|
|
240
262
|
model: 'gpt-4o',
|
|
241
263
|
tokensIn: 500,
|
|
242
264
|
tokensOut: 200,
|
|
243
|
-
toolCalls: 1, // LLM tool_use count (auto-extracted by wrappers)
|
|
244
265
|
externalCostCents: 4.0, // $0.04 for the DALL-E call
|
|
245
266
|
idempotencyKey: 'ticket_8842#step_2',
|
|
246
267
|
});
|
|
247
268
|
```
|
|
248
269
|
|
|
249
|
-
Common examples:
|
|
250
|
-
|
|
251
|
-
| Service | Typical Cost | How to report |
|
|
252
|
-
|---|---|---|
|
|
253
|
-
| DALL-E image generation | ~$0.04/image | `externalCostCents: 4` |
|
|
254
|
-
| E2B code sandbox | ~$0.01/run | `externalCostCents: 1` |
|
|
255
|
-
| Tavily search API | ~$0.01/search | `externalCostCents: 1` |
|
|
256
|
-
| Pinecone vector query | ~$0.001/query | `externalCostCents: 0.1` |
|
|
257
|
-
|
|
258
|
-
External costs flow into the **Tools** bucket in the Customer Drawer's cost-split chart — so you can see exactly what share of each customer's cost comes from external services vs. LLM tokens.
|
|
259
|
-
|
|
260
270
|
---
|
|
261
271
|
|
|
262
272
|
## Customer & Plan Setup
|
|
263
273
|
|
|
264
|
-
Create customers and assign plans
|
|
274
|
+
Create customers and assign plans before sending usage so plan-priced revenue is recognized from the first event. Idempotent.
|
|
265
275
|
|
|
266
276
|
```ts
|
|
267
277
|
await tollgate.upsertCustomer({
|
|
268
278
|
customerId: 'cust_acme',
|
|
269
279
|
name: 'Acme Corp',
|
|
270
|
-
company: 'Acme Corp',
|
|
271
|
-
seats: 5,
|
|
272
280
|
plan: {
|
|
273
281
|
name: 'Pro Plan',
|
|
274
282
|
pricingModel: 'usage_based', // per_unit | per_resolution | usage_based | per_seat | flat | hybrid
|
|
@@ -279,57 +287,26 @@ await tollgate.upsertCustomer({
|
|
|
279
287
|
|
|
280
288
|
---
|
|
281
289
|
|
|
282
|
-
## Manual Tracking
|
|
283
|
-
|
|
284
|
-
For full control, unusual providers, or non-LLM cost events:
|
|
285
|
-
|
|
286
|
-
```ts
|
|
287
|
-
await tollgate.track({
|
|
288
|
-
customerId: 'cust_acme',
|
|
289
|
-
runId: 'run_12345',
|
|
290
|
-
provider: 'anthropic',
|
|
291
|
-
model: 'claude-sonnet-4-6',
|
|
292
|
-
tokensIn: 1200,
|
|
293
|
-
tokensOut: 450,
|
|
294
|
-
reasoningTokens: 0,
|
|
295
|
-
cachedTokens: 0,
|
|
296
|
-
toolCalls: 2,
|
|
297
|
-
revenueUnitCents: 50,
|
|
298
|
-
idempotencyKey: 'run_12345#step_1',
|
|
299
|
-
});
|
|
300
|
-
```
|
|
301
|
-
|
|
302
|
-
### Already have an exact cost?
|
|
303
|
-
|
|
304
|
-
Pass `providerCostCents` (a number or a function of the response) and the server uses it verbatim, skipping the rate card entirely:
|
|
305
|
-
|
|
306
|
-
```ts
|
|
307
|
-
const anthropic = wrapAnthropic(new Anthropic(), tollgate, {
|
|
308
|
-
customerId: 'cust_acme',
|
|
309
|
-
providerCostCents: 3.5, // or: (response) => computeMyOwnCost(response)
|
|
310
|
-
});
|
|
311
|
-
```
|
|
312
|
-
|
|
313
|
-
---
|
|
314
|
-
|
|
315
290
|
## API Reference
|
|
316
291
|
|
|
317
292
|
### Exports
|
|
318
293
|
|
|
319
294
|
```ts
|
|
320
295
|
// Client
|
|
321
|
-
createTollgateClient(options?) //
|
|
296
|
+
createTollgateClient(options?) // -> TollgateClient
|
|
322
297
|
TollgateError // Custom error with status & body
|
|
323
298
|
|
|
324
299
|
// Auto-instrumentation wrappers
|
|
325
|
-
wrapAnthropic(client, tollgate, options)
|
|
326
|
-
wrapOpenAI(client, tollgate, options)
|
|
327
|
-
wrapBedrock(client, tollgate, options)
|
|
300
|
+
wrapAnthropic(client, tollgate, options) // -> instrumented Anthropic client
|
|
301
|
+
wrapOpenAI(client, tollgate, options) // -> instrumented OpenAI / compatible client
|
|
302
|
+
wrapBedrock(client, tollgate, options) // -> instrumented Bedrock Runtime client
|
|
303
|
+
wrapGemini(model, tollgate, options) // -> instrumented Gemini model
|
|
328
304
|
|
|
329
305
|
// Low-level event builders (for manual track payloads)
|
|
330
|
-
anthropicEventFrom(msg, options)
|
|
331
|
-
openAIEventFrom(completion, options)
|
|
332
|
-
bedrockEventFrom(usage, model, options)
|
|
306
|
+
anthropicEventFrom(msg, options) // -> TrackEventInput | null
|
|
307
|
+
openAIEventFrom(completion, options) // -> TrackEventInput | null
|
|
308
|
+
bedrockEventFrom(usage, model, options) // -> TrackEventInput | null
|
|
309
|
+
geminiEventFrom(response, options) // -> TrackEventInput | null
|
|
333
310
|
```
|
|
334
311
|
|
|
335
312
|
### TollgateClient
|
|
@@ -344,32 +321,54 @@ bedrockEventFrom(usage, model, options) // → TrackEventInput | null
|
|
|
344
321
|
|
|
345
322
|
| Field | Type | Required | Description |
|
|
346
323
|
|---|---|---|---|
|
|
347
|
-
| `customerId` | `string` | Yes | Your end customer's stable identifier
|
|
348
|
-
| `agentId` | `string` | No | Agent or workflow identifier
|
|
349
|
-
| `runId` | `string \| () => string` | No | Logical run ID
|
|
350
|
-
| `provider` | `Provider` | No | Override the reported provider
|
|
351
|
-
| `revenueUnitCents` | `number \| (response) => number` | No | Revenue per call in cents
|
|
352
|
-
| `providerCostCents` | `number \| (response) => number` | No | Exact cost override
|
|
353
|
-
| `onError` | `(err) => void` | No | Error handler for background tracking
|
|
324
|
+
| `customerId` | `string` | Yes | Your end customer's stable identifier |
|
|
325
|
+
| `agentId` | `string` | No | Agent or workflow identifier |
|
|
326
|
+
| `runId` | `string \| () => string` | No | Logical run ID (defaults to provider response ID) |
|
|
327
|
+
| `provider` | `Provider` | No | Override the reported provider |
|
|
328
|
+
| `revenueUnitCents` | `number \| (response) => number` | No | Revenue per call in cents |
|
|
329
|
+
| `providerCostCents` | `number \| (response) => number` | No | Exact cost override (skips rate card) |
|
|
330
|
+
| `onError` | `(err) => void` | No | Error handler for background tracking |
|
|
354
331
|
|
|
355
332
|
---
|
|
356
333
|
|
|
357
334
|
## How It Works
|
|
358
335
|
|
|
359
|
-
1. **Proxy wrappers** intercept
|
|
360
|
-
2. After the provider responds, the wrapper extracts token counts, tool
|
|
361
|
-
3. A `POST /api/track`
|
|
362
|
-
4. The server computes cost from tokens via rate cards, joins it with
|
|
363
|
-
5. Events are **idempotent** on `idempotencyKey` (auto-set to the provider response ID)
|
|
336
|
+
1. **Proxy wrappers** intercept provider calls without modifying the request or response.
|
|
337
|
+
2. After the provider responds, the wrapper extracts token counts (by modality), tool calls, service tier, and latency from the response.
|
|
338
|
+
3. A `POST /api/track` fires **in the background** with automatic retries on transient failures.
|
|
339
|
+
4. The server computes cost from tokens via rate cards (text, audio, image, video, cache, reasoning, web search), joins it with plan-configured revenue, and updates real-time margin rollups.
|
|
340
|
+
5. Events are **idempotent** on `idempotencyKey` (auto-set to the provider response ID).
|
|
364
341
|
|
|
365
342
|
## Privacy & Security
|
|
366
343
|
|
|
367
344
|
- **No prompt content is ever sent.** Only token counts, model name, and metadata.
|
|
368
|
-
- Events are deduplicated server-side
|
|
345
|
+
- Events are deduplicated server-side.
|
|
369
346
|
- Background tracking never throws into your application code.
|
|
370
347
|
|
|
371
348
|
---
|
|
372
349
|
|
|
350
|
+
## What's New in v0.6.0
|
|
351
|
+
|
|
352
|
+
- **Fix: Anthropic thinking token extraction** — `output_tokens_details.thinking_tokens` is now extracted and costed at the reasoning rate instead of the output rate. Previously, thinking tokens from extended thinking (Sonnet 4.x, Opus 4.x) were invisible to cost computation.
|
|
353
|
+
- **Fix: OpenAI double-counting** — `completion_tokens` includes reasoning and audio sub-totals; these are now subtracted from `tokensOut` so each token is costed at exactly one rate. Previously, reasoning tokens were billed at both the output rate and the reasoning rate.
|
|
354
|
+
- **Fix: OpenAI input double-counting** — `prompt_tokens` includes cached and audio sub-totals; these are now subtracted from `tokensIn`. Previously, cached tokens were billed at both the full input rate and the cached rate.
|
|
355
|
+
- **Fix: Multimodal-only events** — audio, image, video, and web search events now trigger rate-card lookup even when text token counts are zero.
|
|
356
|
+
- `reasoningTokens` is now extracted from **all three** providers: OpenAI, Anthropic, and Gemini.
|
|
357
|
+
|
|
358
|
+
### v0.5.0
|
|
359
|
+
|
|
360
|
+
- Google Gemini / Vertex AI support (`wrapGemini`) with full multimodal extraction
|
|
361
|
+
- Audio token tracking (OpenAI GPT-4o audio / Realtime API)
|
|
362
|
+
- Image & video token tracking (Gemini per-modality breakdowns)
|
|
363
|
+
- Web search request tracking (Anthropic `server_tool_use`, Gemini grounding)
|
|
364
|
+
- Latency measurement on all wrappers (SDK-measured `latencyMs`)
|
|
365
|
+
- OpenAI Predicted Outputs (`acceptedPredictionTokens` / `rejectedPredictionTokens`)
|
|
366
|
+
- Service tier tracking (OpenAI `flex` / `priority`, Anthropic `priority`)
|
|
367
|
+
- Text modality split for accurate cost attribution in mixed-modal requests
|
|
368
|
+
- Expanded rate card sync: audio, image, video, and web search rates from LiteLLM
|
|
369
|
+
|
|
370
|
+
---
|
|
371
|
+
|
|
373
372
|
## License
|
|
374
373
|
|
|
375
374
|
Licensed for use with Tollgate.
|
package/dist/index.cjs
CHANGED
|
@@ -162,6 +162,7 @@ function anthropicEventFrom(msg, opts) {
|
|
|
162
162
|
const oneh = usage.cache_creation?.ephemeral_1h_input_tokens;
|
|
163
163
|
const hasSplit = fivem !== void 0 || oneh !== void 0;
|
|
164
164
|
const toolCalls = Array.isArray(msg.content) ? msg.content.filter((b) => b.type === "tool_use").length : 0;
|
|
165
|
+
const thinkingTokens = usage.output_tokens_details?.thinking_tokens ?? 0;
|
|
165
166
|
const event = {
|
|
166
167
|
customerId: opts.customerId,
|
|
167
168
|
agentId: opts.agentId,
|
|
@@ -169,10 +170,12 @@ function anthropicEventFrom(msg, opts) {
|
|
|
169
170
|
provider: opts.provider ?? "anthropic",
|
|
170
171
|
model: msg.model ?? "unknown",
|
|
171
172
|
tokensIn: usage.input_tokens ?? 0,
|
|
172
|
-
tokensOut: usage.output_tokens ?? 0,
|
|
173
|
+
tokensOut: (usage.output_tokens ?? 0) - thinkingTokens,
|
|
174
|
+
reasoningTokens: thinkingTokens,
|
|
173
175
|
cachedTokens: usage.cache_read_input_tokens ?? 0,
|
|
174
176
|
cacheWrite5mTokens: hasSplit ? fivem ?? 0 : usage.cache_creation_input_tokens ?? 0,
|
|
175
177
|
cacheWrite1hTokens: hasSplit ? oneh ?? 0 : 0,
|
|
178
|
+
webSearchRequests: usage.server_tool_use?.web_search_requests ?? 0,
|
|
176
179
|
toolCalls,
|
|
177
180
|
revenueUnitCents: resolveRevenue(opts, msg),
|
|
178
181
|
idempotencyKey: msg.id ?? `${runId}#${randomId()}`
|
|
@@ -183,6 +186,7 @@ function wrapAnthropic(client, tollgate, opts) {
|
|
|
183
186
|
const messages = client.messages;
|
|
184
187
|
const original = messages.create.bind(messages);
|
|
185
188
|
const create = async (...args) => {
|
|
189
|
+
const t0 = Date.now();
|
|
186
190
|
const result = await original(...args);
|
|
187
191
|
if (isAsyncIterable(result)) {
|
|
188
192
|
const msg = {};
|
|
@@ -195,7 +199,11 @@ function wrapAnthropic(client, tollgate, opts) {
|
|
|
195
199
|
msg.model = ev.message.model;
|
|
196
200
|
msg.usage = { ...ev.message.usage };
|
|
197
201
|
} else if (ev.type === "message_delta" && ev.usage) {
|
|
198
|
-
msg.usage = {
|
|
202
|
+
msg.usage = {
|
|
203
|
+
...msg.usage ?? {},
|
|
204
|
+
output_tokens: ev.usage.output_tokens,
|
|
205
|
+
output_tokens_details: ev.usage.output_tokens_details
|
|
206
|
+
};
|
|
199
207
|
} else if (ev.type === "content_block_start" && ev.content_block?.type === "tool_use") {
|
|
200
208
|
toolUseBlocks.push(ev.content_block);
|
|
201
209
|
}
|
|
@@ -203,12 +211,18 @@ function wrapAnthropic(client, tollgate, opts) {
|
|
|
203
211
|
() => {
|
|
204
212
|
msg.content = toolUseBlocks;
|
|
205
213
|
const event2 = anthropicEventFrom(msg, opts);
|
|
206
|
-
if (event2)
|
|
214
|
+
if (event2) {
|
|
215
|
+
event2.latencyMs = Date.now() - t0;
|
|
216
|
+
fireAndForget(tollgate.track(event2), opts.onError);
|
|
217
|
+
}
|
|
207
218
|
}
|
|
208
219
|
);
|
|
209
220
|
}
|
|
210
221
|
const event = anthropicEventFrom(result, opts);
|
|
211
|
-
if (event)
|
|
222
|
+
if (event) {
|
|
223
|
+
event.latencyMs = Date.now() - t0;
|
|
224
|
+
fireAndForget(tollgate.track(event), opts.onError);
|
|
225
|
+
}
|
|
212
226
|
return result;
|
|
213
227
|
};
|
|
214
228
|
return new Proxy(client, {
|
|
@@ -227,16 +241,29 @@ function openAIEventFrom(completion, opts) {
|
|
|
227
241
|
if (!usage) return null;
|
|
228
242
|
const runId = resolveRunId(opts, completion.id);
|
|
229
243
|
const toolCalls = completion.choices?.[0]?.message?.tool_calls?.length ?? 0;
|
|
244
|
+
const ptd = usage.prompt_tokens_details;
|
|
245
|
+
const ctd = usage.completion_tokens_details;
|
|
246
|
+
const cachedIn = ptd?.cached_tokens ?? 0;
|
|
247
|
+
const audioIn = ptd?.audio_tokens ?? 0;
|
|
248
|
+
const reasoningOut = ctd?.reasoning_tokens ?? 0;
|
|
249
|
+
const audioOut = ctd?.audio_tokens ?? 0;
|
|
230
250
|
const event = {
|
|
231
251
|
customerId: opts.customerId,
|
|
232
252
|
agentId: opts.agentId,
|
|
233
253
|
runId,
|
|
234
254
|
provider: opts.provider ?? "openai",
|
|
235
255
|
model: completion.model ?? "unknown",
|
|
236
|
-
tokensIn: usage.prompt_tokens ?? 0,
|
|
237
|
-
tokensOut: usage.completion_tokens ?? 0,
|
|
238
|
-
reasoningTokens:
|
|
239
|
-
cachedTokens:
|
|
256
|
+
tokensIn: (usage.prompt_tokens ?? 0) - cachedIn - audioIn,
|
|
257
|
+
tokensOut: (usage.completion_tokens ?? 0) - reasoningOut - audioOut,
|
|
258
|
+
reasoningTokens: reasoningOut,
|
|
259
|
+
cachedTokens: cachedIn,
|
|
260
|
+
audioTokensIn: audioIn,
|
|
261
|
+
audioTokensOut: audioOut,
|
|
262
|
+
textTokensIn: ptd?.text_tokens ?? 0,
|
|
263
|
+
textTokensOut: ctd?.text_tokens ?? 0,
|
|
264
|
+
acceptedPredictionTokens: ctd?.accepted_prediction_tokens ?? 0,
|
|
265
|
+
rejectedPredictionTokens: ctd?.rejected_prediction_tokens ?? 0,
|
|
266
|
+
serviceTier: completion.service_tier,
|
|
240
267
|
toolCalls,
|
|
241
268
|
revenueUnitCents: resolveRevenue(opts, completion),
|
|
242
269
|
idempotencyKey: completion.id ?? `${runId}#${randomId()}`
|
|
@@ -247,11 +274,13 @@ function wrapOpenAI(client, tollgate, opts) {
|
|
|
247
274
|
const completions = client.chat.completions;
|
|
248
275
|
const original = completions.create.bind(completions);
|
|
249
276
|
const create = async (...args) => {
|
|
277
|
+
const t0 = Date.now();
|
|
250
278
|
const result = await original(...args);
|
|
251
279
|
if (isAsyncIterable(result)) {
|
|
252
280
|
let id;
|
|
253
281
|
let model;
|
|
254
282
|
let usage;
|
|
283
|
+
let serviceTier;
|
|
255
284
|
const toolCallIndices = /* @__PURE__ */ new Set();
|
|
256
285
|
return instrumentStream(
|
|
257
286
|
result,
|
|
@@ -259,6 +288,7 @@ function wrapOpenAI(client, tollgate, opts) {
|
|
|
259
288
|
if (chunk.id) id = chunk.id;
|
|
260
289
|
if (chunk.model) model = chunk.model;
|
|
261
290
|
if (chunk.usage) usage = chunk.usage;
|
|
291
|
+
if (chunk.service_tier) serviceTier = chunk.service_tier;
|
|
262
292
|
for (const c of chunk.choices ?? []) {
|
|
263
293
|
for (const tc of c.delta?.tool_calls ?? []) {
|
|
264
294
|
if (tc.index !== void 0) toolCallIndices.add(tc.index);
|
|
@@ -267,17 +297,23 @@ function wrapOpenAI(client, tollgate, opts) {
|
|
|
267
297
|
},
|
|
268
298
|
() => {
|
|
269
299
|
if (!usage) return;
|
|
270
|
-
const synth = { id, model, usage };
|
|
300
|
+
const synth = { id, model, usage, service_tier: serviceTier };
|
|
271
301
|
if (toolCallIndices.size > 0) {
|
|
272
302
|
synth.choices = [{ message: { tool_calls: new Array(toolCallIndices.size) } }];
|
|
273
303
|
}
|
|
274
304
|
const event2 = openAIEventFrom(synth, opts);
|
|
275
|
-
if (event2)
|
|
305
|
+
if (event2) {
|
|
306
|
+
event2.latencyMs = Date.now() - t0;
|
|
307
|
+
fireAndForget(tollgate.track(event2), opts.onError);
|
|
308
|
+
}
|
|
276
309
|
}
|
|
277
310
|
);
|
|
278
311
|
}
|
|
279
312
|
const event = openAIEventFrom(result, opts);
|
|
280
|
-
if (event)
|
|
313
|
+
if (event) {
|
|
314
|
+
event.latencyMs = Date.now() - t0;
|
|
315
|
+
fireAndForget(tollgate.track(event), opts.onError);
|
|
316
|
+
}
|
|
281
317
|
return result;
|
|
282
318
|
};
|
|
283
319
|
return new Proxy(client, {
|
|
@@ -316,6 +352,7 @@ function bedrockEventFrom(usage, model, opts, response = void 0, toolCalls = 0)
|
|
|
316
352
|
function wrapBedrock(client, tollgate, opts) {
|
|
317
353
|
const originalSend = client.send.bind(client);
|
|
318
354
|
const send = async (command, ...rest) => {
|
|
355
|
+
const t0 = Date.now();
|
|
319
356
|
const result = await originalSend(command, ...rest);
|
|
320
357
|
const model = command?.input?.modelId ?? "unknown";
|
|
321
358
|
if (result?.stream && isAsyncIterable(result.stream)) {
|
|
@@ -329,7 +366,10 @@ function wrapBedrock(client, tollgate, opts) {
|
|
|
329
366
|
},
|
|
330
367
|
() => {
|
|
331
368
|
const event = bedrockEventFrom(usage, model, opts, result, streamToolCalls);
|
|
332
|
-
if (event)
|
|
369
|
+
if (event) {
|
|
370
|
+
event.latencyMs = Date.now() - t0;
|
|
371
|
+
fireAndForget(tollgate.track(event), opts.onError);
|
|
372
|
+
}
|
|
333
373
|
}
|
|
334
374
|
);
|
|
335
375
|
return result;
|
|
@@ -337,7 +377,10 @@ function wrapBedrock(client, tollgate, opts) {
|
|
|
337
377
|
if (result?.usage) {
|
|
338
378
|
const tc = result.output?.message?.content?.filter((b) => b.toolUse != null).length ?? 0;
|
|
339
379
|
const event = bedrockEventFrom(result.usage, model, opts, result, tc);
|
|
340
|
-
if (event)
|
|
380
|
+
if (event) {
|
|
381
|
+
event.latencyMs = Date.now() - t0;
|
|
382
|
+
fireAndForget(tollgate.track(event), opts.onError);
|
|
383
|
+
}
|
|
341
384
|
}
|
|
342
385
|
return result;
|
|
343
386
|
};
|
|
@@ -348,14 +391,110 @@ function wrapBedrock(client, tollgate, opts) {
|
|
|
348
391
|
}
|
|
349
392
|
});
|
|
350
393
|
}
|
|
394
|
+
function modalityTokens(details, modality) {
|
|
395
|
+
if (!details) return 0;
|
|
396
|
+
return details.filter((d) => d.modality === modality).reduce((sum, d) => sum + (d.tokenCount ?? 0), 0);
|
|
397
|
+
}
|
|
398
|
+
function geminiEventFrom(response, opts) {
|
|
399
|
+
const usage = response?.usageMetadata;
|
|
400
|
+
if (!usage) return null;
|
|
401
|
+
const runId = resolveRunId(opts, void 0);
|
|
402
|
+
const candidates = response.candidates ?? [];
|
|
403
|
+
const toolCalls = candidates.reduce((sum, c) => {
|
|
404
|
+
const parts = c.content?.parts ?? [];
|
|
405
|
+
return sum + parts.filter((p) => p.functionCall != null).length;
|
|
406
|
+
}, 0);
|
|
407
|
+
const webSearchRequests = candidates.reduce((sum, c) => {
|
|
408
|
+
return sum + (c.groundingMetadata?.webSearchQueries?.length ?? 0);
|
|
409
|
+
}, 0);
|
|
410
|
+
const promptDetails = usage.promptTokensDetails;
|
|
411
|
+
const candidateDetails = usage.candidatesTokensDetails;
|
|
412
|
+
const event = {
|
|
413
|
+
customerId: opts.customerId,
|
|
414
|
+
agentId: opts.agentId,
|
|
415
|
+
runId,
|
|
416
|
+
provider: opts.provider ?? "google",
|
|
417
|
+
model: "unknown",
|
|
418
|
+
tokensIn: usage.promptTokenCount ?? 0,
|
|
419
|
+
tokensOut: usage.candidatesTokenCount ?? 0,
|
|
420
|
+
reasoningTokens: usage.thoughtsTokenCount ?? 0,
|
|
421
|
+
cachedTokens: usage.cachedContentTokenCount ?? 0,
|
|
422
|
+
audioTokensIn: modalityTokens(promptDetails, "AUDIO"),
|
|
423
|
+
audioTokensOut: modalityTokens(candidateDetails, "AUDIO"),
|
|
424
|
+
imageTokensIn: modalityTokens(promptDetails, "IMAGE"),
|
|
425
|
+
imageTokensOut: modalityTokens(candidateDetails, "IMAGE"),
|
|
426
|
+
videoTokensIn: modalityTokens(promptDetails, "VIDEO"),
|
|
427
|
+
textTokensIn: modalityTokens(promptDetails, "TEXT"),
|
|
428
|
+
textTokensOut: modalityTokens(candidateDetails, "TEXT"),
|
|
429
|
+
webSearchRequests,
|
|
430
|
+
toolCalls,
|
|
431
|
+
revenueUnitCents: resolveRevenue(opts, response),
|
|
432
|
+
idempotencyKey: `${runId}#${randomId()}`
|
|
433
|
+
};
|
|
434
|
+
return withCost(event, opts, response);
|
|
435
|
+
}
|
|
436
|
+
function wrapGemini(model, tollgate, opts) {
|
|
437
|
+
const original = model.generateContent.bind(model);
|
|
438
|
+
const modelName = model.model ?? "unknown";
|
|
439
|
+
const generateContent = async (...args) => {
|
|
440
|
+
const t0 = Date.now();
|
|
441
|
+
const result = await original(...args);
|
|
442
|
+
if (isAsyncIterable(result)) {
|
|
443
|
+
const accumulated = {};
|
|
444
|
+
let toolCallCount = 0;
|
|
445
|
+
let searchCount = 0;
|
|
446
|
+
return instrumentStream(
|
|
447
|
+
result,
|
|
448
|
+
(chunk) => {
|
|
449
|
+
if (chunk.usageMetadata) {
|
|
450
|
+
Object.assign(accumulated, chunk.usageMetadata);
|
|
451
|
+
}
|
|
452
|
+
for (const c of chunk.candidates ?? []) {
|
|
453
|
+
for (const p of c.content?.parts ?? []) {
|
|
454
|
+
if (p.functionCall != null) toolCallCount++;
|
|
455
|
+
}
|
|
456
|
+
searchCount += c.groundingMetadata?.webSearchQueries?.length ?? 0;
|
|
457
|
+
}
|
|
458
|
+
},
|
|
459
|
+
() => {
|
|
460
|
+
const synth = {
|
|
461
|
+
usageMetadata: accumulated,
|
|
462
|
+
candidates: searchCount > 0 || toolCallCount > 0 ? [{ content: { parts: new Array(toolCallCount).fill({ functionCall: {} }) }, groundingMetadata: { webSearchQueries: new Array(searchCount) } }] : []
|
|
463
|
+
};
|
|
464
|
+
const event2 = geminiEventFrom(synth, opts);
|
|
465
|
+
if (event2) {
|
|
466
|
+
event2.model = modelName;
|
|
467
|
+
event2.latencyMs = Date.now() - t0;
|
|
468
|
+
fireAndForget(tollgate.track(event2), opts.onError);
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
);
|
|
472
|
+
}
|
|
473
|
+
const event = geminiEventFrom(result, opts);
|
|
474
|
+
if (event) {
|
|
475
|
+
event.model = modelName;
|
|
476
|
+
event.latencyMs = Date.now() - t0;
|
|
477
|
+
fireAndForget(tollgate.track(event), opts.onError);
|
|
478
|
+
}
|
|
479
|
+
return result;
|
|
480
|
+
};
|
|
481
|
+
return new Proxy(model, {
|
|
482
|
+
get(target, prop, recv) {
|
|
483
|
+
if (prop === "generateContent") return generateContent;
|
|
484
|
+
return Reflect.get(target, prop, recv);
|
|
485
|
+
}
|
|
486
|
+
});
|
|
487
|
+
}
|
|
351
488
|
|
|
352
489
|
exports.TollgateError = TollgateError;
|
|
353
490
|
exports.anthropicEventFrom = anthropicEventFrom;
|
|
354
491
|
exports.bedrockEventFrom = bedrockEventFrom;
|
|
355
492
|
exports.createTollgateClient = createTollgateClient;
|
|
493
|
+
exports.geminiEventFrom = geminiEventFrom;
|
|
356
494
|
exports.openAIEventFrom = openAIEventFrom;
|
|
357
495
|
exports.wrapAnthropic = wrapAnthropic;
|
|
358
496
|
exports.wrapBedrock = wrapBedrock;
|
|
497
|
+
exports.wrapGemini = wrapGemini;
|
|
359
498
|
exports.wrapOpenAI = wrapOpenAI;
|
|
360
499
|
//# sourceMappingURL=index.cjs.map
|
|
361
500
|
//# sourceMappingURL=index.cjs.map
|